1232 files changed, 136497 insertions, 24105 deletions
diff --git a/.github/workflows/linux_builds.yml b/.github/workflows/linux_builds.yml
index 15a7be9c4f..0982d768d6 100644
--- a/.github/workflows/linux_builds.yml
+++ b/.github/workflows/linux_builds.yml
@@ -70,15 +70,32 @@ jobs:
         run: |
           ./bin/godot.linuxbsd.opt.tools.64 --test
 
+      # Download, unzip and setup SwiftShader library [d4550ab8d3f]
+      - name: Download SwiftShader
+        run: |
+          wget https://github.com/qarmin/gtk_library_store/releases/download/3.24.0/swiftshader.zip
+          unzip swiftshader.zip
+          rm swiftshader.zip
+          curr="$(pwd)/libvk_swiftshader.so"
+          sed -i "s|PATH_TO_CHANGE|$curr|" vk_swiftshader_icd.json
+
+      # Check class reference
+      - name: Check for class reference updates
+        run: |
+          echo "Running --doctool to see if this changes the public API without updating the documentation."
+          echo -e "If a diff is shown, it means that your code/doc changes are incomplete and you should update the class reference with --doctool.\n\n"
+          VK_ICD_FILENAMES=$(pwd)/vk_swiftshader_icd.json DRI_PRIME=0 xvfb-run bin/godot.linuxbsd.opt.tools.64 --doctool . 2>&1 > /dev/null || true
+          git diff --color --exit-code
+
       - uses: actions/upload-artifact@v2
         with:
           name: ${{ github.job }}
           path: bin/*
           retention-days: 14
 
-  linux-editor-sanitizers-mono:
+  linux-editor-sanitizers:
     runs-on: "ubuntu-20.04"
-    name: Editor w/ Mono and sanitizers (target=debug, tools=yes, tests=yes, use_asan=yes, use_ubsan=yes)
+    name: Editor and sanitizers (target=debug, tools=yes, tests=yes, use_asan=yes, use_ubsan=yes)
 
     steps:
       - uses: actions/checkout@v2
@@ -94,7 +111,8 @@ jobs:
       - name: Configure dependencies
         run: |
           sudo apt-get install build-essential pkg-config libx11-dev libxcursor-dev \
-            libxinerama-dev libgl1-mesa-dev libglu-dev libasound2-dev libpulse-dev libudev-dev libxi-dev libxrandr-dev yasm
+            libxinerama-dev libgl1-mesa-dev libglu-dev libasound2-dev libpulse-dev libudev-dev libxi-dev libxrandr-dev yasm \
+            xvfb wget unzip
 
       # Upload cache on completion and check it out now
       - name: Load .scons_cache directory
@@ -126,17 +144,47 @@ jobs:
           scons --version
 
       # We should always be explicit with our flags usage here since it's gonna be sure to always set those flags
+      # [Workaround] SwiftShader doesn't support tesselation, so we skip Godot check about it
       - name: Compilation
         env:
           SCONS_CACHE: ${{github.workspace}}/.scons_cache/
         run: |
-          scons tools=yes tests=yes target=debug module_mono_enabled=yes mono_glue=no use_asan=yes use_ubsan=yes
+          sed -i "s|ERR_FAIL_COND_V(p_rasterization_state.patch_control_points|//ERR_FAIL_COND_V(p_rasterization_state.patch_control_points|" drivers/vulkan/rendering_device_vulkan.cpp
+          scons tools=yes tests=yes target=debug debug_symbols=no use_asan=yes use_ubsan=yes
           ls -l bin/
 
       # Execute unit tests for the editor
       - name: Unit Tests
         run: |
-          ./bin/godot.linuxbsd.tools.64s.mono --test
+          ./bin/godot.linuxbsd.tools.64s --test
+
+      # Download, unzip and setup SwiftShader library [d4550ab8d3f]
+      - name: Download SwiftShader
+        run: |
+          wget https://github.com/qarmin/gtk_library_store/releases/download/3.24.0/swiftshader.zip
+          unzip swiftshader.zip
+          rm swiftshader.zip
+          curr="$(pwd)/libvk_swiftshader.so"
+          sed -i "s|PATH_TO_CHANGE|$curr|" vk_swiftshader_icd.json
+
+      # Download and extract zip archive with project, folder is renamed to be able to easy change used project
+      - name: Download test project
+        run: |
+          wget https://github.com/qarmin/RegressionTestProject/archive/4.0.zip
+          unzip 4.0.zip
+          mv "RegressionTestProject-4.0" "test_project"
+
+      # Editor is quite complicated piece of software, so it is easy to introduce bug here
+      - name: Open and close editor
+        run: |
+          VK_ICD_FILENAMES=$(pwd)/vk_swiftshader_icd.json DRI_PRIME=0 xvfb-run bin/godot.linuxbsd.tools.64s --audio-driver Dummy -e -q --path test_project 2>&1 | tee sanitizers_log.txt || true
+          misc/scripts/check_ci_log.py sanitizers_log.txt
+
+      # Run test project
+      - name: Run project
+        run: |
+          VK_ICD_FILENAMES=$(pwd)/vk_swiftshader_icd.json DRI_PRIME=0 xvfb-run bin/godot.linuxbsd.tools.64s 40 --audio-driver Dummy --path test_project 2>&1 | tee sanitizers_log.txt || true
+          misc/scripts/check_ci_log.py sanitizers_log.txt
 
   linux-template-mono:
     runs-on: "ubuntu-20.04"
diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt
index 6684978318..9e63da3fc4 100644
--- a/COPYRIGHT.txt
+++ b/COPYRIGHT.txt
@@ -89,6 +89,8 @@ Files: ./servers/physics_3d/gjk_epa.cpp
  ./servers/physics_3d/joints/slider_joint_3d_sw.h
  ./servers/physics_3d/soft_body_3d_sw.cpp
  ./servers/physics_3d/soft_body_3d_sw.h
+ ./servers/physics_3d/shape_3d_sw.cpp
+ ./servers/physics_3d/shape_3d_sw.h
 Comment: Bullet Continuous Collision Detection and Physics Library
 Copyright: 2003-2008, Erwin Coumans
  2007-2021, Juan Linietsky, Ariel Manzur.
@@ -134,10 +136,10 @@ Comment: ENet
 Copyright: 2002-2020, Lee Salzman
 License: Expat
 
-Files: ./thirdparty/etc2comp/
-Comment: Etc2Comp
-Copyright: 2015, Etc2Comp Authors
-License: Apache-2.0
+Files: ./thirdparty/etcpak/
+Comment: etcpak
+Copyright: 2013-2021, Bartosz Taudul
+License: BSD-3-clause
 
 Files: ./thirdparty/fonts/DroidSans*.ttf
 Comment: DroidSans font
@@ -196,7 +198,7 @@ License: HarfBuzz
 
 Files: ./thirdparty/icu4c/
 Comment: International Components for Unicode
-Copyright: 1991-2020, Unicode
+Copyright: 1991-2021, Unicode
 License: Unicode
 
 Files: ./thirdparty/jpeg-compressor/
@@ -261,7 +263,7 @@ License: Apache-2.0
 
 Files: ./thirdparty/meshoptimizer/
 Comment: meshoptimizer
-Copyright: 2016-2020, Arseny Kapoulkine
+Copyright: 2016-2021, Arseny Kapoulkine
 License: Expat
 
 Files: ./thirdparty/minimp3/
diff --git a/SConstruct b/SConstruct
index 3edf81129b..2d9802f293 100644
--- a/SConstruct
+++ b/SConstruct
@@ -137,6 +137,7 @@ opts.Add("extra_suffix", "Custom extra suffix added to the base filename of all
 opts.Add(BoolVariable("vsproj", "Generate a Visual Studio solution", False))
 opts.Add(BoolVariable("disable_3d", "Disable 3D nodes for a smaller executable", False))
 opts.Add(BoolVariable("disable_advanced_gui", "Disable advanced GUI nodes and behaviors", False))
+opts.Add(BoolVariable("modules_enabled_by_default", "If no, disable all modules except ones explicitly enabled", True))
 opts.Add(BoolVariable("no_editor_splash", "Don't use the custom splash screen for the editor", False))
 opts.Add("system_certs_path", "Use this path as SSL certificates default for editor (for package maintainers)", "")
 opts.Add(BoolVariable("use_precise_math_checks", "Math checks use very precise epsilon (debug option)", False))
@@ -259,16 +260,21 @@ for path in module_search_paths:
 
 # Add module options.
 for name, path in modules_detected.items():
-    enabled = True
-    sys.path.insert(0, path)
-    import config
-
-    try:
-        enabled = config.is_enabled()
-    except AttributeError:
-        pass
-    sys.path.remove(path)
-    sys.modules.pop("config")
+    if env_base["modules_enabled_by_default"]:
+        enabled = True
+
+        sys.path.insert(0, path)
+        import config
+
+        try:
+            enabled = config.is_enabled()
+        except AttributeError:
+            pass
+        sys.path.remove(path)
+        sys.modules.pop("config")
+    else:
+        enabled = False
+
     opts.Add(BoolVariable("module_" + name + "_enabled", "Enable module '%s'" % (name,), enabled))
 
 methods.write_modules(modules_detected)
diff --git a/core/config/project_settings.cpp b/core/config/project_settings.cpp
index f87dc6704e..25dd408dce 100644
--- a/core/config/project_settings.cpp
+++ b/core/config/project_settings.cpp
@@ -467,16 +467,17 @@ Error ProjectSettings::_setup(const String &p_path, const String &p_main_pack, b
 	d->change_dir(p_path);
 
 	String current_dir = d->get_current_dir();
-	String candidate = current_dir;
 	bool found = false;
 	Error err;
 
 	while (true) {
+		// Set the resource path early so things can be resolved when loading.
+		resource_path = current_dir;
+		resource_path = resource_path.replace("\\", "/"); // Windows path to Unix path just in case.
 		err = _load_settings_text_or_binary(current_dir.plus_file("project.godot"), current_dir.plus_file("project.binary"));
 		if (err == OK) {
 			// Optional, we don't mind if it fails.
 			_load_settings_text(current_dir.plus_file("override.cfg"));
-			candidate = current_dir;
 			found = true;
 			break;
 		}
@@ -493,8 +494,6 @@ Error ProjectSettings::_setup(const String &p_path, const String &p_main_pack, b
 		}
 	}
 
-	resource_path = candidate;
-	resource_path = resource_path.replace("\\", "/"); // Windows path to Unix path just in case.
 	memdelete(d);
 
 	if (!found) {
diff --git a/core/core_bind.cpp b/core/core_bind.cpp
index c3d547c2c7..84d8d0d4d3 100644
--- a/core/core_bind.cpp
+++ b/core/core_bind.cpp
@@ -1334,7 +1334,7 @@ Vector<uint8_t> _File::get_buffer(int p_length) const {
 	ERR_FAIL_COND_V(len < 0, Vector<uint8_t>());
 
 	if (len < p_length) {
-		data.resize(p_length);
+		data.resize(len);
 	}
 
 	return data;
diff --git a/core/input/input.cpp b/core/input/input.cpp
index 627944210f..2304c05bf8 100644
--- a/core/input/input.cpp
+++ b/core/input/input.cpp
@@ -1329,9 +1329,10 @@ void Input::add_joy_mapping(String p_mapping, bool p_update_existing) {
 	if (p_update_existing) {
 		Vector<String> entry = p_mapping.split(",");
 		String uid = entry[0];
-		for (int i = 0; i < joy_names.size(); i++) {
-			if (uid == joy_names[i].uid) {
-				joy_names[i].mapping = map_db.size() - 1;
+		for (Map<int, Joypad>::Element *E = joy_names.front(); E; E = E->next()) {
+			Joypad &joy = E->get();
+			if (joy.uid == uid) {
+				joy.mapping = map_db.size() - 1;
 			}
 		}
 	}
@@ -1343,9 +1344,10 @@ void Input::remove_joy_mapping(String p_guid) {
 			map_db.remove(i);
 		}
 	}
-	for (int i = 0; i < joy_names.size(); i++) {
-		if (joy_names[i].uid == p_guid) {
-			joy_names[i].mapping = -1;
+	for (Map<int, Joypad>::Element *E = joy_names.front(); E; E = E->next()) {
+		Joypad &joy = E->get();
+		if (joy.uid == p_guid) {
+			joy.mapping = -1;
 		}
 	}
 }
@@ -1361,8 +1363,13 @@ void Input::set_fallback_mapping(String p_guid) {
 
 //platforms that use the remapping system can override and call to these ones
 bool Input::is_joy_known(int p_device) {
-	int mapping = joy_names[p_device].mapping;
-	return mapping != -1 ? (mapping != fallback_mapping) : false;
+	if (joy_names.has(p_device)) {
+		int mapping = joy_names[p_device].mapping;
+		if (mapping != -1 && mapping != fallback_mapping) {
+			return true;
+		}
+	}
+	return false;
 }
 
 String Input::get_joy_guid(int p_device) const {
diff --git a/core/input/input_event.h b/core/input/input_event.h
index a1e7df5969..94aa68db33 100644
--- a/core/input/input_event.h
+++ b/core/input/input_event.h
@@ -33,7 +33,6 @@
 
 #include "core/io/resource.h"
 #include "core/math/transform_2d.h"
-#include "core/os/copymem.h"
 #include "core/string/ustring.h"
 #include "core/typedefs.h"
 
diff --git a/core/input/input_map.cpp b/core/input/input_map.cpp
index 7d85fd6492..aab4e6593c 100644
--- a/core/input/input_map.cpp
+++ b/core/input/input_map.cpp
@@ -54,8 +54,36 @@ void InputMap::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("load_from_project_settings"), &InputMap::load_from_project_settings);
 }
 
+/**
+ * Returns an nonexistent action error message with a suggestion of the closest
+ * matching action name (if possible).
+ */
+String InputMap::_suggest_actions(const StringName &p_action) const {
+	List<StringName> actions = get_actions();
+	StringName closest_action;
+	float closest_similarity = 0.0;
+
+	// Find the most action with the most similar name.
+	for (List<StringName>::Element *E = actions.front(); E; E = E->next()) {
+		const float similarity = String(E->get()).similarity(p_action);
+
+		if (similarity > closest_similarity) {
+			closest_action = E->get();
+			closest_similarity = similarity;
+		}
+	}
+
+	String error_message = vformat("The InputMap action \"%s\" doesn't exist.", p_action);
+
+	if (closest_similarity >= 0.4) {
+		// Only include a suggestion in the error message if it's similar enough.
+		error_message += vformat(" Did you mean \"%s\"?", closest_action);
+	}
+	return error_message;
+}
+
 void InputMap::add_action(const StringName &p_action, float p_deadzone) {
-	ERR_FAIL_COND_MSG(input_map.has(p_action), "InputMap already has action '" + String(p_action) + "'.");
+	ERR_FAIL_COND_MSG(input_map.has(p_action), "InputMap already has action \"" + String(p_action) + "\".");
 	input_map[p_action] = Action();
 	static int last_id = 1;
 	input_map[p_action].id = last_id;
@@ -64,7 +92,8 @@ void InputMap::add_action(const StringName &p_action, float p_deadzone) {
 }
 
 void InputMap::erase_action(const StringName &p_action) {
-	ERR_FAIL_COND_MSG(!input_map.has(p_action), "Request for nonexistent InputMap action '" + String(p_action) + "'.");
+	ERR_FAIL_COND_MSG(!input_map.has(p_action), _suggest_actions(p_action));
+
 	input_map.erase(p_action);
 }
 
@@ -122,20 +151,20 @@ bool InputMap::has_action(const StringName &p_action) const {
 }
 
 float InputMap::action_get_deadzone(const StringName &p_action) {
-	ERR_FAIL_COND_V_MSG(!input_map.has(p_action), 0.0f, "Request for nonexistent InputMap action '" + String(p_action) + "'.");
+	ERR_FAIL_COND_V_MSG(!input_map.has(p_action), 0.0f, _suggest_actions(p_action));
 
 	return input_map[p_action].deadzone;
 }
 
 void InputMap::action_set_deadzone(const StringName &p_action, float p_deadzone) {
-	ERR_FAIL_COND_MSG(!input_map.has(p_action), "Request for nonexistent InputMap action '" + String(p_action) + "'.");
+	ERR_FAIL_COND_MSG(!input_map.has(p_action), _suggest_actions(p_action));
 
 	input_map[p_action].deadzone = p_deadzone;
 }
 
 void InputMap::action_add_event(const StringName &p_action, const Ref<InputEvent> &p_event) {
 	ERR_FAIL_COND_MSG(p_event.is_null(), "It's not a reference to a valid InputEvent object.");
-	ERR_FAIL_COND_MSG(!input_map.has(p_action), "Request for nonexistent InputMap action '" + String(p_action) + "'.");
+	ERR_FAIL_COND_MSG(!input_map.has(p_action), _suggest_actions(p_action));
 	if (_find_event(input_map[p_action], p_event, true)) {
 		return; // Already addded.
 	}
@@ -144,12 +173,12 @@ void InputMap::action_add_event(const StringName &p_action, const Ref<InputEvent
 }
 
 bool InputMap::action_has_event(const StringName &p_action, const Ref<InputEvent> &p_event) {
-	ERR_FAIL_COND_V_MSG(!input_map.has(p_action), false, "Request for nonexistent InputMap action '" + String(p_action) + "'.");
+	ERR_FAIL_COND_V_MSG(!input_map.has(p_action), false, _suggest_actions(p_action));
 	return (_find_event(input_map[p_action], p_event, true) != nullptr);
 }
 
 void InputMap::action_erase_event(const StringName &p_action, const Ref<InputEvent> &p_event) {
-	ERR_FAIL_COND_MSG(!input_map.has(p_action), "Request for nonexistent InputMap action '" + String(p_action) + "'.");
+	ERR_FAIL_COND_MSG(!input_map.has(p_action), _suggest_actions(p_action));
 
 	List<Ref<InputEvent>>::Element *E = _find_event(input_map[p_action], p_event, true);
 	if (E) {
@@ -161,7 +190,7 @@ void InputMap::action_erase_event(const StringName &p_action, const Ref<InputEve
 }
 
 void InputMap::action_erase_events(const StringName &p_action) {
-	ERR_FAIL_COND_MSG(!input_map.has(p_action), "Request for nonexistent InputMap action '" + String(p_action) + "'.");
+	ERR_FAIL_COND_MSG(!input_map.has(p_action), _suggest_actions(p_action));
 
 	input_map[p_action].inputs.clear();
 }
@@ -193,7 +222,7 @@ bool InputMap::event_is_action(const Ref<InputEvent> &p_event, const StringName
 
 bool InputMap::event_get_action_status(const Ref<InputEvent> &p_event, const StringName &p_action, bool p_exact_match, bool *p_pressed, float *p_strength, float *p_raw_strength) const {
 	OrderedHashMap<StringName, Action>::Element E = input_map.find(p_action);
-	ERR_FAIL_COND_V_MSG(!E, false, "Request for nonexistent InputMap action '" + String(p_action) + "'.");
+	ERR_FAIL_COND_V_MSG(!E, false, _suggest_actions(p_action));
 
 	Ref<InputEventAction> input_event_action = p_event;
 	if (input_event_action.is_valid()) {
diff --git a/core/input/input_map.h b/core/input/input_map.h
index 99c71e1e53..0e0567464a 100644
--- a/core/input/input_map.h
+++ b/core/input/input_map.h
@@ -61,6 +61,7 @@ private:
 
 	Array _action_get_events(const StringName &p_action);
 	Array _get_actions();
+	String _suggest_actions(const StringName &p_action) const;
 
 protected:
 	static void _bind_methods();
diff --git a/core/io/compression.cpp b/core/io/compression.cpp
index 980234cbfc..6de626db99 100644
--- a/core/io/compression.cpp
+++ b/core/io/compression.cpp
@@ -32,7 +32,6 @@
 
 #include "core/config/project_settings.h"
 #include "core/io/zip_io.h"
-#include "core/os/copymem.h"
 
 #include "thirdparty/misc/fastlz.h"
 
@@ -44,8 +43,8 @@ int Compression::compress(uint8_t *p_dst, const uint8_t *p_src, int p_src_size,
 		case MODE_FASTLZ: {
 			if (p_src_size < 16) {
 				uint8_t src[16];
-				zeromem(&src[p_src_size], 16 - p_src_size);
-				copymem(src, p_src, p_src_size);
+				memset(&src[p_src_size], 0, 16 - p_src_size);
+				memcpy(src, p_src, p_src_size);
 				return fastlz_compress(src, 16, p_dst);
 			} else {
 				return fastlz_compress(p_src, p_src_size, p_dst);
@@ -136,7 +135,7 @@ int Compression::decompress(uint8_t *p_dst, int p_dst_max_size, const uint8_t *p
 			if (p_dst_max_size < 16) {
 				uint8_t dst[16];
 				ret_size = fastlz_decompress(p_src, p_src_size, dst, 16);
-				copymem(p_dst, dst, p_dst_max_size);
+				memcpy(p_dst, dst, p_dst_max_size);
 			} else {
 				ret_size = fastlz_decompress(p_src, p_src_size, p_dst, p_dst_max_size);
 			}
diff --git a/core/io/file_access_encrypted.cpp b/core/io/file_access_encrypted.cpp
index 8ace897f18..13377a3a25 100644
--- a/core/io/file_access_encrypted.cpp
+++ b/core/io/file_access_encrypted.cpp
@@ -31,7 +31,6 @@
 #include "file_access_encrypted.h"
 
 #include "core/crypto/crypto_core.h"
-#include "core/os/copymem.h"
 #include "core/string/print_string.h"
 #include "core/variant/variant.h"
 
@@ -151,7 +150,7 @@ void FileAccessEncrypted::_release() {
 		ERR_FAIL_COND(CryptoCore::md5(data.ptr(), data.size(), hash) != OK); // Bug?
 
 		compressed.resize(len);
-		zeromem(compressed.ptrw(), len);
+		memset(compressed.ptrw(), 0, len);
 		for (int i = 0; i < data.size(); i++) {
 			compressed.write[i] = data[i];
 		}
diff --git a/core/io/file_access_memory.cpp b/core/io/file_access_memory.cpp
index 58670d5246..af155a77a8 100644
--- a/core/io/file_access_memory.cpp
+++ b/core/io/file_access_memory.cpp
@@ -31,7 +31,6 @@
 #include "file_access_memory.h"
 
 #include "core/config/project_settings.h"
-#include "core/os/copymem.h"
 #include "core/os/dir_access.h"
 #include "core/templates/map.h"
 
@@ -149,7 +148,7 @@ int FileAccessMemory::get_buffer(uint8_t *p_dst, int p_length) const {
 		WARN_PRINT("Reading less data than requested");
 	}
 
-	copymem(p_dst, &data[pos], read);
+	memcpy(p_dst, &data[pos], read);
 	pos += p_length;
 
 	return read;
@@ -176,6 +175,6 @@ void FileAccessMemory::store_buffer(const uint8_t *p_src, int p_length) {
 		WARN_PRINT("Writing less data than requested");
 	}
 
-	copymem(&data[pos], p_src, write);
+	memcpy(&data[pos], p_src, write);
 	pos += p_length;
 }
diff --git a/core/io/file_access_pack.h b/core/io/file_access_pack.h
index 343adbe592..955108f455 100644
--- a/core/io/file_access_pack.h
+++ b/core/io/file_access_pack.h
@@ -36,6 +36,7 @@
 #include "core/string/print_string.h"
 #include "core/templates/list.h"
 #include "core/templates/map.h"
+#include "core/templates/set.h"
 
 // Godot's packed file magic header ("GDPC" in ASCII).
 #define PACK_HEADER_MAGIC 0x43504447
diff --git a/core/io/file_access_zip.cpp b/core/io/file_access_zip.cpp
index 586c988974..397b577612 100644
--- a/core/io/file_access_zip.cpp
+++ b/core/io/file_access_zip.cpp
@@ -32,7 +32,6 @@
 
 #include "file_access_zip.h"
 
-#include "core/os/copymem.h"
 #include "core/os/file_access.h"
 
 ZipArchive *ZipArchive::instance = nullptr;
@@ -120,7 +119,7 @@ unzFile ZipArchive::get_file_handle(String p_file) const {
 	ERR_FAIL_COND_V_MSG(!f, nullptr, "Cannot open file '" + packages[file.package].filename + "'.");
 
 	zlib_filefunc_def io;
-	zeromem(&io, sizeof(io));
+	memset(&io, 0, sizeof(io));
 
 	io.opaque = f;
 	io.zopen_file = godot_open;
diff --git a/core/io/http_client.cpp b/core/io/http_client.cpp
index 3863dce0f6..4b053d576c 100644
--- a/core/io/http_client.cpp
+++ b/core/io/http_client.cpp
@@ -633,7 +633,7 @@ PackedByteArray HTTPClient::read_response_body_chunk() {
 
 					ret.resize(chunk.size() - 2);
 					uint8_t *w = ret.ptrw();
-					copymem(w, chunk.ptr(), chunk.size() - 2);
+					memcpy(w, chunk.ptr(), chunk.size() - 2);
 					chunk.clear();
 				}
 
diff --git a/core/io/image.cpp b/core/io/image.cpp
index 873eb66f33..c36fa6e45f 100644
--- a/core/io/image.cpp
+++ b/core/io/image.cpp
@@ -34,7 +34,6 @@
 #include "core/io/image_loader.h"
 #include "core/io/resource_loader.h"
 #include "core/math/math_funcs.h"
-#include "core/os/copymem.h"
 #include "core/string/print_string.h"
 #include "core/templates/hash_map.h"
 
@@ -1537,7 +1536,7 @@ void Image::shrink_x2() {
 			uint8_t *w = new_img.ptrw();
 			const uint8_t *r = data.ptr();
 
-			copymem(w, &r[ofs], new_size);
+			memcpy(w, &r[ofs], new_size);
 		}
 
 		width = MAX(width / 2, 1);
@@ -1932,7 +1931,7 @@ Error Image::generate_mipmap_roughness(RoughnessChannel p_roughness_channel, con
 
 
 			uint8_t* wr = imgdata.ptrw();
-			copymem(wr.ptr(), ptr, size);
+			memcpy(wr.ptr(), ptr, size);
 			wr = uint8_t*();
 			Ref<Image> im;
 			im.instance();
@@ -1982,7 +1981,7 @@ void Image::create(int p_width, int p_height, bool p_use_mipmaps, Format p_forma
 
 	{
 		uint8_t *w = data.ptrw();
-		zeromem(w, size);
+		memset(w, 0, size);
 	}
 
 	width = p_width;
@@ -3295,7 +3294,7 @@ Ref<Image> Image::get_image_from_mipmap(int p_mipamp) const {
 	{
 		uint8_t *wr = new_data.ptrw();
 		const uint8_t *rd = data.ptr();
-		copymem(wr, rd + ofs, size);
+		memcpy(wr, rd + ofs, size);
 	}
 
 	Ref<Image> image;
@@ -3622,5 +3621,5 @@ Ref<Resource> Image::duplicate(bool p_subresources) const {
 }
 
 void Image::set_as_black() {
-	zeromem(data.ptrw(), data.size());
+	memset(data.ptrw(), 0, data.size());
 }
diff --git a/core/io/marshalls.cpp b/core/io/marshalls.cpp
index 218a612da2..0282609270 100644
--- a/core/io/marshalls.cpp
+++ b/core/io/marshalls.cpp
@@ -851,7 +851,7 @@ static void _encode_string(const String &p_string, uint8_t *&buf, int &r_len) {
 	if (buf) {
 		encode_uint32(utf8.length(), buf);
 		buf += 4;
-		copymem(buf, utf8.get_data(), utf8.length());
+		memcpy(buf, utf8.get_data(), utf8.length());
 		buf += utf8.length();
 	}
 
@@ -995,7 +995,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				if (buf) {
 					encode_uint32(utf8.length(), buf);
 					buf += 4;
-					copymem(buf, utf8.get_data(), utf8.length());
+					memcpy(buf, utf8.get_data(), utf8.length());
 					buf += pad + utf8.length();
 				}
 
@@ -1079,7 +1079,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				Transform2D val = p_variant;
 				for (int i = 0; i < 3; i++) {
 					for (int j = 0; j < 2; j++) {
-						copymem(&buf[(i * 2 + j) * 4], &val.elements[i][j], sizeof(float));
+						memcpy(&buf[(i * 2 + j) * 4], &val.elements[i][j], sizeof(float));
 					}
 				}
 			}
@@ -1130,7 +1130,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				Basis val = p_variant;
 				for (int i = 0; i < 3; i++) {
 					for (int j = 0; j < 3; j++) {
-						copymem(&buf[(i * 3 + j) * 4], &val.elements[i][j], sizeof(float));
+						memcpy(&buf[(i * 3 + j) * 4], &val.elements[i][j], sizeof(float));
 					}
 				}
 			}
@@ -1143,7 +1143,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				Transform val = p_variant;
 				for (int i = 0; i < 3; i++) {
 					for (int j = 0; j < 3; j++) {
-						copymem(&buf[(i * 3 + j) * 4], &val.basis.elements[i][j], sizeof(float));
+						memcpy(&buf[(i * 3 + j) * 4], &val.basis.elements[i][j], sizeof(float));
 					}
 				}
 
@@ -1258,7 +1258,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				if (buf) {
 					encode_uint32(utf8.length()+1,buf);
 					buf+=4;
-					copymem(buf,utf8.get_data(),utf8.length()+1);
+					memcpy(buf,utf8.get_data(),utf8.length()+1);
 				}
 
 				r_len+=4+utf8.length()+1;
@@ -1314,7 +1314,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				encode_uint32(datalen, buf);
 				buf += 4;
 				const uint8_t *r = data.ptr();
-				copymem(buf, &r[0], datalen * datasize);
+				memcpy(buf, &r[0], datalen * datasize);
 				buf += datalen * datasize;
 			}
 
@@ -1412,7 +1412,7 @@ Error encode_variant(const Variant &p_variant, uint8_t *r_buffer, int &r_len, bo
 				if (buf) {
 					encode_uint32(utf8.length() + 1, buf);
 					buf += 4;
-					copymem(buf, utf8.get_data(), utf8.length() + 1);
+					memcpy(buf, utf8.get_data(), utf8.length() + 1);
 					buf += utf8.length() + 1;
 				}
 
diff --git a/core/io/multiplayer_api.cpp b/core/io/multiplayer_api.cpp
index 94060cfe0b..8414ee7c0c 100644
--- a/core/io/multiplayer_api.cpp
+++ b/core/io/multiplayer_api.cpp
@@ -897,7 +897,7 @@ void MultiplayerAPI::_send_rpc(Node *p_from, int p_to, bool p_unreliable, bool p
 			// Special optimization when only the byte vector is sent.
 			const Vector<uint8_t> data = *p_arg[0];
 			MAKE_ROOM(ofs + data.size());
-			copymem(&(packet_cache.write[ofs]), data.ptr(), sizeof(uint8_t) * data.size());
+			memcpy(&(packet_cache.write[ofs]), data.ptr(), sizeof(uint8_t) * data.size());
 			ofs += data.size();
 		} else {
 			// Arguments
diff --git a/core/io/net_socket.h b/core/io/net_socket.h
index bc09477693..a632ad2ea7 100644
--- a/core/io/net_socket.h
+++ b/core/io/net_socket.h
@@ -67,6 +67,7 @@ public:
 
 	virtual bool is_open() const = 0;
 	virtual int get_available_bytes() const = 0;
+	virtual Error get_socket_address(IP_Address *r_ip, uint16_t *r_port) const = 0;
 
 	virtual Error set_broadcasting_enabled(bool p_enabled) = 0; // Returns OK if the socket option has been set successfully.
 	virtual void set_blocking_enabled(bool p_enabled) = 0;
diff --git a/core/io/packed_data_container.cpp b/core/io/packed_data_container.cpp
index a0b97772e6..c6354b11b7 100644
--- a/core/io/packed_data_container.cpp
+++ b/core/io/packed_data_container.cpp
@@ -317,7 +317,7 @@ Error PackedDataContainer::pack(const Variant &p_data) {
 	datalen = tmpdata.size();
 	data.resize(tmpdata.size());
 	uint8_t *w = data.ptrw();
-	copymem(w, tmpdata.ptr(), tmpdata.size());
+	memcpy(w, tmpdata.ptr(), tmpdata.size());
 
 	return OK;
 }
diff --git a/core/io/packet_peer_udp.cpp b/core/io/packet_peer_udp.cpp
index 3f46f2706e..40e4ce4f77 100644
--- a/core/io/packet_peer_udp.cpp
+++ b/core/io/packet_peer_udp.cpp
@@ -159,10 +159,11 @@ int PacketPeerUDP::get_max_packet_size() const {
 	return 512; // uhm maybe not
 }
 
-Error PacketPeerUDP::listen(int p_port, const IP_Address &p_bind_address, int p_recv_buffer_size) {
+Error PacketPeerUDP::bind(int p_port, const IP_Address &p_bind_address, int p_recv_buffer_size) {
 	ERR_FAIL_COND_V(!_sock.is_valid(), ERR_UNAVAILABLE);
 	ERR_FAIL_COND_V(_sock->is_open(), ERR_ALREADY_IN_USE);
 	ERR_FAIL_COND_V(!p_bind_address.is_valid() && !p_bind_address.is_wildcard(), ERR_INVALID_PARAMETER);
+	ERR_FAIL_COND_V_MSG(p_port < 0 || p_port > 65535, ERR_INVALID_PARAMETER, "The local port number must be between 0 and 65535 (inclusive).");
 
 	Error err;
 	IP::Type ip_type = IP::TYPE_ANY;
@@ -210,6 +211,7 @@ Error PacketPeerUDP::connect_to_host(const IP_Address &p_host, int p_port) {
 	ERR_FAIL_COND_V(udp_server, ERR_LOCKED);
 	ERR_FAIL_COND_V(!_sock.is_valid(), ERR_UNAVAILABLE);
 	ERR_FAIL_COND_V(!p_host.is_valid(), ERR_INVALID_PARAMETER);
+	ERR_FAIL_COND_V_MSG(p_port < 1 || p_port > 65535, ERR_INVALID_PARAMETER, "The remote port number must be between 1 and 65535 (inclusive).");
 
 	Error err;
 
@@ -316,7 +318,7 @@ Error PacketPeerUDP::store_packet(IP_Address p_ip, uint32_t p_port, uint8_t *p_b
 	return OK;
 }
 
-bool PacketPeerUDP::is_listening() const {
+bool PacketPeerUDP::is_bound() const {
 	return _sock.is_valid() && _sock->is_open();
 }
 
@@ -328,6 +330,12 @@ int PacketPeerUDP::get_packet_port() const {
 	return packet_port;
 }
 
+int PacketPeerUDP::get_local_port() const {
+	uint16_t local_port;
+	_sock->get_socket_address(nullptr, &local_port);
+	return local_port;
+}
+
 void PacketPeerUDP::set_dest_address(const IP_Address &p_address, int p_port) {
 	ERR_FAIL_COND_MSG(connected, "Destination address cannot be set for connected sockets");
 	peer_addr = p_address;
@@ -335,14 +343,15 @@ void PacketPeerUDP::set_dest_address(const IP_Address &p_address, int p_port) {
 }
 
 void PacketPeerUDP::_bind_methods() {
-	ClassDB::bind_method(D_METHOD("listen", "port", "bind_address", "recv_buf_size"), &PacketPeerUDP::listen, DEFVAL("*"), DEFVAL(65536));
+	ClassDB::bind_method(D_METHOD("bind", "port", "bind_address", "recv_buf_size"), &PacketPeerUDP::bind, DEFVAL("*"), DEFVAL(65536));
 	ClassDB::bind_method(D_METHOD("close"), &PacketPeerUDP::close);
 	ClassDB::bind_method(D_METHOD("wait"), &PacketPeerUDP::wait);
-	ClassDB::bind_method(D_METHOD("is_listening"), &PacketPeerUDP::is_listening);
+	ClassDB::bind_method(D_METHOD("is_bound"), &PacketPeerUDP::is_bound);
 	ClassDB::bind_method(D_METHOD("connect_to_host", "host", "port"), &PacketPeerUDP::connect_to_host);
 	ClassDB::bind_method(D_METHOD("is_connected_to_host"), &PacketPeerUDP::is_connected_to_host);
 	ClassDB::bind_method(D_METHOD("get_packet_ip"), &PacketPeerUDP::_get_packet_ip);
 	ClassDB::bind_method(D_METHOD("get_packet_port"), &PacketPeerUDP::get_packet_port);
+	ClassDB::bind_method(D_METHOD("get_local_port"), &PacketPeerUDP::get_local_port);
 	ClassDB::bind_method(D_METHOD("set_dest_address", "host", "port"), &PacketPeerUDP::_set_dest_address);
 	ClassDB::bind_method(D_METHOD("set_broadcast_enabled", "enabled"), &PacketPeerUDP::set_broadcast_enabled);
 	ClassDB::bind_method(D_METHOD("join_multicast_group", "multicast_address", "interface_name"), &PacketPeerUDP::join_multicast_group);
diff --git a/core/io/packet_peer_udp.h b/core/io/packet_peer_udp.h
index 4bac6994fc..b9d11c465c 100644
--- a/core/io/packet_peer_udp.h
+++ b/core/io/packet_peer_udp.h
@@ -70,10 +70,10 @@ protected:
 public:
 	void set_blocking_mode(bool p_enable);
 
-	Error listen(int p_port, const IP_Address &p_bind_address = IP_Address("*"), int p_recv_buffer_size = 65536);
+	Error bind(int p_port, const IP_Address &p_bind_address = IP_Address("*"), int p_recv_buffer_size = 65536);
 	void close();
 	Error wait();
-	bool is_listening() const;
+	bool is_bound() const;
 
 	Error connect_shared_socket(Ref<NetSocket> p_sock, IP_Address p_ip, uint16_t p_port, UDPServer *ref); // Used by UDPServer
 	void disconnect_shared_socket(); // Used by UDPServer
@@ -83,6 +83,7 @@ public:
 
 	IP_Address get_packet_address() const;
 	int get_packet_port() const;
+	int get_local_port() const;
 	void set_dest_address(const IP_Address &p_address, int p_port);
 
 	Error put_packet(const uint8_t *p_buffer, int p_buffer_size) override;
diff --git a/core/io/resource.cpp b/core/io/resource.cpp
index 8560e2abc7..d46e9edafa 100644
--- a/core/io/resource.cpp
+++ b/core/io/resource.cpp
@@ -110,6 +110,12 @@ String Resource::get_name() const {
 	return name;
 }
 
+void Resource::update_configuration_warning() {
+	if (_update_configuration_warning) {
+		_update_configuration_warning();
+	}
+}
+
 bool Resource::editor_can_reload_from_file() {
 	return true; //by default yes
 }
@@ -320,6 +326,7 @@ void Resource::setup_local_to_scene() {
 }
 
 Node *(*Resource::_get_local_scene_func)() = nullptr;
+void (*Resource::_update_configuration_warning)() = nullptr;
 
 void Resource::set_as_translation_remapped(bool p_remapped) {
 	if (remapped_list.in_list() == p_remapped) {
diff --git a/core/io/resource.h b/core/io/resource.h
index ae18ac0c8a..75a9f928f8 100644
--- a/core/io/resource.h
+++ b/core/io/resource.h
@@ -88,7 +88,9 @@ protected:
 
 public:
 	static Node *(*_get_local_scene_func)(); //used by editor
+	static void (*_update_configuration_warning)(); //used by editor
 
+	void update_configuration_warning();
 	virtual bool editor_can_reload_from_file();
 	virtual void reset_state(); //for resources that use variable amount of properties, either via _validate_property or _get_property_list, this function needs to be implemented to correctly clear state
 	virtual Error copy_from(const Ref<Resource> &p_resource);
diff --git a/core/io/resource_importer.cpp b/core/io/resource_importer.cpp
index 5ca0eb884a..b503655edd 100644
--- a/core/io/resource_importer.cpp
+++ b/core/io/resource_importer.cpp
@@ -192,6 +192,34 @@ bool ResourceFormatImporter::recognize_path(const String &p_path, const String &
 	return FileAccess::exists(p_path + ".import");
 }
 
+Error ResourceFormatImporter::get_import_order_threads_and_importer(const String &p_path, int &r_order, bool &r_can_threads, String &r_importer) const {
+	r_order = 0;
+	r_importer = "";
+
+	r_can_threads = false;
+	Ref<ResourceImporter> importer;
+
+	if (FileAccess::exists(p_path + ".import")) {
+		PathAndType pat;
+		Error err = _get_path_and_type(p_path, pat);
+
+		if (err == OK) {
+			importer = get_importer_by_name(pat.importer);
+		}
+	} else {
+		importer = get_importer_by_extension(p_path.get_extension().to_lower());
+	}
+
+	if (importer.is_valid()) {
+		r_order = importer->get_import_order();
+		r_importer = importer->get_importer_name();
+		r_can_threads = importer->can_import_threaded();
+		return OK;
+	} else {
+		return ERR_INVALID_PARAMETER;
+	}
+}
+
 int ResourceFormatImporter::get_import_order(const String &p_path) const {
 	Ref<ResourceImporter> importer;
 
diff --git a/core/io/resource_importer.h b/core/io/resource_importer.h
index eeb486073e..a14d6ba52c 100644
--- a/core/io/resource_importer.h
+++ b/core/io/resource_importer.h
@@ -72,6 +72,8 @@ public:
 
 	virtual int get_import_order(const String &p_path) const;
 
+	Error get_import_order_threads_and_importer(const String &p_path, int &r_order, bool &r_can_threads, String &r_importer) const;
+
 	String get_internal_resource_path(const String &p_path) const;
 	void get_internal_resource_path_list(const String &p_path, List<String> *r_paths);
 
@@ -126,6 +128,9 @@ public:
 	virtual String get_option_group_file() const { return String(); }
 
 	virtual Error import(const String &p_source_file, const String &p_save_path, const Map<StringName, Variant> &p_options, List<String> *r_platform_variants, List<String> *r_gen_files = nullptr, Variant *r_metadata = nullptr) = 0;
+	virtual bool can_import_threaded() const { return true; }
+	virtual void import_threaded_begin() {}
+	virtual void import_threaded_end() {}
 
 	virtual Error import_group_file(const String &p_group_file, const Map<String, Map<StringName, Variant>> &p_source_file_options, const Map<String, String> &p_base_paths) { return ERR_UNAVAILABLE; }
 	virtual bool are_import_settings_valid(const String &p_path) const { return true; }
diff --git a/core/io/stream_peer.cpp b/core/io/stream_peer.cpp
index 8407d55196..74154321b3 100644
--- a/core/io/stream_peer.cpp
+++ b/core/io/stream_peer.cpp
@@ -433,7 +433,7 @@ Error StreamPeerBuffer::put_data(const uint8_t *p_data, int p_bytes) {
 	}
 
 	uint8_t *w = data.ptrw();
-	copymem(&w[pointer], p_data, p_bytes);
+	memcpy(&w[pointer], p_data, p_bytes);
 
 	pointer += p_bytes;
 	return OK;
@@ -466,7 +466,7 @@ Error StreamPeerBuffer::get_partial_data(uint8_t *p_buffer, int p_bytes, int &r_
 	}
 
 	const uint8_t *r = data.ptr();
-	copymem(p_buffer, r + pointer, r_received);
+	memcpy(p_buffer, r + pointer, r_received);
 
 	pointer += r_received;
 	// FIXME: return what? OK or ERR_*
diff --git a/core/io/stream_peer_tcp.cpp b/core/io/stream_peer_tcp.cpp
index 760710a9eb..9906b9e4c3 100644
--- a/core/io/stream_peer_tcp.cpp
+++ b/core/io/stream_peer_tcp.cpp
@@ -67,21 +67,40 @@ void StreamPeerTCP::accept_socket(Ref<NetSocket> p_sock, IP_Address p_host, uint
 	peer_port = p_port;
 }
 
-Error StreamPeerTCP::connect_to_host(const IP_Address &p_host, uint16_t p_port) {
+Error StreamPeerTCP::bind(int p_port, const IP_Address &p_host) {
 	ERR_FAIL_COND_V(!_sock.is_valid(), ERR_UNAVAILABLE);
 	ERR_FAIL_COND_V(_sock->is_open(), ERR_ALREADY_IN_USE);
-	ERR_FAIL_COND_V(!p_host.is_valid(), ERR_INVALID_PARAMETER);
+	ERR_FAIL_COND_V_MSG(p_port < 0 || p_port > 65535, ERR_INVALID_PARAMETER, "The local port number must be between 0 and 65535 (inclusive).");
 
-	Error err;
 	IP::Type ip_type = p_host.is_ipv4() ? IP::TYPE_IPV4 : IP::TYPE_IPV6;
+	if (p_host.is_wildcard()) {
+		ip_type = IP::TYPE_ANY;
+	}
+	Error err = _sock->open(NetSocket::TYPE_TCP, ip_type);
+	if (err != OK) {
+		return err;
+	}
+	_sock->set_blocking_enabled(false);
+	return _sock->bind(p_host, p_port);
+}
 
-	err = _sock->open(NetSocket::TYPE_TCP, ip_type);
-	ERR_FAIL_COND_V(err != OK, FAILED);
+Error StreamPeerTCP::connect_to_host(const IP_Address &p_host, int p_port) {
+	ERR_FAIL_COND_V(!_sock.is_valid(), ERR_UNAVAILABLE);
+	ERR_FAIL_COND_V(status != STATUS_NONE, ERR_ALREADY_IN_USE);
+	ERR_FAIL_COND_V(!p_host.is_valid(), ERR_INVALID_PARAMETER);
+	ERR_FAIL_COND_V_MSG(p_port < 1 || p_port > 65535, ERR_INVALID_PARAMETER, "The remote port number must be between 1 and 65535 (inclusive).");
 
-	_sock->set_blocking_enabled(false);
+	if (!_sock->is_open()) {
+		IP::Type ip_type = p_host.is_ipv4() ? IP::TYPE_IPV4 : IP::TYPE_IPV6;
+		Error err = _sock->open(NetSocket::TYPE_TCP, ip_type);
+		if (err != OK) {
+			return err;
+		}
+		_sock->set_blocking_enabled(false);
+	}
 
 	timeout = OS::get_singleton()->get_ticks_msec() + (((uint64_t)GLOBAL_GET("network/limits/tcp/connect_timeout_seconds")) * 1000);
-	err = _sock->connect_to_host(p_host, p_port);
+	Error err = _sock->connect_to_host(p_host, p_port);
 
 	if (err == OK) {
 		status = STATUS_CONNECTED;
@@ -300,10 +319,16 @@ IP_Address StreamPeerTCP::get_connected_host() const {
 	return peer_host;
 }
 
-uint16_t StreamPeerTCP::get_connected_port() const {
+int StreamPeerTCP::get_connected_port() const {
 	return peer_port;
 }
 
+int StreamPeerTCP::get_local_port() const {
+	uint16_t local_port;
+	_sock->get_socket_address(nullptr, &local_port);
+	return local_port;
+}
+
 Error StreamPeerTCP::_connect(const String &p_address, int p_port) {
 	IP_Address ip;
 	if (p_address.is_valid_ip_address()) {
@@ -319,11 +344,13 @@ Error StreamPeerTCP::_connect(const String &p_address, int p_port) {
 }
 
 void StreamPeerTCP::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("bind", "port", "host"), &StreamPeerTCP::bind, DEFVAL("*"));
 	ClassDB::bind_method(D_METHOD("connect_to_host", "host", "port"), &StreamPeerTCP::_connect);
 	ClassDB::bind_method(D_METHOD("is_connected_to_host"), &StreamPeerTCP::is_connected_to_host);
 	ClassDB::bind_method(D_METHOD("get_status"), &StreamPeerTCP::get_status);
 	ClassDB::bind_method(D_METHOD("get_connected_host"), &StreamPeerTCP::get_connected_host);
 	ClassDB::bind_method(D_METHOD("get_connected_port"), &StreamPeerTCP::get_connected_port);
+	ClassDB::bind_method(D_METHOD("get_local_port"), &StreamPeerTCP::get_local_port);
 	ClassDB::bind_method(D_METHOD("disconnect_from_host"), &StreamPeerTCP::disconnect_from_host);
 	ClassDB::bind_method(D_METHOD("set_no_delay", "enabled"), &StreamPeerTCP::set_no_delay);
 
diff --git a/core/io/stream_peer_tcp.h b/core/io/stream_peer_tcp.h
index 10b90908d4..3bc7b252dc 100644
--- a/core/io/stream_peer_tcp.h
+++ b/core/io/stream_peer_tcp.h
@@ -65,10 +65,12 @@ protected:
 public:
 	void accept_socket(Ref<NetSocket> p_sock, IP_Address p_host, uint16_t p_port);
 
-	Error connect_to_host(const IP_Address &p_host, uint16_t p_port);
+	Error bind(int p_port, const IP_Address &p_host);
+	Error connect_to_host(const IP_Address &p_host, int p_port);
 	bool is_connected_to_host() const;
 	IP_Address get_connected_host() const;
-	uint16_t get_connected_port() const;
+	int get_connected_port() const;
+	int get_local_port() const;
 	void disconnect_from_host();
 
 	int get_available_bytes() const override;
diff --git a/core/io/tcp_server.cpp b/core/io/tcp_server.cpp
index 323d2bbd7f..348be66ba4 100644
--- a/core/io/tcp_server.cpp
+++ b/core/io/tcp_server.cpp
@@ -34,6 +34,7 @@ void TCP_Server::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("listen", "port", "bind_address"), &TCP_Server::listen, DEFVAL("*"));
 	ClassDB::bind_method(D_METHOD("is_connection_available"), &TCP_Server::is_connection_available);
 	ClassDB::bind_method(D_METHOD("is_listening"), &TCP_Server::is_listening);
+	ClassDB::bind_method(D_METHOD("get_local_port"), &TCP_Server::get_local_port);
 	ClassDB::bind_method(D_METHOD("take_connection"), &TCP_Server::take_connection);
 	ClassDB::bind_method(D_METHOD("stop"), &TCP_Server::stop);
 }
@@ -42,6 +43,7 @@ Error TCP_Server::listen(uint16_t p_port, const IP_Address &p_bind_address) {
 	ERR_FAIL_COND_V(!_sock.is_valid(), ERR_UNAVAILABLE);
 	ERR_FAIL_COND_V(_sock->is_open(), ERR_ALREADY_IN_USE);
 	ERR_FAIL_COND_V(!p_bind_address.is_valid() && !p_bind_address.is_wildcard(), ERR_INVALID_PARAMETER);
+	ERR_FAIL_COND_V_MSG(p_port < 0 || p_port > 65535, ERR_INVALID_PARAMETER, "The local port number must be between 0 and 65535 (inclusive).");
 
 	Error err;
 	IP::Type ip_type = IP::TYPE_ANY;
@@ -74,6 +76,12 @@ Error TCP_Server::listen(uint16_t p_port, const IP_Address &p_bind_address) {
 	return OK;
 }
 
+int TCP_Server::get_local_port() const {
+	uint16_t local_port;
+	_sock->get_socket_address(nullptr, &local_port);
+	return local_port;
+}
+
 bool TCP_Server::is_listening() const {
 	ERR_FAIL_COND_V(!_sock.is_valid(), false);
 
diff --git a/core/io/tcp_server.h b/core/io/tcp_server.h
index f06ddd2d99..58c04d87ec 100644
--- a/core/io/tcp_server.h
+++ b/core/io/tcp_server.h
@@ -49,6 +49,7 @@ protected:
 
 public:
 	Error listen(uint16_t p_port, const IP_Address &p_bind_address = IP_Address("*"));
+	int get_local_port() const;
 	bool is_listening() const;
 	bool is_connection_available() const;
 	Ref<StreamPeerTCP> take_connection();
diff --git a/core/io/udp_server.cpp b/core/io/udp_server.cpp
index f56fb431ef..99642f4af4 100644
--- a/core/io/udp_server.cpp
+++ b/core/io/udp_server.cpp
@@ -34,6 +34,7 @@ void UDPServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("listen", "port", "bind_address"), &UDPServer::listen, DEFVAL("*"));
 	ClassDB::bind_method(D_METHOD("poll"), &UDPServer::poll);
 	ClassDB::bind_method(D_METHOD("is_connection_available"), &UDPServer::is_connection_available);
+	ClassDB::bind_method(D_METHOD("get_local_port"), &UDPServer::get_local_port);
 	ClassDB::bind_method(D_METHOD("is_listening"), &UDPServer::is_listening);
 	ClassDB::bind_method(D_METHOD("take_connection"), &UDPServer::take_connection);
 	ClassDB::bind_method(D_METHOD("stop"), &UDPServer::stop);
@@ -90,6 +91,7 @@ Error UDPServer::listen(uint16_t p_port, const IP_Address &p_bind_address) {
 	ERR_FAIL_COND_V(!_sock.is_valid(), ERR_UNAVAILABLE);
 	ERR_FAIL_COND_V(_sock->is_open(), ERR_ALREADY_IN_USE);
 	ERR_FAIL_COND_V(!p_bind_address.is_valid() && !p_bind_address.is_wildcard(), ERR_INVALID_PARAMETER);
+	ERR_FAIL_COND_V_MSG(p_port < 0 || p_port > 65535, ERR_INVALID_PARAMETER, "The local port number must be between 0 and 65535 (inclusive).");
 
 	Error err;
 	IP::Type ip_type = IP::TYPE_ANY;
@@ -112,11 +114,15 @@ Error UDPServer::listen(uint16_t p_port, const IP_Address &p_bind_address) {
 		stop();
 		return err;
 	}
-	bind_address = p_bind_address;
-	bind_port = p_port;
 	return OK;
 }
 
+int UDPServer::get_local_port() const {
+	uint16_t local_port;
+	_sock->get_socket_address(nullptr, &local_port);
+	return local_port;
+}
+
 bool UDPServer::is_listening() const {
 	ERR_FAIL_COND_V(!_sock.is_valid(), false);
 
@@ -176,8 +182,6 @@ void UDPServer::stop() {
 	if (_sock.is_valid()) {
 		_sock->close();
 	}
-	bind_port = 0;
-	bind_address = IP_Address();
 	List<Peer>::Element *E = peers.front();
 	while (E) {
 		E->get().peer->disconnect_shared_socket();
diff --git a/core/io/udp_server.h b/core/io/udp_server.h
index bbd2f951c9..298d4d4b63 100644
--- a/core/io/udp_server.h
+++ b/core/io/udp_server.h
@@ -53,21 +53,18 @@ protected:
 	};
 	uint8_t recv_buffer[PACKET_BUFFER_SIZE];
 
-	int bind_port = 0;
-	IP_Address bind_address;
-
 	List<Peer> peers;
 	List<Peer> pending;
 	int max_pending_connections = 16;
 
 	Ref<NetSocket> _sock;
-
 	static void _bind_methods();
 
 public:
 	void remove_peer(IP_Address p_ip, int p_port);
 	Error listen(uint16_t p_port, const IP_Address &p_bind_address = IP_Address("*"));
 	Error poll();
+	int get_local_port() const;
 	bool is_listening() const;
 	bool is_connection_available() const;
 	void set_max_pending_connections(int p_max);
diff --git a/core/io/xml_parser.cpp b/core/io/xml_parser.cpp
index d5eb32513b..a1f8e79adc 100644
--- a/core/io/xml_parser.cpp
+++ b/core/io/xml_parser.cpp
@@ -433,7 +433,7 @@ Error XMLParser::open_buffer(const Vector<uint8_t> &p_buffer) {
 
 	length = p_buffer.size();
 	data = memnew_arr(char, length + 1);
-	copymem(data, p_buffer.ptr(), length);
+	memcpy(data, p_buffer.ptr(), length);
 	data[length] = 0;
 	P = data;
 	return OK;
diff --git a/core/io/zip_io.cpp b/core/io/zip_io.cpp
index 4b4a46e198..fe46868dd0 100644
--- a/core/io/zip_io.cpp
+++ b/core/io/zip_io.cpp
@@ -30,8 +30,6 @@
 
 #include "zip_io.h"
 
-#include "core/os/copymem.h"
-
 void *zipio_open(void *data, const char *p_fname, int mode) {
 	FileAccess *&f = *(FileAccess **)data;
 
@@ -103,7 +101,7 @@ int zipio_testerror(voidpf opaque, voidpf stream) {
 
 voidpf zipio_alloc(voidpf opaque, uInt items, uInt size) {
 	voidpf ptr = memalloc(items * size);
-	zeromem(ptr, items * size);
+	memset(ptr, 0, items * size);
 	return ptr;
 }
 
diff --git a/core/math/basis.cpp b/core/math/basis.cpp
index cc2b7c6611..50299902eb 100644
--- a/core/math/basis.cpp
+++ b/core/math/basis.cpp
@@ -31,7 +31,6 @@
 #include "basis.h"
 
 #include "core/math/math_funcs.h"
-#include "core/os/copymem.h"
 #include "core/string/print_string.h"
 
 #define cofac(row1, col1, row2, col2) \
diff --git a/core/math/color.h b/core/math/color.h
index 5eb8b1119a..e404d80c8a 100644
--- a/core/math/color.h
+++ b/core/math/color.h
@@ -197,13 +197,13 @@ struct Color {
 
 	// For the binder.
 	_FORCE_INLINE_ void set_r8(int32_t r8) { r = (CLAMP(r8, 0, 255) / 255.0); }
-	_FORCE_INLINE_ int32_t get_r8() const { return int32_t(CLAMP(r * 255.0, 0.0, 255.0)); }
+	_FORCE_INLINE_ int32_t get_r8() const { return int32_t(CLAMP(Math::round(r * 255.0f), 0.0f, 255.0f)); }
 	_FORCE_INLINE_ void set_g8(int32_t g8) { g = (CLAMP(g8, 0, 255) / 255.0); }
-	_FORCE_INLINE_ int32_t get_g8() const { return int32_t(CLAMP(g * 255.0, 0.0, 255.0)); }
+	_FORCE_INLINE_ int32_t get_g8() const { return int32_t(CLAMP(Math::round(g * 255.0f), 0.0f, 255.0f)); }
 	_FORCE_INLINE_ void set_b8(int32_t b8) { b = (CLAMP(b8, 0, 255) / 255.0); }
-	_FORCE_INLINE_ int32_t get_b8() const { return int32_t(CLAMP(b * 255.0, 0.0, 255.0)); }
+	_FORCE_INLINE_ int32_t get_b8() const { return int32_t(CLAMP(Math::round(b * 255.0f), 0.0f, 255.0f)); }
 	_FORCE_INLINE_ void set_a8(int32_t a8) { a = (CLAMP(a8, 0, 255) / 255.0); }
-	_FORCE_INLINE_ int32_t get_a8() const { return int32_t(CLAMP(a * 255.0, 0.0, 255.0)); }
+	_FORCE_INLINE_ int32_t get_a8() const { return int32_t(CLAMP(Math::round(a * 255.0f), 0.0f, 255.0f)); }
 
 	_FORCE_INLINE_ void set_h(float p_h) { set_hsv(p_h, get_s(), get_v()); }
 	_FORCE_INLINE_ void set_s(float p_s) { set_hsv(get_h(), p_s, get_v()); }
diff --git a/core/math/dynamic_bvh.h b/core/math/dynamic_bvh.h
index 3fb22515a2..0b6286cd9d 100644
--- a/core/math/dynamic_bvh.h
+++ b/core/math/dynamic_bvh.h
@@ -343,7 +343,7 @@ void DynamicBVH::aabb_query(const AABB &p_box, QueryResult &r_result) {
 				if (depth > threshold) {
 					if (aux_stack.is_empty()) {
 						aux_stack.resize(ALLOCA_STACK_SIZE * 2);
-						copymem(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
+						memcpy(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
 					} else {
 						aux_stack.resize(aux_stack.size() * 2);
 					}
@@ -399,7 +399,7 @@ void DynamicBVH::convex_query(const Plane *p_planes, int p_plane_count, const Ve
 				if (depth > threshold) {
 					if (aux_stack.is_empty()) {
 						aux_stack.resize(ALLOCA_STACK_SIZE * 2);
-						copymem(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
+						memcpy(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
 					} else {
 						aux_stack.resize(aux_stack.size() * 2);
 					}
@@ -456,7 +456,7 @@ void DynamicBVH::ray_query(const Vector3 &p_from, const Vector3 &p_to, QueryResu
 				if (depth > threshold) {
 					if (aux_stack.is_empty()) {
 						aux_stack.resize(ALLOCA_STACK_SIZE * 2);
-						copymem(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
+						memcpy(aux_stack.ptr(), stack, ALLOCA_STACK_SIZE * sizeof(const Node *));
 					} else {
 						aux_stack.resize(aux_stack.size() * 2);
 					}
diff --git a/core/math/face3.cpp b/core/math/face3.cpp
index beb0a8e405..20c316c322 100644
--- a/core/math/face3.cpp
+++ b/core/math/face3.cpp
@@ -169,7 +169,7 @@ Vector3 Face3::get_median_point() const {
 }
 
 real_t Face3::get_area() const {
-	return vec3_cross(vertex[0] - vertex[1], vertex[0] - vertex[2]).length();
+	return vec3_cross(vertex[0] - vertex[1], vertex[0] - vertex[2]).length() * 0.5;
 }
 
 ClockDirection Face3::get_clock_dir() const {
diff --git a/core/math/geometry_2d.cpp b/core/math/geometry_2d.cpp
index feb1fb2fb8..7b2630b4ff 100644
--- a/core/math/geometry_2d.cpp
+++ b/core/math/geometry_2d.cpp
@@ -358,7 +358,7 @@ Vector<Point2i> Geometry2D::pack_rects(const Vector<Size2i> &p_sizes, const Size
 Vector<Vector3i> Geometry2D::partial_pack_rects(const Vector<Vector2i> &p_sizes, const Size2i &p_atlas_size) {
 	Vector<stbrp_node> nodes;
 	nodes.resize(p_atlas_size.width);
-	zeromem(nodes.ptrw(), sizeof(stbrp_node) * nodes.size());
+	memset(nodes.ptrw(), 0, sizeof(stbrp_node) * nodes.size());
 
 	stbrp_context context;
 	stbrp_init_target(&context, p_atlas_size.width, p_atlas_size.height, nodes.ptrw(), p_atlas_size.width);
diff --git a/core/math/math_funcs.h b/core/math/math_funcs.h
index 267f6a4fe2..8cf13efdb6 100644
--- a/core/math/math_funcs.h
+++ b/core/math/math_funcs.h
@@ -103,6 +103,9 @@ public:
 	static _ALWAYS_INLINE_ double log(double p_x) { return ::log(p_x); }
 	static _ALWAYS_INLINE_ float log(float p_x) { return ::logf(p_x); }
 
+	static _ALWAYS_INLINE_ double log2(double p_x) { return ::log2(p_x); }
+	static _ALWAYS_INLINE_ float log2(float p_x) { return ::log2f(p_x); }
+
 	static _ALWAYS_INLINE_ double exp(double p_x) { return ::exp(p_x); }
 	static _ALWAYS_INLINE_ float exp(float p_x) { return ::expf(p_x); }
 
diff --git a/core/math/random_pcg.cpp b/core/math/random_pcg.cpp
index 9609620469..681c2a9717 100644
--- a/core/math/random_pcg.cpp
+++ b/core/math/random_pcg.cpp
@@ -39,7 +39,7 @@ RandomPCG::RandomPCG(uint64_t p_seed, uint64_t p_inc) :
 }
 
 void RandomPCG::randomize() {
-	seed(OS::get_singleton()->get_ticks_usec() * pcg.state + PCG_DEFAULT_INC_64);
+	seed(((uint64_t)OS::get_singleton()->get_unix_time() + OS::get_singleton()->get_ticks_usec()) * pcg.state + PCG_DEFAULT_INC_64);
 }
 
 double RandomPCG::random(double p_from, double p_to) {
diff --git a/core/math/transform.cpp b/core/math/transform.cpp
index fab5d124fa..d4d7ff6d28 100644
--- a/core/math/transform.cpp
+++ b/core/math/transform.cpp
@@ -31,7 +31,6 @@
 #include "transform.h"
 
 #include "core/math/math_funcs.h"
-#include "core/os/copymem.h"
 #include "core/string/print_string.h"
 
 void Transform::affine_invert() {
diff --git a/core/object/callable_method_pointer.h b/core/object/callable_method_pointer.h
index 115797a00c..8ba01be4e4 100644
--- a/core/object/callable_method_pointer.h
+++ b/core/object/callable_method_pointer.h
@@ -32,7 +32,6 @@
 #define CALLABLE_METHOD_POINTER_H
 
 #include "core/object/object.h"
-#include "core/os/copymem.h"
 #include "core/templates/hashfuncs.h"
 #include "core/templates/simple_type.h"
 #include "core/variant/binder_common.h"
@@ -98,7 +97,7 @@ public:
 	}
 
 	CallableCustomMethodPointer(T *p_instance, void (T::*p_method)(P...)) {
-		zeromem(&data, sizeof(Data)); // Clear beforehand, may have padding bytes.
+		memset(&data, 0, sizeof(Data)); // Clear beforehand, may have padding bytes.
 		data.instance = p_instance;
 #ifdef DEBUG_ENABLED
 		data.object_id = p_instance->get_instance_id();
@@ -153,7 +152,7 @@ public:
 	}
 
 	CallableCustomMethodPointerRet(T *p_instance, R (T::*p_method)(P...)) {
-		zeromem(&data, sizeof(Data)); // Clear beforehand, may have padding bytes.
+		memset(&data, 0, sizeof(Data)); // Clear beforehand, may have padding bytes.
 		data.instance = p_instance;
 #ifdef DEBUG_ENABLED
 		data.object_id = p_instance->get_instance_id();
@@ -208,7 +207,7 @@ public:
 	}
 
 	CallableCustomMethodPointerRetC(T *p_instance, R (T::*p_method)(P...) const) {
-		zeromem(&data, sizeof(Data)); // Clear beforehand, may have padding bytes.
+		memset(&data, 0, sizeof(Data)); // Clear beforehand, may have padding bytes.
 		data.instance = p_instance;
 #ifdef DEBUG_ENABLED
 		data.object_id = p_instance->get_instance_id();
diff --git a/core/os/memory.cpp b/core/os/memory.cpp
index 5910cb0e7b..a756c1d5dd 100644
--- a/core/os/memory.cpp
+++ b/core/os/memory.cpp
@@ -31,7 +31,6 @@
 #include "memory.h"
 
 #include "core/error/error_macros.h"
-#include "core/os/copymem.h"
 #include "core/templates/safe_refcount.h"
 
 #include <stdio.h>
diff --git a/core/os/pool_allocator.cpp b/core/os/pool_allocator.cpp
index 9be3a62e2f..74e9c24e04 100644
--- a/core/os/pool_allocator.cpp
+++ b/core/os/pool_allocator.cpp
@@ -31,7 +31,6 @@
 #include "pool_allocator.h"
 
 #include "core/error/error_macros.h"
-#include "core/os/copymem.h"
 #include "core/os/memory.h"
 #include "core/os/os.h"
 #include "core/string/print_string.h"
@@ -42,7 +41,7 @@
 	do {                                                      \
 		void *_dst = &((unsigned char *)pool)[m_to_pos];      \
 		void *_src = &((unsigned char *)pool)[(m_entry).pos]; \
-		movemem(_dst, _src, aligned((m_entry).len));          \
+		memmove(_dst, _src, aligned((m_entry).len));          \
 		(m_entry).pos = m_to_pos;                             \
 	} while (0);
 
diff --git a/core/string/node_path.h b/core/string/node_path.h
index 26e15636d9..a277ab26fa 100644
--- a/core/string/node_path.h
+++ b/core/string/node_path.h
@@ -66,8 +66,6 @@ public:
 
 	void prepend_period();
 
-	NodePath get_parent() const;
-
 	_FORCE_INLINE_ uint32_t hash() const {
 		if (!data) {
 			return 0;
diff --git a/core/string/optimized_translation.cpp b/core/string/optimized_translation.cpp
index 53d0a8924d..268562d971 100644
--- a/core/string/optimized_translation.cpp
+++ b/core/string/optimized_translation.cpp
@@ -46,6 +46,7 @@ void OptimizedTranslation::generate(const Ref<Translation> &p_from) {
 	// This method compresses a Translation instance.
 	// Right now, it doesn't handle context or plurals, so Translation subclasses using plurals or context (i.e TranslationPO) shouldn't be compressed.
 #ifdef TOOLS_ENABLED
+	ERR_FAIL_COND(p_from.is_null());
 	List<StringName> keys;
 	p_from->get_message_list(&keys);
 
diff --git a/core/string/translation.cpp b/core/string/translation.cpp
index ade5f7b4d8..153f0190fd 100644
--- a/core/string/translation.cpp
+++ b/core/string/translation.cpp
@@ -835,7 +835,7 @@ Vector<String> Translation::_get_message_list() const {
 void Translation::_set_messages(const Dictionary &p_messages) {
 	List<Variant> keys;
 	p_messages.get_key_list(&keys);
-	for (auto E = keys.front(); E; E = E->next()) {
+	for (List<Variant>::Element *E = keys.front(); E; E = E->next()) {
 		translation_map[E->get()] = p_messages[E->get()];
 	}
 }
diff --git a/core/string/translation_po.cpp b/core/string/translation_po.cpp
index 2efadaa9b7..ad768f7140 100644
--- a/core/string/translation_po.cpp
+++ b/core/string/translation_po.cpp
@@ -47,14 +47,14 @@ void TranslationPO::print_translation_map() {
 
 	List<StringName> context_l;
 	translation_map.get_key_list(&context_l);
-	for (auto E = context_l.front(); E; E = E->next()) {
+	for (List<StringName>::Element *E = context_l.front(); E; E = E->next()) {
 		StringName ctx = E->get();
 		file->store_line(" ===== Context: " + String::utf8(String(ctx).utf8()) + " ===== ");
 		const HashMap<StringName, Vector<StringName>> &inner_map = translation_map[ctx];
 
 		List<StringName> id_l;
 		inner_map.get_key_list(&id_l);
-		for (auto E2 = id_l.front(); E2; E2 = E2->next()) {
+		for (List<StringName>::Element *E2 = id_l.front(); E2; E2 = E2->next()) {
 			StringName id = E2->get();
 			file->store_line("msgid: " + String::utf8(String(id).utf8()));
 			for (int i = 0; i < inner_map[id].size(); i++) {
@@ -74,7 +74,7 @@ Dictionary TranslationPO::_get_messages() const {
 
 	List<StringName> context_l;
 	translation_map.get_key_list(&context_l);
-	for (auto E = context_l.front(); E; E = E->next()) {
+	for (List<StringName>::Element *E = context_l.front(); E; E = E->next()) {
 		StringName ctx = E->get();
 		const HashMap<StringName, Vector<StringName>> &id_str_map = translation_map[ctx];
 
@@ -82,7 +82,7 @@ Dictionary TranslationPO::_get_messages() const {
 		List<StringName> id_l;
 		id_str_map.get_key_list(&id_l);
 		// Save list of id and strs associated with a context in a temporary dictionary.
-		for (auto E2 = id_l.front(); E2; E2 = E2->next()) {
+		for (List<StringName>::Element *E2 = id_l.front(); E2; E2 = E2->next()) {
 			StringName id = E2->get();
 			d2[id] = id_str_map[id];
 		}
@@ -98,14 +98,14 @@ void TranslationPO::_set_messages(const Dictionary &p_messages) {
 
 	List<Variant> context_l;
 	p_messages.get_key_list(&context_l);
-	for (auto E = context_l.front(); E; E = E->next()) {
+	for (List<Variant>::Element *E = context_l.front(); E; E = E->next()) {
 		StringName ctx = E->get();
 		const Dictionary &id_str_map = p_messages[ctx];
 
 		HashMap<StringName, Vector<StringName>> temp_map;
 		List<Variant> id_l;
 		id_str_map.get_key_list(&id_l);
-		for (auto E2 = id_l.front(); E2; E2 = E2->next()) {
+		for (List<Variant>::Element *E2 = id_l.front(); E2; E2 = E2->next()) {
 			StringName id = E2->get();
 			temp_map[id] = id_str_map[id];
 		}
@@ -121,7 +121,7 @@ Vector<String> TranslationPO::_get_message_list() const {
 	get_message_list(&msgs);
 
 	Vector<String> v;
-	for (auto E = msgs.front(); E; E = E->next()) {
+	for (List<StringName>::Element *E = msgs.front(); E; E = E->next()) {
 		v.push_back(E->get());
 	}
 
@@ -281,7 +281,7 @@ void TranslationPO::get_message_list(List<StringName> *r_messages) const {
 	List<StringName> context_l;
 	translation_map.get_key_list(&context_l);
 
-	for (auto E = context_l.front(); E; E = E->next()) {
+	for (List<StringName>::Element *E = context_l.front(); E; E = E->next()) {
 		if (String(E->get()) != "") {
 			continue;
 		}
@@ -289,7 +289,7 @@ void TranslationPO::get_message_list(List<StringName> *r_messages) const {
 		List<StringName> msgid_l;
 		translation_map[E->get()].get_key_list(&msgid_l);
 
-		for (auto E2 = msgid_l.front(); E2; E2 = E2->next()) {
+		for (List<StringName>::Element *E2 = msgid_l.front(); E2; E2 = E2->next()) {
 			r_messages->push_back(E2->get());
 		}
 	}
@@ -300,7 +300,7 @@ int TranslationPO::get_message_count() const {
 	translation_map.get_key_list(&context_l);
 
 	int count = 0;
-	for (auto E = context_l.front(); E; E = E->next()) {
+	for (List<StringName>::Element *E = context_l.front(); E; E = E->next()) {
 		count += translation_map[E->get()].size();
 	}
 	return count;
diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp
index 05c80296c2..bdb66526a4 100644
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@@ -240,6 +240,71 @@ String String::word_wrap(int p_chars_per_line) const {
 	return ret;
 }
 
+Error String::parse_url(String &r_scheme, String &r_host, int &r_port, String &r_path) const {
+	// Splits the URL into scheme, host, port, path. Strip credentials when present.
+	String base = *this;
+	r_scheme = "";
+	r_host = "";
+	r_port = 0;
+	r_path = "";
+	int pos = base.find("://");
+	// Scheme
+	if (pos != -1) {
+		r_scheme = base.substr(0, pos + 3).to_lower();
+		base = base.substr(pos + 3, base.length() - pos - 3);
+	}
+	pos = base.find("/");
+	// Path
+	if (pos != -1) {
+		r_path = base.substr(pos, base.length() - pos);
+		base = base.substr(0, pos);
+	}
+	// Host
+	pos = base.find("@");
+	if (pos != -1) {
+		// Strip credentials
+		base = base.substr(pos + 1, base.length() - pos - 1);
+	}
+	if (base.begins_with("[")) {
+		// Literal IPv6
+		pos = base.rfind("]");
+		if (pos == -1) {
+			return ERR_INVALID_PARAMETER;
+		}
+		r_host = base.substr(1, pos - 1);
+		base = base.substr(pos + 1, base.length() - pos - 1);
+	} else {
+		// Anything else
+		if (base.get_slice_count(":") > 1) {
+			return ERR_INVALID_PARAMETER;
+		}
+		pos = base.rfind(":");
+		if (pos == -1) {
+			r_host = base;
+			base = "";
+		} else {
+			r_host = base.substr(0, pos);
+			base = base.substr(pos, base.length() - pos);
+		}
+	}
+	if (r_host.is_empty()) {
+		return ERR_INVALID_PARAMETER;
+	}
+	r_host = r_host.to_lower();
+	// Port
+	if (base.begins_with(":")) {
+		base = base.substr(1, base.length() - 1);
+		if (!base.is_valid_integer()) {
+			return ERR_INVALID_PARAMETER;
+		}
+		r_port = base.to_int();
+		if (r_port < 1 || r_port > 65535) {
+			return ERR_INVALID_PARAMETER;
+		}
+	}
+	return OK;
+}
+
 void String::copy_from(const char *p_cstr) {
 	// copy Latin-1 encoded c-string directly
 	if (!p_cstr) {
@@ -3772,9 +3837,9 @@ String String::uri_encode() const {
 		} else {
 			char h_Val[3];
 #if defined(__GNUC__) || defined(_MSC_VER)
-			snprintf(h_Val, 3, "%hhX", ord);
+			snprintf(h_Val, 3, "%02hhX", ord);
 #else
-			sprintf(h_Val, "%hhX", ord);
+			sprintf(h_Val, "%02hhX", ord);
 #endif
 			res += "%";
 			res += h_Val;
@@ -3784,27 +3849,28 @@ String String::uri_encode() const {
 }
 
 String String::uri_decode() const {
-	String res;
-	for (int i = 0; i < length(); ++i) {
-		if (unicode_at(i) == '%' && i + 2 < length()) {
-			char32_t ord1 = unicode_at(i + 1);
+	CharString src = utf8();
+	CharString res;
+	for (int i = 0; i < src.length(); ++i) {
+		if (src[i] == '%' && i + 2 < src.length()) {
+			char ord1 = src[i + 1];
 			if ((ord1 >= '0' && ord1 <= '9') || (ord1 >= 'A' && ord1 <= 'Z')) {
-				char32_t ord2 = unicode_at(i + 2);
+				char ord2 = src[i + 2];
 				if ((ord2 >= '0' && ord2 <= '9') || (ord2 >= 'A' && ord2 <= 'Z')) {
 					char bytes[3] = { (char)ord1, (char)ord2, 0 };
 					res += (char)strtol(bytes, nullptr, 16);
 					i += 2;
 				}
 			} else {
-				res += unicode_at(i);
+				res += src[i];
 			}
-		} else if (unicode_at(i) == '+') {
+		} else if (src[i] == '+') {
 			res += ' ';
 		} else {
-			res += unicode_at(i);
+			res += src[i];
 		}
 	}
-	return String::utf8(res.ascii());
+	return String::utf8(res);
 }
 
 String String::c_unescape() const {
@@ -4765,7 +4831,7 @@ Vector<uint8_t> String::to_ascii_buffer() const {
 	size_t len = charstr.length();
 	retval.resize(len);
 	uint8_t *w = retval.ptrw();
-	copymem(w, charstr.ptr(), len);
+	memcpy(w, charstr.ptr(), len);
 
 	return retval;
 }
@@ -4781,7 +4847,7 @@ Vector<uint8_t> String::to_utf8_buffer() const {
 	size_t len = charstr.length();
 	retval.resize(len);
 	uint8_t *w = retval.ptrw();
-	copymem(w, charstr.ptr(), len);
+	memcpy(w, charstr.ptr(), len);
 
 	return retval;
 }
@@ -4797,7 +4863,7 @@ Vector<uint8_t> String::to_utf16_buffer() const {
 	size_t len = charstr.length() * sizeof(char16_t);
 	retval.resize(len);
 	uint8_t *w = retval.ptrw();
-	copymem(w, (const void *)charstr.ptr(), len);
+	memcpy(w, (const void *)charstr.ptr(), len);
 
 	return retval;
 }
@@ -4812,7 +4878,7 @@ Vector<uint8_t> String::to_utf32_buffer() const {
 	size_t len = s->length() * sizeof(char32_t);
 	retval.resize(len);
 	uint8_t *w = retval.ptrw();
-	copymem(w, (const void *)s->ptr(), len);
+	memcpy(w, (const void *)s->ptr(), len);
 
 	return retval;
 }
diff --git a/core/string/ustring.h b/core/string/ustring.h
index 1e362d7683..a56845deff 100644
--- a/core/string/ustring.h
+++ b/core/string/ustring.h
@@ -416,6 +416,7 @@ public:
 	String c_unescape() const;
 	String json_escape() const;
 	String word_wrap(int p_chars_per_line) const;
+	Error parse_url(String &r_scheme, String &r_host, int &r_port, String &r_path) const;
 
 	String property_name_encode() const;
 
diff --git a/core/templates/local_vector.h b/core/templates/local_vector.h
index ffd17b7ee9..5f22e08eb8 100644
--- a/core/templates/local_vector.h
+++ b/core/templates/local_vector.h
@@ -32,7 +32,6 @@
 #define LOCAL_VECTOR_H
 
 #include "core/error/error_macros.h"
-#include "core/os/copymem.h"
 #include "core/os/memory.h"
 #include "core/templates/sort_array.h"
 #include "core/templates/vector.h"
@@ -216,7 +215,7 @@ public:
 		Vector<T> ret;
 		ret.resize(size());
 		T *w = ret.ptrw();
-		copymem(w, data, sizeof(T) * count);
+		memcpy(w, data, sizeof(T) * count);
 		return ret;
 	}
 
@@ -224,7 +223,7 @@ public:
 		Vector<uint8_t> ret;
 		ret.resize(count * sizeof(T));
 		uint8_t *w = ret.ptrw();
-		copymem(w, data, sizeof(T) * count);
+		memcpy(w, data, sizeof(T) * count);
 		return ret;
 	}
 
diff --git a/core/templates/map.h b/core/templates/map.h
index 51a237472d..7dfee13d2c 100644
--- a/core/templates/map.h
+++ b/core/templates/map.h
@@ -32,7 +32,7 @@
 #define MAP_H
 
 #include "core/error/error_macros.h"
-#include "core/templates/set.h"
+#include "core/os/memory.h"
 
 // based on the very nice implementation of rb-trees by:
 // https://web.archive.org/web/20120507164830/http://web.mit.edu/~emin/www/source_code/red_black_tree/index.html
diff --git a/core/templates/oa_hash_map.h b/core/templates/oa_hash_map.h
index 1d4176eb10..2c7c64cd78 100644
--- a/core/templates/oa_hash_map.h
+++ b/core/templates/oa_hash_map.h
@@ -32,7 +32,6 @@
 #define OA_HASH_MAP_H
 
 #include "core/math/math_funcs.h"
-#include "core/os/copymem.h"
 #include "core/os/memory.h"
 #include "core/templates/hashfuncs.h"
 
diff --git a/core/templates/safe_refcount.h b/core/templates/safe_refcount.h
index 91a34ecd54..e9e5695f80 100644
--- a/core/templates/safe_refcount.h
+++ b/core/templates/safe_refcount.h
@@ -36,6 +36,7 @@
 #if !defined(NO_THREADS)
 
 #include <atomic>
+#include <type_traits>
 
 // Design goals for these classes:
 // - No automatic conversions or arithmetic operators,
diff --git a/core/templates/thread_work_pool.h b/core/templates/thread_work_pool.h
index 19ab1dda3a..9f7a692cc5 100644
--- a/core/templates/thread_work_pool.h
+++ b/core/templates/thread_work_pool.h
@@ -83,7 +83,7 @@ public:
 		ERR_FAIL_COND(!threads); //never initialized
 		ERR_FAIL_COND(current_work != nullptr);
 
-		index.store(0);
+		index.store(0, std::memory_order_release);
 
 		Work<C, M, U> *w = memnew((Work<C, M, U>));
 		w->instance = p_instance;
@@ -104,8 +104,15 @@ public:
 		return current_work != nullptr;
 	}
 
+	bool is_done_dispatching() const {
+		ERR_FAIL_COND_V(current_work == nullptr, false);
+		return index.load(std::memory_order_acquire) >= current_work->max_elements;
+	}
+
 	uint32_t get_work_index() const {
-		return index;
+		ERR_FAIL_COND_V(current_work == nullptr, 0);
+		uint32_t idx = index.load(std::memory_order_acquire);
+		return MIN(idx, current_work->max_elements);
 	}
 
 	void end_work() {
diff --git a/core/templates/vector.h b/core/templates/vector.h
index a56a941dbc..dae8874a87 100644
--- a/core/templates/vector.h
+++ b/core/templates/vector.h
@@ -38,7 +38,6 @@
 */
 
 #include "core/error/error_macros.h"
-#include "core/os/copymem.h"
 #include "core/os/memory.h"
 #include "core/templates/cowdata.h"
 #include "core/templates/sort_array.h"
@@ -66,6 +65,7 @@ private:
 public:
 	bool push_back(T p_elem);
 	_FORCE_INLINE_ bool append(const T &p_elem) { return push_back(p_elem); } //alias
+	void fill(T p_elem);
 
 	void remove(int p_index) { _cowdata.remove(p_index); }
 	void erase(const T &p_val) {
@@ -134,7 +134,7 @@ public:
 	Vector<uint8_t> to_byte_array() const {
 		Vector<uint8_t> ret;
 		ret.resize(size() * sizeof(T));
-		copymem(ret.ptrw(), ptr(), sizeof(T) * size());
+		memcpy(ret.ptrw(), ptr(), sizeof(T) * size());
 		return ret;
 	}
 
@@ -223,4 +223,12 @@ bool Vector<T>::push_back(T p_elem) {
 	return false;
 }
 
+template <class T>
+void Vector<T>::fill(T p_elem) {
+	T *p = ptrw();
+	for (int i = 0; i < size(); i++) {
+		p[i] = p_elem;
+	}
+}
+
 #endif // VECTOR_H
diff --git a/core/variant/array.cpp b/core/variant/array.cpp
index 2ad728ec5e..2fb2dd4a30 100644
--- a/core/variant/array.cpp
+++ b/core/variant/array.cpp
@@ -208,6 +208,11 @@ void Array::insert(int p_pos, const Variant &p_value) {
 	_p->array.insert(p_pos, p_value);
 }
 
+void Array::fill(const Variant &p_value) {
+	ERR_FAIL_COND(!_p->typed.validate(p_value, "fill"));
+	_p->array.fill(p_value);
+}
+
 void Array::erase(const Variant &p_value) {
 	ERR_FAIL_COND(!_p->typed.validate(p_value, "erase"));
 	_p->array.erase(p_value);
diff --git a/core/variant/array.h b/core/variant/array.h
index 6b58ed12cb..5ce977ee4b 100644
--- a/core/variant/array.h
+++ b/core/variant/array.h
@@ -74,6 +74,7 @@ public:
 
 	void insert(int p_pos, const Variant &p_value);
 	void remove(int p_pos);
+	void fill(const Variant &p_value);
 
 	Variant front() const;
 	Variant back() const;
diff --git a/core/variant/binder_common.h b/core/variant/binder_common.h
index 8c0b7907e3..830e0a5cbd 100644
--- a/core/variant/binder_common.h
+++ b/core/variant/binder_common.h
@@ -122,6 +122,18 @@ struct VariantObjectClassChecker {
 	}
 };
 
+template <typename T>
+class Ref;
+
+template <typename T>
+struct VariantObjectClassChecker<const Ref<T> &> {
+	static _FORCE_INLINE_ bool check(const Variant &p_variant) {
+		Object *obj = p_variant;
+		const Ref<T> node = p_variant;
+		return node.ptr() || !obj;
+	}
+};
+
 template <>
 struct VariantObjectClassChecker<Node *> {
 	static _FORCE_INLINE_ bool check(const Variant &p_variant) {
@@ -233,6 +245,11 @@ void call_with_ptr_args_retc_helper(T *p_instance, R (T::*p_method)(P...) const,
 	PtrToArg<R>::encode((p_instance->*p_method)(PtrToArg<P>::convert(p_args[Is])...), r_ret);
 }
 
+template <class T, class... P, size_t... Is>
+void call_with_ptr_args_static_helper(T *p_instance, void (*p_method)(T *, P...), const void **p_args, IndexSequence<Is...>) {
+	p_method(p_instance, PtrToArg<P>::convert(p_args[Is])...);
+}
+
 template <class T, class R, class... P, size_t... Is>
 void call_with_ptr_args_static_retc_helper(T *p_instance, R (*p_method)(T *, P...), const void **p_args, void *r_ret, IndexSequence<Is...>) {
 	PtrToArg<R>::encode(p_method(p_instance, PtrToArg<P>::convert(p_args[Is])...), r_ret);
@@ -273,6 +290,11 @@ void call_with_validated_variant_args_static_retc_helper(T *p_instance, R (*p_me
 	VariantInternalAccessor<typename GetSimpleTypeT<R>::type_t>::set(r_ret, p_method(p_instance, (VariantInternalAccessor<typename GetSimpleTypeT<P>::type_t>::get(p_args[Is]))...));
 }
 
+template <class T, class... P, size_t... Is>
+void call_with_validated_variant_args_static_helper(T *p_instance, void (*p_method)(T *, P...), const Variant **p_args, IndexSequence<Is...>) {
+	p_method(p_instance, (VariantInternalAccessor<typename GetSimpleTypeT<P>::type_t>::get(p_args[Is]))...);
+}
+
 template <class R, class... P, size_t... Is>
 void call_with_validated_variant_args_static_method_ret_helper(R (*p_method)(P...), const Variant **p_args, Variant *r_ret, IndexSequence<Is...>) {
 	VariantInternalAccessor<typename GetSimpleTypeT<R>::type_t>::set(r_ret, p_method((VariantInternalAccessor<typename GetSimpleTypeT<P>::type_t>::get(p_args[Is]))...));
@@ -471,6 +493,11 @@ void call_with_ptr_args_retc(T *p_instance, R (T::*p_method)(P...) const, const
 	call_with_ptr_args_retc_helper<T, R, P...>(p_instance, p_method, p_args, r_ret, BuildIndexSequence<sizeof...(P)>{});
 }
 
+template <class T, class... P>
+void call_with_ptr_args_static(T *p_instance, void (*p_method)(T *, P...), const void **p_args) {
+	call_with_ptr_args_static_helper<T, P...>(p_instance, p_method, p_args, BuildIndexSequence<sizeof...(P)>{});
+}
+
 template <class T, class R, class... P>
 void call_with_ptr_args_static_retc(T *p_instance, R (*p_method)(T *, P...), const void **p_args, void *r_ret) {
 	call_with_ptr_args_static_retc_helper<T, R, P...>(p_instance, p_method, p_args, r_ret, BuildIndexSequence<sizeof...(P)>{});
@@ -501,6 +528,11 @@ void call_with_validated_variant_args_retc(Variant *base, R (T::*p_method)(P...)
 	call_with_validated_variant_args_retc_helper<T, R, P...>(VariantGetInternalPtr<T>::get_ptr(base), p_method, p_args, r_ret, BuildIndexSequence<sizeof...(P)>{});
 }
 
+template <class T, class... P>
+void call_with_validated_variant_args_static(Variant *base, void (*p_method)(T *, P...), const Variant **p_args) {
+	call_with_validated_variant_args_static_helper<T, P...>(VariantGetInternalPtr<T>::get_ptr(base), p_method, p_args, BuildIndexSequence<sizeof...(P)>{});
+}
+
 template <class T, class R, class... P>
 void call_with_validated_variant_args_static_retc(Variant *base, R (*p_method)(T *, P...), const Variant **p_args, Variant *r_ret) {
 	call_with_validated_variant_args_static_retc_helper<T, R, P...>(VariantGetInternalPtr<T>::get_ptr(base), p_method, p_args, r_ret, BuildIndexSequence<sizeof...(P)>{});
@@ -758,6 +790,52 @@ void call_with_variant_args_retc_static_helper_dv(T *p_instance, R (*p_method)(T
 	call_with_variant_args_retc_static_helper(p_instance, p_method, args, r_ret, r_error, BuildIndexSequence<sizeof...(P)>{});
 }
 
+template <class T, class... P, size_t... Is>
+void call_with_variant_args_static_helper(T *p_instance, void (*p_method)(T *, P...), const Variant **p_args, Callable::CallError &r_error, IndexSequence<Is...>) {
+	r_error.error = Callable::CallError::CALL_OK;
+
+#ifdef DEBUG_METHODS_ENABLED
+	(p_method)(p_instance, VariantCasterAndValidate<P>::cast(p_args, Is, r_error)...);
+#else
+	(p_method)(p_instance, VariantCaster<P>::cast(*p_args[Is])...);
+#endif
+
+	(void)p_args;
+}
+
+template <class T, class... P>
+void call_with_variant_args_static_helper_dv(T *p_instance, void (*p_method)(T *, P...), const Variant **p_args, int p_argcount, const Vector<Variant> &default_values, Callable::CallError &r_error) {
+#ifdef DEBUG_ENABLED
+	if ((size_t)p_argcount > sizeof...(P)) {
+		r_error.error = Callable::CallError::CALL_ERROR_TOO_MANY_ARGUMENTS;
+		r_error.argument = sizeof...(P);
+		return;
+	}
+#endif
+
+	int32_t missing = (int32_t)sizeof...(P) - (int32_t)p_argcount;
+
+	int32_t dvs = default_values.size();
+#ifdef DEBUG_ENABLED
+	if (missing > dvs) {
+		r_error.error = Callable::CallError::CALL_ERROR_TOO_FEW_ARGUMENTS;
+		r_error.argument = sizeof...(P);
+		return;
+	}
+#endif
+
+	const Variant *args[sizeof...(P) == 0 ? 1 : sizeof...(P)]; //avoid zero sized array
+	for (int32_t i = 0; i < (int32_t)sizeof...(P); i++) {
+		if (i < p_argcount) {
+			args[i] = p_args[i];
+		} else {
+			args[i] = &default_values[i - p_argcount + (dvs - missing)];
+		}
+	}
+
+	call_with_variant_args_static_helper(p_instance, p_method, args, r_error, BuildIndexSequence<sizeof...(P)>{});
+}
+
 template <class R, class... P>
 void call_with_variant_args_static_ret_dv(R (*p_method)(P...), const Variant **p_args, int p_argcount, Variant &r_ret, Callable::CallError &r_error, const Vector<Variant> &default_values) {
 #ifdef DEBUG_ENABLED
diff --git a/core/variant/variant_call.cpp b/core/variant/variant_call.cpp
index 8f2d252810..deaccc6304 100644
--- a/core/variant/variant_call.cpp
+++ b/core/variant/variant_call.cpp
@@ -34,6 +34,7 @@
 #include "core/crypto/crypto_core.h"
 #include "core/debugger/engine_debugger.h"
 #include "core/io/compression.h"
+#include "core/io/marshalls.h"
 #include "core/object/class_db.h"
 #include "core/os/os.h"
 #include "core/templates/local_vector.h"
@@ -73,6 +74,16 @@ static _FORCE_INLINE_ void vc_method_call(void (T::*method)(P...) const, Variant
 }
 
 template <class R, class T, class... P>
+static _FORCE_INLINE_ void vc_method_call_static(R (*method)(T *, P...), Variant *base, const Variant **p_args, int p_argcount, Variant &r_ret, const Vector<Variant> &p_defvals, Callable::CallError &r_error) {
+	call_with_variant_args_retc_static_helper_dv(VariantGetInternalPtr<T>::get_ptr(base), method, p_args, p_argcount, r_ret, p_defvals, r_error);
+}
+
+template <class T, class... P>
+static _FORCE_INLINE_ void vc_method_call_static(void (*method)(T *, P...), Variant *base, const Variant **p_args, int p_argcount, Variant &r_ret, const Vector<Variant> &p_defvals, Callable::CallError &r_error) {
+	call_with_variant_args_static_helper_dv(VariantGetInternalPtr<T>::get_ptr(base), method, p_args, p_argcount, p_defvals, r_error);
+}
+
+template <class R, class T, class... P>
 static _FORCE_INLINE_ void vc_validated_call(R (T::*method)(P...), Variant *base, const Variant **p_args, Variant *r_ret) {
 	call_with_validated_variant_args_ret(base, method, p_args, r_ret);
 }
@@ -91,6 +102,16 @@ static _FORCE_INLINE_ void vc_validated_call(void (T::*method)(P...) const, Vari
 	call_with_validated_variant_argsc(base, method, p_args);
 }
 
+template <class R, class T, class... P>
+static _FORCE_INLINE_ void vc_validated_call_static(R (*method)(T *, P...), Variant *base, const Variant **p_args, Variant *r_ret) {
+	call_with_validated_variant_args_static_retc(base, method, p_args, r_ret);
+}
+
+template <class T, class... P>
+static _FORCE_INLINE_ void vc_validated_call_static(void (*method)(T *, P...), Variant *base, const Variant **p_args, Variant *r_ret) {
+	call_with_validated_variant_args_static(base, method, p_args);
+}
+
 template <class R, class... P>
 static _FORCE_INLINE_ void vc_validated_static_call(R (*method)(P...), const Variant **p_args, Variant *r_ret) {
 	call_with_validated_variant_args_static_method_ret(method, p_args, r_ret);
@@ -122,31 +143,6 @@ static _FORCE_INLINE_ void vc_ptrcall(void (T::*method)(P...) const, void *p_bas
 }
 
 template <class R, class T, class... P>
-static _FORCE_INLINE_ void vc_change_return_type(R (T::*method)(P...), Variant *v) {
-	VariantTypeAdjust<R>::adjust(v);
-}
-
-template <class R, class T, class... P>
-static _FORCE_INLINE_ void vc_change_return_type(R (T::*method)(P...) const, Variant *v) {
-	VariantTypeAdjust<R>::adjust(v);
-}
-
-template <class T, class... P>
-static _FORCE_INLINE_ void vc_change_return_type(void (T::*method)(P...), Variant *v) {
-	VariantInternal::clear(v);
-}
-
-template <class T, class... P>
-static _FORCE_INLINE_ void vc_change_return_type(void (T::*method)(P...) const, Variant *v) {
-	VariantInternal::clear(v);
-}
-
-template <class R, class... P>
-static _FORCE_INLINE_ void vc_change_return_type(R (*method)(P...), Variant *v) {
-	VariantTypeAdjust<R>::adjust(v);
-}
-
-template <class R, class T, class... P>
 static _FORCE_INLINE_ int vc_get_argument_count(R (T::*method)(P...)) {
 	return sizeof...(P);
 }
@@ -229,6 +225,11 @@ static _FORCE_INLINE_ Variant::Type vc_get_return_type(R (*method)(P...)) {
 	return GetTypeInfo<R>::VARIANT_TYPE;
 }
 
+template <class... P>
+static _FORCE_INLINE_ Variant::Type vc_get_return_type(void (*method)(P...)) {
+	return Variant::NIL;
+}
+
 template <class R, class T, class... P>
 static _FORCE_INLINE_ bool vc_has_return_type(R (T::*method)(P...)) {
 	return true;
@@ -302,7 +303,6 @@ static _FORCE_INLINE_ Variant::Type vc_get_base_type(void (T::*method)(P...) con
 			vc_method_call(m_method_ptr, base, p_args, p_argcount, r_ret, p_defvals, r_error);                                                                    \
 		}                                                                                                                                                         \
 		static void validated_call(Variant *base, const Variant **p_args, int p_argcount, Variant *r_ret) {                                                       \
-			vc_change_return_type(m_method_ptr, r_ret);                                                                                                           \
 			vc_validated_call(m_method_ptr, base, p_args, r_ret);                                                                                                 \
 		}                                                                                                                                                         \
 		static void ptrcall(void *p_base, const void **p_args, void *r_ret, int p_argcount) {                                                                     \
@@ -353,7 +353,6 @@ static _FORCE_INLINE_ void vc_static_ptrcall(void (*method)(P...), const void **
 			vc_static_method_call(m_method_ptr, p_args, p_argcount, r_ret, p_defvals, r_error);                                                                   \
 		}                                                                                                                                                         \
 		static void validated_call(Variant *base, const Variant **p_args, int p_argcount, Variant *r_ret) {                                                       \
-			vc_change_return_type(m_method_ptr, r_ret);                                                                                                           \
 			vc_validated_static_call(m_method_ptr, p_args, r_ret);                                                                                                \
 		}                                                                                                                                                         \
 		static void ptrcall(void *p_base, const void **p_args, void *r_ret, int p_argcount) {                                                                     \
@@ -393,45 +392,49 @@ static _FORCE_INLINE_ void vc_ptrcall(R (*method)(T *, P...), void *p_base, cons
 	call_with_ptr_args_static_retc<T, R, P...>(reinterpret_cast<T *>(p_base), method, p_args, r_ret);
 }
 
-#define FUNCTION_CLASS(m_class, m_method_name, m_method_ptr)                                                                                                          \
-	struct Method_##m_class##_##m_method_name {                                                                                                                       \
-		static void call(Variant *base, const Variant **p_args, int p_argcount, Variant &r_ret, const Vector<Variant> &p_defvals, Callable::CallError &r_error) {     \
-			call_with_variant_args_retc_static_helper_dv(VariantGetInternalPtr<m_class>::get_ptr(base), m_method_ptr, p_args, p_argcount, r_ret, p_defvals, r_error); \
-		}                                                                                                                                                             \
-		static void validated_call(Variant *base, const Variant **p_args, int p_argcount, Variant *r_ret) {                                                           \
-			vc_change_return_type(m_method_ptr, r_ret);                                                                                                               \
-			call_with_validated_variant_args_static_retc(base, m_method_ptr, p_args, r_ret);                                                                          \
-		}                                                                                                                                                             \
-		static void ptrcall(void *p_base, const void **p_args, void *r_ret, int p_argcount) {                                                                         \
-			vc_ptrcall(m_method_ptr, p_base, p_args, r_ret);                                                                                                          \
-		}                                                                                                                                                             \
-		static int get_argument_count() {                                                                                                                             \
-			return vc_get_argument_count(m_method_ptr);                                                                                                               \
-		}                                                                                                                                                             \
-		static Variant::Type get_argument_type(int p_arg) {                                                                                                           \
-			return vc_get_argument_type(m_method_ptr, p_arg);                                                                                                         \
-		}                                                                                                                                                             \
-		static Variant::Type get_return_type() {                                                                                                                      \
-			return vc_get_return_type(m_method_ptr);                                                                                                                  \
-		}                                                                                                                                                             \
-		static bool has_return_type() {                                                                                                                               \
-			return true;                                                                                                                                              \
-		}                                                                                                                                                             \
-		static bool is_const() {                                                                                                                                      \
-			return true;                                                                                                                                              \
-		}                                                                                                                                                             \
-		static bool is_static() {                                                                                                                                     \
-			return false;                                                                                                                                             \
-		}                                                                                                                                                             \
-		static bool is_vararg() {                                                                                                                                     \
-			return false;                                                                                                                                             \
-		}                                                                                                                                                             \
-		static Variant::Type get_base_type() {                                                                                                                        \
-			return GetTypeInfo<m_class>::VARIANT_TYPE;                                                                                                                \
-		}                                                                                                                                                             \
-		static StringName get_name() {                                                                                                                                \
-			return #m_method_name;                                                                                                                                    \
-		}                                                                                                                                                             \
+template <class T, class... P>
+static _FORCE_INLINE_ void vc_ptrcall(void (*method)(T *, P...), void *p_base, const void **p_args, void *r_ret) {
+	call_with_ptr_args_static<T, P...>(reinterpret_cast<T *>(p_base), method, p_args);
+}
+
+#define FUNCTION_CLASS(m_class, m_method_name, m_method_ptr, m_const)                                                                                             \
+	struct Method_##m_class##_##m_method_name {                                                                                                                   \
+		static void call(Variant *base, const Variant **p_args, int p_argcount, Variant &r_ret, const Vector<Variant> &p_defvals, Callable::CallError &r_error) { \
+			vc_method_call_static(m_method_ptr, base, p_args, p_argcount, r_ret, p_defvals, r_error);                                                             \
+		}                                                                                                                                                         \
+		static void validated_call(Variant *base, const Variant **p_args, int p_argcount, Variant *r_ret) {                                                       \
+			vc_validated_call_static(m_method_ptr, base, p_args, r_ret);                                                                                          \
+		}                                                                                                                                                         \
+		static void ptrcall(void *p_base, const void **p_args, void *r_ret, int p_argcount) {                                                                     \
+			vc_ptrcall(m_method_ptr, p_base, p_args, r_ret);                                                                                                      \
+		}                                                                                                                                                         \
+		static int get_argument_count() {                                                                                                                         \
+			return vc_get_argument_count(m_method_ptr);                                                                                                           \
+		}                                                                                                                                                         \
+		static Variant::Type get_argument_type(int p_arg) {                                                                                                       \
+			return vc_get_argument_type(m_method_ptr, p_arg);                                                                                                     \
+		}                                                                                                                                                         \
+		static Variant::Type get_return_type() {                                                                                                                  \
+			return vc_get_return_type(m_method_ptr);                                                                                                              \
+		}                                                                                                                                                         \
+		static bool has_return_type() {                                                                                                                           \
+			return vc_has_return_type_static(m_method_ptr);                                                                                                       \
+		}                                                                                                                                                         \
+		static bool is_const() {                                                                                                                                  \
+			return m_const;                                                                                                                                       \
+		}                                                                                                                                                         \
+		static bool is_static() {                                                                                                                                 \
+			return false;                                                                                                                                         \
+		}                                                                                                                                                         \
+		static bool is_vararg() {                                                                                                                                 \
+			return false;                                                                                                                                         \
+		}                                                                                                                                                         \
+		static Variant::Type get_base_type() {                                                                                                                    \
+			return GetTypeInfo<m_class>::VARIANT_TYPE;                                                                                                            \
+		}                                                                                                                                                         \
+		static StringName get_name() {                                                                                                                            \
+			return #m_method_name;                                                                                                                                \
+		}                                                                                                                                                         \
 	};
 
 #define VARARG_CLASS(m_class, m_method_name, m_method_ptr, m_has_return, m_return_type)                                                                           \
@@ -497,7 +500,7 @@ struct _VariantCall {
 			const uint8_t *r = p_instance->ptr();
 			CharString cs;
 			cs.resize(p_instance->size() + 1);
-			copymem(cs.ptrw(), r, p_instance->size());
+			memcpy(cs.ptrw(), r, p_instance->size());
 			cs[p_instance->size()] = 0;
 
 			s = cs.get_data();
@@ -590,6 +593,195 @@ struct _VariantCall {
 		return s;
 	}
 
+	static int64_t func_PackedByteArray_decode_u8(PackedByteArray *p_instance, int64_t p_offset) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0 || p_offset > int64_t(size) - 1, 0);
+		const uint8_t *r = p_instance->ptr();
+		return r[p_offset];
+	}
+	static int64_t func_PackedByteArray_decode_s8(PackedByteArray *p_instance, int64_t p_offset) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0 || p_offset > int64_t(size) - 1, 0);
+		const uint8_t *r = p_instance->ptr();
+		return *((const int8_t *)&r[p_offset]);
+	}
+	static int64_t func_PackedByteArray_decode_u16(PackedByteArray *p_instance, int64_t p_offset) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0 || p_offset > (int64_t(size) - 2), 0);
+		const uint8_t *r = p_instance->ptr();
+		return decode_uint16(&r[p_offset]);
+	}
+	static int64_t func_PackedByteArray_decode_s16(PackedByteArray *p_instance, int64_t p_offset) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0 || p_offset > (int64_t(size) - 2), 0);
+		const uint8_t *r = p_instance->ptr();
+		return (int16_t)decode_uint16(&r[p_offset]);
+	}
+	static int64_t func_PackedByteArray_decode_u32(PackedByteArray *p_instance, int64_t p_offset) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0 || p_offset > (int64_t(size) - 4), 0);
+		const uint8_t *r = p_instance->ptr();
+		return decode_uint32(&r[p_offset]);
+	}
+	static int64_t func_PackedByteArray_decode_s32(PackedByteArray *p_instance, int64_t p_offset) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0 || p_offset > (int64_t(size) - 4), 0);
+		const uint8_t *r = p_instance->ptr();
+		return (int32_t)decode_uint32(&r[p_offset]);
+	}
+	static int64_t func_PackedByteArray_decode_u64(PackedByteArray *p_instance, int64_t p_offset) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0 || p_offset > (int64_t(size) - 8), 0);
+		const uint8_t *r = p_instance->ptr();
+		return (int64_t)decode_uint64(&r[p_offset]);
+	}
+	static int64_t func_PackedByteArray_decode_s64(PackedByteArray *p_instance, int64_t p_offset) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0 || p_offset > (int64_t(size) - 8), 0);
+		const uint8_t *r = p_instance->ptr();
+		return (int64_t)decode_uint64(&r[p_offset]);
+	}
+	static double func_PackedByteArray_decode_half(PackedByteArray *p_instance, int64_t p_offset) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0 || p_offset > (int64_t(size) - 2), 0);
+		const uint8_t *r = p_instance->ptr();
+		return Math::half_to_float(decode_uint16(&r[p_offset]));
+	}
+	static double func_PackedByteArray_decode_float(PackedByteArray *p_instance, int64_t p_offset) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0 || p_offset > (int64_t(size) - 4), 0);
+		const uint8_t *r = p_instance->ptr();
+		return decode_float(&r[p_offset]);
+	}
+
+	static double func_PackedByteArray_decode_double(PackedByteArray *p_instance, int64_t p_offset) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0 || p_offset > (int64_t(size) - 8), 0);
+		const uint8_t *r = p_instance->ptr();
+		return decode_double(&r[p_offset]);
+	}
+
+	static bool func_PackedByteArray_has_encoded_var(PackedByteArray *p_instance, int64_t p_offset, bool p_allow_objects) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0, false);
+		const uint8_t *r = p_instance->ptr();
+		Variant ret;
+		Error err = decode_variant(ret, r + p_offset, size - p_offset, nullptr, p_allow_objects);
+		return err == OK;
+	}
+
+	static Variant func_PackedByteArray_decode_var(PackedByteArray *p_instance, int64_t p_offset, bool p_allow_objects) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0, Variant());
+		const uint8_t *r = p_instance->ptr();
+		Variant ret;
+		Error err = decode_variant(ret, r + p_offset, size - p_offset, nullptr, p_allow_objects);
+		if (err != OK) {
+			ret = Variant();
+		}
+		return ret;
+	}
+
+	static int64_t func_PackedByteArray_decode_var_size(PackedByteArray *p_instance, int64_t p_offset, bool p_allow_objects) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0, 0);
+		const uint8_t *r = p_instance->ptr();
+		Variant ret;
+		int r_size;
+		Error err = decode_variant(ret, r + p_offset, size - p_offset, &r_size, p_allow_objects);
+		if (err == OK) {
+			return r_size;
+		}
+		return 0;
+	}
+
+	static void func_PackedByteArray_encode_u8(PackedByteArray *p_instance, int64_t p_offset, int64_t p_value) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND(p_offset < 0 || p_offset > int64_t(size) - 1);
+		uint8_t *w = p_instance->ptrw();
+		*((uint8_t *)&w[p_offset]) = p_value;
+	}
+	static void func_PackedByteArray_encode_s8(PackedByteArray *p_instance, int64_t p_offset, int64_t p_value) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND(p_offset < 0 || p_offset > int64_t(size) - 1);
+		uint8_t *w = p_instance->ptrw();
+		*((int8_t *)&w[p_offset]) = p_value;
+	}
+
+	static void func_PackedByteArray_encode_u16(PackedByteArray *p_instance, int64_t p_offset, int64_t p_value) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND(p_offset < 0 || p_offset > int64_t(size) - 2);
+		uint8_t *w = p_instance->ptrw();
+		encode_uint16((uint16_t)p_value, &w[p_offset]);
+	}
+	static void func_PackedByteArray_encode_s16(PackedByteArray *p_instance, int64_t p_offset, int64_t p_value) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND(p_offset < 0 || p_offset > int64_t(size) - 2);
+		uint8_t *w = p_instance->ptrw();
+		encode_uint16((int16_t)p_value, &w[p_offset]);
+	}
+
+	static void func_PackedByteArray_encode_u32(PackedByteArray *p_instance, int64_t p_offset, int64_t p_value) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND(p_offset < 0 || p_offset > int64_t(size) - 4);
+		uint8_t *w = p_instance->ptrw();
+		encode_uint32((uint32_t)p_value, &w[p_offset]);
+	}
+	static void func_PackedByteArray_encode_s32(PackedByteArray *p_instance, int64_t p_offset, int64_t p_value) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND(p_offset < 0 || p_offset > int64_t(size) - 4);
+		uint8_t *w = p_instance->ptrw();
+		encode_uint32((int32_t)p_value, &w[p_offset]);
+	}
+
+	static void func_PackedByteArray_encode_u64(PackedByteArray *p_instance, int64_t p_offset, int64_t p_value) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND(p_offset < 0 || p_offset > int64_t(size) - 8);
+		uint8_t *w = p_instance->ptrw();
+		encode_uint64((uint64_t)p_value, &w[p_offset]);
+	}
+	static void func_PackedByteArray_encode_s64(PackedByteArray *p_instance, int64_t p_offset, int64_t p_value) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND(p_offset < 0 || p_offset > int64_t(size) - 8);
+		uint8_t *w = p_instance->ptrw();
+		encode_uint64((int64_t)p_value, &w[p_offset]);
+	}
+
+	static void func_PackedByteArray_encode_half(PackedByteArray *p_instance, int64_t p_offset, double p_value) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND(p_offset < 0 || p_offset > int64_t(size) - 2);
+		uint8_t *w = p_instance->ptrw();
+		encode_uint16(Math::make_half_float(p_value), &w[p_offset]);
+	}
+	static void func_PackedByteArray_encode_float(PackedByteArray *p_instance, int64_t p_offset, double p_value) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND(p_offset < 0 || p_offset > int64_t(size) - 4);
+		uint8_t *w = p_instance->ptrw();
+		encode_float(p_value, &w[p_offset]);
+	}
+	static void func_PackedByteArray_encode_double(PackedByteArray *p_instance, int64_t p_offset, double p_value) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND(p_offset < 0 || p_offset > int64_t(size) - 8);
+		uint8_t *w = p_instance->ptrw();
+		encode_double(p_value, &w[p_offset]);
+	}
+	static int64_t func_PackedByteArray_encode_var(PackedByteArray *p_instance, int64_t p_offset, const Variant &p_value, bool p_allow_objects) {
+		uint64_t size = p_instance->size();
+		ERR_FAIL_COND_V(p_offset < 0, -1);
+		uint8_t *w = p_instance->ptrw();
+		int len;
+		Error err = encode_variant(p_value, nullptr, len, p_allow_objects);
+		if (err != OK) {
+			return -1;
+		}
+		if (uint64_t(p_offset + len) > size) {
+			return -1; // did not fit
+		}
+		encode_variant(p_value, w + p_offset, len, p_allow_objects);
+
+		return len;
+	}
+
 	static void func_Callable_call(Variant *v, const Variant **p_args, int p_argcount, Variant &r_ret, Callable::CallError &r_error) {
 		Callable *callable = VariantGetInternalPtr<Callable>::get_ptr(v);
 		callable->call(p_args, p_argcount, r_ret, r_error);
@@ -1005,11 +1197,21 @@ Variant Variant::get_constant_value(Variant::Type p_type, const StringName &p_va
 
 #ifdef DEBUG_METHODS_ENABLED
 #define bind_function(m_type, m_name, m_method, m_arg_names, m_default_args) \
-	FUNCTION_CLASS(m_type, m_name, m_method);                                \
+	FUNCTION_CLASS(m_type, m_name, m_method, true);                          \
 	register_builtin_method<Method_##m_type##_##m_name>(m_arg_names, m_default_args);
 #else
 #define bind_function(m_type, m_name, m_method, m_arg_names, m_default_args) \
-	FUNCTION_CLASS(m_type, m_name, m_method);                                \
+	FUNCTION_CLASS(m_type, m_name, m_method, true);                          \
+	register_builtin_method<Method_##m_type##_##m_name>(sarray(), m_default_args);
+#endif
+
+#ifdef DEBUG_METHODS_ENABLED
+#define bind_functionnc(m_type, m_name, m_method, m_arg_names, m_default_args) \
+	FUNCTION_CLASS(m_type, m_name, m_method, false);                           \
+	register_builtin_method<Method_##m_type##_##m_name>(m_arg_names, m_default_args);
+#else
+#define bind_functionnc(m_type, m_name, m_method, m_arg_names, m_default_args) \
+	FUNCTION_CLASS(m_type, m_name, m_method, false);                           \
 	register_builtin_method<Method_##m_type##_##m_name>(sarray(), m_default_args);
 #endif
 
@@ -1445,6 +1647,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(Array, resize, sarray("size"), varray());
 	bind_method(Array, insert, sarray("position", "value"), varray());
 	bind_method(Array, remove, sarray("position"), varray());
+	bind_method(Array, fill, sarray("value"), varray());
 	bind_method(Array, erase, sarray("value"), varray());
 	bind_method(Array, front, sarray(), varray());
 	bind_method(Array, back, sarray(), varray());
@@ -1475,6 +1678,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedByteArray, append_array, sarray("array"), varray());
 	bind_method(PackedByteArray, remove, sarray("index"), varray());
 	bind_method(PackedByteArray, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedByteArray, fill, sarray("value"), varray());
 	bind_method(PackedByteArray, resize, sarray("new_size"), varray());
 	bind_method(PackedByteArray, has, sarray("value"), varray());
 	bind_method(PackedByteArray, reverse, sarray(), varray());
@@ -1491,6 +1695,34 @@ static void _register_variant_builtin_methods() {
 	bind_function(PackedByteArray, decompress, _VariantCall::func_PackedByteArray_decompress, sarray("buffer_size", "compression_mode"), varray(0));
 	bind_function(PackedByteArray, decompress_dynamic, _VariantCall::func_PackedByteArray_decompress_dynamic, sarray("max_output_size", "compression_mode"), varray(0));
 
+	bind_function(PackedByteArray, decode_u8, _VariantCall::func_PackedByteArray_decode_u8, sarray("byte_offset"), varray());
+	bind_function(PackedByteArray, decode_s8, _VariantCall::func_PackedByteArray_decode_s8, sarray("byte_offset"), varray());
+	bind_function(PackedByteArray, decode_u16, _VariantCall::func_PackedByteArray_decode_u16, sarray("byte_offset"), varray());
+	bind_function(PackedByteArray, decode_s16, _VariantCall::func_PackedByteArray_decode_s16, sarray("byte_offset"), varray());
+	bind_function(PackedByteArray, decode_u32, _VariantCall::func_PackedByteArray_decode_u32, sarray("byte_offset"), varray());
+	bind_function(PackedByteArray, decode_s32, _VariantCall::func_PackedByteArray_decode_s32, sarray("byte_offset"), varray());
+	bind_function(PackedByteArray, decode_u64, _VariantCall::func_PackedByteArray_decode_u64, sarray("byte_offset"), varray());
+	bind_function(PackedByteArray, decode_s64, _VariantCall::func_PackedByteArray_decode_s64, sarray("byte_offset"), varray());
+	bind_function(PackedByteArray, decode_half, _VariantCall::func_PackedByteArray_decode_half, sarray("byte_offset"), varray());
+	bind_function(PackedByteArray, decode_float, _VariantCall::func_PackedByteArray_decode_float, sarray("byte_offset"), varray());
+	bind_function(PackedByteArray, decode_double, _VariantCall::func_PackedByteArray_decode_double, sarray("byte_offset"), varray());
+	bind_function(PackedByteArray, has_encoded_var, _VariantCall::func_PackedByteArray_has_encoded_var, sarray("byte_offset", "allow_objects"), varray(false));
+	bind_function(PackedByteArray, decode_var, _VariantCall::func_PackedByteArray_decode_var, sarray("byte_offset", "allow_objects"), varray(false));
+	bind_function(PackedByteArray, decode_var_size, _VariantCall::func_PackedByteArray_decode_var_size, sarray("byte_offset", "allow_objects"), varray(false));
+
+	bind_functionnc(PackedByteArray, encode_u8, _VariantCall::func_PackedByteArray_encode_u8, sarray("byte_offset", "value"), varray());
+	bind_functionnc(PackedByteArray, encode_s8, _VariantCall::func_PackedByteArray_encode_s8, sarray("byte_offset", "value"), varray());
+	bind_functionnc(PackedByteArray, encode_u16, _VariantCall::func_PackedByteArray_encode_u16, sarray("byte_offset", "value"), varray());
+	bind_functionnc(PackedByteArray, encode_s16, _VariantCall::func_PackedByteArray_encode_s16, sarray("byte_offset", "value"), varray());
+	bind_functionnc(PackedByteArray, encode_u32, _VariantCall::func_PackedByteArray_encode_u32, sarray("byte_offset", "value"), varray());
+	bind_functionnc(PackedByteArray, encode_s32, _VariantCall::func_PackedByteArray_encode_s32, sarray("byte_offset", "value"), varray());
+	bind_functionnc(PackedByteArray, encode_u64, _VariantCall::func_PackedByteArray_encode_u64, sarray("byte_offset", "value"), varray());
+	bind_functionnc(PackedByteArray, encode_s64, _VariantCall::func_PackedByteArray_encode_s64, sarray("byte_offset", "value"), varray());
+	bind_functionnc(PackedByteArray, encode_half, _VariantCall::func_PackedByteArray_encode_half, sarray("byte_offset", "value"), varray());
+	bind_functionnc(PackedByteArray, encode_float, _VariantCall::func_PackedByteArray_encode_float, sarray("byte_offset", "value"), varray());
+	bind_functionnc(PackedByteArray, encode_double, _VariantCall::func_PackedByteArray_encode_double, sarray("byte_offset", "value"), varray());
+	bind_functionnc(PackedByteArray, encode_var, _VariantCall::func_PackedByteArray_encode_var, sarray("byte_offset", "value", "allow_objects"), varray(false));
+
 	/* Int32 Array */
 
 	bind_method(PackedInt32Array, size, sarray(), varray());
@@ -1501,6 +1733,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedInt32Array, append_array, sarray("array"), varray());
 	bind_method(PackedInt32Array, remove, sarray("index"), varray());
 	bind_method(PackedInt32Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedInt32Array, fill, sarray("value"), varray());
 	bind_method(PackedInt32Array, resize, sarray("new_size"), varray());
 	bind_method(PackedInt32Array, has, sarray("value"), varray());
 	bind_method(PackedInt32Array, reverse, sarray(), varray());
@@ -1519,6 +1752,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedInt64Array, append_array, sarray("array"), varray());
 	bind_method(PackedInt64Array, remove, sarray("index"), varray());
 	bind_method(PackedInt64Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedInt64Array, fill, sarray("value"), varray());
 	bind_method(PackedInt64Array, resize, sarray("new_size"), varray());
 	bind_method(PackedInt64Array, has, sarray("value"), varray());
 	bind_method(PackedInt64Array, reverse, sarray(), varray());
@@ -1537,6 +1771,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedFloat32Array, append_array, sarray("array"), varray());
 	bind_method(PackedFloat32Array, remove, sarray("index"), varray());
 	bind_method(PackedFloat32Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedFloat32Array, fill, sarray("value"), varray());
 	bind_method(PackedFloat32Array, resize, sarray("new_size"), varray());
 	bind_method(PackedFloat32Array, has, sarray("value"), varray());
 	bind_method(PackedFloat32Array, reverse, sarray(), varray());
@@ -1555,6 +1790,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedFloat64Array, append_array, sarray("array"), varray());
 	bind_method(PackedFloat64Array, remove, sarray("index"), varray());
 	bind_method(PackedFloat64Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedFloat64Array, fill, sarray("value"), varray());
 	bind_method(PackedFloat64Array, resize, sarray("new_size"), varray());
 	bind_method(PackedFloat64Array, has, sarray("value"), varray());
 	bind_method(PackedFloat64Array, reverse, sarray(), varray());
@@ -1573,6 +1809,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedStringArray, append_array, sarray("array"), varray());
 	bind_method(PackedStringArray, remove, sarray("index"), varray());
 	bind_method(PackedStringArray, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedStringArray, fill, sarray("value"), varray());
 	bind_method(PackedStringArray, resize, sarray("new_size"), varray());
 	bind_method(PackedStringArray, has, sarray("value"), varray());
 	bind_method(PackedStringArray, reverse, sarray(), varray());
@@ -1591,6 +1828,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedVector2Array, append_array, sarray("array"), varray());
 	bind_method(PackedVector2Array, remove, sarray("index"), varray());
 	bind_method(PackedVector2Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedVector2Array, fill, sarray("value"), varray());
 	bind_method(PackedVector2Array, resize, sarray("new_size"), varray());
 	bind_method(PackedVector2Array, has, sarray("value"), varray());
 	bind_method(PackedVector2Array, reverse, sarray(), varray());
@@ -1609,6 +1847,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedVector3Array, append_array, sarray("array"), varray());
 	bind_method(PackedVector3Array, remove, sarray("index"), varray());
 	bind_method(PackedVector3Array, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedVector3Array, fill, sarray("value"), varray());
 	bind_method(PackedVector3Array, resize, sarray("new_size"), varray());
 	bind_method(PackedVector3Array, has, sarray("value"), varray());
 	bind_method(PackedVector3Array, reverse, sarray(), varray());
@@ -1627,6 +1866,7 @@ static void _register_variant_builtin_methods() {
 	bind_method(PackedColorArray, append_array, sarray("array"), varray());
 	bind_method(PackedColorArray, remove, sarray("index"), varray());
 	bind_method(PackedColorArray, insert, sarray("at_index", "value"), varray());
+	bind_method(PackedColorArray, fill, sarray("value"), varray());
 	bind_method(PackedColorArray, resize, sarray("new_size"), varray());
 	bind_method(PackedColorArray, has, sarray("value"), varray());
 	bind_method(PackedColorArray, reverse, sarray(), varray());
diff --git a/core/variant/variant_op.cpp b/core/variant/variant_op.cpp
index e0a3cf4215..8cfa793c0e 100644
--- a/core/variant/variant_op.cpp
+++ b/core/variant/variant_op.cpp
@@ -257,6 +257,14 @@ public:
 	static void evaluate(const Variant &p_left, const Variant &p_right, Variant *r_ret, bool &r_valid) {
 		const A &a = *VariantGetInternalPtr<A>::get_ptr(&p_left);
 		const B &b = *VariantGetInternalPtr<B>::get_ptr(&p_right);
+
+#if defined(DEBUG_ENABLED)
+		if (b < 0 || a < 0) {
+			*r_ret = "Invalid operands for bit shifting. Only positive operands are supported.";
+			r_valid = false;
+			return;
+		}
+#endif
 		*r_ret = a << b;
 		r_valid = true;
 	}
@@ -276,6 +284,14 @@ public:
 	static void evaluate(const Variant &p_left, const Variant &p_right, Variant *r_ret, bool &r_valid) {
 		const A &a = *VariantGetInternalPtr<A>::get_ptr(&p_left);
 		const B &b = *VariantGetInternalPtr<B>::get_ptr(&p_right);
+
+#if defined(DEBUG_ENABLED)
+		if (b < 0 || a < 0) {
+			*r_ret = "Invalid operands for bit shifting. Only positive operands are supported.";
+			r_valid = false;
+			return;
+		}
+#endif
 		*r_ret = a >> b;
 		r_valid = true;
 	}
@@ -1365,10 +1381,10 @@ void register_op(Variant::Operator p_op, Variant::Type p_type_a, Variant::Type p
 }
 
 void Variant::_register_variant_operators() {
-	zeromem(operator_return_type_table, sizeof(operator_return_type_table));
-	zeromem(operator_evaluator_table, sizeof(operator_evaluator_table));
-	zeromem(validated_operator_evaluator_table, sizeof(validated_operator_evaluator_table));
-	zeromem(ptr_operator_evaluator_table, sizeof(ptr_operator_evaluator_table));
+	memset(operator_return_type_table, 0, sizeof(operator_return_type_table));
+	memset(operator_evaluator_table, 0, sizeof(operator_evaluator_table));
+	memset(validated_operator_evaluator_table, 0, sizeof(validated_operator_evaluator_table));
+	memset(ptr_operator_evaluator_table, 0, sizeof(ptr_operator_evaluator_table));
 
 	register_op<OperatorEvaluatorAdd<int64_t, int64_t, int64_t>>(Variant::OP_ADD, Variant::INT, Variant::INT);
 	register_op<OperatorEvaluatorAdd<double, int64_t, double>>(Variant::OP_ADD, Variant::INT, Variant::FLOAT);
diff --git a/core/variant/variant_setget.cpp b/core/variant/variant_setget.cpp
index f319631ce5..9ab8602782 100644
--- a/core/variant/variant_setget.cpp
+++ b/core/variant/variant_setget.cpp
@@ -1045,6 +1045,7 @@ void register_indexed_setters_getters() {
 	REGISTER_INDEXED_MEMBER(PackedByteArray);
 	REGISTER_INDEXED_MEMBER(PackedInt32Array);
 	REGISTER_INDEXED_MEMBER(PackedInt64Array);
+	REGISTER_INDEXED_MEMBER(PackedFloat32Array);
 	REGISTER_INDEXED_MEMBER(PackedFloat64Array);
 	REGISTER_INDEXED_MEMBER(PackedVector2Array);
 	REGISTER_INDEXED_MEMBER(PackedVector3Array);
diff --git a/doc/classes/@GlobalScope.xml b/doc/classes/@GlobalScope.xml
index 25f8f22d44..95108f1613 100644
--- a/doc/classes/@GlobalScope.xml
+++ b/doc/classes/@GlobalScope.xml
@@ -804,10 +804,7 @@
 		<method name="randomize">
 			<description>
 				Randomizes the seed (or the internal state) of the random number generator. Current implementation reseeds using a number based on time.
-				[codeblock]
-				func _ready():
-				    randomize()
-				[/codeblock]
+				[b]Note:[/b] This method is called automatically when the project is run. If you need to fix the seed to have reproducible results, use [method seed] to initialize the random number generator.
 			</description>
 		</method>
 		<method name="range_lerp">
diff --git a/doc/classes/AStar.xml b/doc/classes/AStar.xml
index e975b8ed28..fce2b90197 100644
--- a/doc/classes/AStar.xml
+++ b/doc/classes/AStar.xml
@@ -289,6 +289,7 @@
 			</argument>
 			<description>
 				Returns an array with the points that are in the path found by AStar between the given points. The array is ordered from the starting point to the ending point of the path.
+				[b]Note:[/b] This method is not thread-safe. If called from a [Thread], it will return an empty [PackedVector3Array] and will print an error message.
 			</description>
 		</method>
 		<method name="get_point_position" qualifiers="const">
diff --git a/doc/classes/AStar2D.xml b/doc/classes/AStar2D.xml
index 2a51678209..3efd2f604c 100644
--- a/doc/classes/AStar2D.xml
+++ b/doc/classes/AStar2D.xml
@@ -258,6 +258,7 @@
 			</argument>
 			<description>
 				Returns an array with the points that are in the path found by AStar2D between the given points. The array is ordered from the starting point to the ending point of the path.
+				[b]Note:[/b] This method is not thread-safe. If called from a [Thread], it will return an empty [PackedVector2Array] and will print an error message.
 			</description>
 		</method>
 		<method name="get_point_position" qualifiers="const">
diff --git a/doc/classes/AnimationNodeTimeSeek.xml b/doc/classes/AnimationNodeTimeSeek.xml
index eb5335c792..171d65fbe0 100644
--- a/doc/classes/AnimationNodeTimeSeek.xml
+++ b/doc/classes/AnimationNodeTimeSeek.xml
@@ -4,7 +4,27 @@
 		A time-seeking animation node to be used with [AnimationTree].
 	</brief_description>
 	<description>
-		This node can be used to cause a seek command to happen to any sub-children of the graph. After setting the time, this value returns to -1.
+		This node can be used to cause a seek command to happen to any sub-children of the animation graph. Use this node type to play an [Animation] from the start or a certain playback position inside the [AnimationNodeBlendTree]. After setting the time and changing the animation playback, the seek node automatically goes into sleep mode on the next process frame by setting its [code]seek_position[/code] value to [code]-1.0[/code].
+		[codeblocks]
+		[gdscript]
+		# Play child animation from the start.
+		animation_tree.set("parameters/Seek/seek_position", 0.0)
+		# Alternative syntax (same result as above).
+		animation_tree["parameters/Seek/seek_position"] = 0.0
+
+		# Play child animation from 12 second timestamp.
+		animation_tree.set("parameters/Seek/seek_position", 12.0)
+		# Alternative syntax (same result as above).
+		animation_tree["parameters/Seek/seek_position"] = 12.0
+		[/gdscript]
+		[csharp]
+		// Play child animation from the start.
+		animationTree.Set("parameters/Seek/seek_position", 0.0);
+
+		// Play child animation from 12 second timestamp.
+		animationTree.Set("parameters/Seek/seek_position", 12.0);
+		[/csharp]
+		[/codeblocks]
 	</description>
 	<tutorials>
 		<link title="AnimationTree">https://docs.godotengine.org/en/latest/tutorials/animation/animation_tree.html</link>
diff --git a/doc/classes/Area2D.xml b/doc/classes/Area2D.xml
index 9711a2a35b..a1e522d146 100644
--- a/doc/classes/Area2D.xml
+++ b/doc/classes/Area2D.xml
@@ -1,10 +1,10 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <class name="Area2D" inherits="CollisionObject2D" version="4.0">
 	<brief_description>
-		2D area for detection and 2D physics influence.
+		2D area for detection and physics and audio influence.
 	</brief_description>
 	<description>
-		2D area that detects [CollisionObject2D] nodes overlapping, entering, or exiting. Can also alter or override local physics parameters (gravity, damping).
+		2D area that detects [CollisionObject2D] nodes overlapping, entering, or exiting. Can also alter or override local physics parameters (gravity, damping) and route audio to a custom audio bus.
 	</description>
 	<tutorials>
 		<link title="Using Area2D">https://docs.godotengine.org/en/latest/tutorials/physics/using_area_2d.html</link>
@@ -13,24 +13,6 @@
 		<link title="2D Platformer Demo">https://godotengine.org/asset-library/asset/120</link>
 	</tutorials>
 	<methods>
-		<method name="get_collision_layer_bit" qualifiers="const">
-			<return type="bool">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<description>
-				Returns an individual bit on the layer mask. Describes whether other areas will collide with this one on the given layer.
-			</description>
-		</method>
-		<method name="get_collision_mask_bit" qualifiers="const">
-			<return type="bool">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<description>
-				Returns an individual bit on the collision mask. Describes whether this area will collide with others on the given layer.
-			</description>
-		</method>
 		<method name="get_overlapping_areas" qualifiers="const">
 			<return type="Area2D[]">
 			</return>
@@ -66,28 +48,6 @@
 				The [code]body[/code] argument can either be a [PhysicsBody2D] or a [TileMap] instance (while TileMaps are not physics body themselves, they register their tiles with collision shapes as a virtual physics body).
 			</description>
 		</method>
-		<method name="set_collision_layer_bit">
-			<return type="void">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<argument index="1" name="value" type="bool">
-			</argument>
-			<description>
-				Set/clear individual bits on the layer mask. This makes getting an area in/out of only one layer easier.
-			</description>
-		</method>
-		<method name="set_collision_mask_bit">
-			<return type="void">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<argument index="1" name="value" type="bool">
-			</argument>
-			<description>
-				Set/clear individual bits on the collision mask. This makes selecting the areas scanned easier.
-			</description>
-		</method>
 	</methods>
 	<members>
 		<member name="angular_damp" type="float" setter="set_angular_damp" getter="get_angular_damp" default="1.0">
@@ -100,12 +60,6 @@
 		<member name="audio_bus_override" type="bool" setter="set_audio_bus_override" getter="is_overriding_audio_bus" default="false">
 			If [code]true[/code], the area's audio bus overrides the default audio bus.
 		</member>
-		<member name="collision_layer" type="int" setter="set_collision_layer" getter="get_collision_layer" default="1">
-			The area's physics layer(s). Collidable objects can exist in any of 32 different layers. A contact is detected if object A is in any of the layers that object B scans, or object B is in any layers that object A scans. See also [member collision_mask]. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
-		</member>
-		<member name="collision_mask" type="int" setter="set_collision_mask" getter="get_collision_mask" default="1">
-			The physics layers this area scans to determine collision detection. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
-		</member>
 		<member name="gravity" type="float" setter="set_gravity" getter="get_gravity" default="98.0">
 			The area's gravity intensity (ranges from -1024 to 1024). This value multiplies the gravity vector. This is useful to alter the force of gravity without altering its direction.
 		</member>
diff --git a/doc/classes/Area3D.xml b/doc/classes/Area3D.xml
index 4271769155..e69a89a836 100644
--- a/doc/classes/Area3D.xml
+++ b/doc/classes/Area3D.xml
@@ -1,34 +1,16 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <class name="Area3D" inherits="CollisionObject3D" version="4.0">
 	<brief_description>
-		General-purpose area node for detection and 3D physics influence.
+		3D area for detection and physics and audio influence.
 	</brief_description>
 	<description>
-		3D area that detects [CollisionObject3D] nodes overlapping, entering, or exiting. Can also alter or override local physics parameters (gravity, damping).
+		3D area that detects [CollisionObject3D] nodes overlapping, entering, or exiting. Can also alter or override local physics parameters (gravity, damping) and route audio to custom audio buses.
 	</description>
 	<tutorials>
 		<link title="3D Platformer Demo">https://godotengine.org/asset-library/asset/125</link>
 		<link title="GUI in 3D Demo">https://godotengine.org/asset-library/asset/127</link>
 	</tutorials>
 	<methods>
-		<method name="get_collision_layer_bit" qualifiers="const">
-			<return type="bool">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<description>
-				Returns an individual bit on the layer mask.
-			</description>
-		</method>
-		<method name="get_collision_mask_bit" qualifiers="const">
-			<return type="bool">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<description>
-				Returns an individual bit on the collision mask.
-			</description>
-		</method>
 		<method name="get_overlapping_areas" qualifiers="const">
 			<return type="Area3D[]">
 			</return>
@@ -64,28 +46,6 @@
 				The [code]body[/code] argument can either be a [PhysicsBody3D] or a [GridMap] instance (while GridMaps are not physics body themselves, they register their tiles with collision shapes as a virtual physics body).
 			</description>
 		</method>
-		<method name="set_collision_layer_bit">
-			<return type="void">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<argument index="1" name="value" type="bool">
-			</argument>
-			<description>
-				Set/clear individual bits on the layer mask. This simplifies editing this [Area3D]'s layers.
-			</description>
-		</method>
-		<method name="set_collision_mask_bit">
-			<return type="void">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<argument index="1" name="value" type="bool">
-			</argument>
-			<description>
-				Set/clear individual bits on the collision mask. This simplifies editing which [Area3D] layers this [Area3D] scans.
-			</description>
-		</method>
 	</methods>
 	<members>
 		<member name="angular_damp" type="float" setter="set_angular_damp" getter="get_angular_damp" default="0.1">
@@ -98,12 +58,6 @@
 		<member name="audio_bus_override" type="bool" setter="set_audio_bus_override" getter="is_overriding_audio_bus" default="false">
 			If [code]true[/code], the area's audio bus overrides the default audio bus.
 		</member>
-		<member name="collision_layer" type="int" setter="set_collision_layer" getter="get_collision_layer" default="1">
-			The area's physics layer(s). Collidable objects can exist in any of 32 different layers. A contact is detected if object A is in any of the layers that object B scans, or object B is in any layers that object A scans. See also [member collision_mask]. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
-		</member>
-		<member name="collision_mask" type="int" setter="set_collision_mask" getter="get_collision_mask" default="1">
-			The physics layers this area scans to determine collision detection. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
-		</member>
 		<member name="gravity" type="float" setter="set_gravity" getter="get_gravity" default="9.8">
 			The area's gravity intensity (ranges from -1024 to 1024). This value multiplies the gravity vector. This is useful to alter the force of gravity without altering its direction.
 		</member>
diff --git a/doc/classes/Array.xml b/doc/classes/Array.xml
index 54bbe7a94b..38b74cb436 100644
--- a/doc/classes/Array.xml
+++ b/doc/classes/Array.xml
@@ -237,6 +237,27 @@
 				[b]Note:[/b] On large arrays, this method will be slower if the removed element is close to the beginning of the array (index 0). This is because all elements placed after the removed element have to be reindexed.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="Variant">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements:
+				[codeblocks]
+				[gdscript]
+				var array = []
+				array.resize(10)
+				array.fill(0) # Initialize the 10 elements to 0.
+				[/gdscript]
+				[csharp]
+				var array = new Godot.Collections.Array{};
+				array.Resize(10);
+				array.Fill(0); // Initialize the 10 elements to 0.
+				[/csharp]
+				[/codeblocks]
+			</description>
+		</method>
 		<method name="find" qualifiers="const">
 			<return type="int">
 			</return>
diff --git a/doc/classes/BaseMaterial3D.xml b/doc/classes/BaseMaterial3D.xml
index c87398ac8f..0a7b4c5dab 100644
--- a/doc/classes/BaseMaterial3D.xml
+++ b/doc/classes/BaseMaterial3D.xml
@@ -142,6 +142,7 @@
 		</member>
 		<member name="clearcoat_enabled" type="bool" setter="set_feature" getter="get_feature" default="false">
 			If [code]true[/code], clearcoat rendering is enabled. Adds a secondary transparent pass to the lighting calculation resulting in an added specular blob. This makes materials appear as if they have a clear layer on them that can be either glossy or rough.
+			[b]Note:[/b] Clearcoat rendering is not visible if the material's [member shading_mode] is [constant SHADING_MODE_UNSHADED].
 		</member>
 		<member name="clearcoat_gloss" type="float" setter="set_clearcoat_gloss" getter="get_clearcoat_gloss" default="0.5">
 			Sets the roughness of the clearcoat pass. A higher value results in a smoother clearcoat while a lower value results in a rougher clearcoat.
@@ -304,6 +305,7 @@
 		</member>
 		<member name="rim_enabled" type="bool" setter="set_feature" getter="get_feature" default="false">
 			If [code]true[/code], rim effect is enabled. Rim lighting increases the brightness at glancing angles on an object.
+			[b]Note:[/b] Rim lighting is not visible if the material's [member shading_mode] is [constant SHADING_MODE_UNSHADED].
 		</member>
 		<member name="rim_texture" type="Texture2D" setter="set_texture" getter="get_texture">
 			Texture used to set the strength of the rim lighting effect per-pixel. Multiplied by [member rim].
@@ -362,6 +364,8 @@
 		<member name="transparency" type="int" setter="set_transparency" getter="get_transparency" enum="BaseMaterial3D.Transparency" default="0">
 			If [code]true[/code], transparency is enabled on the body. See also [member blend_mode].
 		</member>
+		<member name="use_particle_trails" type="bool" setter="set_flag" getter="get_flag" default="false">
+		</member>
 		<member name="use_point_size" type="bool" setter="set_flag" getter="get_flag" default="false">
 			If [code]true[/code], render point size can be changed.
 			[b]Note:[/b] this is only effective for objects whose geometry is point-based rather than triangle-based. See also [member point_size].
@@ -653,7 +657,9 @@
 		<constant name="FLAG_SUBSURFACE_MODE_SKIN" value="18" enum="Flags">
 			Enables the skin mode for subsurface scattering which is used to improve the look of subsurface scattering when used for human skin.
 		</constant>
-		<constant name="FLAG_MAX" value="19" enum="Flags">
+		<constant name="FLAG_PARTICLE_TRAILS_MODE" value="19" enum="Flags">
+		</constant>
+		<constant name="FLAG_MAX" value="20" enum="Flags">
 			Represents the size of the [enum Flags] enum.
 		</constant>
 		<constant name="DIFFUSE_BURLEY" value="0" enum="DiffuseMode">
diff --git a/doc/classes/CapsuleMesh.xml b/doc/classes/CapsuleMesh.xml
index fab11d44cc..031abd0112 100644
--- a/doc/classes/CapsuleMesh.xml
+++ b/doc/classes/CapsuleMesh.xml
@@ -12,7 +12,8 @@
 	</methods>
 	<members>
 		<member name="mid_height" type="float" setter="set_mid_height" getter="get_mid_height" default="1.0">
-			Height of the capsule mesh from the center point.
+			Height of the middle cylindrical part of the capsule (without the hemispherical ends).
+			[b]Note:[/b] The capsule's total height is equal to [member mid_height] + 2 * [member radius].
 		</member>
 		<member name="radial_segments" type="int" setter="set_radial_segments" getter="get_radial_segments" default="64">
 			Number of radial segments on the capsule mesh.
diff --git a/doc/classes/CollisionObject2D.xml b/doc/classes/CollisionObject2D.xml
index e8f7a59e4c..7c4c75bf0f 100644
--- a/doc/classes/CollisionObject2D.xml
+++ b/doc/classes/CollisionObject2D.xml
@@ -31,6 +31,24 @@
 				Creates a new shape owner for the given object. Returns [code]owner_id[/code] of the new owner for future reference.
 			</description>
 		</method>
+		<method name="get_collision_layer_bit" qualifiers="const">
+			<return type="bool">
+			</return>
+			<argument index="0" name="bit" type="int">
+			</argument>
+			<description>
+				Returns whether or not the specified [code]bit[/code] of the [member collision_layer] is set.
+			</description>
+		</method>
+		<method name="get_collision_mask_bit" qualifiers="const">
+			<return type="bool">
+			</return>
+			<argument index="0" name="bit" type="int">
+			</argument>
+			<description>
+				Returns whether or not the specified [code]bit[/code] of the [member collision_mask] is set.
+			</description>
+		</method>
 		<method name="get_rid" qualifiers="const">
 			<return type="RID">
 			</return>
@@ -81,6 +99,30 @@
 				Removes the given shape owner.
 			</description>
 		</method>
+		<method name="set_collision_layer_bit">
+			<return type="void">
+			</return>
+			<argument index="0" name="bit" type="int">
+			</argument>
+			<argument index="1" name="value" type="bool">
+			</argument>
+			<description>
+				If [code]value[/value] is [code]true[/code], sets the specified [code]bit[/code] in the the [member collision_layer].
+				If [code]value[/value] is [code]false[/code], clears the specified [code]bit[/code] in the the [member collision_layer].
+			</description>
+		</method>
+		<method name="set_collision_mask_bit">
+			<return type="void">
+			</return>
+			<argument index="0" name="bit" type="int">
+			</argument>
+			<argument index="1" name="value" type="bool">
+			</argument>
+			<description>
+				If [code]value[/value] is [code]true[/code], sets the specified [code]bit[/code] in the the [member collision_mask].
+				If [code]value[/value] is [code]false[/code], clears the specified [code]bit[/code] in the the [member collision_mask].
+			</description>
+		</method>
 		<method name="shape_find_owner" qualifiers="const">
 			<return type="int">
 			</return>
@@ -216,6 +258,14 @@
 		</method>
 	</methods>
 	<members>
+		<member name="collision_layer" type="int" setter="set_collision_layer" getter="get_collision_layer" default="1">
+			The physics layers this CollisionObject2D is in. Collision objects can exist in one or more of 32 different layers. See also [member collision_mask].
+			[b]Note:[/b] A contact is detected if object A is in any of the layers that object B scans, or object B is in any layers that object A scans. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
+		</member>
+		<member name="collision_mask" type="int" setter="set_collision_mask" getter="get_collision_mask" default="1">
+			The physics layers this CollisionObject2D scans. Collision objects can scan one or more of 32 different layers. See also [member collision_layer].
+			[b]Note:[/b] A contact is detected if object A is in any of the layers that object B scans, or object B is in any layers that object A scans. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
+		</member>
 		<member name="input_pickable" type="bool" setter="set_pickable" getter="is_pickable" default="true">
 			If [code]true[/code], this object is pickable. A pickable object can detect the mouse pointer entering/leaving, and if the mouse is inside it, report input events. Requires at least one [code]collision_layer[/code] bit to be set.
 		</member>
diff --git a/doc/classes/CollisionObject3D.xml b/doc/classes/CollisionObject3D.xml
index f8e897653d..522eec5cbe 100644
--- a/doc/classes/CollisionObject3D.xml
+++ b/doc/classes/CollisionObject3D.xml
@@ -35,6 +35,24 @@
 				Creates a new shape owner for the given object. Returns [code]owner_id[/code] of the new owner for future reference.
 			</description>
 		</method>
+		<method name="get_collision_layer_bit" qualifiers="const">
+			<return type="bool">
+			</return>
+			<argument index="0" name="bit" type="int">
+			</argument>
+			<description>
+				Returns whether or not the specified [code]bit[/code] of the [member collision_layer] is set.
+			</description>
+		</method>
+		<method name="get_collision_mask_bit" qualifiers="const">
+			<return type="bool">
+			</return>
+			<argument index="0" name="bit" type="int">
+			</argument>
+			<description>
+				Returns whether or not the specified [code]bit[/code] of the [member collision_mask] is set.
+			</description>
+		</method>
 		<method name="get_rid" qualifiers="const">
 			<return type="RID">
 			</return>
@@ -67,6 +85,30 @@
 				Removes the given shape owner.
 			</description>
 		</method>
+		<method name="set_collision_layer_bit">
+			<return type="void">
+			</return>
+			<argument index="0" name="bit" type="int">
+			</argument>
+			<argument index="1" name="value" type="bool">
+			</argument>
+			<description>
+				If [code]value[/value] is [code]true[/code], sets the specified [code]bit[/code] in the the [member collision_layer].
+				If [code]value[/value] is [code]false[/code], clears the specified [code]bit[/code] in the the [member collision_layer].
+			</description>
+		</method>
+		<method name="set_collision_mask_bit">
+			<return type="void">
+			</return>
+			<argument index="0" name="bit" type="int">
+			</argument>
+			<argument index="1" name="value" type="bool">
+			</argument>
+			<description>
+				If [code]value[/value] is [code]true[/code], sets the specified [code]bit[/code] in the the [member collision_mask].
+				If [code]value[/value] is [code]false[/code], clears the specified [code]bit[/code] in the the [member collision_mask].
+			</description>
+		</method>
 		<method name="shape_find_owner" qualifiers="const">
 			<return type="int">
 			</return>
@@ -180,6 +222,14 @@
 		</method>
 	</methods>
 	<members>
+		<member name="collision_layer" type="int" setter="set_collision_layer" getter="get_collision_layer" default="1">
+			The physics layers this CollisionObject3D is in. Collision objects can exist in one or more of 32 different layers. See also [member collision_mask].
+			[b]Note:[/b] A contact is detected if object A is in any of the layers that object B scans, or object B is in any layers that object A scans. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
+		</member>
+		<member name="collision_mask" type="int" setter="set_collision_mask" getter="get_collision_mask" default="1">
+			The physics layers this CollisionObject3D scans. Collision objects can scan one or more of 32 different layers. See also [member collision_layer].
+			[b]Note:[/b] A contact is detected if object A is in any of the layers that object B scans, or object B is in any layers that object A scans. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
+		</member>
 		<member name="input_capture_on_drag" type="bool" setter="set_capture_input_on_drag" getter="get_capture_input_on_drag" default="false">
 			If [code]true[/code], the [CollisionObject3D] will continue to receive input events as the mouse is dragged across its shapes.
 		</member>
diff --git a/doc/classes/Color.xml b/doc/classes/Color.xml
index c33d007735..d645588af2 100644
--- a/doc/classes/Color.xml
+++ b/doc/classes/Color.xml
@@ -269,7 +269,7 @@
 			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
-				Returns the linear interpolation with another color. The interpolation factor [code]t[/code] is between 0 and 1.
+				Returns the linear interpolation with another color. The interpolation factor [code]weight[/code] is between 0 and 1.
 				[codeblocks]
 				[gdscript]
 				var c1 = Color(1.0, 0.0, 0.0)
diff --git a/doc/classes/ColorPicker.xml b/doc/classes/ColorPicker.xml
index 2fc4313e47..fddfd27573 100644
--- a/doc/classes/ColorPicker.xml
+++ b/doc/classes/ColorPicker.xml
@@ -51,6 +51,9 @@
 			If [code]true[/code], allows editing the color with Hue/Saturation/Value sliders.
 			[b]Note:[/b] Cannot be enabled if raw mode is on.
 		</member>
+		<member name="picker_shape" type="int" setter="set_picker_shape" getter="get_picker_shape" enum="ColorPicker.PickerShapeType" default="0">
+			The shape of the color space view. See [enum PickerShapeType].
+		</member>
 		<member name="presets_enabled" type="bool" setter="set_presets_enabled" getter="are_presets_enabled" default="true">
 			If [code]true[/code], the "add preset" button is enabled.
 		</member>
@@ -86,6 +89,15 @@
 		</signal>
 	</signals>
 	<constants>
+		<constant name="SHAPE_HSV_RECTANGLE" value="0" enum="PickerShapeType">
+			HSV Color Model rectangle color space.
+		</constant>
+		<constant name="SHAPE_HSV_WHEEL" value="1" enum="PickerShapeType">
+			HSV Color Model rectangle color space with a wheel.
+		</constant>
+		<constant name="SHAPE_VHS_CIRCLE" value="2" enum="PickerShapeType">
+			HSV Color Model circle color space. Use Saturation as a radius.
+		</constant>
 	</constants>
 	<theme_items>
 		<theme_item name="add_preset" type="Texture2D">
@@ -110,6 +122,8 @@
 		<theme_item name="overbright_indicator" type="Texture2D">
 			The indicator used to signalize that the color value is outside the 0-1 range.
 		</theme_item>
+		<theme_item name="picker_cursor" type="Texture2D">
+		</theme_item>
 		<theme_item name="preset_bg" type="Texture2D">
 		</theme_item>
 		<theme_item name="screen_picker" type="Texture2D">
diff --git a/doc/classes/Curve.xml b/doc/classes/Curve.xml
index 26872e1f8e..e47c420a3b 100644
--- a/doc/classes/Curve.xml
+++ b/doc/classes/Curve.xml
@@ -108,7 +108,7 @@
 				Returns the Y value for the point that would exist at the X position [code]offset[/code] along the curve.
 			</description>
 		</method>
-		<method name="interpolate_baked">
+		<method name="interpolate_baked" qualifiers="const">
 			<return type="float">
 			</return>
 			<argument index="0" name="offset" type="float">
diff --git a/doc/classes/EditorInterface.xml b/doc/classes/EditorInterface.xml
index 4d0e11fb19..a5328ce382 100644
--- a/doc/classes/EditorInterface.xml
+++ b/doc/classes/EditorInterface.xml
@@ -10,6 +10,15 @@
 	<tutorials>
 	</tutorials>
 	<methods>
+		<method name="edit_node">
+			<return type="void">
+			</return>
+			<argument index="0" name="node" type="Node">
+			</argument>
+			<description>
+				Edits the given [Node]. The node will be also selected if it's inside the scene tree.
+			</description>
+		</method>
 		<method name="edit_resource">
 			<return type="void">
 			</return>
diff --git a/doc/classes/EditorPlugin.xml b/doc/classes/EditorPlugin.xml
index 8dcffb0b74..61f1761249 100644
--- a/doc/classes/EditorPlugin.xml
+++ b/doc/classes/EditorPlugin.xml
@@ -214,7 +214,7 @@
 				[gdscript]
 				func forward_canvas_draw_over_viewport(overlay):
 				    # Draw a circle at cursor position.
-				    overlay.draw_circle(overlay.get_local_mouse_position(), 64)
+				    overlay.draw_circle(overlay.get_local_mouse_position(), 64, Color.white)
 
 				func forward_canvas_gui_input(event):
 				    if event is InputEventMouseMotion:
diff --git a/doc/classes/EditorSelection.xml b/doc/classes/EditorSelection.xml
index 1ff9744b70..63e89750c3 100644
--- a/doc/classes/EditorSelection.xml
+++ b/doc/classes/EditorSelection.xml
@@ -17,6 +17,7 @@
 			</argument>
 			<description>
 				Adds a node to the selection.
+				[b]Note:[/b] The newly selected node will not be automatically edited in the inspector. If you want to edit a node, use [method EditorInterface.edit_node].
 			</description>
 		</method>
 		<method name="clear">
diff --git a/doc/classes/Engine.xml b/doc/classes/Engine.xml
index f9d8cf574a..1147b52102 100644
--- a/doc/classes/Engine.xml
+++ b/doc/classes/Engine.xml
@@ -156,7 +156,15 @@
 	</methods>
 	<members>
 		<member name="editor_hint" type="bool" setter="set_editor_hint" getter="is_editor_hint" default="true">
-			If [code]true[/code], it is running inside the editor. Useful for tool scripts.
+			If [code]true[/code], the script is currently running inside the editor. This is useful for [code]@tool[/code] scripts to conditionally draw editor helpers, or prevent accidentally running "game" code that would affect the scene state while in the editor:
+			[codeblock]
+			if Engine.editor_hint:
+			    draw_gizmos()
+			else:
+			    simulate_physics()
+			[/codeblock]
+			See [url=https://docs.godotengine.org/en/latest/tutorials/plugins/running_code_in_the_editor.html]Running code in the editor[/url] in the documentation for more information.
+			[b]Note:[/b] To detect whether the script is run from an editor [i]build[/i] (e.g. when pressing [kbd]F5[/kbd]), use [method OS.has_feature] with the [code]"editor"[/code] argument instead. [code]OS.has_feature("editor")[/code] will evaluate to [code]true[/code] both when the code is running in the editor and when running the project from the editor, but it will evaluate to [code]false[/code] when the code is run from an exported project.
 		</member>
 		<member name="iterations_per_second" type="int" setter="set_iterations_per_second" getter="get_iterations_per_second" default="60">
 			The number of fixed iterations per second. This controls how often physics simulation and [method Node._physics_process] methods are run. This value should generally always be set to [code]60[/code] or above, as Godot doesn't interpolate the physics step. As a result, values lower than [code]60[/code] will look stuttery. This value can be increased to make input more reactive or work around tunneling issues, but keep in mind doing so will increase CPU usage.
diff --git a/doc/classes/GPUParticles3D.xml b/doc/classes/GPUParticles3D.xml
index aea106af50..e5d6581ddc 100644
--- a/doc/classes/GPUParticles3D.xml
+++ b/doc/classes/GPUParticles3D.xml
@@ -87,18 +87,22 @@
 		<member name="draw_passes" type="int" setter="set_draw_passes" getter="get_draw_passes" default="1">
 			The number of draw passes when rendering particles.
 		</member>
+		<member name="draw_skin" type="Skin" setter="set_skin" getter="get_skin">
+		</member>
 		<member name="emitting" type="bool" setter="set_emitting" getter="is_emitting" default="true">
 			If [code]true[/code], particles are being emitted.
 		</member>
 		<member name="explosiveness" type="float" setter="set_explosiveness_ratio" getter="get_explosiveness_ratio" default="0.0">
 			Time ratio between each emission. If [code]0[/code], particles are emitted continuously. If [code]1[/code], all particles are emitted simultaneously.
 		</member>
-		<member name="fixed_fps" type="int" setter="set_fixed_fps" getter="get_fixed_fps" default="0">
+		<member name="fixed_fps" type="int" setter="set_fixed_fps" getter="get_fixed_fps" default="30">
 			The particle system's frame rate is fixed to a value. For instance, changing the value to 2 will make the particles render at 2 frames per second. Note this does not slow down the simulation of the particle system itself.
 		</member>
 		<member name="fract_delta" type="bool" setter="set_fractional_delta" getter="get_fractional_delta" default="true">
 			If [code]true[/code], results in fractional delta calculation which has a smoother particles display effect.
 		</member>
+		<member name="interpolate" type="bool" setter="set_interpolate" getter="get_interpolate" default="true">
+		</member>
 		<member name="lifetime" type="float" setter="set_lifetime" getter="get_lifetime" default="1.0">
 			Amount of time each particle will exist.
 		</member>
@@ -122,6 +126,12 @@
 		</member>
 		<member name="sub_emitter" type="NodePath" setter="set_sub_emitter" getter="get_sub_emitter" default="NodePath(&quot;&quot;)">
 		</member>
+		<member name="trail_enabled" type="bool" setter="set_enable_trail" getter="is_trail_enabled" default="false">
+		</member>
+		<member name="trail_length_secs" type="float" setter="set_trail_length" getter="get_trail_length" default="0.3">
+		</member>
+		<member name="transform_align" type="int" setter="set_transform_align" getter="get_transform_align" enum="GPUParticles3D.TransformAlign" default="0">
+		</member>
 		<member name="visibility_aabb" type="AABB" setter="set_visibility_aabb" getter="get_visibility_aabb" default="AABB( -4, -4, -4, 8, 8, 8 )">
 			The [AABB] that determines the node's region which needs to be visible on screen for the particle system to be active.
 			Grow the box if particles suddenly appear/disappear when the node enters/exits the screen. The [AABB] can be grown via code or with the [b]Particles → Generate AABB[/b] editor tool.
@@ -150,5 +160,13 @@
 		<constant name="MAX_DRAW_PASSES" value="4">
 			Maximum number of draw passes supported.
 		</constant>
+		<constant name="TRANSFORM_ALIGN_DISABLED" value="0" enum="TransformAlign">
+		</constant>
+		<constant name="TRANSFORM_ALIGN_Z_BILLBOARD" value="1" enum="TransformAlign">
+		</constant>
+		<constant name="TRANSFORM_ALIGN_Y_TO_VELOCITY" value="2" enum="TransformAlign">
+		</constant>
+		<constant name="TRANSFORM_ALIGN_Z_BILLBOARD_Y_TO_VELOCITY" value="3" enum="TransformAlign">
+		</constant>
 	</constants>
 </class>
diff --git a/doc/classes/Geometry2D.xml b/doc/classes/Geometry2D.xml
index 2c0d9b54d1..13354ec19e 100644
--- a/doc/classes/Geometry2D.xml
+++ b/doc/classes/Geometry2D.xml
@@ -184,7 +184,7 @@
 			</argument>
 			<description>
 				Merges (combines) [code]polygon_a[/code] and [code]polygon_b[/code] and returns an array of merged polygons. This performs [constant OPERATION_UNION] between polygons.
-				The operation may result in an outer polygon (boundary) and inner polygon (hole) produced which could be distinguished by calling [method is_polygon_clockwise].
+				The operation may result in an outer polygon (boundary) and multiple inner polygons (holes) produced which could be distinguished by calling [method is_polygon_clockwise].
 			</description>
 		</method>
 		<method name="offset_polygon">
diff --git a/doc/classes/GeometryInstance3D.xml b/doc/classes/GeometryInstance3D.xml
index 631a30abab..b2c3bfc3ed 100644
--- a/doc/classes/GeometryInstance3D.xml
+++ b/doc/classes/GeometryInstance3D.xml
@@ -48,6 +48,8 @@
 		</member>
 		<member name="gi_mode" type="int" setter="set_gi_mode" getter="get_gi_mode" enum="GeometryInstance3D.GIMode" default="0">
 		</member>
+		<member name="ignore_occlusion_culling" type="bool" setter="set_ignore_occlusion_culling" getter="is_ignoring_occlusion_culling" default="false">
+		</member>
 		<member name="lod_bias" type="float" setter="set_lod_bias" getter="get_lod_bias" default="1.0">
 		</member>
 		<member name="lod_max_distance" type="float" setter="set_lod_max_distance" getter="get_lod_max_distance" default="0.0">
diff --git a/doc/classes/HeightMapShape3D.xml b/doc/classes/HeightMapShape3D.xml
index 6d230bdab8..f6f2a27891 100644
--- a/doc/classes/HeightMapShape3D.xml
+++ b/doc/classes/HeightMapShape3D.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <class name="HeightMapShape3D" inherits="Shape3D" version="4.0">
 	<brief_description>
-		Height map shape for 3D physics (Bullet only).
+		Height map shape for 3D physics.
 	</brief_description>
 	<description>
 		Height map shape resource, which can be added to a [PhysicsBody3D] or [Area3D].
diff --git a/doc/classes/Image.xml b/doc/classes/Image.xml
index 9d87c9bf9a..91a07f66e0 100644
--- a/doc/classes/Image.xml
+++ b/doc/classes/Image.xml
@@ -186,7 +186,8 @@
 			<return type="int" enum="Error">
 			</return>
 			<description>
-				Decompresses the image if it is compressed. Returns an error if decompress function is not available.
+				Decompresses the image if it is VRAM compressed in a supported format. Returns [constant OK] if the format is supported, otherwise [constant ERR_UNAVAILABLE].
+				[b]Note:[/b] The following formats can be decompressed: DXT, RGTC, BPTC, PVRTC1. The formats ETC1 and ETC2 are not supported.
 			</description>
 		</method>
 		<method name="detect_alpha" qualifiers="const">
diff --git a/doc/classes/LargeTexture.xml b/doc/classes/LargeTexture.xml
deleted file mode 100644
index a1d172e4b1..0000000000
--- a/doc/classes/LargeTexture.xml
+++ /dev/null
@@ -1,90 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<class name="LargeTexture" inherits="Texture2D" version="4.0">
-	<brief_description>
-		A [Texture2D] capable of storing many smaller textures with offsets.
-	</brief_description>
-	<description>
-		A [Texture2D] capable of storing many smaller textures with offsets.
-		You can dynamically add pieces ([Texture2D]s) to this [LargeTexture] using different offsets.
-	</description>
-	<tutorials>
-	</tutorials>
-	<methods>
-		<method name="add_piece">
-			<return type="int">
-			</return>
-			<argument index="0" name="ofs" type="Vector2">
-			</argument>
-			<argument index="1" name="texture" type="Texture2D">
-			</argument>
-			<description>
-				Adds [code]texture[/code] to this [LargeTexture], starting on offset [code]ofs[/code].
-			</description>
-		</method>
-		<method name="clear">
-			<return type="void">
-			</return>
-			<description>
-				Clears the [LargeTexture].
-			</description>
-		</method>
-		<method name="get_piece_count" qualifiers="const">
-			<return type="int">
-			</return>
-			<description>
-				Returns the number of pieces currently in this [LargeTexture].
-			</description>
-		</method>
-		<method name="get_piece_offset" qualifiers="const">
-			<return type="Vector2">
-			</return>
-			<argument index="0" name="idx" type="int">
-			</argument>
-			<description>
-				Returns the offset of the piece with the index [code]idx[/code].
-			</description>
-		</method>
-		<method name="get_piece_texture" qualifiers="const">
-			<return type="Texture2D">
-			</return>
-			<argument index="0" name="idx" type="int">
-			</argument>
-			<description>
-				Returns the [Texture2D] of the piece with the index [code]idx[/code].
-			</description>
-		</method>
-		<method name="set_piece_offset">
-			<return type="void">
-			</return>
-			<argument index="0" name="idx" type="int">
-			</argument>
-			<argument index="1" name="ofs" type="Vector2">
-			</argument>
-			<description>
-				Sets the offset of the piece with the index [code]idx[/code] to [code]ofs[/code].
-			</description>
-		</method>
-		<method name="set_piece_texture">
-			<return type="void">
-			</return>
-			<argument index="0" name="idx" type="int">
-			</argument>
-			<argument index="1" name="texture" type="Texture2D">
-			</argument>
-			<description>
-				Sets the [Texture2D] of the piece with index [code]idx[/code] to [code]texture[/code].
-			</description>
-		</method>
-		<method name="set_size">
-			<return type="void">
-			</return>
-			<argument index="0" name="size" type="Vector2">
-			</argument>
-			<description>
-				Sets the size of this [LargeTexture].
-			</description>
-		</method>
-	</methods>
-	<constants>
-	</constants>
-</class>
diff --git a/doc/classes/LineEdit.xml b/doc/classes/LineEdit.xml
index 360f5c451e..7adf19632e 100644
--- a/doc/classes/LineEdit.xml
+++ b/doc/classes/LineEdit.xml
@@ -12,34 +12,25 @@
 		- [kbd]Ctrl + Z[/kbd]: Undo
 		- [kbd]Ctrl + ~[/kbd]: Swap input direction.
 		- [kbd]Ctrl + Shift + Z[/kbd]: Redo
-		- [kbd]Ctrl + U[/kbd]: Delete text from the cursor position to the beginning of the line
-		- [kbd]Ctrl + K[/kbd]: Delete text from the cursor position to the end of the line
+		- [kbd]Ctrl + U[/kbd]: Delete text from the caret position to the beginning of the line
+		- [kbd]Ctrl + K[/kbd]: Delete text from the caret position to the end of the line
 		- [kbd]Ctrl + A[/kbd]: Select all text
-		- [kbd]Up Arrow[/kbd]/[kbd]Down Arrow[/kbd]: Move the cursor to the beginning/end of the line
+		- [kbd]Up Arrow[/kbd]/[kbd]Down Arrow[/kbd]: Move the caret to the beginning/end of the line
 		On macOS, some extra keyboard shortcuts are available:
-		- [kbd]Ctrl + F[/kbd]: Same as [kbd]Right Arrow[/kbd], move the cursor one character right
-		- [kbd]Ctrl + B[/kbd]: Same as [kbd]Left Arrow[/kbd], move the cursor one character left
-		- [kbd]Ctrl + P[/kbd]: Same as [kbd]Up Arrow[/kbd], move the cursor to the previous line
-		- [kbd]Ctrl + N[/kbd]: Same as [kbd]Down Arrow[/kbd], move the cursor to the next line
-		- [kbd]Ctrl + D[/kbd]: Same as [kbd]Delete[/kbd], delete the character on the right side of cursor
-		- [kbd]Ctrl + H[/kbd]: Same as [kbd]Backspace[/kbd], delete the character on the left side of the cursor
-		- [kbd]Ctrl + A[/kbd]: Same as [kbd]Home[/kbd], move the cursor to the beginning of the line
-		- [kbd]Ctrl + E[/kbd]: Same as [kbd]End[/kbd], move the cursor to the end of the line
-		- [kbd]Cmd + Left Arrow[/kbd]: Same as [kbd]Home[/kbd], move the cursor to the beginning of the line
-		- [kbd]Cmd + Right Arrow[/kbd]: Same as [kbd]End[/kbd], move the cursor to the end of the line
+		- [kbd]Ctrl + F[/kbd]: Same as [kbd]Right Arrow[/kbd], move the caret one character right
+		- [kbd]Ctrl + B[/kbd]: Same as [kbd]Left Arrow[/kbd], move the caret one character left
+		- [kbd]Ctrl + P[/kbd]: Same as [kbd]Up Arrow[/kbd], move the caret to the previous line
+		- [kbd]Ctrl + N[/kbd]: Same as [kbd]Down Arrow[/kbd], move the caret to the next line
+		- [kbd]Ctrl + D[/kbd]: Same as [kbd]Delete[/kbd], delete the character on the right side of caret
+		- [kbd]Ctrl + H[/kbd]: Same as [kbd]Backspace[/kbd], delete the character on the left side of the caret
+		- [kbd]Ctrl + A[/kbd]: Same as [kbd]Home[/kbd], move the caret to the beginning of the line
+		- [kbd]Ctrl + E[/kbd]: Same as [kbd]End[/kbd], move the caret to the end of the line
+		- [kbd]Cmd + Left Arrow[/kbd]: Same as [kbd]Home[/kbd], move the caret to the beginning of the line
+		- [kbd]Cmd + Right Arrow[/kbd]: Same as [kbd]End[/kbd], move the caret to the end of the line
 	</description>
 	<tutorials>
 	</tutorials>
 	<methods>
-		<method name="append_at_cursor">
-			<return type="void">
-			</return>
-			<argument index="0" name="text" type="String">
-			</argument>
-			<description>
-				Adds [code]text[/code] after the cursor. If the resulting value is longer than [member max_length], nothing happens.
-			</description>
-		</method>
 		<method name="clear">
 			<return type="void">
 			</return>
@@ -54,11 +45,11 @@
 				Removes all OpenType features.
 			</description>
 		</method>
-		<method name="delete_char_at_cursor">
+		<method name="delete_char_at_caret">
 			<return type="void">
 			</return>
 			<description>
-				Deletes one character at the cursor's current position (equivalent to pressing [kbd]Delete[/kbd]).
+				Deletes one character at the caret's current position (equivalent to pressing [kbd]Delete[/kbd]).
 			</description>
 		</method>
 		<method name="delete_text">
@@ -99,7 +90,16 @@
 			<return type="int">
 			</return>
 			<description>
-				Returns the scroll offset due to [member caret_position], as a number of characters.
+				Returns the scroll offset due to [member caret_column], as a number of characters.
+			</description>
+		</method>
+		<method name="insert_text_at_caret">
+			<return type="void">
+			</return>
+			<argument index="0" name="text" type="String">
+			</argument>
+			<description>
+				Inserts [code]text[/code] at the caret. If the resulting value is longer than [member max_length], nothing happens.
 			</description>
 		</method>
 		<method name="menu_option">
@@ -159,21 +159,21 @@
 		<member name="align" type="int" setter="set_align" getter="get_align" enum="LineEdit.Align" default="0">
 			Text alignment as defined in the [enum Align] enum.
 		</member>
-		<member name="caret_blink" type="bool" setter="cursor_set_blink_enabled" getter="cursor_get_blink_enabled" default="false">
-			If [code]true[/code], the caret (visual cursor) blinks.
+		<member name="caret_blink" type="bool" setter="set_caret_blink_enabled" getter="is_caret_blink_enabled" default="false">
+			If [code]true[/code], the caret (text cursor) blinks.
 		</member>
-		<member name="caret_blink_speed" type="float" setter="cursor_set_blink_speed" getter="cursor_get_blink_speed" default="0.65">
+		<member name="caret_blink_speed" type="float" setter="set_caret_blink_speed" getter="get_caret_blink_speed" default="0.65">
 			Duration (in seconds) of a caret's blinking cycle.
 		</member>
-		<member name="caret_force_displayed" type="bool" setter="cursor_set_force_displayed" getter="cursor_get_force_displayed" default="false">
+		<member name="caret_column" type="int" setter="set_caret_column" getter="get_caret_column" default="0">
+			The caret's column position inside the [LineEdit]. When set, the text may scroll to accommodate it.
 		</member>
-		<member name="caret_mid_grapheme" type="bool" setter="set_mid_grapheme_caret_enabled" getter="get_mid_grapheme_caret_enabled" default="false">
+		<member name="caret_force_displayed" type="bool" setter="set_caret_force_displayed" getter="is_caret_force_displayed" default="false">
+		</member>
+		<member name="caret_mid_grapheme" type="bool" setter="set_caret_mid_grapheme_enabled" getter="is_caret_mid_grapheme_enabled" default="false">
 			Allow moving caret, selecting and removing the individual composite character components.
 			Note: [kbd]Backspace[/kbd] is always removing individual composite character components.
 		</member>
-		<member name="caret_position" type="int" setter="set_cursor_position" getter="get_cursor_position" default="0">
-			The cursor's position inside the [LineEdit]. When set, the text may scroll to accommodate it.
-		</member>
 		<member name="clear_button_enabled" type="bool" setter="set_clear_button_enabled" getter="is_clear_button_enabled" default="false">
 			If [code]true[/code], the [LineEdit] will show a clear button if [code]text[/code] is not empty, which can be used to clear the text quickly.
 		</member>
@@ -186,7 +186,7 @@
 		<member name="editable" type="bool" setter="set_editable" getter="is_editable" default="true">
 			If [code]false[/code], existing text cannot be modified and new text cannot be added.
 		</member>
-		<member name="expand_to_text_length" type="bool" setter="set_expand_to_text_length" getter="get_expand_to_text_length" default="false">
+		<member name="expand_to_text_length" type="bool" setter="set_expand_to_text_length_enabled" getter="is_expand_to_text_length_enabled" default="false">
 			If [code]true[/code], the [LineEdit] width will increase to stay longer than the [member text]. It will [b]not[/b] compress if the [member text] is shortened.
 		</member>
 		<member name="focus_mode" type="int" setter="set_focus_mode" getter="get_focus_mode" override="true" enum="Control.FocusMode" default="2" />
@@ -276,7 +276,7 @@
 			Copies the selected text.
 		</constant>
 		<constant name="MENU_PASTE" value="2" enum="MenuItems">
-			Pastes the clipboard text over the selected text (or at the cursor's position).
+			Pastes the clipboard text over the selected text (or at the caret's position).
 			Non-printable escape characters are automatically stripped from the OS clipboard via [method String.strip_escapes].
 		</constant>
 		<constant name="MENU_CLEAR" value="3" enum="MenuItems">
@@ -359,6 +359,9 @@
 		</constant>
 	</constants>
 	<theme_items>
+		<theme_item name="caret_color" type="Color" default="Color( 0.94, 0.94, 0.94, 1 )">
+			Color of the [LineEdit]'s caret (text cursor).
+		</theme_item>
 		<theme_item name="clear" type="Texture2D">
 			Texture for the clear button. See [member clear_button_enabled].
 		</theme_item>
@@ -368,9 +371,6 @@
 		<theme_item name="clear_button_color_pressed" type="Color" default="Color( 1, 1, 1, 1 )">
 			Color used for the clear button when it's pressed.
 		</theme_item>
-		<theme_item name="cursor_color" type="Color" default="Color( 0.94, 0.94, 0.94, 1 )">
-			Color of the [LineEdit]'s visual cursor (caret).
-		</theme_item>
 		<theme_item name="focus" type="StyleBox">
 			Background used when [LineEdit] has GUI focus.
 		</theme_item>
diff --git a/doc/classes/MeshInstance3D.xml b/doc/classes/MeshInstance3D.xml
index 82cd392cd3..b5ab296bd0 100644
--- a/doc/classes/MeshInstance3D.xml
+++ b/doc/classes/MeshInstance3D.xml
@@ -27,6 +27,13 @@
 				This helper creates a [MeshInstance3D] child node with gizmos at every vertex calculated from the mesh geometry. It's mainly used for testing.
 			</description>
 		</method>
+		<method name="create_multiple_convex_collisions">
+			<return type="void">
+			</return>
+			<description>
+				This helper creates a [StaticBody3D] child node with multiple [ConvexPolygonShape3D] collision shapes calculated from the mesh geometry via convex decomposition. It's mainly used for testing.
+			</description>
+		</method>
 		<method name="create_trimesh_collision">
 			<return type="void">
 			</return>
@@ -43,7 +50,7 @@
 				Returns the [Material] that will be used by the [Mesh] when drawing. This can return the [member GeometryInstance3D.material_override], the surface override [Material] defined in this [MeshInstance3D], or the surface [Material] defined in the [Mesh]. For example, if [member GeometryInstance3D.material_override] is used, all surfaces will return the override material.
 			</description>
 		</method>
-		<method name="get_surface_material" qualifiers="const">
+		<method name="get_surface_override_material" qualifiers="const">
 			<return type="Material">
 			</return>
 			<argument index="0" name="surface" type="int">
@@ -52,14 +59,14 @@
 				Returns the override [Material] for the specified surface of the [Mesh] resource.
 			</description>
 		</method>
-		<method name="get_surface_material_count" qualifiers="const">
+		<method name="get_surface_override_material_count" qualifiers="const">
 			<return type="int">
 			</return>
 			<description>
-				Returns the number of surface materials.
+				Returns the number of surface override materials. This is equivalent to [method Mesh.get_surface_count].
 			</description>
 		</method>
-		<method name="set_surface_material">
+		<method name="set_surface_override_material">
 			<return type="void">
 			</return>
 			<argument index="0" name="surface" type="int">
diff --git a/doc/classes/NavigationAgent2D.xml b/doc/classes/NavigationAgent2D.xml
index 1060e2de41..de81ae4d91 100644
--- a/doc/classes/NavigationAgent2D.xml
+++ b/doc/classes/NavigationAgent2D.xml
@@ -44,6 +44,12 @@
 				Returns a [Vector2] in global coordinates, that can be moved to, making sure that there are no static objects in the way. If the agent does not have a navigation path, it will return the position of the agent's parent.
 			</description>
 		</method>
+		<method name="get_rid" qualifiers="const">
+			<return type="RID">
+			</return>
+			<description>
+			</description>
+		</method>
 		<method name="get_target_location" qualifiers="const">
 			<return type="Vector2">
 			</return>
diff --git a/doc/classes/NavigationAgent3D.xml b/doc/classes/NavigationAgent3D.xml
index 00e9db0a33..8942a37774 100644
--- a/doc/classes/NavigationAgent3D.xml
+++ b/doc/classes/NavigationAgent3D.xml
@@ -44,6 +44,12 @@
 				Returns a [Vector3] in global coordinates, that can be moved to, making sure that there are no static objects in the way. If the agent does not have a navigation path, it will return the origin of the agent's parent.
 			</description>
 		</method>
+		<method name="get_rid" qualifiers="const">
+			<return type="RID">
+			</return>
+			<description>
+			</description>
+		</method>
 		<method name="get_target_location" qualifiers="const">
 			<return type="Vector3">
 			</return>
diff --git a/doc/classes/Node.xml b/doc/classes/Node.xml
index 7750d45226..523f3a0c17 100644
--- a/doc/classes/Node.xml
+++ b/doc/classes/Node.xml
@@ -37,13 +37,13 @@
 				Corresponds to the [constant NOTIFICATION_EXIT_TREE] notification in [method Object._notification] and signal [signal tree_exiting]. To get notified when the node has already left the active tree, connect to the [signal tree_exited].
 			</description>
 		</method>
-		<method name="_get_configuration_warning" qualifiers="virtual">
-			<return type="String">
+		<method name="_get_configuration_warnings" qualifiers="virtual">
+			<return type="String[]">
 			</return>
 			<description>
-				The string returned from this method is displayed as a warning in the Scene Dock if the script that overrides it is a [code]tool[/code] script.
-				Returning an empty string produces no warning.
-				Call [method update_configuration_warning] when the warning needs to be updated for this node.
+				The elements in the array returned from this method are displayed as warnings in the Scene Dock if the script that overrides it is a [code]tool[/code] script.
+				Returning an empty array produces no warnings.
+				Call [method update_configuration_warnings] when the warnings need to be updated for this node.
 			</description>
 		</method>
 		<method name="_input" qualifiers="virtual">
@@ -856,12 +856,12 @@
 				Sets whether this is an instance load placeholder. See [InstancePlaceholder].
 			</description>
 		</method>
-		<method name="update_configuration_warning">
+		<method name="update_configuration_warnings">
 			<return type="void">
 			</return>
 			<description>
 				Updates the warning displayed for this node in the Scene Dock.
-				Use [method _get_configuration_warning] to setup the warning message to display.
+				Use [method _get_configuration_warnings] to setup the warning message to display.
 			</description>
 		</method>
 	</methods>
diff --git a/doc/classes/Occluder3D.xml b/doc/classes/Occluder3D.xml
new file mode 100644
index 0000000000..fc676c2b49
--- /dev/null
+++ b/doc/classes/Occluder3D.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<class name="Occluder3D" inherits="Resource" version="4.0">
+	<brief_description>
+	</brief_description>
+	<description>
+	</description>
+	<tutorials>
+	</tutorials>
+	<methods>
+	</methods>
+	<members>
+		<member name="indices" type="PackedInt32Array" setter="set_indices" getter="get_indices" default="PackedInt32Array(  )">
+		</member>
+		<member name="vertices" type="PackedVector3Array" setter="set_vertices" getter="get_vertices" default="PackedVector3Array(  )">
+		</member>
+	</members>
+	<constants>
+	</constants>
+</class>
diff --git a/doc/classes/OccluderInstance3D.xml b/doc/classes/OccluderInstance3D.xml
new file mode 100644
index 0000000000..76b784d21d
--- /dev/null
+++ b/doc/classes/OccluderInstance3D.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<class name="OccluderInstance3D" inherits="Node3D" version="4.0">
+	<brief_description>
+	</brief_description>
+	<description>
+	</description>
+	<tutorials>
+	</tutorials>
+	<methods>
+		<method name="get_bake_mask_bit" qualifiers="const">
+			<return type="bool">
+			</return>
+			<argument index="0" name="layer" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="set_bake_mask_bit">
+			<return type="void">
+			</return>
+			<argument index="0" name="layer" type="int">
+			</argument>
+			<argument index="1" name="enabled" type="bool">
+			</argument>
+			<description>
+			</description>
+		</method>
+	</methods>
+	<members>
+		<member name="bake_mask" type="int" setter="set_bake_mask" getter="get_bake_mask" default="4294967295">
+		</member>
+		<member name="occluder" type="Occluder3D" setter="set_occluder" getter="get_occluder">
+		</member>
+	</members>
+	<constants>
+	</constants>
+</class>
diff --git a/doc/classes/PackedByteArray.xml b/doc/classes/PackedByteArray.xml
index 21f835a53c..0652cf0aa1 100644
--- a/doc/classes/PackedByteArray.xml
+++ b/doc/classes/PackedByteArray.xml
@@ -61,6 +61,114 @@
 				Returns a new [PackedByteArray] with the data compressed. Set the compression mode using one of [enum File.CompressionMode]'s constants.
 			</description>
 		</method>
+		<method name="decode_double" qualifiers="const">
+			<return type="float">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_float" qualifiers="const">
+			<return type="float">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_half" qualifiers="const">
+			<return type="float">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_s16" qualifiers="const">
+			<return type="int">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_s32" qualifiers="const">
+			<return type="int">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_s64" qualifiers="const">
+			<return type="int">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_s8" qualifiers="const">
+			<return type="int">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_u16" qualifiers="const">
+			<return type="int">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_u32" qualifiers="const">
+			<return type="int">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_u64" qualifiers="const">
+			<return type="int">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_u8" qualifiers="const">
+			<return type="int">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_var" qualifiers="const">
+			<return type="Variant">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="allow_objects" type="bool" default="false">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="decode_var_size" qualifiers="const">
+			<return type="int">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="allow_objects" type="bool" default="false">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="decompress" qualifiers="const">
 			<return type="PackedByteArray">
 			</return>
@@ -92,6 +200,137 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="encode_double">
+			<return type="void">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="float">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="encode_float">
+			<return type="void">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="float">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="encode_half">
+			<return type="void">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="float">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="encode_s16">
+			<return type="void">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="encode_s32">
+			<return type="void">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="encode_s64">
+			<return type="void">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="encode_s8">
+			<return type="void">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="encode_u16">
+			<return type="void">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="encode_u32">
+			<return type="void">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="encode_u64">
+			<return type="void">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="encode_u8">
+			<return type="void">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="encode_var">
+			<return type="int">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="value" type="Variant">
+			</argument>
+			<argument index="2" name="allow_objects" type="bool" default="false">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="int">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="get_string_from_ascii" qualifiers="const">
 			<return type="String">
 			</return>
@@ -129,6 +368,16 @@
 				Returns [code]true[/code] if the array contains [code]value[/code].
 			</description>
 		</method>
+		<method name="has_encoded_var" qualifiers="const">
+			<return type="bool">
+			</return>
+			<argument index="0" name="byte_offset" type="int">
+			</argument>
+			<argument index="1" name="allow_objects" type="bool" default="false">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="hex_encode" qualifiers="const">
 			<return type="String">
 			</return>
diff --git a/doc/classes/PackedColorArray.xml b/doc/classes/PackedColorArray.xml
index 38240b3154..19cfcd7c87 100644
--- a/doc/classes/PackedColorArray.xml
+++ b/doc/classes/PackedColorArray.xml
@@ -59,6 +59,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="Color">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedFloat32Array.xml b/doc/classes/PackedFloat32Array.xml
index 6be1d24b5d..ab97c9a695 100644
--- a/doc/classes/PackedFloat32Array.xml
+++ b/doc/classes/PackedFloat32Array.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="float">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
@@ -111,6 +120,14 @@
 			<description>
 			</description>
 		</method>
+		<method name="operator []" qualifiers="operator">
+			<return type="float">
+			</return>
+			<argument index="0" name="index" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="push_back">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedFloat64Array.xml b/doc/classes/PackedFloat64Array.xml
index fb7817cb41..ad20801b01 100644
--- a/doc/classes/PackedFloat64Array.xml
+++ b/doc/classes/PackedFloat64Array.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="float">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedInt32Array.xml b/doc/classes/PackedInt32Array.xml
index 4ee428dfbc..ff4729082e 100644
--- a/doc/classes/PackedInt32Array.xml
+++ b/doc/classes/PackedInt32Array.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="int">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedInt64Array.xml b/doc/classes/PackedInt64Array.xml
index 51948fcbc8..195b12b129 100644
--- a/doc/classes/PackedInt64Array.xml
+++ b/doc/classes/PackedInt64Array.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="int">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedStringArray.xml b/doc/classes/PackedStringArray.xml
index 9748301dae..22458832da 100644
--- a/doc/classes/PackedStringArray.xml
+++ b/doc/classes/PackedStringArray.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="String">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedVector2Array.xml b/doc/classes/PackedVector2Array.xml
index 1b3201b072..6c8791f988 100644
--- a/doc/classes/PackedVector2Array.xml
+++ b/doc/classes/PackedVector2Array.xml
@@ -60,6 +60,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="Vector2">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PackedVector3Array.xml b/doc/classes/PackedVector3Array.xml
index 25d854016a..85d41d7519 100644
--- a/doc/classes/PackedVector3Array.xml
+++ b/doc/classes/PackedVector3Array.xml
@@ -59,6 +59,15 @@
 				Creates a copy of the array, and returns it.
 			</description>
 		</method>
+		<method name="fill">
+			<return type="void">
+			</return>
+			<argument index="0" name="value" type="Vector3">
+			</argument>
+			<description>
+				Assigns the given value to all elements in the array. This can typically be used together with [method resize] to create an array with a given size and initialized elements.
+			</description>
+		</method>
 		<method name="has">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/PacketPeerUDP.xml b/doc/classes/PacketPeerUDP.xml
index d7cf6cc8c6..5d059ad3df 100644
--- a/doc/classes/PacketPeerUDP.xml
+++ b/doc/classes/PacketPeerUDP.xml
@@ -9,11 +9,27 @@
 	<tutorials>
 	</tutorials>
 	<methods>
+		<method name="bind">
+			<return type="int" enum="Error">
+			</return>
+			<argument index="0" name="port" type="int">
+			</argument>
+			<argument index="1" name="bind_address" type="String" default="&quot;*&quot;">
+			</argument>
+			<argument index="2" name="recv_buf_size" type="int" default="65536">
+			</argument>
+			<description>
+				Binds this [PacketPeerUDP] to the specified [code]port[/code] and [code]address[/code] with a buffer size [code]recv_buf_size[/code], allowing it to receive incoming packets.
+				If [code]address[/code] is set to [code]"*"[/code] (default), the peer will be bound on all available addresses (both IPv4 and IPv6).
+				If [code]address[/code] is set to [code]"0.0.0.0"[/code] (for IPv4) or [code]"::"[/code] (for IPv6), the peer will be bound to all available addresses matching that IP type.
+				If [code]address[/code] is set to any valid address (e.g. [code]"192.168.1.101"[/code], [code]"::1"[/code], etc), the peer will only be bound to the interface with that addresses (or fail if no interface with the given address exists).
+			</description>
+		</method>
 		<method name="close">
 			<return type="void">
 			</return>
 			<description>
-				Closes the UDP socket the [PacketPeerUDP] is currently listening on.
+				Closes the [PacketPeerUDP]'s underlying UDP socket.
 			</description>
 		</method>
 		<method name="connect_to_host">
@@ -28,6 +44,13 @@
 				[b]Note:[/b] Connecting to the remote peer does not help to protect from malicious attacks like IP spoofing, etc. Think about using an encryption technique like SSL or DTLS if you feel like your application is transferring sensitive information.
 			</description>
 		</method>
+		<method name="get_local_port" qualifiers="const">
+			<return type="int">
+			</return>
+			<description>
+				Returns the local port to which this peer is bound.
+			</description>
+		</method>
 		<method name="get_packet_ip" qualifiers="const">
 			<return type="String">
 			</return>
@@ -42,18 +65,18 @@
 				Returns the port of the remote peer that sent the last packet(that was received with [method PacketPeer.get_packet] or [method PacketPeer.get_var]).
 			</description>
 		</method>
-		<method name="is_connected_to_host" qualifiers="const">
+		<method name="is_bound" qualifiers="const">
 			<return type="bool">
 			</return>
 			<description>
-				Returns [code]true[/code] if the UDP socket is open and has been connected to a remote address. See [method connect_to_host].
+				Returns whether this [PacketPeerUDP] is bound to an address and can receive packets.
 			</description>
 		</method>
-		<method name="is_listening" qualifiers="const">
+		<method name="is_connected_to_host" qualifiers="const">
 			<return type="bool">
 			</return>
 			<description>
-				Returns whether this [PacketPeerUDP] is listening.
+				Returns [code]true[/code] if the UDP socket is open and has been connected to a remote address. See [method connect_to_host].
 			</description>
 		</method>
 		<method name="join_multicast_group">
@@ -80,22 +103,6 @@
 				Removes the interface identified by [code]interface_name[/code] from the multicast group specified by [code]multicast_address[/code].
 			</description>
 		</method>
-		<method name="listen">
-			<return type="int" enum="Error">
-			</return>
-			<argument index="0" name="port" type="int">
-			</argument>
-			<argument index="1" name="bind_address" type="String" default="&quot;*&quot;">
-			</argument>
-			<argument index="2" name="recv_buf_size" type="int" default="65536">
-			</argument>
-			<description>
-				Makes this [PacketPeerUDP] listen on the [code]port[/code] binding to [code]bind_address[/code] with a buffer size [code]recv_buf_size[/code].
-				If [code]bind_address[/code] is set to [code]"*"[/code] (default), the peer will listen on all available addresses (both IPv4 and IPv6).
-				If [code]bind_address[/code] is set to [code]"0.0.0.0"[/code] (for IPv4) or [code]"::"[/code] (for IPv6), the peer will listen on all available addresses matching that IP type.
-				If [code]bind_address[/code] is set to any valid address (e.g. [code]"192.168.1.101"[/code], [code]"::1"[/code], etc), the peer will only listen on the interface with that addresses (or fail if no interface with the given address exists).
-			</description>
-		</method>
 		<method name="set_broadcast_enabled">
 			<return type="void">
 			</return>
@@ -122,7 +129,7 @@
 			<return type="int" enum="Error">
 			</return>
 			<description>
-				Waits for a packet to arrive on the listening port. See [method listen].
+				Waits for a packet to arrive on the bound address. See [method bind].
 				[b]Note:[/b] [method wait] can't be interrupted once it has been called. This can be worked around by allowing the other party to send a specific "death pill" packet like this:
 		[codeblocks]
 		[gdscript]
diff --git a/doc/classes/PhysicsBody2D.xml b/doc/classes/PhysicsBody2D.xml
index 9c3c47afba..e43d3bb762 100644
--- a/doc/classes/PhysicsBody2D.xml
+++ b/doc/classes/PhysicsBody2D.xml
@@ -26,24 +26,6 @@
 				Returns an array of nodes that were added as collision exceptions for this body.
 			</description>
 		</method>
-		<method name="get_collision_layer_bit" qualifiers="const">
-			<return type="bool">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<description>
-				Returns an individual bit on the [member collision_layer].
-			</description>
-		</method>
-		<method name="get_collision_mask_bit" qualifiers="const">
-			<return type="bool">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<description>
-				Returns an individual bit on the [member collision_mask].
-			</description>
-		</method>
 		<method name="remove_collision_exception_with">
 			<return type="void">
 			</return>
@@ -53,38 +35,8 @@
 				Removes a body from the list of bodies that this body can't collide with.
 			</description>
 		</method>
-		<method name="set_collision_layer_bit">
-			<return type="void">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<argument index="1" name="value" type="bool">
-			</argument>
-			<description>
-				Sets individual bits on the [member collision_layer] bitmask. Use this if you only need to change one layer's value.
-			</description>
-		</method>
-		<method name="set_collision_mask_bit">
-			<return type="void">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<argument index="1" name="value" type="bool">
-			</argument>
-			<description>
-				Sets individual bits on the [member collision_mask] bitmask. Use this if you only need to change one layer's value.
-			</description>
-		</method>
 	</methods>
 	<members>
-		<member name="collision_layer" type="int" setter="set_collision_layer" getter="get_collision_layer" default="1">
-			The physics layers this area is in.
-			Collidable objects can exist in any of 32 different layers. These layers work like a tagging system, and are not visual. A collidable can use these layers to select with which objects it can collide, using the [member collision_mask] property.
-			A contact is detected if object A is in any of the layers that object B scans, or object B is in any layer scanned by object A. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
-		</member>
-		<member name="collision_mask" type="int" setter="set_collision_mask" getter="get_collision_mask" default="1">
-			The physics layers this area scans for collisions. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
-		</member>
 		<member name="input_pickable" type="bool" setter="set_pickable" getter="is_pickable" override="true" default="false" />
 	</members>
 	<constants>
diff --git a/doc/classes/PhysicsBody3D.xml b/doc/classes/PhysicsBody3D.xml
index 7de65603f9..b320d37d23 100644
--- a/doc/classes/PhysicsBody3D.xml
+++ b/doc/classes/PhysicsBody3D.xml
@@ -26,24 +26,6 @@
 				Returns an array of nodes that were added as collision exceptions for this body.
 			</description>
 		</method>
-		<method name="get_collision_layer_bit" qualifiers="const">
-			<return type="bool">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<description>
-				Returns an individual bit on the [member collision_layer].
-			</description>
-		</method>
-		<method name="get_collision_mask_bit" qualifiers="const">
-			<return type="bool">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<description>
-				Returns an individual bit on the [member collision_mask].
-			</description>
-		</method>
 		<method name="remove_collision_exception_with">
 			<return type="void">
 			</return>
@@ -53,39 +35,7 @@
 				Removes a body from the list of bodies that this body can't collide with.
 			</description>
 		</method>
-		<method name="set_collision_layer_bit">
-			<return type="void">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<argument index="1" name="value" type="bool">
-			</argument>
-			<description>
-				Sets individual bits on the [member collision_layer] bitmask. Use this if you only need to change one layer's value.
-			</description>
-		</method>
-		<method name="set_collision_mask_bit">
-			<return type="void">
-			</return>
-			<argument index="0" name="bit" type="int">
-			</argument>
-			<argument index="1" name="value" type="bool">
-			</argument>
-			<description>
-				Sets individual bits on the [member collision_mask] bitmask. Use this if you only need to change one layer's value.
-			</description>
-		</method>
 	</methods>
-	<members>
-		<member name="collision_layer" type="int" setter="set_collision_layer" getter="get_collision_layer" default="1">
-			The physics layers this area is in.
-			Collidable objects can exist in any of 32 different layers. These layers work like a tagging system, and are not visual. A collidable can use these layers to select with which objects it can collide, using the [member collision_mask] property.
-			A contact is detected if object A is in any of the layers that object B scans, or object B is in any layer scanned by object A. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
-		</member>
-		<member name="collision_mask" type="int" setter="set_collision_mask" getter="get_collision_mask" default="1">
-			The physics layers this area scans for collisions. See [url=https://docs.godotengine.org/en/latest/tutorials/physics/physics_introduction.html#collision-layers-and-masks]Collision layers and masks[/url] in the documentation for more information.
-		</member>
-	</members>
 	<constants>
 	</constants>
 </class>
diff --git a/doc/classes/PhysicsServer2D.xml b/doc/classes/PhysicsServer2D.xml
index 701a430538..229facd08b 100644
--- a/doc/classes/PhysicsServer2D.xml
+++ b/doc/classes/PhysicsServer2D.xml
@@ -659,11 +659,9 @@
 			</return>
 			<argument index="0" name="body" type="RID">
 			</argument>
-			<argument index="1" name="receiver" type="Object">
-			</argument>
-			<argument index="2" name="method" type="StringName">
+			<argument index="1" name="callable" type="Callable">
 			</argument>
-			<argument index="3" name="userdata" type="Variant" default="null">
+			<argument index="2" name="userdata" type="Variant" default="null">
 			</argument>
 			<description>
 				Sets the function used to calculate physics for an object, if that object allows it (see [method body_set_omit_force_integration]).
diff --git a/doc/classes/PhysicsServer3D.xml b/doc/classes/PhysicsServer3D.xml
index c61347ba0b..46de9e5282 100644
--- a/doc/classes/PhysicsServer3D.xml
+++ b/doc/classes/PhysicsServer3D.xml
@@ -653,11 +653,9 @@
 			</return>
 			<argument index="0" name="body" type="RID">
 			</argument>
-			<argument index="1" name="receiver" type="Object">
-			</argument>
-			<argument index="2" name="method" type="StringName">
+			<argument index="1" name="callable" type="Callable">
 			</argument>
-			<argument index="3" name="userdata" type="Variant" default="null">
+			<argument index="2" name="userdata" type="Variant" default="null">
 			</argument>
 			<description>
 				Sets the function used to calculate physics for an object, if that object allows it (see [method body_set_omit_force_integration]).
diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml
index 5b9150ab04..005873c2ff 100644
--- a/doc/classes/ProjectSettings.xml
+++ b/doc/classes/ProjectSettings.xml
@@ -6,7 +6,8 @@
 	<description>
 		Contains global variables accessible from everywhere. Use [method get_setting], [method set_setting] or [method has_setting] to access them. Variables stored in [code]project.godot[/code] are also loaded into ProjectSettings, making this object very useful for reading custom game configuration options.
 		When naming a Project Settings property, use the full path to the setting including the category. For example, [code]"application/config/name"[/code] for the project name. Category and property names can be viewed in the Project Settings dialog.
-		[b]Overriding:[/b] Any project setting can be overridden by creating a file named [code]override.cfg[/code] in the project's root directory. This can also be used in exported projects by placing this file in the same directory as the project binary.
+		[b]Feature tags:[/b] Project settings can be overriden for specific platforms and configurations (debug, release, ...) using [url=https://docs.godotengine.org/en/latest/tutorials/export/feature_tags.html]feature tags[/url].
+		[b]Overriding:[/b] Any project setting can be overridden by creating a file named [code]override.cfg[/code] in the project's root directory. This can also be used in exported projects by placing this file in the same directory as the project binary. Overriding will still take the base project settings' [url=https://docs.godotengine.org/en/latest/tutorials/export/feature_tags.html]feature tags[/url] in account. Therefore, make sure to [i]also[/i] override the setting with the desired feature tags if you want them to override base project settings on all platforms and configurations.
 	</description>
 	<tutorials>
 		<link title="3D Physics Tests Demo">https://godotengine.org/asset-library/asset/675</link>
@@ -255,8 +256,8 @@
 			[b]Note:[/b] Changing this value will also change the user data folder's path if [member application/config/use_custom_user_dir] is [code]false[/code]. After renaming the project, you will no longer be able to access existing data in [code]user://[/code] unless you rename the old folder to match the new project name. See [url=https://docs.godotengine.org/en/latest/tutorials/io/data_paths.html]Data paths[/url] in the documentation for more information.
 		</member>
 		<member name="application/config/project_settings_override" type="String" setter="" getter="" default="&quot;&quot;">
-			Specifies a file to override project settings. For example: [code]user://custom_settings.cfg[/code].
-			[b]Note:[/b] Regardless of this setting's value, [code]res://override.cfg[/code] will still be read to override the project settings (see this class' description at the top).
+			Specifies a file to override project settings. For example: [code]user://custom_settings.cfg[/code]. See "Overriding" in the [ProjectSettings] class description at the top for more information.
+			[b]Note:[/b] Regardless of this setting's value, [code]res://override.cfg[/code] will still be read to override the project settings.
 		</member>
 		<member name="application/config/use_custom_user_dir" type="bool" setter="" getter="" default="false">
 			If [code]true[/code], the project will save user data to its own user directory (see [member application/config/custom_user_dir_name]). This setting is only effective on desktop platforms. A name must be set in the [member application/config/custom_user_dir_name] setting for this to take effect. If [code]false[/code], the project will save user data to [code](OS user data directory)/Godot/app_userdata/(project name)[/code].
@@ -749,6 +750,11 @@
 		<member name="internationalization/locale/fallback" type="String" setter="" getter="" default="&quot;en&quot;">
 			The locale to fall back to if a translation isn't available in a given language. If left empty, [code]en[/code] (English) will be used.
 		</member>
+		<member name="internationalization/locale/include_text_server_data" type="bool" setter="" getter="" default="false">
+			If [code]true[/code], text server break iteration rule sets, dictionaries and other optional data are included in the exported project.
+			[b]Note:[/b] "ICU / HarfBuzz / Graphite" text server data includes dictionaries for Burmese, Chinese, Japanese, Khmer, Lao and Thai as well as Unicode Standard Annex #29 and Unicode Standard Annex #14 word and line breaking rules. Data is about 4 MB large.
+			[b]Note:[/b] "Fallback" text server does not use additional data.
+		</member>
 		<member name="internationalization/locale/test" type="String" setter="" getter="" default="&quot;&quot;">
 			If non-empty, this locale will be used when running the project from the editor.
 		</member>
@@ -1149,7 +1155,7 @@
 		<member name="navigation/3d/default_cell_size" type="float" setter="" getter="" default="0.3">
 			Default cell size for 3D navigation maps. See [method NavigationServer3D.map_set_cell_size].
 		</member>
-		<member name="navigation/3d/default_edge_connection_margin" type="float" setter="" getter="" default="5.0">
+		<member name="navigation/3d/default_edge_connection_margin" type="float" setter="" getter="" default="0.3">
 			Default edge connection margin for 3D navigation maps. See [method NavigationServer3D.map_set_edge_connection_margin].
 		</member>
 		<member name="network/limits/debugger/max_chars_per_second" type="int" setter="" getter="" default="32768">
@@ -1350,10 +1356,6 @@
 			[b]FIXME:[/b] No longer valid after DisplayServer split:
 			In such cases, this property is not updated, so use [code]OS.get_current_video_driver[/code] to query it at run-time.
 		</member>
-		<member name="rendering/driver/rd_renderer/use_low_end_renderer" type="bool" setter="" getter="" default="false">
-		</member>
-		<member name="rendering/driver/rd_renderer/use_low_end_renderer.mobile" type="bool" setter="" getter="" default="true">
-		</member>
 		<member name="rendering/driver/threads/thread_model" type="int" setter="" getter="" default="1">
 			Thread model for rendering. Rendering on a thread can vastly improve performance, but synchronizing to the main thread can cause a bit more jitter.
 		</member>
@@ -1463,6 +1465,12 @@
 		</member>
 		<member name="rendering/mesh_lod/lod_change/threshold_pixels" type="float" setter="" getter="" default="1.0">
 		</member>
+		<member name="rendering/occlusion_culling/bvh_build_quality" type="int" setter="" getter="" default="2">
+		</member>
+		<member name="rendering/occlusion_culling/occlusion_rays_per_thread" type="int" setter="" getter="" default="512">
+		</member>
+		<member name="rendering/occlusion_culling/use_occlusion_culling" type="bool" setter="" getter="" default="false">
+		</member>
 		<member name="rendering/reflections/reflection_atlas/reflection_count" type="int" setter="" getter="" default="64">
 			Number of cubemaps to store in the reflection atlas. The number of [ReflectionProbe]s in a scene will be limited by this amount. A higher number requires more VRAM.
 		</member>
@@ -1571,6 +1579,10 @@
 		</member>
 		<member name="rendering/vulkan/descriptor_pools/max_descriptors_per_pool" type="int" setter="" getter="" default="64">
 		</member>
+		<member name="rendering/vulkan/rendering/back_end" type="int" setter="" getter="" default="0">
+		</member>
+		<member name="rendering/vulkan/rendering/back_end.mobile" type="int" setter="" getter="" default="1">
+		</member>
 		<member name="rendering/vulkan/staging_buffer/block_size_kb" type="int" setter="" getter="" default="256">
 		</member>
 		<member name="rendering/vulkan/staging_buffer/max_size_mb" type="int" setter="" getter="" default="128">
diff --git a/doc/classes/RenderingServer.xml b/doc/classes/RenderingServer.xml
index f82301bcf4..638b0bb297 100644
--- a/doc/classes/RenderingServer.xml
+++ b/doc/classes/RenderingServer.xml
@@ -1317,7 +1317,7 @@
 				Sets the scenario that the instance is in. The scenario is the 3D world that the objects will be displayed in.
 			</description>
 		</method>
-		<method name="instance_set_surface_material">
+		<method name="instance_set_surface_override_material">
 			<return type="void">
 			</return>
 			<argument index="0" name="instance" type="RID">
@@ -1327,7 +1327,7 @@
 			<argument index="2" name="material" type="RID">
 			</argument>
 			<description>
-				Sets the material of a specific surface. Equivalent to [method MeshInstance3D.set_surface_material].
+				Sets the override material of a specific surface. Equivalent to [method MeshInstance3D.set_surface_override_material].
 			</description>
 		</method>
 		<method name="instance_set_transform">
@@ -1999,6 +1999,24 @@
 				Sets the number of instances visible at a given time. If -1, all instances that have been allocated are drawn. Equivalent to [member MultiMesh.visible_instance_count].
 			</description>
 		</method>
+		<method name="occluder_create">
+			<return type="RID">
+			</return>
+			<description>
+			</description>
+		</method>
+		<method name="occluder_set_mesh">
+			<return type="void">
+			</return>
+			<argument index="0" name="arg0" type="RID">
+			</argument>
+			<argument index="1" name="arg1" type="PackedVector3Array">
+			</argument>
+			<argument index="2" name="arg2" type="PackedInt32Array">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="omni_light_create">
 			<return type="RID">
 			</return>
@@ -2412,6 +2430,16 @@
 				The scenario is the 3D world that all the visual instances exist in.
 			</description>
 		</method>
+		<method name="scenario_set_camera_effects">
+			<return type="void">
+			</return>
+			<argument index="0" name="scenario" type="RID">
+			</argument>
+			<argument index="1" name="effects" type="RID">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="scenario_set_debug">
 			<return type="void">
 			</return>
@@ -2897,6 +2925,22 @@
 				Sets the anti-aliasing mode. See [enum ViewportMSAA] for options.
 			</description>
 		</method>
+		<method name="viewport_set_occlusion_culling_build_quality">
+			<return type="void">
+			</return>
+			<argument index="0" name="quality" type="int" enum="RenderingServer.ViewportOcclusionCullingBuildQuality">
+			</argument>
+			<description>
+			</description>
+		</method>
+		<method name="viewport_set_occlusion_rays_per_thread">
+			<return type="void">
+			</return>
+			<argument index="0" name="rays_per_thread" type="int">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="viewport_set_parent_viewport">
 			<return type="void">
 			</return>
@@ -3002,6 +3046,16 @@
 			<description>
 			</description>
 		</method>
+		<method name="viewport_set_use_occlusion_culling">
+			<return type="void">
+			</return>
+			<argument index="0" name="viewport" type="RID">
+			</argument>
+			<argument index="1" name="enable" type="bool">
+			</argument>
+			<description>
+			</description>
+		</method>
 		<method name="viewport_set_use_xr">
 			<return type="void">
 			</return>
@@ -3454,6 +3508,8 @@
 		</constant>
 		<constant name="VIEWPORT_DEBUG_DRAW_GI_BUFFER" value="17" enum="ViewportDebugDraw">
 		</constant>
+		<constant name="VIEWPORT_DEBUG_DRAW_OCCLUDERS" value="23" enum="ViewportDebugDraw">
+		</constant>
 		<constant name="SKY_MODE_QUALITY" value="1" enum="SkyMode">
 			Uses high quality importance sampling to process the radiance map. In general, this results in much higher quality than [constant Sky.PROCESS_MODE_REALTIME] but takes much longer to generate. This should not be used if you plan on changing the sky at runtime. If you are finding that the reflection is not blurry enough and is showing sparkles or fireflies, try increasing [member ProjectSettings.rendering/reflections/sky_reflections/ggx_samples].
 		</constant>
@@ -3606,6 +3662,12 @@
 		<constant name="SCENARIO_DEBUG_SHADELESS" value="3" enum="ScenarioDebugMode">
 			Draw all objects without shading. Equivalent to setting all objects shaders to [code]unshaded[/code].
 		</constant>
+		<constant name="VIEWPORT_OCCLUSION_BUILD_QUALITY_LOW" value="0" enum="ViewportOcclusionCullingBuildQuality">
+		</constant>
+		<constant name="VIEWPORT_OCCLUSION_BUILD_QUALITY_MEDIUM" value="1" enum="ViewportOcclusionCullingBuildQuality">
+		</constant>
+		<constant name="VIEWPORT_OCCLUSION_BUILD_QUALITY_HIGH" value="2" enum="ViewportOcclusionCullingBuildQuality">
+		</constant>
 		<constant name="INSTANCE_NONE" value="0" enum="InstanceType">
 			The instance does not have a type.
 		</constant>
@@ -3638,7 +3700,9 @@
 		<constant name="INSTANCE_LIGHTMAP" value="10" enum="InstanceType">
 			The instance is a lightmap.
 		</constant>
-		<constant name="INSTANCE_MAX" value="11" enum="InstanceType">
+		<constant name="INSTANCE_OCCLUDER" value="11" enum="InstanceType">
+		</constant>
+		<constant name="INSTANCE_MAX" value="12" enum="InstanceType">
 			Represents the size of the [enum InstanceType] enum.
 		</constant>
 		<constant name="INSTANCE_GEOMETRY_MASK" value="30" enum="InstanceType">
@@ -3653,7 +3717,9 @@
 		<constant name="INSTANCE_FLAG_DRAW_NEXT_FRAME_IF_VISIBLE" value="2" enum="InstanceFlags">
 			When set, manually requests to draw geometry on next frame.
 		</constant>
-		<constant name="INSTANCE_FLAG_MAX" value="3" enum="InstanceFlags">
+		<constant name="INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING" value="3" enum="InstanceFlags">
+		</constant>
+		<constant name="INSTANCE_FLAG_MAX" value="4" enum="InstanceFlags">
 			Represents the size of the [enum InstanceFlags] enum.
 		</constant>
 		<constant name="SHADOW_CASTING_SETTING_OFF" value="0" enum="ShadowCastingSetting">
diff --git a/doc/classes/RibbonTrailMesh.xml b/doc/classes/RibbonTrailMesh.xml
new file mode 100644
index 0000000000..771f2e444b
--- /dev/null
+++ b/doc/classes/RibbonTrailMesh.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<class name="RibbonTrailMesh" inherits="PrimitiveMesh" version="4.0">
+	<brief_description>
+	</brief_description>
+	<description>
+	</description>
+	<tutorials>
+	</tutorials>
+	<methods>
+	</methods>
+	<members>
+		<member name="curve" type="Curve" setter="set_curve" getter="get_curve">
+		</member>
+		<member name="section_length" type="float" setter="set_section_length" getter="get_section_length" default="0.2">
+		</member>
+		<member name="section_segments" type="int" setter="set_section_segments" getter="get_section_segments" default="3">
+		</member>
+		<member name="sections" type="int" setter="set_sections" getter="get_sections" default="5">
+		</member>
+		<member name="shape" type="int" setter="set_shape" getter="get_shape" enum="RibbonTrailMesh.Shape" default="1">
+		</member>
+		<member name="size" type="float" setter="set_size" getter="get_size" default="1.0">
+		</member>
+	</members>
+	<constants>
+		<constant name="SHAPE_FLAT" value="0" enum="Shape">
+		</constant>
+		<constant name="SHAPE_CROSS" value="1" enum="Shape">
+		</constant>
+	</constants>
+</class>
diff --git a/doc/classes/SceneTree.xml b/doc/classes/SceneTree.xml
index 9366d7dd44..06800082cb 100644
--- a/doc/classes/SceneTree.xml
+++ b/doc/classes/SceneTree.xml
@@ -21,7 +21,8 @@
 			<argument index="1" name="method" type="StringName">
 			</argument>
 			<description>
-				Calls [code]method[/code] on each member of the given group.
+				Calls [code]method[/code] on each member of the given group. You can pass arguments to [code]method[/code] by specifying them at the end of the method call.
+				[b]Note:[/b] [method call_group] will always call methods with an one-frame delay, in a way similar to [method Object.call_deferred]. To call methods immediately, use [method call_group_flags] with the [constant GROUP_CALL_REALTIME] flag.
 			</description>
 		</method>
 		<method name="call_group_flags" qualifiers="vararg">
@@ -34,7 +35,8 @@
 			<argument index="2" name="method" type="StringName">
 			</argument>
 			<description>
-				Calls [code]method[/code] on each member of the given group, respecting the given [enum GroupCallFlags].
+				Calls [code]method[/code] on each member of the given group, respecting the given [enum GroupCallFlags]. You can pass arguments to [code]method[/code] by specifying them at the end of the method call.
+				[b]Note:[/b] Group call flags are used to control the method calling behavior. If the [constant GROUP_CALL_REALTIME] flag is present in the [code]flags[/code] argument, methods will be called immediately. If this flag isn't present in [code]flags[/code], methods will be called with a one-frame delay in a way similar to [method call_group].
 			</description>
 		</method>
 		<method name="change_scene">
diff --git a/doc/classes/StreamPeerTCP.xml b/doc/classes/StreamPeerTCP.xml
index b6d91715ee..7b7c1d7426 100644
--- a/doc/classes/StreamPeerTCP.xml
+++ b/doc/classes/StreamPeerTCP.xml
@@ -9,6 +9,18 @@
 	<tutorials>
 	</tutorials>
 	<methods>
+		<method name="bind">
+			<return type="int" enum="Error">
+			</return>
+			<argument index="0" name="port" type="int">
+			</argument>
+			<argument index="1" name="host" type="String" default="&quot;*&quot;">
+			</argument>
+			<description>
+				Opens the TCP socket, and binds it to the specified local address.
+				This method is generally not needed, and only used to force the subsequent call to [method connect_to_host] to use the specified [code]host[/code] and [code]port[/code] as source address. This can be desired in some NAT punchthrough techniques, or when forcing the source network interface.
+			</description>
+		</method>
 		<method name="connect_to_host">
 			<return type="int" enum="Error">
 			</return>
@@ -17,7 +29,7 @@
 			<argument index="1" name="port" type="int">
 			</argument>
 			<description>
-				Connects to the specified [code]host:port[/code] pair. A hostname will be resolved if valid. Returns [constant OK] on success or [constant FAILED] on failure.
+				Connects to the specified [code]host:port[/code] pair. A hostname will be resolved if valid. Returns [constant OK] on success.
 			</description>
 		</method>
 		<method name="disconnect_from_host">
@@ -41,6 +53,13 @@
 				Returns the port of this peer.
 			</description>
 		</method>
+		<method name="get_local_port" qualifiers="const">
+			<return type="int">
+			</return>
+			<description>
+				Returns the local port to which this peer is bound.
+			</description>
+		</method>
 		<method name="get_status">
 			<return type="int" enum="StreamPeerTCP.Status">
 			</return>
diff --git a/doc/classes/SubViewport.xml b/doc/classes/SubViewport.xml
index 376082f417..b6e9eda1d1 100644
--- a/doc/classes/SubViewport.xml
+++ b/doc/classes/SubViewport.xml
@@ -34,9 +34,6 @@
 		<member name="size_2d_override_stretch" type="bool" setter="set_size_2d_override_stretch" getter="is_size_2d_override_stretch_enabled" default="false">
 			If [code]true[/code], the 2D size override affects stretch as well.
 		</member>
-		<member name="xr" type="bool" setter="set_use_xr" getter="is_using_xr" default="false">
-			If [code]true[/code], the sub-viewport will be used in AR/VR process.
-		</member>
 	</members>
 	<constants>
 		<constant name="CLEAR_MODE_ALWAYS" value="0" enum="ClearMode">
diff --git a/doc/classes/TCP_Server.xml b/doc/classes/TCP_Server.xml
index 72e9ca923d..ec91d75d47 100644
--- a/doc/classes/TCP_Server.xml
+++ b/doc/classes/TCP_Server.xml
@@ -9,6 +9,13 @@
 	<tutorials>
 	</tutorials>
 	<methods>
+		<method name="get_local_port" qualifiers="const">
+			<return type="int">
+			</return>
+			<description>
+				Returns the local port this server is listening to.
+			</description>
+		</method>
 		<method name="is_connection_available" qualifiers="const">
 			<return type="bool">
 			</return>
diff --git a/doc/classes/Tabs.xml b/doc/classes/Tabs.xml
index 79fa8896e3..d784585e20 100644
--- a/doc/classes/Tabs.xml
+++ b/doc/classes/Tabs.xml
@@ -389,8 +389,6 @@
 		<theme_item name="outline_size" type="int" default="0">
 			The size of the tab text outline.
 		</theme_item>
-		<theme_item name="panel" type="StyleBox">
-		</theme_item>
 		<theme_item name="tab_disabled" type="StyleBox">
 			The style of disabled tabs.
 		</theme_item>
diff --git a/doc/classes/TubeTrailMesh.xml b/doc/classes/TubeTrailMesh.xml
new file mode 100644
index 0000000000..2782791a62
--- /dev/null
+++ b/doc/classes/TubeTrailMesh.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<class name="TubeTrailMesh" inherits="PrimitiveMesh" version="4.0">
+	<brief_description>
+	</brief_description>
+	<description>
+	</description>
+	<tutorials>
+	</tutorials>
+	<methods>
+	</methods>
+	<members>
+		<member name="curve" type="Curve" setter="set_curve" getter="get_curve">
+		</member>
+		<member name="radial_steps" type="int" setter="set_radial_steps" getter="get_radial_steps" default="8">
+		</member>
+		<member name="radius" type="float" setter="set_radius" getter="get_radius" default="1.0">
+		</member>
+		<member name="section_length" type="float" setter="set_section_length" getter="get_section_length" default="0.2">
+		</member>
+		<member name="section_rings" type="int" setter="set_section_rings" getter="get_section_rings" default="3">
+		</member>
+		<member name="sections" type="int" setter="set_sections" getter="get_sections" default="5">
+		</member>
+	</members>
+	<constants>
+	</constants>
+</class>
diff --git a/doc/classes/UDPServer.xml b/doc/classes/UDPServer.xml
index 0fc00f67f8..6f3ccb8a17 100644
--- a/doc/classes/UDPServer.xml
+++ b/doc/classes/UDPServer.xml
@@ -123,6 +123,13 @@
 	<tutorials>
 	</tutorials>
 	<methods>
+		<method name="get_local_port" qualifiers="const">
+			<return type="int">
+			</return>
+			<description>
+				Returns the local port this server is listening to.
+			</description>
+		</method>
 		<method name="is_connection_available" qualifiers="const">
 			<return type="bool">
 			</return>
@@ -145,7 +152,7 @@
 			<argument index="1" name="bind_address" type="String" default="&quot;*&quot;">
 			</argument>
 			<description>
-				Starts the server by opening a UDP socket listening on the given port. You can optionally specify a [code]bind_address[/code] to only listen for packets sent to that address. See also [method PacketPeerUDP.listen].
+				Starts the server by opening a UDP socket listening on the given port. You can optionally specify a [code]bind_address[/code] to only listen for packets sent to that address. See also [method PacketPeerUDP.bind].
 			</description>
 		</method>
 		<method name="poll">
diff --git a/doc/classes/Vector2.xml b/doc/classes/Vector2.xml
index b979425b85..94d4b1a903 100644
--- a/doc/classes/Vector2.xml
+++ b/doc/classes/Vector2.xml
@@ -229,7 +229,7 @@
 			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
-				Returns the result of the linear interpolation between this vector and [code]b[/code] by amount [code]t[/code]. [code]t[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
+				Returns the result of the linear interpolation between this vector and [code]to[/code] by amount [code]weight[/code]. [code]weight[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
 			</description>
 		</method>
 		<method name="move_toward" qualifiers="const">
@@ -464,7 +464,7 @@
 			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
-				Returns the result of spherical linear interpolation between this vector and [code]b[/code], by amount [code]t[/code]. [code]t[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
+				Returns the result of spherical linear interpolation between this vector and [code]to[/code], by amount [code]weight[/code]. [code]weight[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
 				[b]Note:[/b] Both vectors must be normalized.
 			</description>
 		</method>
diff --git a/doc/classes/Vector3.xml b/doc/classes/Vector3.xml
index bd568e01ec..0a86369506 100644
--- a/doc/classes/Vector3.xml
+++ b/doc/classes/Vector3.xml
@@ -204,7 +204,7 @@
 			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
-				Returns the result of the linear interpolation between this vector and [code]b[/code] by amount [code]weight[/code]. [code]weight[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
+				Returns the result of the linear interpolation between this vector and [code]to[/code] by amount [code]weight[/code]. [code]weight[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
 			</description>
 		</method>
 		<method name="max_axis" qualifiers="const">
@@ -484,7 +484,7 @@
 			<argument index="1" name="weight" type="float">
 			</argument>
 			<description>
-				Returns the result of spherical linear interpolation between this vector and [code]b[/code], by amount [code]t[/code]. [code]t[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
+				Returns the result of spherical linear interpolation between this vector and [code]to[/code], by amount [code]weight[/code]. [code]weight[/code] is on the range of 0.0 to 1.0, representing the amount of interpolation.
 				[b]Note:[/b] Both vectors must be normalized.
 			</description>
 		</method>
diff --git a/doc/classes/VideoPlayer.xml b/doc/classes/VideoPlayer.xml
index b2ab356b0d..d905ce4054 100644
--- a/doc/classes/VideoPlayer.xml
+++ b/doc/classes/VideoPlayer.xml
@@ -74,6 +74,7 @@
 		</member>
 		<member name="stream_position" type="float" setter="set_stream_position" getter="get_stream_position">
 			The current position of the stream, in seconds.
+			[b]Note:[/b] Changing this value won't have any effect as seeking is not implemented yet, except in video formats implemented by a GDNative add-on.
 		</member>
 		<member name="volume" type="float" setter="set_volume" getter="get_volume">
 			Audio volume as a linear value.
diff --git a/doc/classes/Viewport.xml b/doc/classes/Viewport.xml
index 471d21374d..1c33274cb0 100644
--- a/doc/classes/Viewport.xml
+++ b/doc/classes/Viewport.xml
@@ -267,6 +267,11 @@
 		</member>
 		<member name="use_debanding" type="bool" setter="set_use_debanding" getter="is_using_debanding" default="false">
 		</member>
+		<member name="use_occlusion_culling" type="bool" setter="set_use_occlusion_culling" getter="is_using_occlusion_culling" default="false">
+		</member>
+		<member name="use_xr" type="bool" setter="set_use_xr" getter="is_using_xr" default="false">
+			If [code]true[/code], the viewport will use the primary XR interface to render XR output. When applicable this can result in a stereoscopic image and the resulting render being output to a headset.
+		</member>
 		<member name="world_2d" type="World2D" setter="set_world_2d" getter="get_world_2d">
 			The custom [World2D] which can be used as 2D environment source.
 		</member>
@@ -419,6 +424,8 @@
 		</constant>
 		<constant name="DEBUG_DRAW_CLUSTER_REFLECTION_PROBES" value="22" enum="DebugDraw">
 		</constant>
+		<constant name="DEBUG_DRAW_OCCLUDERS" value="23" enum="DebugDraw">
+		</constant>
 		<constant name="DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_NEAREST" value="0" enum="DefaultCanvasItemTextureFilter">
 			The texture filter reads from the nearest pixel only. The simplest and fastest method of filtering, but the texture will look pixelized.
 		</constant>
diff --git a/doc/classes/VisualShader.xml b/doc/classes/VisualShader.xml
index c29c30289a..ff00a848b9 100644
--- a/doc/classes/VisualShader.xml
+++ b/doc/classes/VisualShader.xml
@@ -228,7 +228,9 @@
 		</constant>
 		<constant name="TYPE_END" value="5" enum="Type">
 		</constant>
-		<constant name="TYPE_MAX" value="6" enum="Type">
+		<constant name="TYPE_SKY" value="6" enum="Type">
+		</constant>
+		<constant name="TYPE_MAX" value="7" enum="Type">
 			Represents the size of the [enum Type] enum.
 		</constant>
 		<constant name="NODE_ID_INVALID" value="-1">
diff --git a/doc/tools/makerst.py b/doc/tools/makerst.py
index ae3cc73098..1c6055f8ca 100755
--- a/doc/tools/makerst.py
+++ b/doc/tools/makerst.py
@@ -437,7 +437,7 @@ def make_rst_class(class_def, state, dry_run, output_dir):  # type: (ClassDef, S
         for property_def in class_def.properties.values():
             type_rst = property_def.type_name.to_rst(state)
             default = property_def.default_value
-            if property_def.overridden:
+            if default is not None and property_def.overridden:
                 ml.append((type_rst, property_def.name, default + " *(parent override)*"))
             else:
                 ref = ":ref:`{0}<class_{1}_property_{0}>`".format(property_def.name, class_name)
diff --git a/drivers/coreaudio/audio_driver_coreaudio.cpp b/drivers/coreaudio/audio_driver_coreaudio.cpp
index f40036d628..4139727422 100644
--- a/drivers/coreaudio/audio_driver_coreaudio.cpp
+++ b/drivers/coreaudio/audio_driver_coreaudio.cpp
@@ -70,7 +70,7 @@ OSStatus AudioDriverCoreAudio::output_device_address_cb(AudioObjectID inObjectID
 
 Error AudioDriverCoreAudio::init() {
 	AudioComponentDescription desc;
-	zeromem(&desc, sizeof(desc));
+	memset(&desc, 0, sizeof(desc));
 	desc.componentType = kAudioUnitType_Output;
 #ifdef OSX_ENABLED
 	desc.componentSubType = kAudioUnitSubType_HALOutput;
@@ -97,7 +97,7 @@ Error AudioDriverCoreAudio::init() {
 
 	AudioStreamBasicDescription strdesc;
 
-	zeromem(&strdesc, sizeof(strdesc));
+	memset(&strdesc, 0, sizeof(strdesc));
 	UInt32 size = sizeof(strdesc);
 	result = AudioUnitGetProperty(audio_unit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Output, kOutputBus, &strdesc, &size);
 	ERR_FAIL_COND_V(result != noErr, FAILED);
@@ -118,7 +118,7 @@ Error AudioDriverCoreAudio::init() {
 
 	mix_rate = GLOBAL_GET("audio/driver/mix_rate");
 
-	zeromem(&strdesc, sizeof(strdesc));
+	memset(&strdesc, 0, sizeof(strdesc));
 	strdesc.mFormatID = kAudioFormatLinearPCM;
 	strdesc.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger | kLinearPCMFormatFlagIsPacked;
 	strdesc.mChannelsPerFrame = channels;
@@ -148,7 +148,7 @@ Error AudioDriverCoreAudio::init() {
 	print_verbose("CoreAudio: audio buffer frames: " + itos(buffer_frames) + " calculated latency: " + itos(buffer_frames * 1000 / mix_rate) + "ms");
 
 	AURenderCallbackStruct callback;
-	zeromem(&callback, sizeof(AURenderCallbackStruct));
+	memset(&callback, 0, sizeof(AURenderCallbackStruct));
 	callback.inputProc = &AudioDriverCoreAudio::output_callback;
 	callback.inputProcRefCon = this;
 	result = AudioUnitSetProperty(audio_unit, kAudioUnitProperty_SetRenderCallback, kAudioUnitScope_Input, kOutputBus, &callback, sizeof(callback));
@@ -173,7 +173,7 @@ OSStatus AudioDriverCoreAudio::output_callback(void *inRefCon,
 	if (!ad->active || !ad->try_lock()) {
 		for (unsigned int i = 0; i < ioData->mNumberBuffers; i++) {
 			AudioBuffer *abuf = &ioData->mBuffers[i];
-			zeromem(abuf->mData, abuf->mDataByteSize);
+			memset(abuf->mData, 0, abuf->mDataByteSize);
 		};
 		return 0;
 	};
@@ -293,7 +293,7 @@ void AudioDriverCoreAudio::finish() {
 		lock();
 
 		AURenderCallbackStruct callback;
-		zeromem(&callback, sizeof(AURenderCallbackStruct));
+		memset(&callback, 0, sizeof(AURenderCallbackStruct));
 		result = AudioUnitSetProperty(audio_unit, kAudioUnitProperty_SetRenderCallback, kAudioUnitScope_Input, kOutputBus, &callback, sizeof(callback));
 		if (result != noErr) {
 			ERR_PRINT("AudioUnitSetProperty failed");
@@ -337,7 +337,7 @@ void AudioDriverCoreAudio::finish() {
 
 Error AudioDriverCoreAudio::capture_init() {
 	AudioComponentDescription desc;
-	zeromem(&desc, sizeof(desc));
+	memset(&desc, 0, sizeof(desc));
 	desc.componentType = kAudioUnitType_Output;
 #ifdef OSX_ENABLED
 	desc.componentSubType = kAudioUnitSubType_HALOutput;
@@ -383,7 +383,7 @@ Error AudioDriverCoreAudio::capture_init() {
 #endif
 
 	AudioStreamBasicDescription strdesc;
-	zeromem(&strdesc, sizeof(strdesc));
+	memset(&strdesc, 0, sizeof(strdesc));
 	size = sizeof(strdesc);
 	result = AudioUnitGetProperty(input_unit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Output, kInputBus, &strdesc, &size);
 	ERR_FAIL_COND_V(result != noErr, FAILED);
@@ -405,7 +405,7 @@ Error AudioDriverCoreAudio::capture_init() {
 
 	mix_rate = GLOBAL_GET("audio/driver/mix_rate");
 
-	zeromem(&strdesc, sizeof(strdesc));
+	memset(&strdesc, 0, sizeof(strdesc));
 	strdesc.mFormatID = kAudioFormatLinearPCM;
 	strdesc.mFormatFlags = kLinearPCMFormatFlagIsSignedInteger | kLinearPCMFormatFlagIsPacked;
 	strdesc.mChannelsPerFrame = capture_channels;
@@ -419,7 +419,7 @@ Error AudioDriverCoreAudio::capture_init() {
 	ERR_FAIL_COND_V(result != noErr, FAILED);
 
 	AURenderCallbackStruct callback;
-	zeromem(&callback, sizeof(AURenderCallbackStruct));
+	memset(&callback, 0, sizeof(AURenderCallbackStruct));
 	callback.inputProc = &AudioDriverCoreAudio::input_callback;
 	callback.inputProcRefCon = this;
 	result = AudioUnitSetProperty(input_unit, kAudioOutputUnitProperty_SetInputCallback, kAudioUnitScope_Global, kInputBus, &callback, sizeof(callback));
@@ -436,7 +436,7 @@ void AudioDriverCoreAudio::capture_finish() {
 		lock();
 
 		AURenderCallbackStruct callback;
-		zeromem(&callback, sizeof(AURenderCallbackStruct));
+		memset(&callback, 0, sizeof(AURenderCallbackStruct));
 		OSStatus result = AudioUnitSetProperty(input_unit, kAudioOutputUnitProperty_SetInputCallback, kAudioUnitScope_Global, 0, &callback, sizeof(callback));
 		if (result != noErr) {
 			ERR_PRINT("AudioUnitSetProperty failed");
diff --git a/drivers/dummy/rasterizer_dummy.h b/drivers/dummy/rasterizer_dummy.h
index 9d6be1a802..64582eb784 100644
--- a/drivers/dummy/rasterizer_dummy.h
+++ b/drivers/dummy/rasterizer_dummy.h
@@ -175,7 +175,7 @@ public:
 
 	void gi_probe_set_quality(RS::GIProbeQuality) override {}
 
-	void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr) override {}
+	void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr) override {}
 	void render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region) override {}
 	void render_particle_collider_heightfield(RID p_collider, const Transform &p_transform, const PagedArray<GeometryInstance *> &p_instances) override {}
 
@@ -199,8 +199,6 @@ public:
 	void update() override {}
 	void sdfgi_set_debug_probe_select(const Vector3 &p_position, const Vector3 &p_dir) override {}
 
-	bool is_low_end() const override { return true; }
-
 	RasterizerSceneDummy() {}
 	~RasterizerSceneDummy() {}
 };
@@ -548,6 +546,10 @@ public:
 	void lightmap_set_probe_capture_update_speed(float p_speed) override {}
 	float lightmap_get_probe_capture_update_speed() const override { return 0; }
 
+	/* OCCLUDER */
+
+	void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) {}
+
 	/* PARTICLES */
 
 	RID particles_allocate() override { return RID(); }
diff --git a/drivers/png/image_loader_png.cpp b/drivers/png/image_loader_png.cpp
index 854c6706e6..ded6bbc53e 100644
--- a/drivers/png/image_loader_png.cpp
+++ b/drivers/png/image_loader_png.cpp
@@ -88,7 +88,7 @@ Vector<uint8_t> ImageLoaderPNG::lossless_pack_png(const Ref<Image> &p_image) {
 	{
 		// must be closed before call to image_to_png
 		uint8_t *writer = out_buffer.ptrw();
-		copymem(writer, "PNG ", 4);
+		memcpy(writer, "PNG ", 4);
 	}
 
 	Error err = PNGDriverCommon::image_to_png(p_image, out_buffer);
diff --git a/drivers/png/png_driver_common.cpp b/drivers/png/png_driver_common.cpp
index 9e848a2253..412e17c6b7 100644
--- a/drivers/png/png_driver_common.cpp
+++ b/drivers/png/png_driver_common.cpp
@@ -60,7 +60,7 @@ static bool check_error(const png_image &image) {
 
 Error png_to_image(const uint8_t *p_source, size_t p_size, bool p_force_linear, Ref<Image> p_image) {
 	png_image png_img;
-	zeromem(&png_img, sizeof(png_img));
+	memset(&png_img, 0, sizeof(png_img));
 	png_img.version = PNG_IMAGE_VERSION;
 
 	// fetch image properties
@@ -134,7 +134,7 @@ Error image_to_png(const Ref<Image> &p_image, Vector<uint8_t> &p_buffer) {
 	ERR_FAIL_COND_V(source_image->is_compressed(), FAILED);
 
 	png_image png_img;
-	zeromem(&png_img, sizeof(png_img));
+	memset(&png_img, 0, sizeof(png_img));
 	png_img.version = PNG_IMAGE_VERSION;
 	png_img.width = source_image->get_width();
 	png_img.height = source_image->get_height();
diff --git a/drivers/unix/dir_access_unix.cpp b/drivers/unix/dir_access_unix.cpp
index 34ef6f3ce6..22151b60c1 100644
--- a/drivers/unix/dir_access_unix.cpp
+++ b/drivers/unix/dir_access_unix.cpp
@@ -226,8 +226,9 @@ static void _get_drives(List<String> *list) {
 		while (getmntent_r(mtab, &mnt, strings, sizeof(strings))) {
 			if (mnt.mnt_dir != nullptr && _filter_drive(&mnt)) {
 				// Avoid duplicates
-				if (!list->find(mnt.mnt_dir)) {
-					list->push_back(mnt.mnt_dir);
+				String name = String::utf8(mnt.mnt_dir);
+				if (!list->find(name)) {
+					list->push_back(name);
 				}
 			}
 		}
@@ -240,8 +241,9 @@ static void _get_drives(List<String> *list) {
 	const char *home = getenv("HOME");
 	if (home) {
 		// Only add if it's not a duplicate
-		if (!list->find(home)) {
-			list->push_back(home);
+		String home_name = String::utf8(home);
+		if (!list->find(home_name)) {
+			list->push_back(home_name);
 		}
 
 		// Check $HOME/.config/gtk-3.0/bookmarks
@@ -254,7 +256,7 @@ static void _get_drives(List<String> *list) {
 				// Parse only file:// links
 				if (strncmp(string, "file://", 7) == 0) {
 					// Strip any unwanted edges on the strings and push_back if it's not a duplicate
-					String fpath = String(string + 7).strip_edges().split_spaces()[0].uri_decode();
+					String fpath = String::utf8(string + 7).strip_edges().split_spaces()[0].uri_decode();
 					if (!list->find(fpath)) {
 						list->push_back(fpath);
 					}
diff --git a/drivers/unix/net_socket_posix.cpp b/drivers/unix/net_socket_posix.cpp
index 19753943c8..e2ad352c10 100644
--- a/drivers/unix/net_socket_posix.cpp
+++ b/drivers/unix/net_socket_posix.cpp
@@ -106,7 +106,7 @@ size_t NetSocketPosix::_set_addr_storage(struct sockaddr_storage *p_addr, const
 		addr6->sin6_family = AF_INET6;
 		addr6->sin6_port = htons(p_port);
 		if (p_ip.is_valid()) {
-			copymem(&addr6->sin6_addr.s6_addr, p_ip.get_ipv6(), 16);
+			memcpy(&addr6->sin6_addr.s6_addr, p_ip.get_ipv6(), 16);
 		} else {
 			addr6->sin6_addr = in6addr_any;
 		}
@@ -121,7 +121,7 @@ size_t NetSocketPosix::_set_addr_storage(struct sockaddr_storage *p_addr, const
 		addr4->sin_port = htons(p_port); // short, network byte order
 
 		if (p_ip.is_valid()) {
-			copymem(&addr4->sin_addr.s_addr, p_ip.get_ipv4(), 4);
+			memcpy(&addr4->sin_addr.s_addr, p_ip.get_ipv4(), 4);
 		} else {
 			addr4->sin_addr.s_addr = INADDR_ANY;
 		}
@@ -130,18 +130,23 @@ size_t NetSocketPosix::_set_addr_storage(struct sockaddr_storage *p_addr, const
 	}
 }
 
-void NetSocketPosix::_set_ip_port(struct sockaddr_storage *p_addr, IP_Address &r_ip, uint16_t &r_port) {
+void NetSocketPosix::_set_ip_port(struct sockaddr_storage *p_addr, IP_Address *r_ip, uint16_t *r_port) {
 	if (p_addr->ss_family == AF_INET) {
 		struct sockaddr_in *addr4 = (struct sockaddr_in *)p_addr;
-		r_ip.set_ipv4((uint8_t *)&(addr4->sin_addr.s_addr));
-
-		r_port = ntohs(addr4->sin_port);
-
+		if (r_ip) {
+			r_ip->set_ipv4((uint8_t *)&(addr4->sin_addr.s_addr));
+		}
+		if (r_port) {
+			*r_port = ntohs(addr4->sin_port);
+		}
 	} else if (p_addr->ss_family == AF_INET6) {
 		struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)p_addr;
-		r_ip.set_ipv6(addr6->sin6_addr.s6_addr);
-
-		r_port = ntohs(addr6->sin6_port);
+		if (r_ip) {
+			r_ip->set_ipv6(addr6->sin6_addr.s6_addr);
+		}
+		if (r_port) {
+			*r_port = ntohs(addr6->sin6_port);
+		}
 	};
 }
 
@@ -186,13 +191,21 @@ NetSocketPosix::~NetSocketPosix() {
 NetSocketPosix::NetError NetSocketPosix::_get_socket_error() const {
 #if defined(WINDOWS_ENABLED)
 	int err = WSAGetLastError();
-
-	if (err == WSAEISCONN)
+	if (err == WSAEISCONN) {
 		return ERR_NET_IS_CONNECTED;
-	if (err == WSAEINPROGRESS || err == WSAEALREADY)
+	}
+	if (err == WSAEINPROGRESS || err == WSAEALREADY) {
 		return ERR_NET_IN_PROGRESS;
-	if (err == WSAEWOULDBLOCK)
+	}
+	if (err == WSAEWOULDBLOCK) {
 		return ERR_NET_WOULD_BLOCK;
+	}
+	if (err == WSAEADDRINUSE || err == WSAEADDRNOTAVAIL) {
+		return ERR_NET_ADDRESS_INVALID_OR_UNAVAILABLE;
+	}
+	if (err == WSAEACCES) {
+		return ERR_NET_UNAUTHORIZED;
+	}
 	print_verbose("Socket error: " + itos(err));
 	return ERR_NET_OTHER;
 #else
@@ -205,6 +218,12 @@ NetSocketPosix::NetError NetSocketPosix::_get_socket_error() const {
 	if (errno == EAGAIN || errno == EWOULDBLOCK) {
 		return ERR_NET_WOULD_BLOCK;
 	}
+	if (errno == EADDRINUSE || errno == EINVAL || errno == EADDRNOTAVAIL) {
+		return ERR_NET_ADDRESS_INVALID_OR_UNAVAILABLE;
+	}
+	if (errno == EACCES) {
+		return ERR_NET_UNAUTHORIZED;
+	}
 	print_verbose("Socket error: " + itos(errno));
 	return ERR_NET_OTHER;
 #endif
@@ -264,13 +283,13 @@ _FORCE_INLINE_ Error NetSocketPosix::_change_multicast_group(IP_Address p_ip, St
 		ERR_FAIL_COND_V(!if_ip.is_valid(), ERR_INVALID_PARAMETER);
 		struct ip_mreq greq;
 		int sock_opt = p_add ? IP_ADD_MEMBERSHIP : IP_DROP_MEMBERSHIP;
-		copymem(&greq.imr_multiaddr, p_ip.get_ipv4(), 4);
-		copymem(&greq.imr_interface, if_ip.get_ipv4(), 4);
+		memcpy(&greq.imr_multiaddr, p_ip.get_ipv4(), 4);
+		memcpy(&greq.imr_interface, if_ip.get_ipv4(), 4);
 		ret = setsockopt(_sock, level, sock_opt, (const char *)&greq, sizeof(greq));
 	} else {
 		struct ipv6_mreq greq;
 		int sock_opt = p_add ? IPV6_ADD_MEMBERSHIP : IPV6_DROP_MEMBERSHIP;
-		copymem(&greq.ipv6mr_multiaddr, p_ip.get_ipv6(), 16);
+		memcpy(&greq.ipv6mr_multiaddr, p_ip.get_ipv6(), 16);
 		greq.ipv6mr_interface = if_v6id;
 		ret = setsockopt(_sock, level, sock_opt, (const char *)&greq, sizeof(greq));
 	}
@@ -384,8 +403,8 @@ Error NetSocketPosix::bind(IP_Address p_addr, uint16_t p_port) {
 	size_t addr_size = _set_addr_storage(&addr, p_addr, p_port, _ip_type);
 
 	if (::bind(_sock, (struct sockaddr *)&addr, addr_size) != 0) {
-		_get_socket_error();
-		print_verbose("Failed to bind socket.");
+		NetError err = _get_socket_error();
+		print_verbose("Failed to bind socket. Error: " + itos(err));
 		close();
 		return ERR_UNAVAILABLE;
 	}
@@ -446,7 +465,7 @@ Error NetSocketPosix::poll(PollType p_type, int p_timeout) const {
 	FD_ZERO(&wr);
 	FD_ZERO(&ex);
 	FD_SET(_sock, &ex);
-	struct timeval timeout = { p_timeout, 0 };
+	struct timeval timeout = { p_timeout / 1000, (p_timeout % 1000) * 1000 };
 	// For blocking operation, pass nullptr  timeout pointer to select.
 	struct timeval *tp = nullptr;
 	if (p_timeout >= 0) {
@@ -716,6 +735,20 @@ int NetSocketPosix::get_available_bytes() const {
 	return len;
 }
 
+Error NetSocketPosix::get_socket_address(IP_Address *r_ip, uint16_t *r_port) const {
+	ERR_FAIL_COND_V(!is_open(), FAILED);
+
+	struct sockaddr_storage saddr;
+	socklen_t len = sizeof(saddr);
+	if (getsockname(_sock, (struct sockaddr *)&saddr, &len) != 0) {
+		_get_socket_error();
+		print_verbose("Error when reading local socket address.");
+		return FAILED;
+	}
+	_set_ip_port(&saddr, r_ip, r_port);
+	return OK;
+}
+
 Ref<NetSocket> NetSocketPosix::accept(IP_Address &r_ip, uint16_t &r_port) {
 	Ref<NetSocket> out;
 	ERR_FAIL_COND_V(!is_open(), out);
@@ -729,7 +762,7 @@ Ref<NetSocket> NetSocketPosix::accept(IP_Address &r_ip, uint16_t &r_port) {
 		return out;
 	}
 
-	_set_ip_port(&their_addr, r_ip, r_port);
+	_set_ip_port(&their_addr, &r_ip, &r_port);
 
 	NetSocketPosix *ns = memnew(NetSocketPosix);
 	ns->_set_socket(fd, _ip_type, _is_stream);
diff --git a/drivers/unix/net_socket_posix.h b/drivers/unix/net_socket_posix.h
index cc6af661c8..dbfe3a524e 100644
--- a/drivers/unix/net_socket_posix.h
+++ b/drivers/unix/net_socket_posix.h
@@ -54,7 +54,9 @@ private:
 		ERR_NET_WOULD_BLOCK,
 		ERR_NET_IS_CONNECTED,
 		ERR_NET_IN_PROGRESS,
-		ERR_NET_OTHER
+		ERR_NET_ADDRESS_INVALID_OR_UNAVAILABLE,
+		ERR_NET_UNAUTHORIZED,
+		ERR_NET_OTHER,
 	};
 
 	NetError _get_socket_error() const;
@@ -70,7 +72,7 @@ protected:
 public:
 	static void make_default();
 	static void cleanup();
-	static void _set_ip_port(struct sockaddr_storage *p_addr, IP_Address &r_ip, uint16_t &r_port);
+	static void _set_ip_port(struct sockaddr_storage *p_addr, IP_Address *r_ip, uint16_t *r_port);
 	static size_t _set_addr_storage(struct sockaddr_storage *p_addr, const IP_Address &p_ip, uint16_t p_port, IP::Type p_ip_type);
 
 	virtual Error open(Type p_sock_type, IP::Type &ip_type);
@@ -87,6 +89,7 @@ public:
 
 	virtual bool is_open() const;
 	virtual int get_available_bytes() const;
+	virtual Error get_socket_address(IP_Address *r_ip, uint16_t *r_port) const;
 
 	virtual Error set_broadcasting_enabled(bool p_enabled);
 	virtual void set_blocking_enabled(bool p_enabled);
diff --git a/drivers/vulkan/rendering_device_vulkan.cpp b/drivers/vulkan/rendering_device_vulkan.cpp
index 09e2b4546a..30cc01fd10 100644
--- a/drivers/vulkan/rendering_device_vulkan.cpp
+++ b/drivers/vulkan/rendering_device_vulkan.cpp
@@ -1600,7 +1600,7 @@ Error RenderingDeviceVulkan::_buffer_update(Buffer *p_buffer, size_t p_offset, c
 		}
 
 		//copy to staging buffer
-		copymem(((uint8_t *)data_ptr) + block_write_offset, p_data + submit_from, block_write_amount);
+		memcpy(((uint8_t *)data_ptr) + block_write_offset, p_data + submit_from, block_write_amount);
 
 		//unmap
 		vmaUnmapMemory(allocator, staging_buffer_blocks[staging_buffer_current].allocation);
@@ -2558,7 +2558,7 @@ Vector<uint8_t> RenderingDeviceVulkan::_texture_get_data_from_image(Texture *tex
 						const uint8_t *rptr = slice_read_ptr + y * layout.rowPitch;
 						uint8_t *wptr = write_ptr + y * line_width;
 
-						copymem(wptr, rptr, line_width);
+						memcpy(wptr, rptr, line_width);
 					}
 
 				} else {
@@ -2566,7 +2566,7 @@ Vector<uint8_t> RenderingDeviceVulkan::_texture_get_data_from_image(Texture *tex
 					for (uint32_t y = 0; y < height; y++) {
 						const uint8_t *rptr = slice_read_ptr + y * layout.rowPitch;
 						uint8_t *wptr = write_ptr + y * pixel_size * width;
-						copymem(wptr, rptr, (uint64_t)pixel_size * width);
+						memcpy(wptr, rptr, (uint64_t)pixel_size * width);
 					}
 				}
 			}
@@ -2699,7 +2699,7 @@ Vector<uint8_t> RenderingDeviceVulkan::texture_get_data(RID p_texture, uint32_t
 		{
 			buffer_data.resize(buffer_size);
 			uint8_t *w = buffer_data.ptrw();
-			copymem(w, buffer_mem, buffer_size);
+			memcpy(w, buffer_mem, buffer_size);
 		}
 
 		vmaUnmapMemory(allocator, tmp_buffer.allocation);
@@ -5359,7 +5359,7 @@ Vector<uint8_t> RenderingDeviceVulkan::buffer_get_data(RID p_buffer) {
 	{
 		buffer_data.resize(buffer->size);
 		uint8_t *w = buffer_data.ptrw();
-		copymem(w, buffer_mem, buffer->size);
+		memcpy(w, buffer_mem, buffer->size);
 	}
 
 	vmaUnmapMemory(allocator, tmp_buffer.allocation);
@@ -7986,7 +7986,11 @@ void RenderingDeviceVulkan::_free_rids(T &p_owner, const char *p_type) {
 	List<RID> owned;
 	p_owner.get_owned_list(&owned);
 	if (owned.size()) {
-		WARN_PRINT(itos(owned.size()) + " RIDs of type '" + p_type + "' were leaked.");
+		if (owned.size() == 1) {
+			WARN_PRINT(vformat("1 RID of type \"%s\" was leaked.", p_type));
+		} else {
+			WARN_PRINT(vformat("%d RIDs of type \"%s\" were leaked.", owned.size(), p_type));
+		}
 		for (List<RID>::Element *E = owned.front(); E; E = E->next()) {
 			free(E->get());
 		}
@@ -8199,7 +8203,11 @@ void RenderingDeviceVulkan::finalize() {
 		List<RID> owned;
 		texture_owner.get_owned_list(&owned);
 		if (owned.size()) {
-			WARN_PRINT(itos(owned.size()) + " RIDs of type 'Texture' were leaked.");
+			if (owned.size() == 1) {
+				WARN_PRINT("1 RID of type \"Texture\" was leaked.");
+			} else {
+				WARN_PRINT(vformat("%d RIDs of type \"Texture\" were leaked.", owned.size()));
+			}
 			//free shared first
 			for (List<RID>::Element *E = owned.front(); E;) {
 				List<RID>::Element *N = E->next();
diff --git a/drivers/vulkan/vulkan_context.cpp b/drivers/vulkan/vulkan_context.cpp
index 504e63392f..0a8a5c746f 100644
--- a/drivers/vulkan/vulkan_context.cpp
+++ b/drivers/vulkan/vulkan_context.cpp
@@ -41,6 +41,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <vector>
 
 #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
 #define APP_SHORT_NAME "GodotEngine"
@@ -193,7 +194,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL VulkanContext::_debug_report_callback(
 	return VK_FALSE;
 }
 
-VkBool32 VulkanContext::_check_layers(uint32_t check_count, const char **check_names, uint32_t layer_count, VkLayerProperties *layers) {
+VkBool32 VulkanContext::_check_layers(uint32_t check_count, const char *const *check_names, uint32_t layer_count, VkLayerProperties *layers) {
 	for (uint32_t i = 0; i < check_count; i++) {
 		VkBool32 found = 0;
 		for (uint32_t j = 0; j < layer_count; j++) {
@@ -210,57 +211,55 @@ VkBool32 VulkanContext::_check_layers(uint32_t check_count, const char **check_n
 	return 1;
 }
 
-Error VulkanContext::_create_validation_layers() {
-	VkResult err;
-	const char *instance_validation_layers_alt1[] = { "VK_LAYER_KHRONOS_validation" };
-	const char *instance_validation_layers_alt2[] = { "VK_LAYER_LUNARG_standard_validation" };
-	const char *instance_validation_layers_alt3[] = { "VK_LAYER_GOOGLE_threading", "VK_LAYER_LUNARG_parameter_validation", "VK_LAYER_LUNARG_object_tracker", "VK_LAYER_LUNARG_core_validation", "VK_LAYER_GOOGLE_unique_objects" };
+Error VulkanContext::_get_preferred_validation_layers(uint32_t *count, const char *const **names) {
+	static const std::vector<std::vector<const char *>> instance_validation_layers_alt{
+		// Preferred set of validation layers
+		{ "VK_LAYER_KHRONOS_validation" },
 
-	uint32_t instance_layer_count = 0;
-	err = vkEnumerateInstanceLayerProperties(&instance_layer_count, nullptr);
-	ERR_FAIL_COND_V(err, ERR_CANT_CREATE);
+		// Alternative (deprecated, removed in SDK 1.1.126.0) set of validation layers
+		{ "VK_LAYER_LUNARG_standard_validation" },
 
-	VkBool32 validation_found = 0;
-	uint32_t validation_layer_count = 0;
-	const char **instance_validation_layers = nullptr;
-	if (instance_layer_count > 0) {
-		VkLayerProperties *instance_layers = (VkLayerProperties *)malloc(sizeof(VkLayerProperties) * instance_layer_count);
-		err = vkEnumerateInstanceLayerProperties(&instance_layer_count, instance_layers);
-		if (err) {
-			free(instance_layers);
-			ERR_FAIL_V(ERR_CANT_CREATE);
-		}
+		// Alternative (deprecated, removed in SDK 1.1.121.1) set of validation layers
+		{ "VK_LAYER_GOOGLE_threading", "VK_LAYER_LUNARG_parameter_validation", "VK_LAYER_LUNARG_object_tracker", "VK_LAYER_LUNARG_core_validation", "VK_LAYER_GOOGLE_unique_objects" }
+	};
 
-		validation_layer_count = ARRAY_SIZE(instance_validation_layers_alt1);
-		instance_validation_layers = instance_validation_layers_alt1;
-		validation_found = _check_layers(validation_layer_count, instance_validation_layers, instance_layer_count, instance_layers);
+	// Clear out-arguments
+	*count = 0;
+	if (names != nullptr) {
+		*names = nullptr;
+	}
 
-		// use alternative (deprecated, removed in SDK 1.1.126.0) set of validation layers
-		if (!validation_found) {
-			validation_layer_count = ARRAY_SIZE(instance_validation_layers_alt2);
-			instance_validation_layers = instance_validation_layers_alt2;
-			validation_found = _check_layers(validation_layer_count, instance_validation_layers, instance_layer_count, instance_layers);
-		}
+	VkResult err;
+	uint32_t instance_layer_count;
 
-		// use alternative (deprecated, removed in SDK 1.1.121.1) set of validation layers
-		if (!validation_found) {
-			validation_layer_count = ARRAY_SIZE(instance_validation_layers_alt3);
-			instance_validation_layers = instance_validation_layers_alt3;
-			validation_found = _check_layers(validation_layer_count, instance_validation_layers, instance_layer_count, instance_layers);
-		}
+	err = vkEnumerateInstanceLayerProperties(&instance_layer_count, nullptr);
+	if (err) {
+		ERR_FAIL_V(ERR_CANT_CREATE);
+	}
+
+	if (instance_layer_count < 1) {
+		return OK;
+	}
 
+	VkLayerProperties *instance_layers = (VkLayerProperties *)malloc(sizeof(VkLayerProperties) * instance_layer_count);
+	err = vkEnumerateInstanceLayerProperties(&instance_layer_count, instance_layers);
+	if (err) {
 		free(instance_layers);
+		ERR_FAIL_V(ERR_CANT_CREATE);
 	}
 
-	if (validation_found) {
-		enabled_layer_count = validation_layer_count;
-		for (uint32_t i = 0; i < validation_layer_count; i++) {
-			enabled_layers[i] = instance_validation_layers[i];
+	for (uint32_t i = 0; i < instance_validation_layers_alt.size(); i++) {
+		if (_check_layers(instance_validation_layers_alt[i].size(), instance_validation_layers_alt[i].data(), instance_layer_count, instance_layers)) {
+			*count = instance_validation_layers_alt[i].size();
+			if (names != nullptr) {
+				*names = instance_validation_layers_alt[i].data();
+			}
+			break;
 		}
-	} else {
-		return ERR_CANT_CREATE;
 	}
 
+	free(instance_layers);
+
 	return OK;
 }
 
@@ -301,7 +300,6 @@ Error VulkanContext::_initialize_extensions() {
 	uint32_t instance_extension_count = 0;
 
 	enabled_extension_count = 0;
-	enabled_layer_count = 0;
 	enabled_debug_utils = false;
 	enabled_debug_report = false;
 	/* Look for instance extensions */
@@ -330,7 +328,7 @@ Error VulkanContext::_initialize_extensions() {
 				extension_names[enabled_extension_count++] = _get_platform_surface_extension();
 			}
 			if (!strcmp(VK_EXT_DEBUG_REPORT_EXTENSION_NAME, instance_extensions[i].extensionName)) {
-				if (use_validation_layers) {
+				if (_use_validation_layers()) {
 					extension_names[enabled_extension_count++] = VK_EXT_DEBUG_REPORT_EXTENSION_NAME;
 					enabled_debug_report = true;
 				}
@@ -542,11 +540,6 @@ Error VulkanContext::_create_physical_device() {
 	/* obtain version */
 	_obtain_vulkan_version();
 
-	/* Look for validation layers */
-	if (use_validation_layers) {
-		_create_validation_layers();
-	}
-
 	/* initialise extensions */
 	{
 		Error err = _initialize_extensions();
@@ -567,16 +560,14 @@ Error VulkanContext::_create_physical_device() {
 		/*engineVersion*/ 0,
 		/*apiVersion*/ VK_MAKE_VERSION(vulkan_major, vulkan_minor, 0)
 	};
-	VkInstanceCreateInfo inst_info = {
-		/*sType*/ VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
-		/*pNext*/ nullptr,
-		/*flags*/ 0,
-		/*pApplicationInfo*/ &app,
-		/*enabledLayerCount*/ enabled_layer_count,
-		/*ppEnabledLayerNames*/ (const char *const *)enabled_layers,
-		/*enabledExtensionCount*/ enabled_extension_count,
-		/*ppEnabledExtensionNames*/ (const char *const *)extension_names,
-	};
+	VkInstanceCreateInfo inst_info{};
+	inst_info.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+	inst_info.pApplicationInfo = &app;
+	inst_info.enabledExtensionCount = enabled_extension_count;
+	inst_info.ppEnabledExtensionNames = (const char *const *)extension_names;
+	if (_use_validation_layers()) {
+		_get_preferred_validation_layers(&inst_info.enabledLayerCount, &inst_info.ppEnabledLayerNames);
+	}
 
 	/*
 	   * This is info for a temp callback to use during CreateInstance.
@@ -825,7 +816,7 @@ Error VulkanContext::_create_physical_device() {
 		}
 	}
 
-	/* Call with NULL data to get count */
+	/* Call with nullptr data to get count */
 	vkGetPhysicalDeviceQueueFamilyProperties(gpu, &queue_family_count, nullptr);
 	ERR_FAIL_COND_V(queue_family_count == 0, ERR_CANT_CREATE);
 
@@ -1077,6 +1068,10 @@ Error VulkanContext::_create_semaphores() {
 	return OK;
 }
 
+bool VulkanContext::_use_validation_layers() {
+	return Engine::get_singleton()->is_validation_layers_enabled();
+}
+
 Error VulkanContext::_window_create(DisplayServer::WindowID p_window_id, VkSurfaceKHR p_surface, int p_width, int p_height) {
 	ERR_FAIL_COND_V(windows.has(p_window_id), ERR_INVALID_PARAMETER);
 
@@ -2008,8 +2003,6 @@ String VulkanContext::get_device_pipeline_cache_uuid() const {
 }
 
 VulkanContext::VulkanContext() {
-	use_validation_layers = Engine::get_singleton()->is_validation_layers_enabled();
-
 	command_buffer_queue.resize(1); // First one is always the setup command.
 	command_buffer_queue.write[0] = nullptr;
 }
diff --git a/drivers/vulkan/vulkan_context.h b/drivers/vulkan/vulkan_context.h
index b788181ab9..88e4f26bb1 100644
--- a/drivers/vulkan/vulkan_context.h
+++ b/drivers/vulkan/vulkan_context.h
@@ -151,9 +151,6 @@ private:
 	 */
 	bool enabled_debug_report = false;
 
-	uint32_t enabled_layer_count = 0;
-	const char *enabled_layers[MAX_LAYERS];
-
 	PFN_vkCreateDebugUtilsMessengerEXT CreateDebugUtilsMessengerEXT;
 	PFN_vkDestroyDebugUtilsMessengerEXT DestroyDebugUtilsMessengerEXT;
 	PFN_vkSubmitDebugUtilsMessageEXT SubmitDebugUtilsMessageEXT;
@@ -180,11 +177,10 @@ private:
 	VkDebugReportCallbackEXT dbg_debug_report = VK_NULL_HANDLE;
 
 	Error _obtain_vulkan_version();
-	Error _create_validation_layers();
 	Error _initialize_extensions();
 	Error _check_capabilities();
 
-	VkBool32 _check_layers(uint32_t check_count, const char **check_names, uint32_t layer_count, VkLayerProperties *layers);
+	VkBool32 _check_layers(uint32_t check_count, const char *const *check_names, uint32_t layer_count, VkLayerProperties *layers);
 	static VKAPI_ATTR VkBool32 VKAPI_CALL _debug_messenger_callback(
 			VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
 			VkDebugUtilsMessageTypeFlagsEXT messageType,
@@ -217,11 +213,12 @@ private:
 protected:
 	virtual const char *_get_platform_surface_extension() const = 0;
 
-	// Enabled via command line argument.
-	bool use_validation_layers = false;
-
 	virtual Error _window_create(DisplayServer::WindowID p_window_id, VkSurfaceKHR p_surface, int p_width, int p_height);
 
+	virtual bool _use_validation_layers();
+
+	Error _get_preferred_validation_layers(uint32_t *count, const char *const **names);
+
 	VkInstance _get_instance() {
 		return inst;
 	}
diff --git a/editor/animation_track_editor.cpp b/editor/animation_track_editor.cpp
index 4fe2d2bb2a..9db2f0a287 100644
--- a/editor/animation_track_editor.cpp
+++ b/editor/animation_track_editor.cpp
@@ -2734,7 +2734,7 @@ void AnimationTrackEdit::_gui_input(const Ref<InputEvent> &p_event) {
 		path_popup->set_size(path_rect.size);
 		path_popup->popup();
 		path->grab_focus();
-		path->set_cursor_position(path->get_text().length());
+		path->set_caret_column(path->get_text().length());
 		clicking_on_name = false;
 	}
 
diff --git a/editor/code_editor.cpp b/editor/code_editor.cpp
index ac8bef817b..1c62c3d3e1 100644
--- a/editor/code_editor.cpp
+++ b/editor/code_editor.cpp
@@ -142,7 +142,7 @@ bool FindReplaceBar::_search(uint32_t p_flags, int p_from_line, int p_from_col)
 	bool found = text_editor->search(text, p_flags, p_from_line, p_from_col, line, col);
 
 	if (found) {
-		if (!preserve_cursor) {
+		if (!preserve_cursor && !is_selection_only()) {
 			text_editor->unfold_line(line);
 			text_editor->cursor_set_line(line, false);
 			text_editor->cursor_set_column(col + text.length(), false);
@@ -488,10 +488,10 @@ void FindReplaceBar::_show_search(bool p_focus_replace, bool p_show_only) {
 	if (!get_search_text().is_empty()) {
 		if (p_focus_replace) {
 			replace_text->select_all();
-			replace_text->set_cursor_position(replace_text->get_text().length());
+			replace_text->set_caret_column(replace_text->get_text().length());
 		} else {
 			search_text->select_all();
-			search_text->set_cursor_position(search_text->get_text().length());
+			search_text->set_caret_column(search_text->get_text().length());
 		}
 
 		results_count = -1;
diff --git a/editor/debugger/editor_debugger_node.cpp b/editor/debugger/editor_debugger_node.cpp
index 3ef9548727..ded0ee3aa7 100644
--- a/editor/debugger/editor_debugger_node.cpp
+++ b/editor/debugger/editor_debugger_node.cpp
@@ -209,7 +209,7 @@ void EditorDebuggerNode::stop() {
 	// Also close all debugging sessions.
 	_for_all(tabs, [&](ScriptEditorDebugger *dbg) {
 		if (dbg->is_session_active()) {
-			dbg->stop();
+			dbg->_stop_and_notify();
 		}
 	});
 	_break_state_changed();
diff --git a/editor/debugger/editor_profiler.cpp b/editor/debugger/editor_profiler.cpp
index c4290b7cca..6befee090b 100644
--- a/editor/debugger/editor_profiler.cpp
+++ b/editor/debugger/editor_profiler.cpp
@@ -43,28 +43,34 @@ void EditorProfiler::_make_metric_ptrs(Metric &m) {
 	}
 }
 
+EditorProfiler::Metric EditorProfiler::_get_frame_metric(int index) {
+	return frame_metrics[(frame_metrics.size() + last_metric - (total_metrics - 1) + index) % frame_metrics.size()];
+}
+
 void EditorProfiler::add_frame_metric(const Metric &p_metric, bool p_final) {
 	++last_metric;
 	if (last_metric >= frame_metrics.size()) {
 		last_metric = 0;
 	}
 
+	total_metrics++;
+	if (total_metrics > frame_metrics.size()) {
+		total_metrics = frame_metrics.size();
+	}
+
 	frame_metrics.write[last_metric] = p_metric;
 	_make_metric_ptrs(frame_metrics.write[last_metric]);
 
 	updating_frame = true;
-	cursor_metric_edit->set_max(frame_metrics[last_metric].frame_number);
-	cursor_metric_edit->set_min(MAX(frame_metrics[last_metric].frame_number - frame_metrics.size(), 0));
+	clear_button->set_disabled(false);
+	cursor_metric_edit->set_editable(true);
+	cursor_metric_edit->set_max(p_metric.frame_number);
+	cursor_metric_edit->set_min(_get_frame_metric(0).frame_number);
 
 	if (!seeking) {
-		cursor_metric_edit->set_value(frame_metrics[last_metric].frame_number);
-		if (hover_metric != -1) {
-			hover_metric++;
-			if (hover_metric >= frame_metrics.size()) {
-				hover_metric = 0;
-			}
-		}
+		cursor_metric_edit->set_value(p_metric.frame_number);
 	}
+
 	updating_frame = false;
 
 	if (frame_delay->is_stopped()) {
@@ -83,6 +89,7 @@ void EditorProfiler::clear() {
 	metric_size = CLAMP(metric_size, 60, 1024);
 	frame_metrics.clear();
 	frame_metrics.resize(metric_size);
+	total_metrics = 0;
 	last_metric = -1;
 	variables->clear();
 	plot_sigs.clear();
@@ -93,6 +100,7 @@ void EditorProfiler::clear() {
 	cursor_metric_edit->set_min(0);
 	cursor_metric_edit->set_max(100); // Doesn't make much sense, but we can't have min == max. Doesn't hurt.
 	cursor_metric_edit->set_value(0);
+	cursor_metric_edit->set_editable(false);
 	updating_frame = false;
 	hover_metric = -1;
 	seeking = false;
@@ -187,11 +195,8 @@ void EditorProfiler::_update_plot() {
 	const bool use_self = display_time->get_selected() == DISPLAY_SELF_TIME;
 	float highest = 0;
 
-	for (int i = 0; i < frame_metrics.size(); i++) {
-		const Metric &m = frame_metrics[i];
-		if (!m.valid) {
-			continue;
-		}
+	for (int i = 0; i < total_metrics; i++) {
+		const Metric &m = _get_frame_metric(i);
 
 		for (Set<StringName>::Element *E = plot_sigs.front(); E; E = E->next()) {
 			const Map<StringName, Metric::Category *>::Element *F = m.category_ptrs.find(E->get());
@@ -220,78 +225,43 @@ void EditorProfiler::_update_plot() {
 
 		int *column = columnv.ptrw();
 
-		Map<StringName, int> plot_prev;
-		//Map<StringName,int> plot_max;
+		Map<StringName, int> prev_plots;
 
-		for (int i = 0; i < w; i++) {
+		for (int i = 0; i < total_metrics * w / frame_metrics.size() - 1; i++) {
 			for (int j = 0; j < h * 4; j++) {
 				column[j] = 0;
 			}
 
 			int current = i * frame_metrics.size() / w;
-			int next = (i + 1) * frame_metrics.size() / w;
-			if (next > frame_metrics.size()) {
-				next = frame_metrics.size();
-			}
-			if (next == current) {
-				next = current + 1; //just because for loop must work
-			}
 
 			for (Set<StringName>::Element *E = plot_sigs.front(); E; E = E->next()) {
-				int plot_pos = -1;
+				const Metric &m = _get_frame_metric(current);
 
-				for (int j = current; j < next; j++) {
-					//wrap
-					int idx = last_metric + 1 + j;
-					while (idx >= frame_metrics.size()) {
-						idx -= frame_metrics.size();
-					}
-
-					//get
-					const Metric &m = frame_metrics[idx];
-					if (!m.valid) {
-						continue; //skip because invalid
-					}
+				float value = 0;
 
-					float value = 0;
-
-					const Map<StringName, Metric::Category *>::Element *F = m.category_ptrs.find(E->get());
-					if (F) {
-						value = F->get()->total_time;
-					}
+				const Map<StringName, Metric::Category *>::Element *F = m.category_ptrs.find(E->get());
+				if (F) {
+					value = F->get()->total_time;
+				}
 
-					const Map<StringName, Metric::Category::Item *>::Element *G = m.item_ptrs.find(E->get());
-					if (G) {
-						if (use_self) {
-							value = G->get()->self;
-						} else {
-							value = G->get()->total;
-						}
+				const Map<StringName, Metric::Category::Item *>::Element *G = m.item_ptrs.find(E->get());
+				if (G) {
+					if (use_self) {
+						value = G->get()->self;
+					} else {
+						value = G->get()->total;
 					}
-
-					plot_pos = MAX(CLAMP(int(value * h / highest), 0, h - 1), plot_pos);
 				}
 
+				int plot_pos = CLAMP(int(value * h / highest), 0, h - 1);
+
 				int prev_plot = plot_pos;
-				Map<StringName, int>::Element *H = plot_prev.find(E->get());
+				Map<StringName, int>::Element *H = prev_plots.find(E->get());
 				if (H) {
 					prev_plot = H->get();
 					H->get() = plot_pos;
 				} else {
-					plot_prev[E->get()] = plot_pos;
-				}
-
-				if (plot_pos == -1 && prev_plot == -1) {
-					//don't bother drawing
-					continue;
-				}
-
-				if (prev_plot != -1 && plot_pos == -1) {
-					plot_pos = prev_plot;
-				}
-
-				if (prev_plot == -1 && plot_pos != -1) {
-					prev_plot = plot_pos;
+					prev_plots[E->get()] = plot_pos;
 				}
 
 				plot_pos = h - plot_pos - 1;
@@ -352,15 +322,13 @@ void EditorProfiler::_update_plot() {
 }
 
 void EditorProfiler::_update_frame() {
-	int cursor_metric = _get_cursor_index();
-
-	ERR_FAIL_INDEX(cursor_metric, frame_metrics.size());
+	int cursor_metric = cursor_metric_edit->get_value() - _get_frame_metric(0).frame_number;
 
 	updating_frame = true;
 	variables->clear();
 
 	TreeItem *root = variables->create_item();
-	const Metric &m = frame_metrics[cursor_metric];
+	const Metric &m = _get_frame_metric(cursor_metric);
 
 	int dtime = display_time->get_selected();
 
@@ -410,6 +378,7 @@ void EditorProfiler::_activate_pressed() {
 	if (activate->is_pressed()) {
 		activate->set_icon(get_theme_icon("Stop", "EditorIcons"));
 		activate->set_text(TTR("Stop"));
+		_clear_pressed();
 	} else {
 		activate->set_icon(get_theme_icon("Play", "EditorIcons"));
 		activate->set_text(TTR("Start"));
@@ -418,6 +387,7 @@ void EditorProfiler::_activate_pressed() {
 }
 
 void EditorProfiler::_clear_pressed() {
+	clear_button->set_disabled(true);
 	clear();
 	_update_plot();
 }
@@ -430,30 +400,16 @@ void EditorProfiler::_notification(int p_what) {
 }
 
 void EditorProfiler::_graph_tex_draw() {
-	if (last_metric < 0) {
+	if (total_metrics == 0) {
 		return;
 	}
 	if (seeking) {
-		int max_frames = frame_metrics.size();
-		int frame = cursor_metric_edit->get_value() - (frame_metrics[last_metric].frame_number - max_frames + 1);
-		if (frame < 0) {
-			frame = 0;
-		}
-
-		int cur_x = frame * graph->get_size().x / max_frames;
-
+		int frame = cursor_metric_edit->get_value() - _get_frame_metric(0).frame_number;
+		int cur_x = (2 * frame + 1) * graph->get_size().x / (2 * frame_metrics.size()) + 1;
 		graph->draw_line(Vector2(cur_x, 0), Vector2(cur_x, graph->get_size().y), Color(1, 1, 1, 0.8));
 	}
-
-	if (hover_metric != -1 && frame_metrics[hover_metric].valid) {
-		int max_frames = frame_metrics.size();
-		int frame = frame_metrics[hover_metric].frame_number - (frame_metrics[last_metric].frame_number - max_frames + 1);
-		if (frame < 0) {
-			frame = 0;
-		}
-
-		int cur_x = frame * graph->get_size().x / max_frames;
-
+	if (hover_metric > -1 && hover_metric < total_metrics) {
+		int cur_x = (2 * hover_metric + 1) * graph->get_size().x / (2 * frame_metrics.size()) + 1;
 		graph->draw_line(Vector2(cur_x, 0), Vector2(cur_x, graph->get_size().y), Color(1, 1, 1, 0.4));
 	}
 }
@@ -484,10 +440,10 @@ void EditorProfiler::_graph_tex_input(const Ref<InputEvent> &p_ev) {
 	if (
 			(mb.is_valid() && mb->get_button_index() == MOUSE_BUTTON_LEFT && mb->is_pressed()) ||
 			(mm.is_valid())) {
-		int x = me->get_position().x;
+		int x = me->get_position().x - 1;
 		x = x * frame_metrics.size() / graph->get_size().width;
 
-		bool show_hover = x >= 0 && x < frame_metrics.size();
+		hover_metric = x;
 
 		if (x < 0) {
 			x = 0;
@@ -497,41 +453,11 @@ void EditorProfiler::_graph_tex_input(const Ref<InputEvent> &p_ev) {
 			x = frame_metrics.size() - 1;
 		}
 
-		int metric = frame_metrics.size() - x - 1;
-		metric = last_metric - metric;
-		while (metric < 0) {
-			metric += frame_metrics.size();
-		}
-
-		if (show_hover) {
-			hover_metric = metric;
-
-		} else {
-			hover_metric = -1;
-		}
-
 		if (mb.is_valid() || mm->get_button_mask() & MOUSE_BUTTON_MASK_LEFT) {
-			//cursor_metric=x;
 			updating_frame = true;
 
-			//metric may be invalid, so look for closest metric that is valid, this makes snap feel better
-			bool valid = false;
-			for (int i = 0; i < frame_metrics.size(); i++) {
-				if (frame_metrics[metric].valid) {
-					valid = true;
-					break;
-				}
-
-				metric++;
-				if (metric >= frame_metrics.size()) {
-					metric = 0;
-				}
-			}
-
-			if (valid) {
-				cursor_metric_edit->set_value(frame_metrics[metric].frame_number);
-			}
-
+			if (x < total_metrics)
+				cursor_metric_edit->set_value(_get_frame_metric(x).frame_number);
 			updating_frame = false;
 
 			if (activate->is_pressed()) {
@@ -552,24 +478,6 @@ void EditorProfiler::_graph_tex_input(const Ref<InputEvent> &p_ev) {
 	}
 }
 
-int EditorProfiler::_get_cursor_index() const {
-	if (last_metric < 0) {
-		return 0;
-	}
-	if (!frame_metrics[last_metric].valid) {
-		return 0;
-	}
-
-	int diff = (frame_metrics[last_metric].frame_number - cursor_metric_edit->get_value());
-
-	int idx = last_metric - diff;
-	while (idx < 0) {
-		idx += frame_metrics.size();
-	}
-
-	return idx;
-}
-
 void EditorProfiler::disable_seeking() {
 	seeking = false;
 	graph->update();
@@ -659,6 +567,7 @@ EditorProfiler::EditorProfiler() {
 	clear_button = memnew(Button);
 	clear_button->set_text(TTR("Clear"));
 	clear_button->connect("pressed", callable_mp(this, &EditorProfiler::_clear_pressed));
+	clear_button->set_disabled(true);
 	hb->add_child(clear_button);
 
 	hb->add_child(memnew(Label(TTR("Measure:"))));
@@ -687,6 +596,8 @@ EditorProfiler::EditorProfiler() {
 
 	cursor_metric_edit = memnew(SpinBox);
 	cursor_metric_edit->set_h_size_flags(SIZE_FILL);
+	cursor_metric_edit->set_value(0);
+	cursor_metric_edit->set_editable(false);
 	hb->add_child(cursor_metric_edit);
 	cursor_metric_edit->connect("value_changed", callable_mp(this, &EditorProfiler::_cursor_metric_changed));
 
@@ -726,6 +637,7 @@ EditorProfiler::EditorProfiler() {
 
 	int metric_size = CLAMP(int(EDITOR_DEF("debugger/profiler_frame_history_size", 600)), 60, 1024);
 	frame_metrics.resize(metric_size);
+	total_metrics = 0;
 	last_metric = -1;
 	hover_metric = -1;
 
diff --git a/editor/debugger/editor_profiler.h b/editor/debugger/editor_profiler.h
index e16bde41f6..8880824b87 100644
--- a/editor/debugger/editor_profiler.h
+++ b/editor/debugger/editor_profiler.h
@@ -106,13 +106,13 @@ private:
 	SpinBox *cursor_metric_edit;
 
 	Vector<Metric> frame_metrics;
+	int total_metrics;
 	int last_metric;
 
 	int max_functions;
 
 	bool updating_frame;
 
-	//int cursor_metric;
 	int hover_metric;
 
 	float graph_height;
@@ -139,14 +139,14 @@ private:
 	void _graph_tex_draw();
 	void _graph_tex_input(const Ref<InputEvent> &p_ev);
 
-	int _get_cursor_index() const;
-
 	Color _get_color_from_signature(const StringName &p_signature) const;
 
 	void _cursor_metric_changed(double);
 
 	void _combo_changed(int);
 
+	Metric _get_frame_metric(int index);
+
 protected:
 	void _notification(int p_what);
 	static void _bind_methods();
diff --git a/editor/debugger/script_editor_debugger.cpp b/editor/debugger/script_editor_debugger.cpp
index c92e94270e..1d95161e6c 100644
--- a/editor/debugger/script_editor_debugger.cpp
+++ b/editor/debugger/script_editor_debugger.cpp
@@ -35,6 +35,8 @@
 #include "core/debugger/remote_debugger.h"
 #include "core/io/marshalls.h"
 #include "core/string/ustring.h"
+#include "core/version.h"
+#include "core/version_hash.gen.h"
 #include "editor/debugger/editor_network_profiler.h"
 #include "editor/debugger/editor_performance_profiler.h"
 #include "editor/debugger/editor_profiler.h"
@@ -1371,7 +1373,8 @@ void ScriptEditorDebugger::_error_tree_item_rmb_selected(const Vector2 &p_pos) {
 	item_menu->set_size(Size2(1, 1));
 
 	if (error_tree->is_anything_selected()) {
-		item_menu->add_icon_item(get_theme_icon("ActionCopy", "EditorIcons"), TTR("Copy Error"), 0);
+		item_menu->add_icon_item(get_theme_icon("ActionCopy", "EditorIcons"), TTR("Copy Error"), ACTION_COPY_ERROR);
+		item_menu->add_icon_item(get_theme_icon("Instance", "EditorIcons"), TTR("Open C++ Source on GitHub"), ACTION_OPEN_SOURCE);
 	}
 
 	if (item_menu->get_item_count() > 0) {
@@ -1381,30 +1384,64 @@ void ScriptEditorDebugger::_error_tree_item_rmb_selected(const Vector2 &p_pos) {
 }
 
 void ScriptEditorDebugger::_item_menu_id_pressed(int p_option) {
-	TreeItem *ti = error_tree->get_selected();
-	while (ti->get_parent() != error_tree->get_root()) {
-		ti = ti->get_parent();
-	}
+	switch (p_option) {
+		case ACTION_COPY_ERROR: {
+			TreeItem *ti = error_tree->get_selected();
+			while (ti->get_parent() != error_tree->get_root()) {
+				ti = ti->get_parent();
+			}
 
-	String type;
+			String type;
 
-	if (ti->get_icon(0) == get_theme_icon("Warning", "EditorIcons")) {
-		type = "W ";
-	} else if (ti->get_icon(0) == get_theme_icon("Error", "EditorIcons")) {
-		type = "E ";
-	}
+			if (ti->get_icon(0) == get_theme_icon("Warning", "EditorIcons")) {
+				type = "W ";
+			} else if (ti->get_icon(0) == get_theme_icon("Error", "EditorIcons")) {
+				type = "E ";
+			}
 
-	String text = ti->get_text(0) + "   ";
-	int rpad_len = text.length();
+			String text = ti->get_text(0) + "   ";
+			int rpad_len = text.length();
 
-	text = type + text + ti->get_text(1) + "\n";
-	TreeItem *ci = ti->get_children();
-	while (ci) {
-		text += "  " + ci->get_text(0).rpad(rpad_len) + ci->get_text(1) + "\n";
-		ci = ci->get_next();
-	}
+			text = type + text + ti->get_text(1) + "\n";
+			TreeItem *ci = ti->get_children();
+			while (ci) {
+				text += "  " + ci->get_text(0).rpad(rpad_len) + ci->get_text(1) + "\n";
+				ci = ci->get_next();
+			}
 
-	DisplayServer::get_singleton()->clipboard_set(text);
+			DisplayServer::get_singleton()->clipboard_set(text);
+		} break;
+
+		case ACTION_OPEN_SOURCE: {
+			TreeItem *ti = error_tree->get_selected();
+			while (ti->get_parent() != error_tree->get_root()) {
+				ti = ti->get_parent();
+			}
+
+			// We only need the first child here (C++ source stack trace).
+			TreeItem *ci = ti->get_children();
+			// Parse back the `file:line @ method()` string.
+			const Vector<String> file_line_number = ci->get_text(1).split("@")[0].strip_edges().split(":");
+			ERR_FAIL_COND_MSG(file_line_number.size() < 2, "Incorrect C++ source stack trace file:line format (please report).");
+			const String file = file_line_number[0];
+			const int line_number = file_line_number[1].to_int();
+
+			// Construct a GitHub repository URL and open it in the user's default web browser.
+			if (String(VERSION_HASH).length() >= 1) {
+				// Git commit hash information available; use it for greater accuracy, including for development versions.
+				OS::get_singleton()->shell_open(vformat("https://github.com/godotengine/godot/blob/%s/%s#L%d",
+						VERSION_HASH,
+						file,
+						line_number));
+			} else {
+				// Git commit hash information unavailable; fall back to tagged releases.
+				OS::get_singleton()->shell_open(vformat("https://github.com/godotengine/godot/blob/%s-stable/%s#L%d",
+						VERSION_NUMBER,
+						file,
+						line_number));
+			}
+		} break;
+	}
 }
 
 void ScriptEditorDebugger::_tab_changed(int p_tab) {
diff --git a/editor/debugger/script_editor_debugger.h b/editor/debugger/script_editor_debugger.h
index e5fb3c35a9..a5731c9f9c 100644
--- a/editor/debugger/script_editor_debugger.h
+++ b/editor/debugger/script_editor_debugger.h
@@ -74,6 +74,11 @@ private:
 		PROFILER_SCRIPTS_SERVERS
 	};
 
+	enum Actions {
+		ACTION_COPY_ERROR,
+		ACTION_OPEN_SOURCE,
+	};
+
 	AcceptDialog *msgdialog;
 
 	LineEdit *clicked_ctrl;
diff --git a/editor/editor_about.cpp b/editor/editor_about.cpp
index 2ed937b6ff..d962658484 100644
--- a/editor/editor_about.cpp
+++ b/editor/editor_about.cpp
@@ -38,16 +38,15 @@
 #include "core/version_hash.gen.h"
 
 void EditorAbout::_theme_changed() {
-	Control *base = EditorNode::get_singleton()->get_gui_base();
-	Ref<Font> font = base->get_theme_font("source", "EditorFonts");
-	int font_size = base->get_theme_font_size("source_size", "EditorFonts");
+	const Ref<Font> font = get_theme_font("source", "EditorFonts");
+	const int font_size = get_theme_font_size("source_size", "EditorFonts");
 	_tpl_text->add_theme_font_override("normal_font", font);
 	_tpl_text->add_theme_font_size_override("normal_font_size", font_size);
 	_tpl_text->add_theme_constant_override("line_separation", 6 * EDSCALE);
 	_license_text->add_theme_font_override("normal_font", font);
 	_license_text->add_theme_font_size_override("normal_font_size", font_size);
 	_license_text->add_theme_constant_override("line_separation", 6 * EDSCALE);
-	_logo->set_texture(base->get_theme_icon("Logo", "EditorIcons"));
+	_logo->set_texture(get_theme_icon("Logo", "EditorIcons"));
 }
 
 void EditorAbout::_notification(int p_what) {
diff --git a/editor/editor_about.h b/editor/editor_about.h
index efb7245e78..2823220a8a 100644
--- a/editor/editor_about.h
+++ b/editor/editor_about.h
@@ -44,6 +44,10 @@
 
 #include "editor_scale.h"
 
+/**
+ * NOTE: Do not assume the EditorNode singleton to be available in this class' methods.
+ * EditorAbout is also used from the project manager where EditorNode isn't initialized.
+ */
 class EditorAbout : public AcceptDialog {
 	GDCLASS(EditorAbout, AcceptDialog);
 
diff --git a/editor/editor_export.cpp b/editor/editor_export.cpp
index a5ebfbfb8a..a368a9618e 100644
--- a/editor/editor_export.cpp
+++ b/editor/editor_export.cpp
@@ -1051,14 +1051,28 @@ Error EditorExportPlatform::export_project_files(const Ref<EditorExportPreset> &
 		}
 	}
 
-	// Store text server data if exists.
+	// Store text server data if it is supported.
 	if (TS->has_feature(TextServer::FEATURE_USE_SUPPORT_DATA)) {
-		String ts_data = "res://" + TS->get_support_data_filename();
-		if (FileAccess::exists(ts_data)) {
-			Vector<uint8_t> array = FileAccess::get_file_as_array(ts_data);
-			err = p_func(p_udata, ts_data, array, idx, total, enc_in_filters, enc_ex_filters, key);
-			if (err != OK) {
-				return err;
+		bool use_data = ProjectSettings::get_singleton()->get("internationalization/locale/include_text_server_data");
+		if (use_data) {
+			// Try using user provided data file.
+			String ts_data = "res://" + TS->get_support_data_filename();
+			if (FileAccess::exists(ts_data)) {
+				Vector<uint8_t> array = FileAccess::get_file_as_array(ts_data);
+				err = p_func(p_udata, ts_data, array, idx, total, enc_in_filters, enc_ex_filters, key);
+				if (err != OK) {
+					return err;
+				}
+			} else {
+				// Use default text server data.
+				String icu_data_file = EditorSettings::get_singleton()->get_cache_dir().plus_file("tmp_icu_data");
+				TS->save_support_data(icu_data_file);
+				Vector<uint8_t> array = FileAccess::get_file_as_array(icu_data_file);
+				err = p_func(p_udata, ts_data, array, idx, total, enc_in_filters, enc_ex_filters, key);
+				DirAccess::remove_file_or_error(icu_data_file);
+				if (err != OK) {
+					return err;
+				}
 			}
 		}
 	}
diff --git a/editor/editor_file_system.cpp b/editor/editor_file_system.cpp
index fb0dc57501..495bdd42f7 100644
--- a/editor/editor_file_system.cpp
+++ b/editor/editor_file_system.cpp
@@ -68,6 +68,11 @@ int EditorFileSystemDirectory::find_dir_index(const String &p_dir) const {
 	return -1;
 }
 
+void EditorFileSystemDirectory::force_update() {
+	// We set modified_time to 0 to force `EditorFileSystem::_scan_fs_changes` to search changes in the directory
+	modified_time = 0;
+}
+
 int EditorFileSystemDirectory::get_subdir_count() const {
 	return subdirs.size();
 }
@@ -854,9 +859,11 @@ void EditorFileSystem::_scan_fs_changes(EditorFileSystemDirectory *p_dir, const
 
 		//then scan files and directories and check what's different
 
-		DirAccess *da = DirAccess::create(DirAccess::ACCESS_RESOURCES);
+		DirAccessRef da = DirAccess::create(DirAccess::ACCESS_RESOURCES);
+
+		Error ret = da->change_dir(cd);
+		ERR_FAIL_COND_MSG(ret != OK, "Cannot change to '" + cd + "' folder.");
 
-		da->change_dir(cd);
 		da->list_dir_begin();
 		while (true) {
 			String f = da->get_next();
@@ -944,7 +951,6 @@ void EditorFileSystem::_scan_fs_changes(EditorFileSystemDirectory *p_dir, const
 		}
 
 		da->list_dir_end();
-		memdelete(da);
 	}
 
 	for (int i = 0; i < p_dir->files.size(); i++) {
@@ -1922,6 +1928,11 @@ void EditorFileSystem::reimport_file_with_custom_parameters(const String &p_file
 	_reimport_file(p_file, &p_custom_params, p_importer);
 }
 
+void EditorFileSystem::_reimport_thread(uint32_t p_index, ImportThreadData *p_import_data) {
+	p_import_data->max_index = MAX(p_import_data->reimport_from + int(p_index), p_import_data->max_index);
+	_reimport_file(p_import_data->reimport_files[p_import_data->reimport_from + p_index].path);
+}
+
 void EditorFileSystem::reimport_files(const Vector<String> &p_files) {
 	{
 		// Ensure that ProjectSettings::IMPORTED_FILES_PATH exists.
@@ -1939,7 +1950,8 @@ void EditorFileSystem::reimport_files(const Vector<String> &p_files) {
 	importing = true;
 	EditorProgress pr("reimport", TTR("(Re)Importing Assets"), p_files.size());
 
-	Vector<ImportFile> files;
+	Vector<ImportFile> reimport_files;
+
 	Set<String> groups_to_reimport;
 
 	for (int i = 0; i < p_files.size(); i++) {
@@ -1957,8 +1969,8 @@ void EditorFileSystem::reimport_files(const Vector<String> &p_files) {
 			//it's a regular file
 			ImportFile ifile;
 			ifile.path = p_files[i];
-			ifile.order = ResourceFormatImporter::get_singleton()->get_import_order(p_files[i]);
-			files.push_back(ifile);
+			ResourceFormatImporter::get_singleton()->get_import_order_threads_and_importer(p_files[i], ifile.order, ifile.threaded, ifile.importer);
+			reimport_files.push_back(ifile);
 		}
 
 		//group may have changed, so also update group reference
@@ -1969,11 +1981,51 @@ void EditorFileSystem::reimport_files(const Vector<String> &p_files) {
 		}
 	}
 
-	files.sort();
+	reimport_files.sort();
 
-	for (int i = 0; i < files.size(); i++) {
-		pr.step(files[i].path.get_file(), i);
-		_reimport_file(files[i].path);
+	bool use_threads = GLOBAL_GET("editor/import/use_multiple_threads");
+
+	int from = 0;
+	for (int i = 0; i < reimport_files.size(); i++) {
+		if (use_threads && reimport_files[i].threaded) {
+			if (i + 1 == reimport_files.size() || reimport_files[i + 1].importer != reimport_files[from].importer) {
+				if (from - i == 0) {
+					//single file, do not use threads
+					pr.step(reimport_files[i].path.get_file(), i);
+					_reimport_file(reimport_files[i].path);
+				} else {
+					Ref<ResourceImporter> importer = ResourceFormatImporter::get_singleton()->get_importer_by_name(reimport_files[from].importer);
+					ERR_CONTINUE(!importer.is_valid());
+
+					importer->import_threaded_begin();
+
+					ImportThreadData data;
+					data.max_index = from;
+					data.reimport_from = from;
+					data.reimport_files = reimport_files.ptr();
+
+					import_threads.begin_work(i - from + 1, this, &EditorFileSystem::_reimport_thread, &data);
+					int current_index = from - 1;
+					do {
+						if (current_index < data.max_index) {
+							current_index = data.max_index;
+							pr.step(reimport_files[current_index].path.get_file(), current_index);
+						}
+						OS::get_singleton()->delay_usec(1);
+					} while (!import_threads.is_done_dispatching());
+
+					import_threads.end_work();
+
+					importer->import_threaded_end();
+				}
+
+				from = i + 1;
+			}
+
+		} else {
+			pr.step(reimport_files[i].path.get_file(), i);
+			_reimport_file(reimport_files[i].path);
+		}
 	}
 
 	//reimport groups
@@ -2111,7 +2163,7 @@ void EditorFileSystem::_update_extensions() {
 EditorFileSystem::EditorFileSystem() {
 	ResourceLoader::import = _resource_import;
 	reimport_on_missing_imported_files = GLOBAL_DEF("editor/import/reimport_missing_imported_files", true);
-
+	GLOBAL_DEF("editor/import/use_multiple_threads", true);
 	singleton = this;
 	filesystem = memnew(EditorFileSystemDirectory); //like, empty
 	filesystem->parent = nullptr;
@@ -2138,7 +2190,9 @@ EditorFileSystem::EditorFileSystem() {
 	first_scan = true;
 	scan_changes_pending = false;
 	revalidate_import_files = false;
+	import_threads.init();
 }
 
 EditorFileSystem::~EditorFileSystem() {
+	import_threads.finish();
 }
diff --git a/editor/editor_file_system.h b/editor/editor_file_system.h
index 6f4f058503..855c320856 100644
--- a/editor/editor_file_system.h
+++ b/editor/editor_file_system.h
@@ -36,7 +36,9 @@
 #include "core/os/thread_safe.h"
 #include "core/templates/safe_refcount.h"
 #include "core/templates/set.h"
+#include "core/templates/thread_work_pool.h"
 #include "scene/main/node.h"
+
 class FileAccess;
 
 struct EditorProgressBG;
@@ -100,6 +102,8 @@ public:
 	int find_file_index(const String &p_file) const;
 	int find_dir_index(const String &p_dir) const;
 
+	void force_update();
+
 	EditorFileSystemDirectory();
 	~EditorFileSystemDirectory();
 };
@@ -214,9 +218,11 @@ class EditorFileSystem : public Node {
 
 	struct ImportFile {
 		String path;
+		String importer;
+		bool threaded = false;
 		int order = 0;
 		bool operator<(const ImportFile &p_if) const {
-			return order < p_if.order;
+			return order == p_if.order ? (importer < p_if.importer) : (order < p_if.order);
 		}
 	};
 
@@ -236,6 +242,16 @@ class EditorFileSystem : public Node {
 
 	Set<String> group_file_cache;
 
+	ThreadWorkPool import_threads;
+
+	struct ImportThreadData {
+		const ImportFile *reimport_files;
+		int reimport_from;
+		int max_index = 0;
+	};
+
+	void _reimport_thread(uint32_t p_index, ImportThreadData *p_import_data);
+
 protected:
 	void _notification(int p_what);
 	static void _bind_methods();
diff --git a/editor/editor_help.cpp b/editor/editor_help.cpp
index a747652a2f..6039f64b7c 100644
--- a/editor/editor_help.cpp
+++ b/editor/editor_help.cpp
@@ -1801,7 +1801,7 @@ void FindBar::popup_search() {
 
 	if (!search_text->get_text().is_empty()) {
 		search_text->select_all();
-		search_text->set_cursor_position(search_text->get_text().length());
+		search_text->set_caret_column(search_text->get_text().length());
 		if (grabbed_focus) {
 			_search();
 		}
diff --git a/editor/editor_help_search.cpp b/editor/editor_help_search.cpp
index a1ff87fe2e..23226ffa9b 100644
--- a/editor/editor_help_search.cpp
+++ b/editor/editor_help_search.cpp
@@ -123,7 +123,7 @@ void EditorHelpSearch::_notification(int p_what) {
 				if (search->work()) {
 					// Search done.
 
-					// Only point to the perfect match if it's a new search, and not just reopening a old one.
+					// Only point to the match if it's a new search, and not just reopening a old one.
 					if (!old_search) {
 						results_tree->ensure_cursor_is_visible();
 					} else {
@@ -310,6 +310,7 @@ bool EditorHelpSearch::Runner::_phase_match_classes_init() {
 	iterator_doc = EditorHelp::get_doc_data()->class_list.front();
 	matches.clear();
 	matched_item = nullptr;
+	match_highest_score = 0;
 
 	return true;
 }
@@ -460,16 +461,20 @@ bool EditorHelpSearch::Runner::_match_string(const String &p_term, const String
 }
 
 void EditorHelpSearch::Runner::_match_item(TreeItem *p_item, const String &p_text) {
-	if (!matched_item) {
-		if (search_flags & SEARCH_CASE_SENSITIVE) {
-			if (p_text.casecmp_to(term) == 0) {
-				matched_item = p_item;
-			}
-		} else {
-			if (p_text.nocasecmp_to(term) == 0) {
-				matched_item = p_item;
-			}
-		}
+	float inverse_length = 1.f / float(p_text.length());
+
+	// Favor types where search term is a substring close to the start of the type.
+	float w = 0.5f;
+	int pos = p_text.findn(term);
+	float score = (pos > -1) ? 1.0f - w * MIN(1, 3 * pos * inverse_length) : MAX(0.f, .9f - w);
+
+	// Favor shorter items: they resemble the search term more.
+	w = 0.1f;
+	score *= (1 - w) + w * (term.length() * inverse_length);
+
+	if (match_highest_score == 0 || score > match_highest_score) {
+		matched_item = p_item;
+		match_highest_score = score;
 	}
 }
 
diff --git a/editor/editor_help_search.h b/editor/editor_help_search.h
index 0e236d523d..350a02315f 100644
--- a/editor/editor_help_search.h
+++ b/editor/editor_help_search.h
@@ -124,6 +124,7 @@ class EditorHelpSearch::Runner : public Reference {
 	TreeItem *root_item = nullptr;
 	Map<String, TreeItem *> class_items;
 	TreeItem *matched_item = nullptr;
+	float match_highest_score = 0;
 
 	bool _is_class_disabled_by_feature_profile(const StringName &p_class);
 
diff --git a/editor/editor_node.cpp b/editor/editor_node.cpp
index 9ca46cfcc0..6137617564 100644
--- a/editor/editor_node.cpp
+++ b/editor/editor_node.cpp
@@ -143,6 +143,7 @@
 #include "editor/plugins/multimesh_editor_plugin.h"
 #include "editor/plugins/navigation_polygon_editor_plugin.h"
 #include "editor/plugins/node_3d_editor_plugin.h"
+#include "editor/plugins/occluder_instance_3d_editor_plugin.h"
 #include "editor/plugins/ot_features_plugin.h"
 #include "editor/plugins/packed_scene_translation_parser_plugin.h"
 #include "editor/plugins/path_2d_editor_plugin.h"
@@ -5892,6 +5893,8 @@ EditorNode::EditorNode() {
 	EDITOR_DEF("interface/inspector/resources_to_open_in_new_inspector", "Script,MeshLibrary,TileSet");
 	EDITOR_DEF("interface/inspector/default_color_picker_mode", 0);
 	EditorSettings::get_singleton()->add_property_hint(PropertyInfo(Variant::INT, "interface/inspector/default_color_picker_mode", PROPERTY_HINT_ENUM, "RGB,HSV,RAW", PROPERTY_USAGE_DEFAULT));
+	EDITOR_DEF("interface/inspector/default_color_picker_shape", (int32_t)ColorPicker::SHAPE_VHS_CIRCLE);
+	EditorSettings::get_singleton()->add_property_hint(PropertyInfo(Variant::INT, "interface/inspector/default_color_picker_shape", PROPERTY_HINT_ENUM, "HSV Rectangle,HSV Rectangle Wheel,VHS Circle", PROPERTY_USAGE_DEFAULT));
 	EDITOR_DEF("run/auto_save/save_before_running", true);
 
 	theme_base = memnew(Control);
@@ -6798,6 +6801,7 @@ EditorNode::EditorNode() {
 	add_editor_plugin(memnew(TextureRegionEditorPlugin(this)));
 	add_editor_plugin(memnew(GIProbeEditorPlugin(this)));
 	add_editor_plugin(memnew(BakedLightmapEditorPlugin(this)));
+	add_editor_plugin(memnew(OccluderInstance3DEditorPlugin(this)));
 	add_editor_plugin(memnew(Path2DEditorPlugin(this)));
 	add_editor_plugin(memnew(Path3DEditorPlugin(this)));
 	add_editor_plugin(memnew(Line2DEditorPlugin(this)));
diff --git a/editor/editor_plugin.cpp b/editor/editor_plugin.cpp
index 064271fce8..eabcbacd9a 100644
--- a/editor/editor_plugin.cpp
+++ b/editor/editor_plugin.cpp
@@ -160,6 +160,10 @@ void EditorInterface::edit_resource(const Ref<Resource> &p_resource) {
 	EditorNode::get_singleton()->edit_resource(p_resource);
 }
 
+void EditorInterface::edit_node(Node *p_node) {
+	EditorNode::get_singleton()->edit_node(p_node);
+}
+
 void EditorInterface::open_scene_from_path(const String &scene_path) {
 	if (EditorNode::get_singleton()->is_changing_scene()) {
 		return;
@@ -312,6 +316,7 @@ void EditorInterface::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_base_control"), &EditorInterface::get_base_control);
 	ClassDB::bind_method(D_METHOD("get_editor_scale"), &EditorInterface::get_editor_scale);
 	ClassDB::bind_method(D_METHOD("edit_resource", "resource"), &EditorInterface::edit_resource);
+	ClassDB::bind_method(D_METHOD("edit_node", "node"), &EditorInterface::edit_node);
 	ClassDB::bind_method(D_METHOD("open_scene_from_path", "scene_filepath"), &EditorInterface::open_scene_from_path);
 	ClassDB::bind_method(D_METHOD("reload_scene_from_path", "scene_filepath"), &EditorInterface::reload_scene_from_path);
 	ClassDB::bind_method(D_METHOD("play_main_scene"), &EditorInterface::play_main_scene);
diff --git a/editor/editor_plugin.h b/editor/editor_plugin.h
index b0713c641b..67b163eabf 100644
--- a/editor/editor_plugin.h
+++ b/editor/editor_plugin.h
@@ -71,6 +71,7 @@ public:
 
 	Control *get_editor_main_control();
 	void edit_resource(const Ref<Resource> &p_resource);
+	void edit_node(Node *p_node);
 	void open_scene_from_path(const String &scene_path);
 	void reload_scene_from_path(const String &scene_path);
 
diff --git a/editor/editor_properties.cpp b/editor/editor_properties.cpp
index fa44239e32..652deb1804 100644
--- a/editor/editor_properties.cpp
+++ b/editor/editor_properties.cpp
@@ -38,7 +38,7 @@
 #include "scene/main/window.h"
 #include "scene/resources/font.h"
 
-///////////////////// NULL /////////////////////////
+///////////////////// Nil /////////////////////////
 
 void EditorPropertyNil::update_property() {
 }
@@ -2190,6 +2190,9 @@ void EditorPropertyColor::_picker_created() {
 	} else if (default_color_mode == 2) {
 		picker->get_picker()->set_raw_mode(true);
 	}
+
+	int picker_shape = EDITOR_GET("interface/inspector/default_color_picker_shape");
+	picker->get_picker()->set_picker_shape((ColorPicker::PickerShapeType)picker_shape);
 }
 
 void EditorPropertyColor::_picker_opening() {
diff --git a/editor/editor_themes.cpp b/editor/editor_themes.cpp
index 35cf330714..7cc9ebd63e 100644
--- a/editor/editor_themes.cpp
+++ b/editor/editor_themes.cpp
@@ -963,7 +963,7 @@ Ref<Theme> create_editor_theme(const Ref<Theme> p_theme) {
 	theme->set_color("read_only", "LineEdit", font_disabled_color);
 	theme->set_color("font_color", "LineEdit", font_color);
 	theme->set_color("font_selected_color", "LineEdit", mono_color);
-	theme->set_color("cursor_color", "LineEdit", font_color);
+	theme->set_color("caret_color", "LineEdit", font_color);
 	theme->set_color("selection_color", "LineEdit", selection_color);
 	theme->set_color("clear_button_color", "LineEdit", font_color);
 	theme->set_color("clear_button_color_pressed", "LineEdit", accent_color);
@@ -1282,6 +1282,7 @@ Ref<Theme> create_editor_theme(const Ref<Theme> p_theme) {
 	theme->set_icon("preset_bg", "ColorPicker", theme->get_icon("GuiMiniCheckerboard", "EditorIcons"));
 	theme->set_icon("overbright_indicator", "ColorPicker", theme->get_icon("OverbrightIndicator", "EditorIcons"));
 	theme->set_icon("bar_arrow", "ColorPicker", theme->get_icon("ColorPickerBarArrow", "EditorIcons"));
+	theme->set_icon("picker_cursor", "ColorPicker", theme->get_icon("PickerCursor", "EditorIcons"));
 
 	theme->set_icon("bg", "ColorPickerButton", theme->get_icon("GuiMiniCheckerboard", "EditorIcons"));
 
diff --git a/editor/editor_translation_parser.cpp b/editor/editor_translation_parser.cpp
index fd36372dde..49d5cf1fd3 100644
--- a/editor/editor_translation_parser.cpp
+++ b/editor/editor_translation_parser.cpp
@@ -105,7 +105,7 @@ void EditorTranslationParser::get_recognized_extensions(List<String> *r_extensio
 	for (int i = 0; i < temp.size(); i++) {
 		extensions.insert(temp[i]);
 	}
-	for (auto E = extensions.front(); E; E = E->next()) {
+	for (Set<String>::Element *E = extensions.front(); E; E = E->next()) {
 		r_extensions->push_back(E->get());
 	}
 }
diff --git a/editor/export_template_manager.cpp b/editor/export_template_manager.cpp
index 781d21c370..0f5c01be0e 100644
--- a/editor/export_template_manager.cpp
+++ b/editor/export_template_manager.cpp
@@ -116,13 +116,14 @@ void ExportTemplateManager::_update_template_list() {
 	}
 
 	for (Set<String>::Element *E = templates.back(); E; E = E->prev()) {
-		HBoxContainer *hbc = memnew(HBoxContainer);
-		Label *version = memnew(Label);
-		version->set_modulate(current->get_theme_color("disabled_font_color", "Editor"));
 		String text = E->get();
 		if (text == current_version) {
-			text += " " + TTR("(Current)");
+			continue;
 		}
+
+		HBoxContainer *hbc = memnew(HBoxContainer);
+		Label *version = memnew(Label);
+		version->set_modulate(current->get_theme_color("disabled_font_color", "Editor"));
 		version->set_text(text);
 		version->set_h_size_flags(Control::SIZE_EXPAND_FILL);
 		hbc->add_child(version);
@@ -653,7 +654,7 @@ ExportTemplateManager::ExportTemplateManager() {
 	main_vb->add_margin_child(TTR("Current Version:"), current_hb, false);
 
 	installed_scroll = memnew(ScrollContainer);
-	main_vb->add_margin_child(TTR("Installed Versions:"), installed_scroll, true);
+	main_vb->add_margin_child(TTR("Other Installed Versions:"), installed_scroll, true);
 
 	installed_vb = memnew(VBoxContainer);
 	installed_scroll->add_child(installed_vb);
diff --git a/editor/filesystem_dock.cpp b/editor/filesystem_dock.cpp
index 899070f036..09424647fe 100644
--- a/editor/filesystem_dock.cpp
+++ b/editor/filesystem_dock.cpp
@@ -1465,6 +1465,10 @@ void FileSystemDock::_folder_removed(String p_folder) {
 	}
 
 	current_path->set_text(path);
+	EditorFileSystemDirectory *efd = EditorFileSystem::get_singleton()->get_filesystem_path(path);
+	if (efd) {
+		efd->force_update();
+	}
 }
 
 void FileSystemDock::_rename_operation_confirm() {
diff --git a/editor/icons/FontSize.svg b/editor/icons/FontSize.svg
new file mode 100644
index 0000000000..e608d89b6a
--- /dev/null
+++ b/editor/icons/FontSize.svg
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg width="100%" height="100%" viewBox="0 0 16 16" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;stroke-linejoin:round;stroke-miterlimit:2;"><g id="SmallerT"><rect x="1.047" y="7.127" width="6.025" height="1.2" style="fill:#e0e0e0;fill-rule:nonzero;"/><rect x="3.452" y="7.127" width="1.214" height="6.508" style="fill:#e0e0e0;fill-rule:nonzero;"/><rect x="2.238" y="13.171" width="3.643" height="0.465" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M1.477,7.127l0,2.4l-0.43,0l-0,-2.4l0.43,0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M7.071,7.127l0,2.4l-0.43,0l0,-2.4l0.43,0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M1.477,8.327l0,1.2c0,-0.658 0.389,-1.2 0.861,-1.2l-0.861,0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M5.78,8.327c0.473,0 0.861,0.542 0.861,1.2l0,-1.2l-0.861,0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M2.238,13.171c0.666,-0 1.214,-0.42 1.214,-0.93l0,0.93l-1.214,-0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M5.88,13.171c-0.666,-0 -1.214,-0.42 -1.214,-0.93l0,0.93l1.214,-0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/></g><g id="BiggerT"><rect x="4.563" y="2.873" width="10.773" height="1.539" style="fill:#e0e0e0;fill-rule:nonzero;"/><rect x="9.18" y="2.873" width="1.539" height="10.773" style="fill:#e0e0e0;fill-rule:nonzero;"/><rect x="7.641" y="12.877" width="4.617" height="0.769" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M5.332,2.873l0,3.078l-0.769,0l-0,-3.078l0.769,0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M15.336,2.873l-0,3.078l-0.77,0l0,-3.078l0.77,0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M5.332,4.412l0,1.539c0,-0.844 0.695,-1.539 1.539,-1.539l-1.539,0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M13.027,4.412c0.844,0 1.539,0.695 1.539,1.539l0,-1.539l-1.539,0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M7.641,12.877c0.844,-0 1.539,-0.695 1.539,-1.539l-0,1.539l-1.539,-0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/><path d="M12.258,12.877c-0.845,-0 -1.539,-0.695 -1.539,-1.539l-0,1.539l1.539,-0Z" style="fill:#e0e0e0;fill-rule:nonzero;"/></g></svg>
diff --git a/editor/icons/LargeTexture.svg b/editor/icons/LargeTexture.svg
deleted file mode 100644
index 137a761e1d..0000000000
--- a/editor/icons/LargeTexture.svg
+++ /dev/null
@@ -1 +0,0 @@
-<svg height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m1 1v1 2h1v-2h2v-1zm11 0v1h2v2h1v-3zm-3 5v1h-1v1h-2v1h-1v1h-1v1h2 2 2 2v-2h-1v-1-1h-1v-1zm-8 6v2 1h3v-1h-2v-2zm13 0v2h-2v1h3v-1-2z" fill="#e0e0e0" fill-opacity=".99608"/></svg>
diff --git a/editor/icons/PickerCursor.svg b/editor/icons/PickerCursor.svg
new file mode 100644
index 0000000000..88ee3f55ce
--- /dev/null
+++ b/editor/icons/PickerCursor.svg
@@ -0,0 +1 @@
+<svg height="16" viewBox="0 0 16 16" width="16" xmlns="http://www.w3.org/2000/svg"><path d="m8 2a6 6 0 0 0 -6 6 6 6 0 0 0 6 6 6 6 0 0 0 6-6 6 6 0 0 0 -6-6zm0 1a5 5 0 0 1 5 5 5 5 0 0 1 -5 5 5 5 0 0 1 -5-5 5 5 0 0 1 5-5z" fill="#fff"/><path d="m8 3a5 5 0 0 0 -5 5 5 5 0 0 0 5 5 5 5 0 0 0 5-5 5 5 0 0 0 -5-5zm-.0605469 1a4 4 0 0 1 .0605469 0 4 4 0 0 1 4 4 4 4 0 0 1 -4 4 4 4 0 0 1 -4-4 4 4 0 0 1 3.9394531-4z"/></svg>
diff --git a/editor/icons/ThemeRemoveAllItems.svg b/editor/icons/ThemeRemoveAllItems.svg
new file mode 100644
index 0000000000..47ed624d04
--- /dev/null
+++ b/editor/icons/ThemeRemoveAllItems.svg
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg width="100%" height="100%" viewBox="0 0 16 16" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;stroke-linejoin:round;stroke-miterlimit:2;"><path d="M8,1.745c-0.595,0 -1.084,0.489 -1.084,1.084l0,3.699l-3.851,-1.927c-0.163,-0.08 -0.343,-0.119 -0.525,-0.112c-0.395,0.015 -0.752,0.244 -0.929,0.597c-0.076,0.151 -0.115,0.317 -0.115,0.485c0,0.41 0.233,0.786 0.599,0.97l3.481,1.74l-3.481,1.74c-0.366,0.184 -0.599,0.56 -0.599,0.97c0,0.168 0.039,0.334 0.115,0.485c0.183,0.367 0.559,0.599 0.969,0.599c0.168,0 0.334,-0.039 0.485,-0.114l3.851,-1.927l0,3.111c0,0.594 0.489,1.084 1.084,1.084c0.595,-0 1.084,-0.49 1.084,-1.084l-0,-3.111l3.851,1.927c0.151,0.075 0.317,0.114 0.485,0.114c0.41,0 0.786,-0.232 0.969,-0.599c0.076,-0.151 0.115,-0.317 0.115,-0.485c-0,-0.41 -0.233,-0.786 -0.599,-0.97l-3.481,-1.74l3.481,-1.74c0.366,-0.184 0.599,-0.56 0.599,-0.97c-0,-0.168 -0.039,-0.334 -0.115,-0.485c-0.182,-0.364 -0.554,-0.596 -0.961,-0.599c-0.171,-0.001 -0.34,0.038 -0.493,0.114l-3.851,1.927l-0,-3.699c-0,-0.595 -0.489,-1.084 -1.084,-1.084Z" style="fill:#a5efac;"/><path d="M8,1.745c-0,0 -0,1.783 -0,1.783l-1.084,0l0,-0.699c0,-0.595 0.489,-1.084 1.084,-1.084Z" style="fill:#ff7070;fill-rule:nonzero;"/><path d="M1.528,5.312l2.957,-0l-1.42,-0.711c-0.163,-0.08 -0.343,-0.119 -0.525,-0.112c-0.395,0.015 -0.752,0.244 -0.929,0.597c-0.036,0.072 -0.064,0.148 -0.083,0.226Zm5.388,-1.784l1.084,0l-0,1.784l-1.084,-0l0,-1.784Z" style="fill:#ffeb70;fill-rule:nonzero;"/><path d="M6.916,5.312l1.084,-0l-0,1.783l-4.796,0l-1.109,-0.554c-0.366,-0.184 -0.599,-0.56 -0.599,-0.97c0,-0.088 0.011,-0.175 0.032,-0.259l2.957,-0l2.431,1.216l0,-1.216Z" style="fill:#9dff70;fill-rule:nonzero;"/><path d="M3.204,7.095l4.796,0l-0,1.783l-3.619,0l1.195,-0.597l-2.372,-1.186Z" style="fill:#70ffb9;fill-rule:nonzero;"/><path d="M4.381,8.878l3.619,0l-0,1.784l-1.084,-0l0,-0.628l-1.255,0.628l-4.114,-0c0.088,-0.274 0.283,-0.508 0.548,-0.641l2.286,-1.143Z" style="fill:#70deff;fill-rule:nonzero;"/><path d="M6.916,12.445l1.084,0l-0,1.784l-0,-0c-0.595,-0.001 -1.084,-0.49 -1.084,-1.084l0,-0.7Z" style="fill:#ff70ac;fill-rule:nonzero;"/><path d="M6.916,10.662l1.084,-0l-0,1.783l-1.084,0l0,-1.783Zm-1.255,-0l-4.114,-0c-0.033,0.105 -0.051,0.216 -0.051,0.329c0,0.168 0.039,0.334 0.115,0.485c0.183,0.367 0.559,0.599 0.969,0.599c0.168,0 0.334,-0.039 0.485,-0.114l2.596,-1.299Z" style="fill:#9f70ff;fill-rule:nonzero;"/></svg>
diff --git a/editor/icons/ThemeRemoveCustomItems.svg b/editor/icons/ThemeRemoveCustomItems.svg
new file mode 100644
index 0000000000..bb8a8bd026
--- /dev/null
+++ b/editor/icons/ThemeRemoveCustomItems.svg
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg width="100%" height="100%" viewBox="0 0 16 16" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;stroke-linejoin:round;stroke-miterlimit:2;"><path d="M11.299,3c0.772,0.513 1.42,1.199 1.888,2l-2.553,0c-0.706,-0.621 -1.629,-1 -2.634,-1c-1.005,0 -1.928,0.379 -2.634,1l-2.553,0c0.468,-0.801 1.116,-1.487 1.888,-2l6.598,-0Z" style="fill:#ffeb70;fill-rule:nonzero;"/><path d="M5.366,5c-0.593,0.522 -1.033,1.216 -1.238,2l-2.043,0c0.122,-0.717 0.373,-1.392 0.728,-2l2.553,-0Zm7.821,-0c0.355,0.608 0.606,1.283 0.728,2l-2.043,0c-0.205,-0.784 -0.645,-1.478 -1.238,-2l2.553,-0Z" style="fill:#9dff70;fill-rule:nonzero;"/><path d="M13.915,7c0.056,0.326 0.085,0.66 0.085,1c-0,0.34 -0.029,0.674 -0.085,1l-2.043,0c0.083,-0.32 0.128,-0.655 0.128,-1c0,-0.345 -0.045,-0.68 -0.128,-1l2.043,-0Zm-9.787,0c-0.083,0.32 -0.128,0.655 -0.128,1c0,0.345 0.045,0.68 0.128,1l-2.043,-0c-0.056,-0.326 -0.085,-0.66 -0.085,-1c0,-0.34 0.029,-0.674 0.085,-1l2.043,0Z" style="fill:#70ffb9;fill-rule:nonzero;"/><path d="M4.128,9c0.205,0.784 0.645,1.478 1.238,2l-2.553,0c-0.355,-0.608 -0.606,-1.283 -0.728,-2l2.043,0Zm9.787,0c-0.122,0.717 -0.373,1.392 -0.728,2l-2.553,0c0.593,-0.522 1.033,-1.216 1.238,-2l2.043,0Z" style="fill:#70deff;fill-rule:nonzero;"/><path d="M11.299,13l-6.598,0c0.949,0.631 2.084,1 3.299,1c1.215,0 2.35,-0.369 3.299,-1Z" style="fill:#ff70ac;fill-rule:nonzero;"/><path d="M13.187,11c-0.468,0.801 -1.116,1.487 -1.888,2l-6.598,0c-0.772,-0.513 -1.42,-1.199 -1.888,-2l2.553,0c0.706,0.621 1.629,1 2.634,1c1.005,0 1.928,-0.379 2.634,-1l2.553,0Z" style="fill:#9f70ff;fill-rule:nonzero;"/><path d="M4.701,3l6.598,0c-0.949,-0.631 -2.084,-1 -3.299,-1c-1.215,0 -2.35,0.369 -3.299,1Z" style="fill:#ff7070;fill-rule:nonzero;"/></svg>
diff --git a/editor/import/resource_importer_scene.cpp b/editor/import/resource_importer_scene.cpp
index 9041b815ca..96002400f3 100644
--- a/editor/import/resource_importer_scene.cpp
+++ b/editor/import/resource_importer_scene.cpp
@@ -1136,7 +1136,7 @@ Ref<Animation> ResourceImporterScene::import_animation_from_other_importer(Edito
 	return importer->import_animation(p_path, p_flags, p_bake_fps);
 }
 
-void ResourceImporterScene::_generate_meshes(Node *p_node, const Dictionary &p_mesh_data, bool p_generate_lods, bool p_create_shadow_meshes, LightBakeMode p_light_bake_mode, float p_lightmap_texel_size, const Vector<uint8_t> &p_src_lightmap_cache, Vector<uint8_t> &r_dst_lightmap_cache) {
+void ResourceImporterScene::_generate_meshes(Node *p_node, const Dictionary &p_mesh_data, bool p_generate_lods, bool p_create_shadow_meshes, LightBakeMode p_light_bake_mode, float p_lightmap_texel_size, const Vector<uint8_t> &p_src_lightmap_cache, Vector<Vector<uint8_t>> &r_lightmap_caches) {
 	EditorSceneImporterMeshNode3D *src_mesh_node = Object::cast_to<EditorSceneImporterMeshNode3D>(p_node);
 	if (src_mesh_node) {
 		//is mesh
@@ -1216,7 +1216,28 @@ void ResourceImporterScene::_generate_meshes(Node *p_node, const Dictionary &p_m
 						n = n->get_parent_spatial();
 					}
 
-					//use xf as transform for mesh, and bake it
+					Vector<uint8_t> lightmap_cache;
+					src_mesh_node->get_mesh()->lightmap_unwrap_cached(xf, p_lightmap_texel_size, p_src_lightmap_cache, lightmap_cache);
+
+					if (!lightmap_cache.is_empty()) {
+						if (r_lightmap_caches.is_empty()) {
+							r_lightmap_caches.push_back(lightmap_cache);
+						} else {
+							String new_md5 = String::md5(lightmap_cache.ptr()); // MD5 is stored at the beginning of the cache data
+
+							for (int i = 0; i < r_lightmap_caches.size(); i++) {
+								String md5 = String::md5(r_lightmap_caches[i].ptr());
+								if (new_md5 < md5) {
+									r_lightmap_caches.insert(i, lightmap_cache);
+									break;
+								}
+
+								if (new_md5 == md5) {
+									break;
+								}
+							}
+						}
+					}
 				}
 
 				if (save_to_file != String()) {
@@ -1241,7 +1262,7 @@ void ResourceImporterScene::_generate_meshes(Node *p_node, const Dictionary &p_m
 			if (mesh.is_valid()) {
 				mesh_node->set_mesh(mesh);
 				for (int i = 0; i < mesh->get_surface_count(); i++) {
-					mesh_node->set_surface_material(i, src_mesh_node->get_surface_material(i));
+					mesh_node->set_surface_override_material(i, src_mesh_node->get_surface_material(i));
 				}
 			}
 		}
@@ -1265,7 +1286,7 @@ void ResourceImporterScene::_generate_meshes(Node *p_node, const Dictionary &p_m
 	}
 
 	for (int i = 0; i < p_node->get_child_count(); i++) {
-		_generate_meshes(p_node->get_child(i), p_mesh_data, p_generate_lods, p_create_shadow_meshes, p_light_bake_mode, p_lightmap_texel_size, p_src_lightmap_cache, r_dst_lightmap_cache);
+		_generate_meshes(p_node->get_child(i), p_mesh_data, p_generate_lods, p_create_shadow_meshes, p_light_bake_mode, p_lightmap_texel_size, p_src_lightmap_cache, r_lightmap_caches);
 	}
 }
 
@@ -1433,7 +1454,7 @@ Error ResourceImporterScene::import(const String &p_source_file, const String &p
 	float lightmap_texel_size = MAX(0.001, texel_size);
 
 	Vector<uint8_t> src_lightmap_cache;
-	Vector<uint8_t> dst_lightmap_cache;
+	Vector<Vector<uint8_t>> mesh_lightmap_caches;
 
 	{
 		src_lightmap_cache = FileAccess::get_file_as_array(p_source_file + ".unwrap_cache", &err);
@@ -1446,124 +1467,20 @@ Error ResourceImporterScene::import(const String &p_source_file, const String &p
 	if (subresources.has("meshes")) {
 		mesh_data = subresources["meshes"];
 	}
-	_generate_meshes(scene, mesh_data, gen_lods, create_shadow_meshes, LightBakeMode(light_bake_mode), lightmap_texel_size, src_lightmap_cache, dst_lightmap_cache);
+	_generate_meshes(scene, mesh_data, gen_lods, create_shadow_meshes, LightBakeMode(light_bake_mode), lightmap_texel_size, src_lightmap_cache, mesh_lightmap_caches);
 
-	if (dst_lightmap_cache.size()) {
+	if (mesh_lightmap_caches.size()) {
 		FileAccessRef f = FileAccess::open(p_source_file + ".unwrap_cache", FileAccess::WRITE);
 		if (f) {
-			f->store_buffer(dst_lightmap_cache.ptr(), dst_lightmap_cache.size());
-		}
-	}
-	err = OK;
-
-#if 0
-	if (light_bake_mode == 2 /* || generate LOD */) {
-		Map<Ref<ArrayMesh>, Transform> meshes;
-		_find_meshes(scene, meshes);
-
-		String file_id = src_path.get_file();
-		String cache_file_path = base_path.plus_file(file_id + ".unwrap_cache");
-
-		Vector<unsigned char> cache_data;
-
-		if (FileAccess::exists(cache_file_path)) {
-			Error err2;
-			FileAccess *file = FileAccess::open(cache_file_path, FileAccess::READ, &err2);
-
-			if (err2) {
-				if (file) {
-					memdelete(file);
-				}
-			} else {
-				int cache_size = file->get_len();
-				cache_data.resize(cache_size);
-				file->get_buffer(cache_data.ptrw(), cache_size);
+			f->store_32(mesh_lightmap_caches.size());
+			for (int i = 0; i < mesh_lightmap_caches.size(); i++) {
+				String md5 = String::md5(mesh_lightmap_caches[i].ptr());
+				f->store_buffer(mesh_lightmap_caches[i].ptr(), mesh_lightmap_caches[i].size());
 			}
-		}
-
-		Map<String, unsigned int> used_unwraps;
-
-		EditorProgress progress2("gen_lightmaps", TTR("Generating Lightmaps"), meshes.size());
-		int step = 0;
-		for (Map<Ref<ArrayMesh>, Transform>::Element *E = meshes.front(); E; E = E->next()) {
-			Ref<ArrayMesh> mesh = E->key();
-			String name = mesh->get_name();
-			if (name == "") { //should not happen but..
-				name = "Mesh " + itos(step);
-			}
-
-			progress2.step(TTR("Generating for Mesh: ") + name + " (" + itos(step) + "/" + itos(meshes.size()) + ")", step);
-
-			int *ret_cache_data = (int *)cache_data.ptrw();
-			unsigned int ret_cache_size = cache_data.size();
-			bool ret_used_cache = true; // Tell the unwrapper to use the cache
-			Error err2 = mesh->lightmap_unwrap_cached(ret_cache_data, ret_cache_size, ret_used_cache, E->get(), texel_size);
-
-			if (err2 != OK) {
-				EditorNode::add_io_error("Mesh '" + name + "' failed lightmap generation. Please fix geometry.");
-			} else {
-`				String hash = String::md5((unsigned char *)ret_cache_data);
-				used_unwraps.insert(hash, ret_cache_size);
-
-				if (!ret_used_cache) {
-					// Cache was not used, add the generated entry to the current cache
-					if (cache_data.is_empty()) {
-						cache_data.resize(4 + ret_cache_size);
-						int *data = (int *)cache_data.ptrw();
-						data[0] = 1;
-						memcpy(&data[1], ret_cache_data, ret_cache_size);
-					} else {
-						int current_size = cache_data.size();
-						cache_data.resize(cache_data.size() + ret_cache_size);
-							unsigned char *ptrw = cache_data.ptrw();
-						memcpy(&ptrw[current_size], ret_cache_data, ret_cache_size);
-						int *data = (int *)ptrw;
-						data[0] += 1;
-					}
-				}
-			}
-			step++;
-		}
-
-		Error err2;
-		FileAccess *file = FileAccess::open(cache_file_path, FileAccess::WRITE, &err2);
-
-		if (err2) {
-			if (file) {
-				memdelete(file);
-			}
-		} else {
-			// Store number of entries
-			file->store_32(used_unwraps.size());
-
-			// Store cache entries
-			const int *cache = (int *)cache_data.ptr();
-			unsigned int r_idx = 1;
-			for (int i = 0; i < cache[0]; ++i) {
-				unsigned char *entry_start = (unsigned char *)&cache[r_idx];
-				String entry_hash = String::md5(entry_start);
-				if (used_unwraps.has(entry_hash)) {
-					unsigned int entry_size = used_unwraps[entry_hash];
-					file->store_buffer(entry_start, entry_size);
-				}
-
-				r_idx += 4; // hash
-				r_idx += 2; // size hint
-
-				int vertex_count = cache[r_idx];
-				r_idx += 1; // vertex count
-				r_idx += vertex_count; // vertex
-				r_idx += vertex_count * 2; // uvs
-
-				int index_count = cache[r_idx];
-				r_idx += 1; // index count
-				r_idx += index_count; // indices
-			}
-
-			file->close();
+			f->close();
 		}
 	}
-#endif
+	err = OK;
 
 	progress.step(TTR("Running Custom Script..."), 2);
 
diff --git a/editor/import/resource_importer_scene.h b/editor/import/resource_importer_scene.h
index 6c6af57c4c..8cb84abce2 100644
--- a/editor/import/resource_importer_scene.h
+++ b/editor/import/resource_importer_scene.h
@@ -119,7 +119,7 @@ class ResourceImporterScene : public ResourceImporter {
 	};
 
 	void _replace_owner(Node *p_node, Node *p_scene, Node *p_new_owner);
-	void _generate_meshes(Node *p_node, const Dictionary &p_mesh_data, bool p_generate_lods, bool p_create_shadow_meshes, LightBakeMode p_light_bake_mode, float p_lightmap_texel_size, const Vector<uint8_t> &p_src_lightmap_cache, Vector<uint8_t> &r_dst_lightmap_cache);
+	void _generate_meshes(Node *p_node, const Dictionary &p_mesh_data, bool p_generate_lods, bool p_create_shadow_meshes, LightBakeMode p_light_bake_mode, float p_lightmap_texel_size, const Vector<uint8_t> &p_src_lightmap_cache, Vector<Vector<uint8_t>> &r_lightmap_caches);
 	void _add_shapes(Node *p_node, const List<Ref<Shape3D>> &p_shapes);
 
 public:
@@ -173,6 +173,8 @@ public:
 	virtual bool has_advanced_options() const override;
 	virtual void show_advanced_options(const String &p_path) override;
 
+	virtual bool can_import_threaded() const override { return false; }
+
 	ResourceImporterScene();
 };
 
diff --git a/editor/import/scene_importer_mesh.cpp b/editor/import/scene_importer_mesh.cpp
index 28fdd4ddbd..bc7e8a1626 100644
--- a/editor/import/scene_importer_mesh.cpp
+++ b/editor/import/scene_importer_mesh.cpp
@@ -583,7 +583,7 @@ Ref<NavigationMesh> EditorSceneImporterMesh::create_navigation_mesh() {
 	return nm;
 }
 
-extern bool (*array_mesh_lightmap_unwrap_callback)(float p_texel_size, const float *p_vertices, const float *p_normals, int p_vertex_count, const int *p_indices, int p_index_count, float **r_uv, int **r_vertex, int *r_vertex_count, int **r_index, int *r_index_count, int *r_size_hint_x, int *r_size_hint_y, int *&r_cache_data, unsigned int &r_cache_size, bool &r_used_cache);
+extern bool (*array_mesh_lightmap_unwrap_callback)(float p_texel_size, const float *p_vertices, const float *p_normals, int p_vertex_count, const int *p_indices, int p_index_count, const uint8_t *p_cache_data, bool *r_use_cache, uint8_t **r_mesh_cache, int *r_mesh_cache_size, float **r_uv, int **r_vertex, int *r_vertex_count, int **r_index, int *r_index_count, int *r_size_hint_x, int *r_size_hint_y);
 
 struct EditorSceneImporterMeshLightmapSurface {
 	Ref<Material> material;
@@ -593,22 +593,24 @@ struct EditorSceneImporterMeshLightmapSurface {
 	String name;
 };
 
-Error EditorSceneImporterMesh::lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cache_size, bool &r_used_cache, const Transform &p_base_transform, float p_texel_size) {
+Error EditorSceneImporterMesh::lightmap_unwrap_cached(const Transform &p_base_transform, float p_texel_size, const Vector<uint8_t> &p_src_cache, Vector<uint8_t> &r_dst_cache) {
 	ERR_FAIL_COND_V(!array_mesh_lightmap_unwrap_callback, ERR_UNCONFIGURED);
 	ERR_FAIL_COND_V_MSG(blend_shapes.size() != 0, ERR_UNAVAILABLE, "Can't unwrap mesh with blend shapes.");
 
-	Vector<float> vertices;
-	Vector<float> normals;
-	Vector<int> indices;
-	Vector<float> uv;
-	Vector<Pair<int, int>> uv_indices;
+	LocalVector<float> vertices;
+	LocalVector<float> normals;
+	LocalVector<int> indices;
+	LocalVector<float> uv;
+	LocalVector<Pair<int, int>> uv_indices;
 
 	Vector<EditorSceneImporterMeshLightmapSurface> lightmap_surfaces;
 
 	// Keep only the scale
-	Transform transform = p_base_transform;
-	transform.origin = Vector3();
-	transform.looking_at(Vector3(1, 0, 0), Vector3(0, 1, 0));
+	Basis basis = p_base_transform.get_basis();
+	Vector3 scale = Vector3(basis.get_axis(0).length(), basis.get_axis(1).length(), basis.get_axis(2).length());
+
+	Transform transform;
+	transform.scale(scale);
 
 	Basis normal_basis = transform.basis.inverse().transposed();
 
@@ -623,15 +625,10 @@ Error EditorSceneImporterMesh::lightmap_unwrap_cached(int *&r_cache_data, unsign
 
 		SurfaceTool::create_vertex_array_from_triangle_arrays(arrays, s.vertices, &s.format);
 
-		Vector<Vector3> rvertices = arrays[Mesh::ARRAY_VERTEX];
+		PackedVector3Array rvertices = arrays[Mesh::ARRAY_VERTEX];
 		int vc = rvertices.size();
-		const Vector3 *r = rvertices.ptr();
-
-		Vector<Vector3> rnormals = arrays[Mesh::ARRAY_NORMAL];
-
-		ERR_FAIL_COND_V_MSG(rnormals.size() == 0, ERR_UNAVAILABLE, "Normals are required for lightmap unwrap.");
 
-		const Vector3 *rn = rnormals.ptr();
+		PackedVector3Array rnormals = arrays[Mesh::ARRAY_NORMAL];
 
 		int vertex_ofs = vertices.size() / 3;
 
@@ -640,24 +637,29 @@ Error EditorSceneImporterMesh::lightmap_unwrap_cached(int *&r_cache_data, unsign
 		uv_indices.resize(vertex_ofs + vc);
 
 		for (int j = 0; j < vc; j++) {
-			Vector3 v = transform.xform(r[j]);
-			Vector3 n = normal_basis.xform(rn[j]).normalized();
-
-			vertices.write[(j + vertex_ofs) * 3 + 0] = v.x;
-			vertices.write[(j + vertex_ofs) * 3 + 1] = v.y;
-			vertices.write[(j + vertex_ofs) * 3 + 2] = v.z;
-			normals.write[(j + vertex_ofs) * 3 + 0] = n.x;
-			normals.write[(j + vertex_ofs) * 3 + 1] = n.y;
-			normals.write[(j + vertex_ofs) * 3 + 2] = n.z;
-			uv_indices.write[j + vertex_ofs] = Pair<int, int>(i, j);
+			Vector3 v = transform.xform(rvertices[j]);
+			Vector3 n = normal_basis.xform(rnormals[j]).normalized();
+
+			vertices[(j + vertex_ofs) * 3 + 0] = v.x;
+			vertices[(j + vertex_ofs) * 3 + 1] = v.y;
+			vertices[(j + vertex_ofs) * 3 + 2] = v.z;
+			normals[(j + vertex_ofs) * 3 + 0] = n.x;
+			normals[(j + vertex_ofs) * 3 + 1] = n.y;
+			normals[(j + vertex_ofs) * 3 + 2] = n.z;
+			uv_indices[j + vertex_ofs] = Pair<int, int>(i, j);
 		}
 
-		Vector<int> rindices = arrays[Mesh::ARRAY_INDEX];
+		PackedInt32Array rindices = arrays[Mesh::ARRAY_INDEX];
 		int ic = rindices.size();
 
+		float eps = 1.19209290e-7F; // Taken from xatlas.h
 		if (ic == 0) {
 			for (int j = 0; j < vc / 3; j++) {
-				if (Face3(r[j * 3 + 0], r[j * 3 + 1], r[j * 3 + 2]).is_degenerate()) {
+				Vector3 p0 = transform.xform(rvertices[j * 3 + 0]);
+				Vector3 p1 = transform.xform(rvertices[j * 3 + 1]);
+				Vector3 p2 = transform.xform(rvertices[j * 3 + 2]);
+
+				if ((p0 - p1).length_squared() < eps || (p1 - p2).length_squared() < eps || (p2 - p0).length_squared() < eps) {
 					continue;
 				}
 
@@ -667,15 +669,18 @@ Error EditorSceneImporterMesh::lightmap_unwrap_cached(int *&r_cache_data, unsign
 			}
 
 		} else {
-			const int *ri = rindices.ptr();
-
 			for (int j = 0; j < ic / 3; j++) {
-				if (Face3(r[ri[j * 3 + 0]], r[ri[j * 3 + 1]], r[ri[j * 3 + 2]]).is_degenerate()) {
+				Vector3 p0 = transform.xform(rvertices[rindices[j * 3 + 0]]);
+				Vector3 p1 = transform.xform(rvertices[rindices[j * 3 + 1]]);
+				Vector3 p2 = transform.xform(rvertices[rindices[j * 3 + 2]]);
+
+				if ((p0 - p1).length_squared() < eps || (p1 - p2).length_squared() < eps || (p2 - p0).length_squared() < eps) {
 					continue;
 				}
-				indices.push_back(vertex_ofs + ri[j * 3 + 0]);
-				indices.push_back(vertex_ofs + ri[j * 3 + 1]);
-				indices.push_back(vertex_ofs + ri[j * 3 + 2]);
+
+				indices.push_back(vertex_ofs + rindices[j * 3 + 0]);
+				indices.push_back(vertex_ofs + rindices[j * 3 + 1]);
+				indices.push_back(vertex_ofs + rindices[j * 3 + 2]);
 			}
 		}
 
@@ -684,6 +689,9 @@ Error EditorSceneImporterMesh::lightmap_unwrap_cached(int *&r_cache_data, unsign
 
 	//unwrap
 
+	bool use_cache = true; // Used to request cache generation and to know if cache was used
+	uint8_t *gen_cache;
+	int gen_cache_size;
 	float *gen_uvs;
 	int *gen_vertices;
 	int *gen_indices;
@@ -692,7 +700,7 @@ Error EditorSceneImporterMesh::lightmap_unwrap_cached(int *&r_cache_data, unsign
 	int size_x;
 	int size_y;
 
-	bool ok = array_mesh_lightmap_unwrap_callback(p_texel_size, vertices.ptr(), normals.ptr(), vertices.size() / 3, indices.ptr(), indices.size(), &gen_uvs, &gen_vertices, &gen_vertex_count, &gen_indices, &gen_index_count, &size_x, &size_y, r_cache_data, r_cache_size, r_used_cache);
+	bool ok = array_mesh_lightmap_unwrap_callback(p_texel_size, vertices.ptr(), normals.ptr(), vertices.size() / 3, indices.ptr(), indices.size(), p_src_cache.ptr(), &use_cache, &gen_cache, &gen_cache_size, &gen_uvs, &gen_vertices, &gen_vertex_count, &gen_indices, &gen_index_count, &size_x, &size_y);
 
 	if (!ok) {
 		return ERR_CANT_CREATE;
@@ -702,7 +710,7 @@ Error EditorSceneImporterMesh::lightmap_unwrap_cached(int *&r_cache_data, unsign
 	clear();
 
 	//create surfacetools for each surface..
-	Vector<Ref<SurfaceTool>> surfaces_tools;
+	LocalVector<Ref<SurfaceTool>> surfaces_tools;
 
 	for (int i = 0; i < lightmap_surfaces.size(); i++) {
 		Ref<SurfaceTool> st;
@@ -714,11 +722,12 @@ Error EditorSceneImporterMesh::lightmap_unwrap_cached(int *&r_cache_data, unsign
 	}
 
 	print_verbose("Mesh: Gen indices: " + itos(gen_index_count));
+
 	//go through all indices
 	for (int i = 0; i < gen_index_count; i += 3) {
-		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 0]], uv_indices.size(), ERR_BUG);
-		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 1]], uv_indices.size(), ERR_BUG);
-		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 2]], uv_indices.size(), ERR_BUG);
+		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 0]], (int)uv_indices.size(), ERR_BUG);
+		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 1]], (int)uv_indices.size(), ERR_BUG);
+		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 2]], (int)uv_indices.size(), ERR_BUG);
 
 		ERR_FAIL_COND_V(uv_indices[gen_vertices[gen_indices[i + 0]]].first != uv_indices[gen_vertices[gen_indices[i + 1]]].first || uv_indices[gen_vertices[gen_indices[i + 0]]].first != uv_indices[gen_vertices[gen_indices[i + 2]]].first, ERR_BUG);
 
@@ -728,49 +737,54 @@ Error EditorSceneImporterMesh::lightmap_unwrap_cached(int *&r_cache_data, unsign
 			SurfaceTool::Vertex v = lightmap_surfaces[surface].vertices[uv_indices[gen_vertices[gen_indices[i + j]]].second];
 
 			if (lightmap_surfaces[surface].format & Mesh::ARRAY_FORMAT_COLOR) {
-				surfaces_tools.write[surface]->set_color(v.color);
+				surfaces_tools[surface]->set_color(v.color);
 			}
 			if (lightmap_surfaces[surface].format & Mesh::ARRAY_FORMAT_TEX_UV) {
-				surfaces_tools.write[surface]->set_uv(v.uv);
+				surfaces_tools[surface]->set_uv(v.uv);
 			}
 			if (lightmap_surfaces[surface].format & Mesh::ARRAY_FORMAT_NORMAL) {
-				surfaces_tools.write[surface]->set_normal(v.normal);
+				surfaces_tools[surface]->set_normal(v.normal);
 			}
 			if (lightmap_surfaces[surface].format & Mesh::ARRAY_FORMAT_TANGENT) {
 				Plane t;
 				t.normal = v.tangent;
 				t.d = v.binormal.dot(v.normal.cross(v.tangent)) < 0 ? -1 : 1;
-				surfaces_tools.write[surface]->set_tangent(t);
+				surfaces_tools[surface]->set_tangent(t);
 			}
 			if (lightmap_surfaces[surface].format & Mesh::ARRAY_FORMAT_BONES) {
-				surfaces_tools.write[surface]->set_bones(v.bones);
+				surfaces_tools[surface]->set_bones(v.bones);
 			}
 			if (lightmap_surfaces[surface].format & Mesh::ARRAY_FORMAT_WEIGHTS) {
-				surfaces_tools.write[surface]->set_weights(v.weights);
+				surfaces_tools[surface]->set_weights(v.weights);
 			}
 
 			Vector2 uv2(gen_uvs[gen_indices[i + j] * 2 + 0], gen_uvs[gen_indices[i + j] * 2 + 1]);
-			surfaces_tools.write[surface]->set_uv2(uv2);
+			surfaces_tools[surface]->set_uv2(uv2);
 
-			surfaces_tools.write[surface]->add_vertex(v.vertex);
+			surfaces_tools[surface]->add_vertex(v.vertex);
 		}
 	}
 
 	//generate surfaces
-
-	for (int i = 0; i < surfaces_tools.size(); i++) {
-		surfaces_tools.write[i]->index();
-		Array arrays = surfaces_tools.write[i]->commit_to_arrays();
-		add_surface(surfaces_tools.write[i]->get_primitive(), arrays, Array(), Dictionary(), surfaces_tools.write[i]->get_material(), surfaces_tools.write[i]->get_meta("name"));
+	for (unsigned int i = 0; i < surfaces_tools.size(); i++) {
+		surfaces_tools[i]->index();
+		Array arrays = surfaces_tools[i]->commit_to_arrays();
+		add_surface(surfaces_tools[i]->get_primitive(), arrays, Array(), Dictionary(), surfaces_tools[i]->get_material(), surfaces_tools[i]->get_meta("name"));
 	}
 
 	set_lightmap_size_hint(Size2(size_x, size_y));
 
-	if (!r_used_cache) {
-		//free stuff
-		::free(gen_vertices);
-		::free(gen_indices);
-		::free(gen_uvs);
+	if (gen_cache_size > 0) {
+		r_dst_cache.resize(gen_cache_size);
+		memcpy(r_dst_cache.ptrw(), gen_cache, gen_cache_size);
+		memfree(gen_cache);
+	}
+
+	if (!use_cache) {
+		// Cache was not used, free the buffers
+		memfree(gen_vertices);
+		memfree(gen_indices);
+		memfree(gen_uvs);
 	}
 
 	return OK;
diff --git a/editor/import/scene_importer_mesh.h b/editor/import/scene_importer_mesh.h
index 3326fab55d..b3e8137e0a 100644
--- a/editor/import/scene_importer_mesh.h
+++ b/editor/import/scene_importer_mesh.h
@@ -105,7 +105,7 @@ public:
 	Vector<Ref<Shape3D>> convex_decompose() const;
 	Ref<Shape3D> create_trimesh_shape() const;
 	Ref<NavigationMesh> create_navigation_mesh();
-	Error lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cache_size, bool &r_used_cache, const Transform &p_base_transform, float p_texel_size);
+	Error lightmap_unwrap_cached(const Transform &p_base_transform, float p_texel_size, const Vector<uint8_t> &p_src_cache, Vector<uint8_t> &r_dst_cache);
 
 	void set_lightmap_size_hint(const Size2i &p_size);
 	Size2i get_lightmap_size_hint() const;
diff --git a/editor/localization_editor.cpp b/editor/localization_editor.cpp
index 0e68af06f0..161f1dde0d 100644
--- a/editor/localization_editor.cpp
+++ b/editor/localization_editor.cpp
@@ -37,24 +37,6 @@
 #include "scene/gui/control.h"
 
 void LocalizationEditor::_notification(int p_what) {
-	if (p_what == NOTIFICATION_TEXT_SERVER_CHANGED) {
-		ts_name->set_text(TTR("Text server: ") + TS->get_name());
-
-		FileAccessRef file_check = FileAccess::create(FileAccess::ACCESS_RESOURCES);
-		if (TS->has_feature(TextServer::FEATURE_USE_SUPPORT_DATA)) {
-			if (file_check->file_exists("res://" + TS->get_support_data_filename())) {
-				ts_data_status->set_text(TTR("Support data: ") + TTR("Installed"));
-				ts_install->set_disabled(true);
-			} else {
-				ts_data_status->set_text(TTR("Support data: ") + TTR("Not installed"));
-				ts_install->set_disabled(false);
-			}
-		} else {
-			ts_data_status->set_text(TTR("Support data: ") + TTR("Not supported"));
-			ts_install->set_disabled(false);
-		}
-		ts_data_info->set_text(TTR("Info: ") + TS->get_support_data_info());
-	}
 	if (p_what == NOTIFICATION_ENTER_TREE) {
 		translation_list->connect("button_pressed", callable_mp(this, &LocalizationEditor::_translation_delete));
 		translation_pot_list->connect("button_pressed", callable_mp(this, &LocalizationEditor::_pot_delete));
@@ -649,26 +631,6 @@ void LocalizationEditor::update_translations() {
 	updating_translations = false;
 }
 
-void LocalizationEditor::_install_ts_data() {
-	if (TS->has_feature(TextServer::FEATURE_USE_SUPPORT_DATA)) {
-		TS->save_support_data("res://" + TS->get_support_data_filename());
-	}
-
-	FileAccessRef file_check = FileAccess::create(FileAccess::ACCESS_RESOURCES);
-	if (TS->has_feature(TextServer::FEATURE_USE_SUPPORT_DATA)) {
-		if (file_check->file_exists("res://" + TS->get_support_data_filename())) {
-			ts_data_status->set_text(TTR("Support data: ") + TTR("Installed"));
-			ts_install->set_disabled(true);
-		} else {
-			ts_data_status->set_text(TTR("Support data: ") + TTR("Not installed"));
-			ts_install->set_disabled(false);
-		}
-	} else {
-		ts_data_status->set_text(TTR("Support data: ") + TTR("Not supported"));
-		ts_install->set_disabled(false);
-	}
-}
-
 void LocalizationEditor::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("update_translations"), &LocalizationEditor::update_translations);
 
@@ -838,37 +800,4 @@ LocalizationEditor::LocalizationEditor() {
 		pot_file_open_dialog->connect("files_selected", callable_mp(this, &LocalizationEditor::_pot_add));
 		add_child(pot_file_open_dialog);
 	}
-
-	{
-		VBoxContainer *tvb = memnew(VBoxContainer);
-		tvb->set_name(TTR("Text Server Data"));
-		translations->add_child(tvb);
-
-		ts_name = memnew(Label(TTR("Text server: ") + TS->get_name()));
-		tvb->add_child(ts_name);
-
-		ts_data_status = memnew(Label(TTR("Support data: ")));
-		tvb->add_child(ts_data_status);
-
-		ts_data_info = memnew(Label(TTR("Info: ") + TS->get_support_data_info()));
-		tvb->add_child(ts_data_info);
-
-		ts_install = memnew(Button(TTR("Install support data...")));
-		ts_install->connect("pressed", callable_mp(this, &LocalizationEditor::_install_ts_data));
-		tvb->add_child(ts_install);
-
-		FileAccessRef file_check = FileAccess::create(FileAccess::ACCESS_RESOURCES);
-		if (TS->has_feature(TextServer::FEATURE_USE_SUPPORT_DATA)) {
-			if (file_check->file_exists("res://" + TS->get_support_data_filename())) {
-				ts_data_status->set_text(TTR("Support data: ") + TTR("Installed"));
-				ts_install->set_disabled(true);
-			} else {
-				ts_data_status->set_text(TTR("Support data: ") + TTR("Not installed"));
-				ts_install->set_disabled(false);
-			}
-		} else {
-			ts_data_status->set_text(TTR("Support data: ") + TTR("Not supported"));
-			ts_install->set_disabled(false);
-		}
-	}
 }
diff --git a/editor/localization_editor.h b/editor/localization_editor.h
index 6e0d7ce61f..23cea06fbe 100644
--- a/editor/localization_editor.h
+++ b/editor/localization_editor.h
@@ -58,11 +58,6 @@ class LocalizationEditor : public VBoxContainer {
 	Vector<TreeItem *> translation_filter_treeitems;
 	Vector<int> translation_locales_idxs_remap;
 
-	Label *ts_name;
-	Label *ts_data_status;
-	Label *ts_data_info;
-	Button *ts_install;
-
 	Tree *translation_pot_list;
 	EditorFileDialog *pot_file_open_dialog;
 	EditorFileDialog *pot_generate_dialog;
@@ -94,8 +89,6 @@ class LocalizationEditor : public VBoxContainer {
 	void _pot_generate(const String &p_file);
 	void _update_pot_file_extensions();
 
-	void _install_ts_data();
-
 protected:
 	void _notification(int p_what);
 	static void _bind_methods();
diff --git a/editor/node_3d_editor_gizmos.cpp b/editor/node_3d_editor_gizmos.cpp
index 7dcabafece..afafd7d195 100644
--- a/editor/node_3d_editor_gizmos.cpp
+++ b/editor/node_3d_editor_gizmos.cpp
@@ -47,6 +47,7 @@
 #include "scene/3d/listener_3d.h"
 #include "scene/3d/mesh_instance_3d.h"
 #include "scene/3d/navigation_region_3d.h"
+#include "scene/3d/occluder_instance_3d.h"
 #include "scene/3d/physics_joint_3d.h"
 #include "scene/3d/position_3d.h"
 #include "scene/3d/ray_cast_3d.h"
@@ -176,6 +177,7 @@ void EditorNode3DGizmo::Instance::create_instance(Node3D *p_base, bool p_hidden)
 	RS::get_singleton()->instance_geometry_set_cast_shadows_setting(instance, RS::SHADOW_CASTING_SETTING_OFF);
 	int layer = p_hidden ? 0 : 1 << Node3DEditorViewport::GIZMO_EDIT_LAYER;
 	RS::get_singleton()->instance_set_layer_mask(instance, layer); //gizmos are 26
+	RS::get_singleton()->instance_geometry_set_flag(instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 }
 
 void EditorNode3DGizmo::add_mesh(const Ref<ArrayMesh> &p_mesh, bool p_billboard, const Ref<SkinReference> &p_skin_reference, const Ref<Material> &p_material) {
@@ -1464,6 +1466,44 @@ void MeshInstance3DGizmoPlugin::redraw(EditorNode3DGizmo *p_gizmo) {
 }
 
 /////
+
+OccluderInstance3DGizmoPlugin::OccluderInstance3DGizmoPlugin() {
+	create_material("line_material", EDITOR_DEF("editors/3d_gizmos/gizmo_colors/occluder", Color(0.8, 0.5, 1)));
+}
+
+bool OccluderInstance3DGizmoPlugin::has_gizmo(Node3D *p_spatial) {
+	return Object::cast_to<OccluderInstance3D>(p_spatial) != nullptr;
+}
+
+String OccluderInstance3DGizmoPlugin::get_gizmo_name() const {
+	return "OccluderInstance3D";
+}
+
+int OccluderInstance3DGizmoPlugin::get_priority() const {
+	return -1;
+}
+
+void OccluderInstance3DGizmoPlugin::redraw(EditorNode3DGizmo *p_gizmo) {
+	OccluderInstance3D *occluder_instance = Object::cast_to<OccluderInstance3D>(p_gizmo->get_spatial_node());
+
+	p_gizmo->clear();
+
+	Ref<Occluder3D> o = occluder_instance->get_occluder();
+
+	if (!o.is_valid()) {
+		return;
+	}
+
+	Vector<Vector3> lines = o->get_debug_lines();
+	if (!lines.is_empty()) {
+		Ref<Material> material = get_material("line_material", p_gizmo);
+		p_gizmo->add_lines(lines, material);
+		p_gizmo->add_collision_segments(lines);
+	}
+}
+
+/////
+
 Sprite3DGizmoPlugin::Sprite3DGizmoPlugin() {
 }
 
diff --git a/editor/node_3d_editor_gizmos.h b/editor/node_3d_editor_gizmos.h
index 6f98d3a08c..95344176ad 100644
--- a/editor/node_3d_editor_gizmos.h
+++ b/editor/node_3d_editor_gizmos.h
@@ -100,6 +100,18 @@ public:
 	MeshInstance3DGizmoPlugin();
 };
 
+class OccluderInstance3DGizmoPlugin : public EditorNode3DGizmoPlugin {
+	GDCLASS(OccluderInstance3DGizmoPlugin, EditorNode3DGizmoPlugin);
+
+public:
+	bool has_gizmo(Node3D *p_spatial) override;
+	String get_gizmo_name() const override;
+	int get_priority() const override;
+	void redraw(EditorNode3DGizmo *p_gizmo) override;
+
+	OccluderInstance3DGizmoPlugin();
+};
+
 class Sprite3DGizmoPlugin : public EditorNode3DGizmoPlugin {
 	GDCLASS(Sprite3DGizmoPlugin, EditorNode3DGizmoPlugin);
 
diff --git a/editor/plugins/animation_blend_space_1d_editor.cpp b/editor/plugins/animation_blend_space_1d_editor.cpp
index 025fcaf818..f7c0ebcfaf 100644
--- a/editor/plugins/animation_blend_space_1d_editor.cpp
+++ b/editor/plugins/animation_blend_space_1d_editor.cpp
@@ -698,7 +698,7 @@ AnimationNodeBlendSpace1DEditor::AnimationNodeBlendSpace1DEditor() {
 		max_value->set_step(0.01);
 
 		label_value = memnew(LineEdit);
-		label_value->set_expand_to_text_length(true);
+		label_value->set_expand_to_text_length_enabled(true);
 
 		// now add
 
diff --git a/editor/plugins/animation_blend_space_2d_editor.cpp b/editor/plugins/animation_blend_space_2d_editor.cpp
index af9c391174..e719df53d5 100644
--- a/editor/plugins/animation_blend_space_2d_editor.cpp
+++ b/editor/plugins/animation_blend_space_2d_editor.cpp
@@ -942,7 +942,7 @@ AnimationNodeBlendSpace2DEditor::AnimationNodeBlendSpace2DEditor() {
 		left_vbox->add_spacer();
 		label_y = memnew(LineEdit);
 		left_vbox->add_child(label_y);
-		label_y->set_expand_to_text_length(true);
+		label_y->set_expand_to_text_length_enabled(true);
 		left_vbox->add_spacer();
 		min_y_value = memnew(SpinBox);
 		left_vbox->add_child(min_y_value);
@@ -978,7 +978,7 @@ AnimationNodeBlendSpace2DEditor::AnimationNodeBlendSpace2DEditor() {
 		bottom_vbox->add_spacer();
 		label_x = memnew(LineEdit);
 		bottom_vbox->add_child(label_x);
-		label_x->set_expand_to_text_length(true);
+		label_x->set_expand_to_text_length_enabled(true);
 		bottom_vbox->add_spacer();
 		max_x_value = memnew(SpinBox);
 		bottom_vbox->add_child(max_x_value);
diff --git a/editor/plugins/animation_blend_tree_editor_plugin.cpp b/editor/plugins/animation_blend_tree_editor_plugin.cpp
index fdbbe5184b..48fb507bb1 100644
--- a/editor/plugins/animation_blend_tree_editor_plugin.cpp
+++ b/editor/plugins/animation_blend_tree_editor_plugin.cpp
@@ -136,7 +136,7 @@ void AnimationNodeBlendTreeEditor::_update_graph() {
 		if (String(E->get()) != "output") {
 			LineEdit *name = memnew(LineEdit);
 			name->set_text(E->get());
-			name->set_expand_to_text_length(true);
+			name->set_expand_to_text_length_enabled(true);
 			node->add_child(name);
 			node->set_slot(0, false, 0, Color(), true, 0, get_theme_color("font_color", "Label"));
 			name->connect("text_entered", callable_mp(this, &AnimationNodeBlendTreeEditor::_node_renamed), varray(agnode), CONNECT_DEFERRED);
diff --git a/editor/plugins/canvas_item_editor_plugin.cpp b/editor/plugins/canvas_item_editor_plugin.cpp
index b678197037..6ac47595dc 100644
--- a/editor/plugins/canvas_item_editor_plugin.cpp
+++ b/editor/plugins/canvas_item_editor_plugin.cpp
@@ -804,11 +804,15 @@ void CanvasItemEditor::_find_canvas_items_in_rect(const Rect2 &p_rect, Node *p_n
 
 bool CanvasItemEditor::_select_click_on_item(CanvasItem *item, Point2 p_click_pos, bool p_append) {
 	bool still_selected = true;
-	if (p_append) {
+	if (p_append && !editor_selection->get_selected_node_list().is_empty()) {
 		if (editor_selection->is_selected(item)) {
 			// Already in the selection, remove it from the selected nodes
 			editor_selection->remove_node(item);
 			still_selected = false;
+
+			if (editor_selection->get_selected_node_list().size() == 1) {
+				editor->push_item(editor_selection->get_selected_node_list()[0]);
+			}
 		} else {
 			// Add the item to the selection
 			editor_selection->add_node(item);
@@ -2589,6 +2593,9 @@ bool CanvasItemEditor::_gui_input_select(const Ref<InputEvent> &p_event) {
 				}
 
 				_find_canvas_items_in_rect(Rect2(bsfrom, bsto - bsfrom), scene, &selitems);
+				if (selitems.size() == 1 && editor_selection->get_selected_node_list().is_empty()) {
+					editor->push_item(selitems[0]);
+				}
 				for (List<CanvasItem *>::Element *E = selitems.front(); E; E = E->next()) {
 					editor_selection->add_node(E->get());
 				}
@@ -5378,9 +5385,6 @@ void CanvasItemEditor::_focus_selection(int p_op) {
 			rect = rect.merge(canvas_item_rect);
 		}
 	};
-	if (count == 0) {
-		return;
-	}
 
 	if (p_op == VIEW_CENTER_TO_SELECTION) {
 		center = rect.position + rect.size / 2;
@@ -6514,8 +6518,7 @@ bool CanvasItemEditorViewport::can_drop_data(const Point2 &p_point, const Varian
 						   type == "CurveTexture" ||
 						   type == "GradientTexture" ||
 						   type == "StreamTexture2D" ||
-						   type == "AtlasTexture" ||
-						   type == "LargeTexture") {
+						   type == "AtlasTexture") {
 					Ref<Texture2D> texture = Ref<Texture2D>(Object::cast_to<Texture2D>(*res));
 					if (!texture.is_valid()) {
 						continue;
diff --git a/editor/plugins/editor_preview_plugins.cpp b/editor/plugins/editor_preview_plugins.cpp
index d3e5854786..a319a595c7 100644
--- a/editor/plugins/editor_preview_plugins.cpp
+++ b/editor/plugins/editor_preview_plugins.cpp
@@ -81,7 +81,6 @@ bool EditorTexturePreviewPlugin::generate_small_preview_automatically() const {
 Ref<Texture2D> EditorTexturePreviewPlugin::generate(const RES &p_from, const Size2 &p_size) const {
 	Ref<Image> img;
 	Ref<AtlasTexture> atex = p_from;
-	Ref<LargeTexture> ltex = p_from;
 	if (atex.is_valid()) {
 		Ref<Texture2D> tex = atex->get_atlas();
 		if (!tex.is_valid()) {
@@ -94,8 +93,6 @@ Ref<Texture2D> EditorTexturePreviewPlugin::generate(const RES &p_from, const Siz
 		}
 
 		img = atlas->get_rect(atex->get_region());
-	} else if (ltex.is_valid()) {
-		img = ltex->to_image();
 	} else {
 		Ref<Texture2D> tex = p_from;
 		if (tex.is_valid()) {
@@ -507,6 +504,7 @@ Ref<Texture2D> EditorScriptPreviewPlugin::generate(const RES &p_from, const Size
 	Color keyword_color = EditorSettings::get_singleton()->get("text_editor/highlighting/keyword_color");
 	Color text_color = EditorSettings::get_singleton()->get("text_editor/highlighting/text_color");
 	Color symbol_color = EditorSettings::get_singleton()->get("text_editor/highlighting/symbol_color");
+	Color comment_color = EditorSettings::get_singleton()->get("text_editor/highlighting/comment_color");
 
 	if (bg_color.a == 0) {
 		bg_color = Color(0, 0, 0, 0);
@@ -526,33 +524,42 @@ Ref<Texture2D> EditorScriptPreviewPlugin::generate(const RES &p_from, const Size
 
 	bool prev_is_text = false;
 	bool in_keyword = false;
+	bool in_comment = false;
 	for (int i = 0; i < code.length(); i++) {
 		char32_t c = code[i];
 		if (c > 32) {
 			if (col < thumbnail_size) {
 				Color color = text_color;
 
-				if (c != '_' && ((c >= '!' && c <= '/') || (c >= ':' && c <= '@') || (c >= '[' && c <= '`') || (c >= '{' && c <= '~') || c == '\t')) {
-					//make symbol a little visible
-					color = symbol_color;
-					in_keyword = false;
-				} else if (!prev_is_text && _is_text_char(c)) {
-					int pos = i;
+				if (c == '#') {
+					in_comment = true;
+				}
 
-					while (_is_text_char(code[pos])) {
-						pos++;
-					}
-					String word = code.substr(i, pos - i);
-					if (keywords.has(word)) {
-						in_keyword = true;
+				if (in_comment) {
+					color = comment_color;
+				} else {
+					if (c != '_' && ((c >= '!' && c <= '/') || (c >= ':' && c <= '@') || (c >= '[' && c <= '`') || (c >= '{' && c <= '~') || c == '\t')) {
+						//make symbol a little visible
+						color = symbol_color;
+						in_keyword = false;
+					} else if (!prev_is_text && _is_text_char(c)) {
+						int pos = i;
+
+						while (_is_text_char(code[pos])) {
+							pos++;
+						}
+						String word = code.substr(i, pos - i);
+						if (keywords.has(word)) {
+							in_keyword = true;
+						}
+
+					} else if (!_is_text_char(c)) {
+						in_keyword = false;
 					}
 
-				} else if (!_is_text_char(c)) {
-					in_keyword = false;
-				}
-
-				if (in_keyword) {
-					color = keyword_color;
+					if (in_keyword) {
+						color = keyword_color;
+					}
 				}
 
 				Color ul = color;
@@ -562,11 +569,14 @@ Ref<Texture2D> EditorScriptPreviewPlugin::generate(const RES &p_from, const Size
 
 				prev_is_text = _is_text_char(c);
 			}
+			col++;
 		} else {
 			prev_is_text = false;
 			in_keyword = false;
 
 			if (c == '\n') {
+				in_comment = false;
+
 				col = x0;
 				line++;
 				if (line >= available_height / 2) {
@@ -574,9 +584,10 @@ Ref<Texture2D> EditorScriptPreviewPlugin::generate(const RES &p_from, const Size
 				}
 			} else if (c == '\t') {
 				col += 3;
+			} else {
+				col++;
 			}
 		}
-		col++;
 	}
 
 	post_process_preview(img);
diff --git a/editor/plugins/gpu_particles_3d_editor_plugin.cpp b/editor/plugins/gpu_particles_3d_editor_plugin.cpp
index 433a5ae51c..89d6aaa5f9 100644
--- a/editor/plugins/gpu_particles_3d_editor_plugin.cpp
+++ b/editor/plugins/gpu_particles_3d_editor_plugin.cpp
@@ -346,7 +346,7 @@ void GPUParticles3DEditor::_generate_emission_points() {
 
 	{
 		uint8_t *iw = point_img.ptrw();
-		zeromem(iw, w * h * 3 * sizeof(float));
+		memset(iw, 0, w * h * 3 * sizeof(float));
 		const Vector3 *r = points.ptr();
 		float *wf = (float *)iw;
 		for (int i = 0; i < point_count; i++) {
@@ -374,7 +374,7 @@ void GPUParticles3DEditor::_generate_emission_points() {
 
 		{
 			uint8_t *iw = point_img2.ptrw();
-			zeromem(iw, w * h * 3 * sizeof(float));
+			memset(iw, 0, w * h * 3 * sizeof(float));
 			const Vector3 *r = normals.ptr();
 			float *wf = (float *)iw;
 			for (int i = 0; i < point_count; i++) {
diff --git a/editor/plugins/mesh_library_editor_plugin.cpp b/editor/plugins/mesh_library_editor_plugin.cpp
index f8932cd534..6f1f243444 100644
--- a/editor/plugins/mesh_library_editor_plugin.cpp
+++ b/editor/plugins/mesh_library_editor_plugin.cpp
@@ -93,7 +93,7 @@ void MeshLibraryEditor::_import_scene(Node *p_scene, Ref<MeshLibrary> p_library,
 
 		mesh = mesh->duplicate();
 		for (int j = 0; j < mesh->get_surface_count(); ++j) {
-			Ref<Material> mat = mi->get_surface_material(j);
+			Ref<Material> mat = mi->get_surface_override_material(j);
 
 			if (mat.is_valid()) {
 				mesh->surface_set_material(j, mat);
diff --git a/editor/plugins/node_3d_editor_plugin.cpp b/editor/plugins/node_3d_editor_plugin.cpp
index cbe0133034..023d91be30 100644
--- a/editor/plugins/node_3d_editor_plugin.cpp
+++ b/editor/plugins/node_3d_editor_plugin.cpp
@@ -1279,7 +1279,7 @@ void Node3DEditorViewport::_sinput(const Ref<InputEvent> &p_event) {
 					clicked = ObjectID();
 					clicked_includes_current = false;
 
-					if ((spatial_editor->get_tool_mode() == Node3DEditor::TOOL_MODE_SELECT && b->get_control()) || spatial_editor->get_tool_mode() == Node3DEditor::TOOL_MODE_ROTATE) {
+					if ((spatial_editor->get_tool_mode() == Node3DEditor::TOOL_MODE_SELECT && b->get_command()) || spatial_editor->get_tool_mode() == Node3DEditor::TOOL_MODE_ROTATE) {
 						/* HANDLE ROTATION */
 						if (get_selected_count() == 0) {
 							break; //bye
@@ -2215,6 +2215,12 @@ void Node3DEditorViewport::scale_cursor_distance(real_t scale) {
 		cursor.distance = CLAMP(cursor.distance * scale, min_distance, max_distance);
 	}
 
+	if (cursor.distance == max_distance || cursor.distance == min_distance) {
+		zoom_failed_attempts_count++;
+	} else {
+		zoom_failed_attempts_count = 0;
+	}
+
 	zoom_indicator_delay = ZOOM_FREELOOK_INDICATOR_DELAY_S;
 	surface->update();
 }
@@ -2362,6 +2368,9 @@ void Node3DEditorViewport::_project_settings_changed() {
 	viewport->set_screen_space_aa(Viewport::ScreenSpaceAA(ssaa_mode));
 	const bool use_debanding = GLOBAL_GET("rendering/anti_aliasing/quality/use_debanding");
 	viewport->set_use_debanding(use_debanding);
+
+	const bool use_occlusion_culling = GLOBAL_GET("rendering/occlusion_culling/use_occlusion_culling");
+	viewport->set_use_occlusion_culling(use_occlusion_culling);
 }
 
 void Node3DEditorViewport::_notification(int p_what) {
@@ -2396,6 +2405,7 @@ void Node3DEditorViewport::_notification(int p_what) {
 			zoom_indicator_delay -= delta;
 			if (zoom_indicator_delay <= 0) {
 				surface->update();
+				zoom_limit_label->hide();
 			}
 		}
 
@@ -2535,6 +2545,8 @@ void Node3DEditorViewport::_notification(int p_what) {
 				cpu_time += cpu_time_history[i];
 			}
 			cpu_time /= FRAME_TIME_HISTORY;
+			// Prevent unrealistically low values.
+			cpu_time = MAX(0.01, cpu_time);
 
 			gpu_time_history[gpu_time_history_index] = RS::get_singleton()->viewport_get_measured_render_time_gpu(viewport->get_viewport_rid());
 			gpu_time_history_index = (gpu_time_history_index + 1) % FRAME_TIME_HISTORY;
@@ -2543,16 +2555,19 @@ void Node3DEditorViewport::_notification(int p_what) {
 				gpu_time += gpu_time_history[i];
 			}
 			gpu_time /= FRAME_TIME_HISTORY;
+			// Prevent division by zero for the FPS counter (and unrealistically low values).
+			// This limits the reported FPS to 100000.
+			gpu_time = MAX(0.01, gpu_time);
 
 			// Color labels depending on performance level ("good" = green, "OK" = yellow, "bad" = red).
 			// Middle point is at 15 ms.
-			cpu_time_label->set_text(vformat(TTR("CPU Time: %s ms"), String::num(cpu_time, 1)));
+			cpu_time_label->set_text(vformat(TTR("CPU Time: %s ms"), rtos(cpu_time).pad_decimals(1)));
 			cpu_time_label->add_theme_color_override(
 					"font_color",
 					frame_time_gradient->get_color_at_offset(
 							Math::range_lerp(cpu_time, 0, 30, 0, 1)));
 
-			gpu_time_label->set_text(vformat(TTR("GPU Time: %s ms"), String::num(gpu_time, 1)));
+			gpu_time_label->set_text(vformat(TTR("GPU Time: %s ms"), rtos(gpu_time).pad_decimals(1)));
 			// Middle point is at 15 ms.
 			gpu_time_label->add_theme_color_override(
 					"font_color",
@@ -2770,6 +2785,7 @@ void Node3DEditorViewport::_draw() {
 
 			} else {
 				// Show zoom
+				zoom_limit_label->set_visible(zoom_failed_attempts_count > 15);
 
 				real_t min_distance = MAX(camera->get_near() * 4, ZOOM_FREELOOK_MIN);
 				real_t max_distance = MIN(camera->get_far() / 4, ZOOM_FREELOOK_MAX);
@@ -3058,7 +3074,8 @@ void Node3DEditorViewport::_menu_option(int p_option) {
 		case VIEW_DISPLAY_DEBUG_CLUSTER_OMNI_LIGHTS:
 		case VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS:
 		case VIEW_DISPLAY_DEBUG_CLUSTER_DECALS:
-		case VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES: {
+		case VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES:
+		case VIEW_DISPLAY_DEBUG_OCCLUDERS: {
 			static const int display_options[] = {
 				VIEW_DISPLAY_NORMAL,
 				VIEW_DISPLAY_WIREFRAME,
@@ -3084,6 +3101,7 @@ void Node3DEditorViewport::_menu_option(int p_option) {
 				VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS,
 				VIEW_DISPLAY_DEBUG_CLUSTER_DECALS,
 				VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES,
+				VIEW_DISPLAY_DEBUG_OCCLUDERS,
 				VIEW_MAX
 			};
 			static const Viewport::DebugDraw debug_draw_modes[] = {
@@ -3111,6 +3129,7 @@ void Node3DEditorViewport::_menu_option(int p_option) {
 				Viewport::DEBUG_DRAW_CLUSTER_SPOT_LIGHTS,
 				Viewport::DEBUG_DRAW_CLUSTER_DECALS,
 				Viewport::DEBUG_DRAW_CLUSTER_REFLECTION_PROBES,
+				Viewport::DEBUG_DRAW_OCCLUDERS,
 			};
 
 			int idx = 0;
@@ -3160,6 +3179,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(move_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(move_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(move_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(move_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 		move_plane_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(move_plane_gizmo_instance[i], spatial_editor->get_move_plane_gizmo(i)->get_rid());
@@ -3167,6 +3187,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(move_plane_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(move_plane_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(move_plane_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(move_plane_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 		rotate_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(rotate_gizmo_instance[i], spatial_editor->get_rotate_gizmo(i)->get_rid());
@@ -3174,6 +3195,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(rotate_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(rotate_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(rotate_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(rotate_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 		scale_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(scale_gizmo_instance[i], spatial_editor->get_scale_gizmo(i)->get_rid());
@@ -3181,6 +3203,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(scale_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(scale_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(scale_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(scale_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 		scale_plane_gizmo_instance[i] = RS::get_singleton()->instance_create();
 		RS::get_singleton()->instance_set_base(scale_plane_gizmo_instance[i], spatial_editor->get_scale_plane_gizmo(i)->get_rid());
@@ -3188,6 +3211,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 		RS::get_singleton()->instance_set_visible(scale_plane_gizmo_instance[i], false);
 		RS::get_singleton()->instance_geometry_set_cast_shadows_setting(scale_plane_gizmo_instance[i], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(scale_plane_gizmo_instance[i], layer);
+		RS::get_singleton()->instance_geometry_set_flag(scale_plane_gizmo_instance[i], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 	}
 
 	// Rotation white outline
@@ -3197,6 +3221,7 @@ void Node3DEditorViewport::_init_gizmo_instance(int p_idx) {
 	RS::get_singleton()->instance_set_visible(rotate_gizmo_instance[3], false);
 	RS::get_singleton()->instance_geometry_set_cast_shadows_setting(rotate_gizmo_instance[3], RS::SHADOW_CASTING_SETTING_OFF);
 	RS::get_singleton()->instance_set_layer_mask(rotate_gizmo_instance[3], layer);
+	RS::get_singleton()->instance_geometry_set_flag(rotate_gizmo_instance[3], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 }
 
 void Node3DEditorViewport::_finish_gizmo_instances() {
@@ -3547,10 +3572,6 @@ void Node3DEditorViewport::reset() {
 }
 
 void Node3DEditorViewport::focus_selection() {
-	if (!get_selected_count()) {
-		return;
-	}
-
 	Vector3 center;
 	int count = 0;
 
@@ -3647,9 +3668,9 @@ Vector3 Node3DEditorViewport::_get_instance_position(const Point2 &p_pos) const
 AABB Node3DEditorViewport::_calculate_spatial_bounds(const Node3D *p_parent, bool p_exclude_top_level_transform) {
 	AABB bounds;
 
-	const MeshInstance3D *mesh_instance = Object::cast_to<MeshInstance3D>(p_parent);
-	if (mesh_instance) {
-		bounds = mesh_instance->get_aabb();
+	const VisualInstance3D *visual_instance = Object::cast_to<VisualInstance3D>(p_parent);
+	if (visual_instance) {
+		bounds = visual_instance->get_aabb();
 	}
 
 	for (int i = 0; i < p_parent->get_child_count(); i++) {
@@ -4034,6 +4055,7 @@ Node3DEditorViewport::Node3DEditorViewport(Node3DEditor *p_spatial_editor, Edito
 	display_submenu->add_radio_check_item(TTR("Spot Light Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS);
 	display_submenu->add_radio_check_item(TTR("Decal Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_DECALS);
 	display_submenu->add_radio_check_item(TTR("Reflection Probe Cluster"), VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES);
+	display_submenu->add_radio_check_item(TTR("Occlusion Culling Buffer"), VIEW_DISPLAY_DEBUG_OCCLUDERS);
 
 	display_submenu->set_name("display_advanced");
 	view_menu->get_popup()->add_submenu_item(TTR("Display Advanced..."), "display_advanced", VIEW_DISPLAY_ADVANCED);
@@ -4132,6 +4154,15 @@ Node3DEditorViewport::Node3DEditorViewport(Node3DEditor *p_spatial_editor, Edito
 	locked_label->set_text(TTR("View Rotation Locked"));
 	locked_label->hide();
 
+	zoom_limit_label = memnew(Label);
+	zoom_limit_label->set_anchors_and_offsets_preset(LayoutPreset::PRESET_BOTTOM_LEFT);
+	zoom_limit_label->set_offset(Side::SIDE_TOP, -28 * EDSCALE);
+	zoom_limit_label->set_text(TTR("To zoom further, change the camera's clipping planes (View -> Settings...)"));
+	zoom_limit_label->set_name("ZoomLimitMessageLabel");
+	zoom_limit_label->add_theme_color_override("font_color", Color(1, 1, 1, 1));
+	zoom_limit_label->hide();
+	surface->add_child(zoom_limit_label);
+
 	frame_time_gradient = memnew(Gradient);
 	// The color is set when the theme changes.
 	frame_time_gradient->add_point(0.5, Color());
@@ -4607,6 +4638,7 @@ Object *Node3DEditor::_get_editor_data(Object *p_what) {
 			si->sbox_instance,
 			RS::SHADOW_CASTING_SETTING_OFF);
 	RS::get_singleton()->instance_set_layer_mask(si->sbox_instance, 1 << Node3DEditorViewport::MISC_TOOL_LAYER);
+	RS::get_singleton()->instance_geometry_set_flag(si->sbox_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 	si->sbox_instance_xray = RenderingServer::get_singleton()->instance_create2(
 			selection_box_xray->get_rid(),
 			sp->get_world_3d()->get_scenario());
@@ -4614,6 +4646,7 @@ Object *Node3DEditor::_get_editor_data(Object *p_what) {
 			si->sbox_instance_xray,
 			RS::SHADOW_CASTING_SETTING_OFF);
 	RS::get_singleton()->instance_set_layer_mask(si->sbox_instance_xray, 1 << Node3DEditorViewport::MISC_TOOL_LAYER);
+	RS::get_singleton()->instance_geometry_set_flag(si->sbox_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 	return si;
 }
@@ -5385,6 +5418,7 @@ void Node3DEditor::_init_indicators() {
 
 		origin_instance = RenderingServer::get_singleton()->instance_create2(origin, get_tree()->get_root()->get_world_3d()->get_scenario());
 		RS::get_singleton()->instance_set_layer_mask(origin_instance, 1 << Node3DEditorViewport::GIZMO_GRID_LAYER);
+		RS::get_singleton()->instance_geometry_set_flag(origin_instance, RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 
 		RenderingServer::get_singleton()->instance_geometry_set_cast_shadows_setting(origin_instance, RS::SHADOW_CASTING_SETTING_OFF);
 	}
@@ -5946,6 +5980,7 @@ void Node3DEditor::_init_grid() {
 		RenderingServer::get_singleton()->instance_set_visible(grid_instance[c], grid_visible[a]);
 		RenderingServer::get_singleton()->instance_geometry_set_cast_shadows_setting(grid_instance[c], RS::SHADOW_CASTING_SETTING_OFF);
 		RS::get_singleton()->instance_set_layer_mask(grid_instance[c], 1 << Node3DEditorViewport::GIZMO_GRID_LAYER);
+		RS::get_singleton()->instance_geometry_set_flag(grid_instance[c], RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, true);
 	}
 }
 
@@ -6447,6 +6482,7 @@ void Node3DEditor::_register_all_gizmos() {
 	add_gizmo_plugin(Ref<Light3DGizmoPlugin>(memnew(Light3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<AudioStreamPlayer3DGizmoPlugin>(memnew(AudioStreamPlayer3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<MeshInstance3DGizmoPlugin>(memnew(MeshInstance3DGizmoPlugin)));
+	add_gizmo_plugin(Ref<OccluderInstance3DGizmoPlugin>(memnew(OccluderInstance3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<SoftBody3DGizmoPlugin>(memnew(SoftBody3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<Sprite3DGizmoPlugin>(memnew(Sprite3DGizmoPlugin)));
 	add_gizmo_plugin(Ref<Skeleton3DGizmoPlugin>(memnew(Skeleton3DGizmoPlugin)));
@@ -7322,6 +7358,7 @@ void EditorNode3DGizmoPlugin::create_material(const String &p_name, const Color
 		material->set_shading_mode(StandardMaterial3D::SHADING_MODE_UNSHADED);
 		material->set_transparency(StandardMaterial3D::TRANSPARENCY_ALPHA);
 		material->set_render_priority(StandardMaterial3D::RENDER_PRIORITY_MIN + 1);
+		material->set_cull_mode(StandardMaterial3D::CULL_DISABLED);
 
 		if (p_use_vertex_color) {
 			material->set_flag(StandardMaterial3D::FLAG_ALBEDO_FROM_VERTEX_COLOR, true);
diff --git a/editor/plugins/node_3d_editor_plugin.h b/editor/plugins/node_3d_editor_plugin.h
index ff4a941b06..33f4c32471 100644
--- a/editor/plugins/node_3d_editor_plugin.h
+++ b/editor/plugins/node_3d_editor_plugin.h
@@ -221,6 +221,7 @@ class Node3DEditorViewport : public Control {
 		VIEW_DISPLAY_DEBUG_CLUSTER_SPOT_LIGHTS,
 		VIEW_DISPLAY_DEBUG_CLUSTER_DECALS,
 		VIEW_DISPLAY_DEBUG_CLUSTER_REFLECTION_PROBES,
+		VIEW_DISPLAY_DEBUG_OCCLUDERS,
 
 		VIEW_LOCK_ROTATION,
 		VIEW_CINEMATIC_PREVIEW,
@@ -295,6 +296,7 @@ private:
 	Label *info_label;
 	Label *cinema_label;
 	Label *locked_label;
+	Label *zoom_limit_label;
 
 	VBoxContainer *top_right_vbox;
 	ViewportRotationControl *rotation_control;
@@ -418,6 +420,7 @@ private:
 	void scale_freelook_speed(real_t scale);
 
 	real_t zoom_indicator_delay;
+	int zoom_failed_attempts_count = 0;
 
 	RID move_gizmo_instance[3], move_plane_gizmo_instance[3], rotate_gizmo_instance[4], scale_gizmo_instance[3], scale_plane_gizmo_instance[3];
 
diff --git a/editor/plugins/occluder_instance_3d_editor_plugin.cpp b/editor/plugins/occluder_instance_3d_editor_plugin.cpp
new file mode 100644
index 0000000000..0821f140b3
--- /dev/null
+++ b/editor/plugins/occluder_instance_3d_editor_plugin.cpp
@@ -0,0 +1,117 @@
+/*************************************************************************/
+/*  occluder_instance_3d_editor_plugin.cpp                               */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "occluder_instance_3d_editor_plugin.h"
+
+void OccluderInstance3DEditorPlugin::_bake_select_file(const String &p_file) {
+	if (occluder_instance) {
+		OccluderInstance3D::BakeError err;
+		if (get_tree()->get_edited_scene_root() && get_tree()->get_edited_scene_root() == occluder_instance) {
+			err = occluder_instance->bake(occluder_instance, p_file);
+		} else {
+			err = occluder_instance->bake(occluder_instance->get_parent(), p_file);
+		}
+
+		switch (err) {
+			case OccluderInstance3D::BAKE_ERROR_NO_SAVE_PATH: {
+				String scene_path = occluder_instance->get_filename();
+				if (scene_path == String()) {
+					scene_path = occluder_instance->get_owner()->get_filename();
+				}
+				if (scene_path == String()) {
+					EditorNode::get_singleton()->show_warning(TTR("Can't determine a save path for the occluder.\nSave your scene and try again."));
+					break;
+				}
+				scene_path = scene_path.get_basename() + ".occ";
+
+				file_dialog->set_current_path(scene_path);
+				file_dialog->popup_file_dialog();
+
+			} break;
+			case OccluderInstance3D::BAKE_ERROR_NO_MESHES: {
+				EditorNode::get_singleton()->show_warning(TTR("No meshes to bake."));
+				break;
+			}
+			default: {
+			}
+		}
+	}
+}
+
+void OccluderInstance3DEditorPlugin::_bake() {
+	_bake_select_file("");
+}
+
+void OccluderInstance3DEditorPlugin::edit(Object *p_object) {
+	OccluderInstance3D *s = Object::cast_to<OccluderInstance3D>(p_object);
+	if (!s) {
+		return;
+	}
+
+	occluder_instance = s;
+}
+
+bool OccluderInstance3DEditorPlugin::handles(Object *p_object) const {
+	return p_object->is_class("OccluderInstance3D");
+}
+
+void OccluderInstance3DEditorPlugin::make_visible(bool p_visible) {
+	if (p_visible) {
+		bake->show();
+	} else {
+		bake->hide();
+	}
+}
+
+void OccluderInstance3DEditorPlugin::_bind_methods() {
+	ClassDB::bind_method("_bake", &OccluderInstance3DEditorPlugin::_bake);
+}
+
+OccluderInstance3DEditorPlugin::OccluderInstance3DEditorPlugin(EditorNode *p_node) {
+	editor = p_node;
+	bake = memnew(Button);
+	bake->set_flat(true);
+	bake->set_icon(editor->get_gui_base()->get_theme_icon("Bake", "EditorIcons"));
+	bake->set_text(TTR("Bake Occluders"));
+	bake->hide();
+	bake->connect("pressed", Callable(this, "_bake"));
+	add_control_to_container(CONTAINER_SPATIAL_EDITOR_MENU, bake);
+	occluder_instance = nullptr;
+
+	file_dialog = memnew(EditorFileDialog);
+	file_dialog->set_file_mode(EditorFileDialog::FILE_MODE_SAVE_FILE);
+	file_dialog->add_filter("*.occ ; Occluder3D");
+	file_dialog->set_title(TTR("Select occluder bake file:"));
+	file_dialog->connect("file_selected", callable_mp(this, &OccluderInstance3DEditorPlugin::_bake_select_file));
+	bake->add_child(file_dialog);
+}
+
+OccluderInstance3DEditorPlugin::~OccluderInstance3DEditorPlugin() {
+}
diff --git a/editor/plugins/occluder_instance_3d_editor_plugin.h b/editor/plugins/occluder_instance_3d_editor_plugin.h
new file mode 100644
index 0000000000..161b17811c
--- /dev/null
+++ b/editor/plugins/occluder_instance_3d_editor_plugin.h
@@ -0,0 +1,66 @@
+/*************************************************************************/
+/*  occluder_instance_3d_editor_plugin.h                                 */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef OCCLUDER_INSTANCE_3D_EDITOR_PLUGIN_H
+#define OCCLUDER_INSTANCE_3D_EDITOR_PLUGIN_H
+
+#include "editor/editor_node.h"
+#include "editor/editor_plugin.h"
+#include "scene/3d/occluder_instance_3d.h"
+#include "scene/resources/material.h"
+
+class OccluderInstance3DEditorPlugin : public EditorPlugin {
+	GDCLASS(OccluderInstance3DEditorPlugin, EditorPlugin);
+
+	OccluderInstance3D *occluder_instance;
+
+	Button *bake;
+	EditorNode *editor;
+
+	EditorFileDialog *file_dialog;
+
+	void _bake_select_file(const String &p_file);
+	void _bake();
+
+protected:
+	static void _bind_methods();
+
+public:
+	virtual String get_name() const override { return "OccluderInstance3D"; }
+	bool has_main_screen() const override { return false; }
+	virtual void edit(Object *p_object) override;
+	virtual bool handles(Object *p_object) const override;
+	virtual void make_visible(bool p_visible) override;
+
+	OccluderInstance3DEditorPlugin(EditorNode *p_node);
+	~OccluderInstance3DEditorPlugin();
+};
+
+#endif
diff --git a/editor/plugins/script_text_editor.cpp b/editor/plugins/script_text_editor.cpp
index 3534809891..c982207224 100644
--- a/editor/plugins/script_text_editor.cpp
+++ b/editor/plugins/script_text_editor.cpp
@@ -1721,6 +1721,9 @@ void ScriptTextEditor::_enable_code_editor() {
 		color_picker->set_raw_mode(true);
 	}
 
+	int picker_shape = EDITOR_GET("interface/inspector/default_color_picker_shape");
+	color_picker->set_picker_shape((ColorPicker::PickerShapeType)picker_shape);
+
 	quick_open = memnew(ScriptEditorQuickOpen);
 	quick_open->connect("goto_line", callable_mp(this, &ScriptTextEditor::_goto_line));
 	add_child(quick_open);
diff --git a/editor/plugins/shader_editor_plugin.cpp b/editor/plugins/shader_editor_plugin.cpp
index 8f8a4b3054..ed3b746678 100644
--- a/editor/plugins/shader_editor_plugin.cpp
+++ b/editor/plugins/shader_editor_plugin.cpp
@@ -205,7 +205,7 @@ void ShaderTextEditor::_code_complete_script(const String &p_code, List<ScriptCo
 	ShaderLanguage sl;
 	String calltip;
 
-	sl.complete(p_code, ShaderTypes::get_singleton()->get_functions(RenderingServer::ShaderMode(shader->get_mode())), ShaderTypes::get_singleton()->get_modes(RenderingServer::ShaderMode(shader->get_mode())), ShaderTypes::get_singleton()->get_types(), _get_global_variable_type, r_options, calltip);
+	sl.complete(p_code, ShaderTypes::get_singleton()->get_functions(RenderingServer::ShaderMode(shader->get_mode())), ShaderTypes::get_singleton()->get_modes(RenderingServer::ShaderMode(shader->get_mode())), ShaderLanguage::VaryingFunctionNames(), ShaderTypes::get_singleton()->get_types(), _get_global_variable_type, r_options, calltip);
 
 	get_text_editor()->set_code_hint(calltip);
 }
@@ -219,7 +219,7 @@ void ShaderTextEditor::_validate_script() {
 
 	ShaderLanguage sl;
 
-	Error err = sl.compile(code, ShaderTypes::get_singleton()->get_functions(RenderingServer::ShaderMode(shader->get_mode())), ShaderTypes::get_singleton()->get_modes(RenderingServer::ShaderMode(shader->get_mode())), ShaderTypes::get_singleton()->get_types(), _get_global_variable_type);
+	Error err = sl.compile(code, ShaderTypes::get_singleton()->get_functions(RenderingServer::ShaderMode(shader->get_mode())), ShaderTypes::get_singleton()->get_modes(RenderingServer::ShaderMode(shader->get_mode())), ShaderLanguage::VaryingFunctionNames(), ShaderTypes::get_singleton()->get_types(), _get_global_variable_type);
 
 	if (err != OK) {
 		String error_text = "error(" + itos(sl.get_error_line()) + "): " + sl.get_error_text();
diff --git a/editor/plugins/skeleton_3d_editor_plugin.cpp b/editor/plugins/skeleton_3d_editor_plugin.cpp
index ad60984ad1..404ef62eca 100644
--- a/editor/plugins/skeleton_3d_editor_plugin.cpp
+++ b/editor/plugins/skeleton_3d_editor_plugin.cpp
@@ -515,6 +515,8 @@ void Skeleton3DEditor::_joint_tree_selection_changed() {
 		rest_editor->set_target(bone_path + "rest");
 		custom_pose_editor->set_target(bone_path + "custom_pose");
 
+		_update_properties();
+
 		pose_editor->set_visible(true);
 		rest_editor->set_visible(true);
 		custom_pose_editor->set_visible(true);
diff --git a/editor/plugins/texture_editor_plugin.cpp b/editor/plugins/texture_editor_plugin.cpp
index 253f8878d2..ecf7370834 100644
--- a/editor/plugins/texture_editor_plugin.cpp
+++ b/editor/plugins/texture_editor_plugin.cpp
@@ -143,7 +143,7 @@ TextureEditor::~TextureEditor() {
 
 //
 bool EditorInspectorPluginTexture::can_handle(Object *p_object) {
-	return Object::cast_to<ImageTexture>(p_object) != nullptr || Object::cast_to<AtlasTexture>(p_object) != nullptr || Object::cast_to<StreamTexture2D>(p_object) != nullptr || Object::cast_to<LargeTexture>(p_object) != nullptr || Object::cast_to<AnimatedTexture>(p_object) != nullptr;
+	return Object::cast_to<ImageTexture>(p_object) != nullptr || Object::cast_to<AtlasTexture>(p_object) != nullptr || Object::cast_to<StreamTexture2D>(p_object) != nullptr || Object::cast_to<AnimatedTexture>(p_object) != nullptr;
 }
 
 void EditorInspectorPluginTexture::parse_begin(Object *p_object) {
diff --git a/editor/plugins/theme_editor_plugin.cpp b/editor/plugins/theme_editor_plugin.cpp
index dfa8c04145..c765aa0319 100644
--- a/editor/plugins/theme_editor_plugin.cpp
+++ b/editor/plugins/theme_editor_plugin.cpp
@@ -31,12 +31,696 @@
 #include "theme_editor_plugin.h"
 
 #include "core/os/file_access.h"
+#include "core/os/keyboard.h"
 #include "core/version.h"
 #include "editor/editor_scale.h"
 #include "scene/gui/progress_bar.h"
 
+void ThemeItemEditorDialog::_dialog_about_to_show() {
+	ERR_FAIL_COND(edited_theme.is_null());
+
+	_update_edit_types();
+}
+
+void ThemeItemEditorDialog::_update_edit_types() {
+	Ref<Theme> base_theme = Theme::get_default();
+
+	List<StringName> theme_types;
+	edited_theme->get_type_list(&theme_types);
+	theme_types.sort_custom<StringName::AlphCompare>();
+
+	bool item_reselected = false;
+	edit_type_list->clear();
+	int e_idx = 0;
+	for (List<StringName>::Element *E = theme_types.front(); E; E = E->next()) {
+		Ref<Texture2D> item_icon;
+		if (E->get() == "") {
+			item_icon = get_theme_icon("NodeDisabled", "EditorIcons");
+		} else {
+			item_icon = EditorNode::get_singleton()->get_class_icon(E->get(), "NodeDisabled");
+		}
+		edit_type_list->add_item(E->get(), item_icon);
+
+		if (E->get() == edited_item_type) {
+			edit_type_list->select(e_idx);
+			item_reselected = true;
+		}
+		e_idx++;
+	}
+	if (!item_reselected) {
+		edited_item_type = "";
+
+		if (edit_type_list->get_item_count() > 0) {
+			edit_type_list->select(0);
+		}
+	}
+
+	List<StringName> default_types;
+	base_theme->get_type_list(&default_types);
+	default_types.sort_custom<StringName::AlphCompare>();
+
+	edit_add_class_options->clear();
+	for (List<StringName>::Element *E = default_types.front(); E; E = E->next()) {
+		edit_add_class_options->add_item(E->get());
+	}
+
+	String selected_type = "";
+	Vector<int> selected_ids = edit_type_list->get_selected_items();
+	if (selected_ids.size() > 0) {
+		selected_type = edit_type_list->get_item_text(selected_ids[0]);
+
+		edit_items_add_color->set_disabled(false);
+		edit_items_add_constant->set_disabled(false);
+		edit_items_add_font->set_disabled(false);
+		edit_items_add_font_size->set_disabled(false);
+		edit_items_add_icon->set_disabled(false);
+		edit_items_add_stylebox->set_disabled(false);
+
+		edit_items_remove_class->set_disabled(false);
+		edit_items_remove_custom->set_disabled(false);
+		edit_items_remove_all->set_disabled(false);
+	} else {
+		edit_items_add_color->set_disabled(true);
+		edit_items_add_constant->set_disabled(true);
+		edit_items_add_font->set_disabled(true);
+		edit_items_add_font_size->set_disabled(true);
+		edit_items_add_icon->set_disabled(true);
+		edit_items_add_stylebox->set_disabled(true);
+
+		edit_items_remove_class->set_disabled(true);
+		edit_items_remove_custom->set_disabled(true);
+		edit_items_remove_all->set_disabled(true);
+	}
+	_update_edit_item_tree(selected_type);
+}
+
+void ThemeItemEditorDialog::_edited_type_selected(int p_item_idx) {
+	String selected_type = edit_type_list->get_item_text(p_item_idx);
+	_update_edit_item_tree(selected_type);
+}
+
+void ThemeItemEditorDialog::_update_edit_item_tree(String p_item_type) {
+	edited_item_type = p_item_type;
+
+	edit_items_tree->clear();
+	TreeItem *root = edit_items_tree->create_item();
+
+	List<StringName> names;
+
+	{
+		names.clear();
+		edited_theme->get_color_list(p_item_type, &names);
+
+		if (names.size() > 0) {
+			TreeItem *color_root = edit_items_tree->create_item(root);
+			color_root->set_metadata(0, Theme::DATA_TYPE_COLOR);
+			color_root->set_icon(0, get_theme_icon("Color", "EditorIcons"));
+			color_root->set_text(0, TTR("Colors"));
+			color_root->add_button(0, get_theme_icon("Clear", "EditorIcons"), ITEMS_TREE_REMOVE_DATA_TYPE, false, TTR("Remove All Color Items"));
+
+			names.sort_custom<StringName::AlphCompare>();
+			for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+				TreeItem *item = edit_items_tree->create_item(color_root);
+				item->set_text(0, E->get());
+				item->add_button(0, get_theme_icon("Edit", "EditorIcons"), ITEMS_TREE_RENAME_ITEM, false, TTR("Rename Item"));
+				item->add_button(0, get_theme_icon("Remove", "EditorIcons"), ITEMS_TREE_REMOVE_ITEM, false, TTR("Remove Item"));
+			}
+		}
+	}
+
+	{
+		names.clear();
+		edited_theme->get_constant_list(p_item_type, &names);
+
+		if (names.size() > 0) {
+			TreeItem *constant_root = edit_items_tree->create_item(root);
+			constant_root->set_metadata(0, Theme::DATA_TYPE_CONSTANT);
+			constant_root->set_icon(0, get_theme_icon("MemberConstant", "EditorIcons"));
+			constant_root->set_text(0, TTR("Constants"));
+			constant_root->add_button(0, get_theme_icon("Clear", "EditorIcons"), ITEMS_TREE_REMOVE_DATA_TYPE, false, TTR("Remove All Constant Items"));
+
+			names.sort_custom<StringName::AlphCompare>();
+			for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+				TreeItem *item = edit_items_tree->create_item(constant_root);
+				item->set_text(0, E->get());
+				item->add_button(0, get_theme_icon("Edit", "EditorIcons"), ITEMS_TREE_RENAME_ITEM, false, TTR("Rename Item"));
+				item->add_button(0, get_theme_icon("Remove", "EditorIcons"), ITEMS_TREE_REMOVE_ITEM, false, TTR("Remove Item"));
+			}
+		}
+	}
+
+	{
+		names.clear();
+		edited_theme->get_font_list(p_item_type, &names);
+
+		if (names.size() > 0) {
+			TreeItem *font_root = edit_items_tree->create_item(root);
+			font_root->set_metadata(0, Theme::DATA_TYPE_FONT);
+			font_root->set_icon(0, get_theme_icon("Font", "EditorIcons"));
+			font_root->set_text(0, TTR("Fonts"));
+			font_root->add_button(0, get_theme_icon("Clear", "EditorIcons"), ITEMS_TREE_REMOVE_DATA_TYPE, false, TTR("Remove All Font Items"));
+
+			names.sort_custom<StringName::AlphCompare>();
+			for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+				TreeItem *item = edit_items_tree->create_item(font_root);
+				item->set_text(0, E->get());
+				item->add_button(0, get_theme_icon("Edit", "EditorIcons"), ITEMS_TREE_RENAME_ITEM, false, TTR("Rename Item"));
+				item->add_button(0, get_theme_icon("Remove", "EditorIcons"), ITEMS_TREE_REMOVE_ITEM, false, TTR("Remove Item"));
+			}
+		}
+	}
+
+	{
+		names.clear();
+		edited_theme->get_font_size_list(p_item_type, &names);
+
+		if (names.size() > 0) {
+			TreeItem *font_size_root = edit_items_tree->create_item(root);
+			font_size_root->set_metadata(0, Theme::DATA_TYPE_FONT_SIZE);
+			font_size_root->set_icon(0, get_theme_icon("FontSize", "EditorIcons"));
+			font_size_root->set_text(0, TTR("Font Sizes"));
+			font_size_root->add_button(0, get_theme_icon("Clear", "EditorIcons"), ITEMS_TREE_REMOVE_DATA_TYPE, false, TTR("Remove All Font Size Items"));
+
+			names.sort_custom<StringName::AlphCompare>();
+			for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+				TreeItem *item = edit_items_tree->create_item(font_size_root);
+				item->set_text(0, E->get());
+				item->add_button(0, get_theme_icon("Edit", "EditorIcons"), ITEMS_TREE_RENAME_ITEM, false, TTR("Rename Item"));
+				item->add_button(0, get_theme_icon("Remove", "EditorIcons"), ITEMS_TREE_REMOVE_ITEM, false, TTR("Remove Item"));
+			}
+		}
+	}
+
+	{
+		names.clear();
+		edited_theme->get_icon_list(p_item_type, &names);
+
+		if (names.size() > 0) {
+			TreeItem *icon_root = edit_items_tree->create_item(root);
+			icon_root->set_metadata(0, Theme::DATA_TYPE_ICON);
+			icon_root->set_icon(0, get_theme_icon("ImageTexture", "EditorIcons"));
+			icon_root->set_text(0, TTR("Icons"));
+			icon_root->add_button(0, get_theme_icon("Clear", "EditorIcons"), ITEMS_TREE_REMOVE_DATA_TYPE, false, TTR("Remove All Icon Items"));
+
+			names.sort_custom<StringName::AlphCompare>();
+			for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+				TreeItem *item = edit_items_tree->create_item(icon_root);
+				item->set_text(0, E->get());
+				item->add_button(0, get_theme_icon("Edit", "EditorIcons"), ITEMS_TREE_RENAME_ITEM, false, TTR("Rename Item"));
+				item->add_button(0, get_theme_icon("Remove", "EditorIcons"), ITEMS_TREE_REMOVE_ITEM, false, TTR("Remove Item"));
+			}
+		}
+	}
+
+	{
+		names.clear();
+		edited_theme->get_stylebox_list(p_item_type, &names);
+
+		if (names.size() > 0) {
+			TreeItem *stylebox_root = edit_items_tree->create_item(root);
+			stylebox_root->set_metadata(0, Theme::DATA_TYPE_STYLEBOX);
+			stylebox_root->set_icon(0, get_theme_icon("StyleBoxFlat", "EditorIcons"));
+			stylebox_root->set_text(0, TTR("Styleboxes"));
+			stylebox_root->add_button(0, get_theme_icon("Clear", "EditorIcons"), ITEMS_TREE_REMOVE_DATA_TYPE, false, TTR("Remove All StyleBox Items"));
+
+			names.sort_custom<StringName::AlphCompare>();
+			for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+				TreeItem *item = edit_items_tree->create_item(stylebox_root);
+				item->set_text(0, E->get());
+				item->add_button(0, get_theme_icon("Edit", "EditorIcons"), ITEMS_TREE_RENAME_ITEM, false, TTR("Rename Item"));
+				item->add_button(0, get_theme_icon("Remove", "EditorIcons"), ITEMS_TREE_REMOVE_ITEM, false, TTR("Remove Item"));
+			}
+		}
+	}
+}
+
+void ThemeItemEditorDialog::_item_tree_button_pressed(Object *p_item, int p_column, int p_id) {
+	TreeItem *item = Object::cast_to<TreeItem>(p_item);
+	if (!item) {
+		return;
+	}
+
+	switch (p_id) {
+		case ITEMS_TREE_RENAME_ITEM: {
+			String item_name = item->get_text(0);
+			int data_type = item->get_parent()->get_metadata(0);
+			_open_rename_theme_item_dialog((Theme::DataType)data_type, item_name);
+		} break;
+		case ITEMS_TREE_REMOVE_ITEM: {
+			String item_name = item->get_text(0);
+			int data_type = item->get_parent()->get_metadata(0);
+			edited_theme->clear_theme_item((Theme::DataType)data_type, item_name, edited_item_type);
+		} break;
+		case ITEMS_TREE_REMOVE_DATA_TYPE: {
+			int data_type = item->get_metadata(0);
+			_remove_data_type_items((Theme::DataType)data_type, edited_item_type);
+		} break;
+	}
+
+	_update_edit_item_tree(edited_item_type);
+}
+
+void ThemeItemEditorDialog::_add_class_type_items() {
+	int selected_idx = edit_add_class_options->get_selected();
+	String type_name = edit_add_class_options->get_item_text(selected_idx);
+	List<StringName> names;
+
+	{
+		names.clear();
+		Theme::get_default()->get_icon_list(type_name, &names);
+		for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+			edited_theme->set_icon(E->get(), type_name, Ref<Texture2D>());
+		}
+	}
+	{
+		names.clear();
+		Theme::get_default()->get_stylebox_list(type_name, &names);
+		for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+			edited_theme->set_stylebox(E->get(), type_name, Ref<StyleBox>());
+		}
+	}
+	{
+		names.clear();
+		Theme::get_default()->get_font_list(type_name, &names);
+		for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+			edited_theme->set_font(E->get(), type_name, Ref<Font>());
+		}
+	}
+	{
+		names.clear();
+		Theme::get_default()->get_font_size_list(type_name, &names);
+		for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+			edited_theme->set_font_size(E->get(), type_name, Theme::get_default()->get_font_size(E->get(), type_name));
+		}
+	}
+	{
+		names.clear();
+		Theme::get_default()->get_color_list(type_name, &names);
+		for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+			edited_theme->set_color(E->get(), type_name, Theme::get_default()->get_color(E->get(), type_name));
+		}
+	}
+	{
+		names.clear();
+		Theme::get_default()->get_constant_list(type_name, &names);
+		for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+			edited_theme->set_constant(E->get(), type_name, Theme::get_default()->get_constant(E->get(), type_name));
+		}
+	}
+
+	_update_edit_types();
+}
+
+void ThemeItemEditorDialog::_add_custom_type() {
+	edited_theme->add_icon_type(edit_add_custom_value->get_text());
+	edited_theme->add_stylebox_type(edit_add_custom_value->get_text());
+	edited_theme->add_font_type(edit_add_custom_value->get_text());
+	edited_theme->add_font_size_type(edit_add_custom_value->get_text());
+	edited_theme->add_color_type(edit_add_custom_value->get_text());
+	edited_theme->add_constant_type(edit_add_custom_value->get_text());
+	_update_edit_types();
+}
+
+void ThemeItemEditorDialog::_add_theme_item(Theme::DataType p_data_type, String p_item_name, String p_item_type) {
+	switch (p_data_type) {
+		case Theme::DATA_TYPE_ICON:
+			edited_theme->set_icon(p_item_name, p_item_type, Ref<Texture2D>());
+			break;
+		case Theme::DATA_TYPE_STYLEBOX:
+			edited_theme->set_stylebox(p_item_name, p_item_type, Ref<StyleBox>());
+			break;
+		case Theme::DATA_TYPE_FONT:
+			edited_theme->set_font(p_item_name, p_item_type, Ref<Font>());
+			break;
+		case Theme::DATA_TYPE_FONT_SIZE:
+			edited_theme->set_font_size(p_item_name, p_item_type, -1);
+			break;
+		case Theme::DATA_TYPE_COLOR:
+			edited_theme->set_color(p_item_name, p_item_type, Color());
+			break;
+		case Theme::DATA_TYPE_CONSTANT:
+			edited_theme->set_constant(p_item_name, p_item_type, 0);
+			break;
+		case Theme::DATA_TYPE_MAX:
+			break; // Can't happen, but silences warning.
+	}
+}
+
+void ThemeItemEditorDialog::_remove_data_type_items(Theme::DataType p_data_type, String p_item_type) {
+	List<StringName> names;
+
+	edited_theme->get_theme_item_list(p_data_type, p_item_type, &names);
+	for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+		edited_theme->clear_theme_item(p_data_type, E->get(), p_item_type);
+	}
+}
+
+void ThemeItemEditorDialog::_remove_class_items() {
+	List<StringName> names;
+
+	for (int dt = 0; dt < Theme::DATA_TYPE_MAX; dt++) {
+		Theme::DataType data_type = (Theme::DataType)dt;
+
+		names.clear();
+		Theme::get_default()->get_theme_item_list(data_type, edited_item_type, &names);
+		for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+			if (edited_theme->has_theme_item_nocheck(data_type, E->get(), edited_item_type)) {
+				edited_theme->clear_theme_item(data_type, E->get(), edited_item_type);
+			}
+		}
+	}
+
+	_update_edit_item_tree(edited_item_type);
+}
+
+void ThemeItemEditorDialog::_remove_custom_items() {
+	List<StringName> names;
+
+	for (int dt = 0; dt < Theme::DATA_TYPE_MAX; dt++) {
+		Theme::DataType data_type = (Theme::DataType)dt;
+
+		names.clear();
+		edited_theme->get_theme_item_list(data_type, edited_item_type, &names);
+		for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+			if (!Theme::get_default()->has_theme_item_nocheck(data_type, E->get(), edited_item_type)) {
+				edited_theme->clear_theme_item(data_type, E->get(), edited_item_type);
+			}
+		}
+	}
+
+	_update_edit_item_tree(edited_item_type);
+}
+
+void ThemeItemEditorDialog::_remove_all_items() {
+	List<StringName> names;
+
+	for (int dt = 0; dt < Theme::DATA_TYPE_MAX; dt++) {
+		Theme::DataType data_type = (Theme::DataType)dt;
+
+		names.clear();
+		edited_theme->get_theme_item_list(data_type, edited_item_type, &names);
+		for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
+			edited_theme->clear_theme_item(data_type, E->get(), edited_item_type);
+		}
+	}
+
+	_update_edit_item_tree(edited_item_type);
+}
+
+void ThemeItemEditorDialog::_open_add_theme_item_dialog(int p_data_type) {
+	ERR_FAIL_INDEX_MSG(p_data_type, Theme::DATA_TYPE_MAX, "Theme item data type is out of bounds.");
+
+	item_popup_mode = CREATE_THEME_ITEM;
+	edit_item_data_type = (Theme::DataType)p_data_type;
+
+	switch (edit_item_data_type) {
+		case Theme::DATA_TYPE_COLOR:
+			edit_theme_item_dialog->set_title(TTR("Add Color Item"));
+			break;
+		case Theme::DATA_TYPE_CONSTANT:
+			edit_theme_item_dialog->set_title(TTR("Add Constant Item"));
+			break;
+		case Theme::DATA_TYPE_FONT:
+			edit_theme_item_dialog->set_title(TTR("Add Font Item"));
+			break;
+		case Theme::DATA_TYPE_FONT_SIZE:
+			edit_theme_item_dialog->set_title(TTR("Add Font Size Item"));
+			break;
+		case Theme::DATA_TYPE_ICON:
+			edit_theme_item_dialog->set_title(TTR("Add Icon Item"));
+			break;
+		case Theme::DATA_TYPE_STYLEBOX:
+			edit_theme_item_dialog->set_title(TTR("Add Stylebox Item"));
+			break;
+		case Theme::DATA_TYPE_MAX:
+			break; // Can't happen, but silences warning.
+	}
+
+	edit_theme_item_old_vb->hide();
+	theme_item_name->clear();
+	edit_theme_item_dialog->popup_centered(Size2(380, 110) * EDSCALE);
+	theme_item_name->grab_focus();
+}
+
+void ThemeItemEditorDialog::_open_rename_theme_item_dialog(Theme::DataType p_data_type, String p_item_name) {
+	ERR_FAIL_INDEX_MSG(p_data_type, Theme::DATA_TYPE_MAX, "Theme item data type is out of bounds.");
+
+	item_popup_mode = RENAME_THEME_ITEM;
+	edit_item_data_type = p_data_type;
+	edit_item_old_name = p_item_name;
+
+	switch (edit_item_data_type) {
+		case Theme::DATA_TYPE_COLOR:
+			edit_theme_item_dialog->set_title(TTR("Rename Color Item"));
+			break;
+		case Theme::DATA_TYPE_CONSTANT:
+			edit_theme_item_dialog->set_title(TTR("Rename Constant Item"));
+			break;
+		case Theme::DATA_TYPE_FONT:
+			edit_theme_item_dialog->set_title(TTR("Rename Font Item"));
+			break;
+		case Theme::DATA_TYPE_FONT_SIZE:
+			edit_theme_item_dialog->set_title(TTR("Rename Font Size Item"));
+			break;
+		case Theme::DATA_TYPE_ICON:
+			edit_theme_item_dialog->set_title(TTR("Rename Icon Item"));
+			break;
+		case Theme::DATA_TYPE_STYLEBOX:
+			edit_theme_item_dialog->set_title(TTR("Rename Stylebox Item"));
+			break;
+		case Theme::DATA_TYPE_MAX:
+			break; // Can't happen, but silences warning.
+	}
+
+	edit_theme_item_old_vb->show();
+	theme_item_old_name->set_text(p_item_name);
+	theme_item_name->set_text(p_item_name);
+	edit_theme_item_dialog->popup_centered(Size2(380, 140) * EDSCALE);
+	theme_item_name->grab_focus();
+}
+
+void ThemeItemEditorDialog::_confirm_edit_theme_item() {
+	if (item_popup_mode == CREATE_THEME_ITEM) {
+		_add_theme_item(edit_item_data_type, theme_item_name->get_text(), edited_item_type);
+	} else if (item_popup_mode == RENAME_THEME_ITEM) {
+		edited_theme->rename_theme_item(edit_item_data_type, edit_item_old_name, theme_item_name->get_text(), edited_item_type);
+	}
+
+	item_popup_mode = ITEM_POPUP_MODE_MAX;
+	edit_item_data_type = Theme::DATA_TYPE_MAX;
+	edit_item_old_name = "";
+
+	_update_edit_item_tree(edited_item_type);
+}
+
+void ThemeItemEditorDialog::_edit_theme_item_gui_input(const Ref<InputEvent> &p_event) {
+	Ref<InputEventKey> k = p_event;
+
+	if (k.is_valid()) {
+		if (!k->is_pressed()) {
+			return;
+		}
+
+		switch (k->get_keycode()) {
+			case KEY_KP_ENTER:
+			case KEY_ENTER: {
+				_confirm_edit_theme_item();
+				edit_theme_item_dialog->hide();
+				edit_theme_item_dialog->set_input_as_handled();
+			} break;
+			case KEY_ESCAPE: {
+				edit_theme_item_dialog->hide();
+				edit_theme_item_dialog->set_input_as_handled();
+			} break;
+		}
+	}
+}
+
+void ThemeItemEditorDialog::_notification(int p_what) {
+	switch (p_what) {
+		case NOTIFICATION_ENTER_TREE: {
+			connect("about_to_popup", callable_mp(this, &ThemeItemEditorDialog::_dialog_about_to_show));
+			[[fallthrough]];
+		}
+		case NOTIFICATION_THEME_CHANGED: {
+			edit_items_add_color->set_icon(get_theme_icon("Color", "EditorIcons"));
+			edit_items_add_constant->set_icon(get_theme_icon("MemberConstant", "EditorIcons"));
+			edit_items_add_font->set_icon(get_theme_icon("Font", "EditorIcons"));
+			edit_items_add_font_size->set_icon(get_theme_icon("FontSize", "EditorIcons"));
+			edit_items_add_icon->set_icon(get_theme_icon("ImageTexture", "EditorIcons"));
+			edit_items_add_stylebox->set_icon(get_theme_icon("StyleBoxFlat", "EditorIcons"));
+
+			edit_items_remove_class->set_icon(get_theme_icon("Control", "EditorIcons"));
+			edit_items_remove_custom->set_icon(get_theme_icon("ThemeRemoveCustomItems", "EditorIcons"));
+			edit_items_remove_all->set_icon(get_theme_icon("ThemeRemoveAllItems", "EditorIcons"));
+		} break;
+	}
+}
+
+void ThemeItemEditorDialog::set_edited_theme(const Ref<Theme> &p_theme) {
+	edited_theme = p_theme;
+}
+
+ThemeItemEditorDialog::ThemeItemEditorDialog() {
+	set_title(TTR("Edit Theme Items"));
+
+	HSplitContainer *edit_dialog_hs = memnew(HSplitContainer);
+	add_child(edit_dialog_hs);
+
+	VBoxContainer *edit_dialog_side_vb = memnew(VBoxContainer);
+	edit_dialog_side_vb->set_custom_minimum_size(Size2(200.0, 0.0) * EDSCALE);
+	edit_dialog_hs->add_child(edit_dialog_side_vb);
+
+	Label *edit_type_label = memnew(Label);
+	edit_type_label->set_text(TTR("Types:"));
+	edit_dialog_side_vb->add_child(edit_type_label);
+
+	edit_type_list = memnew(ItemList);
+	edit_type_list->set_v_size_flags(Control::SIZE_EXPAND_FILL);
+	edit_dialog_side_vb->add_child(edit_type_list);
+	edit_type_list->connect("item_selected", callable_mp(this, &ThemeItemEditorDialog::_edited_type_selected));
+
+	Label *edit_add_class_label = memnew(Label);
+	edit_add_class_label->set_text(TTR("Add Type from Class:"));
+	edit_dialog_side_vb->add_child(edit_add_class_label);
+
+	HBoxContainer *edit_add_class = memnew(HBoxContainer);
+	edit_dialog_side_vb->add_child(edit_add_class);
+	edit_add_class_options = memnew(OptionButton);
+	edit_add_class_options->set_h_size_flags(Control::SIZE_EXPAND_FILL);
+	edit_add_class->add_child(edit_add_class_options);
+	Button *edit_add_class_button = memnew(Button);
+	edit_add_class_button->set_text(TTR("Add"));
+	edit_add_class->add_child(edit_add_class_button);
+	edit_add_class_button->connect("pressed", callable_mp(this, &ThemeItemEditorDialog::_add_class_type_items));
+
+	Label *edit_add_custom_label = memnew(Label);
+	edit_add_custom_label->set_text(TTR("Add Custom Type:"));
+	edit_dialog_side_vb->add_child(edit_add_custom_label);
+
+	HBoxContainer *edit_add_custom = memnew(HBoxContainer);
+	edit_dialog_side_vb->add_child(edit_add_custom);
+	edit_add_custom_value = memnew(LineEdit);
+	edit_add_custom_value->set_h_size_flags(Control::SIZE_EXPAND_FILL);
+	edit_add_custom->add_child(edit_add_custom_value);
+	Button *edit_add_custom_button = memnew(Button);
+	edit_add_custom_button->set_text(TTR("Add"));
+	edit_add_custom->add_child(edit_add_custom_button);
+	edit_add_custom_button->connect("pressed", callable_mp(this, &ThemeItemEditorDialog::_add_custom_type));
+
+	VBoxContainer *edit_items_vb = memnew(VBoxContainer);
+	edit_items_vb->set_h_size_flags(Control::SIZE_EXPAND_FILL);
+	edit_dialog_hs->add_child(edit_items_vb);
+
+	HBoxContainer *edit_items_toolbar = memnew(HBoxContainer);
+	edit_items_vb->add_child(edit_items_toolbar);
+
+	Label *edit_items_toolbar_add_label = memnew(Label);
+	edit_items_toolbar_add_label->set_text(TTR("Add:"));
+	edit_items_toolbar->add_child(edit_items_toolbar_add_label);
+
+	edit_items_add_color = memnew(Button);
+	edit_items_add_color->set_tooltip(TTR("Add Color Item"));
+	edit_items_add_color->set_flat(true);
+	edit_items_add_color->set_disabled(true);
+	edit_items_toolbar->add_child(edit_items_add_color);
+	edit_items_add_color->connect("pressed", callable_mp(this, &ThemeItemEditorDialog::_open_add_theme_item_dialog), varray(Theme::DATA_TYPE_COLOR));
+
+	edit_items_add_constant = memnew(Button);
+	edit_items_add_constant->set_tooltip(TTR("Add Constant Item"));
+	edit_items_add_constant->set_flat(true);
+	edit_items_add_constant->set_disabled(true);
+	edit_items_toolbar->add_child(edit_items_add_constant);
+	edit_items_add_constant->connect("pressed", callable_mp(this, &ThemeItemEditorDialog::_open_add_theme_item_dialog), varray(Theme::DATA_TYPE_CONSTANT));
+
+	edit_items_add_font = memnew(Button);
+	edit_items_add_font->set_tooltip(TTR("Add Font Item"));
+	edit_items_add_font->set_flat(true);
+	edit_items_add_font->set_disabled(true);
+	edit_items_toolbar->add_child(edit_items_add_font);
+	edit_items_add_font->connect("pressed", callable_mp(this, &ThemeItemEditorDialog::_open_add_theme_item_dialog), varray(Theme::DATA_TYPE_FONT));
+
+	edit_items_add_font_size = memnew(Button);
+	edit_items_add_font_size->set_tooltip(TTR("Add Font Size Item"));
+	edit_items_add_font_size->set_flat(true);
+	edit_items_add_font_size->set_disabled(true);
+	edit_items_toolbar->add_child(edit_items_add_font_size);
+	edit_items_add_font_size->connect("pressed", callable_mp(this, &ThemeItemEditorDialog::_open_add_theme_item_dialog), varray(Theme::DATA_TYPE_FONT_SIZE));
+
+	edit_items_add_icon = memnew(Button);
+	edit_items_add_icon->set_tooltip(TTR("Add Icon Item"));
+	edit_items_add_icon->set_flat(true);
+	edit_items_add_icon->set_disabled(true);
+	edit_items_toolbar->add_child(edit_items_add_icon);
+	edit_items_add_icon->connect("pressed", callable_mp(this, &ThemeItemEditorDialog::_open_add_theme_item_dialog), varray(Theme::DATA_TYPE_ICON));
+
+	edit_items_add_stylebox = memnew(Button);
+	edit_items_add_stylebox->set_tooltip(TTR("Add StyleBox Item"));
+	edit_items_add_stylebox->set_flat(true);
+	edit_items_add_stylebox->set_disabled(true);
+	edit_items_toolbar->add_child(edit_items_add_stylebox);
+	edit_items_add_stylebox->connect("pressed", callable_mp(this, &ThemeItemEditorDialog::_open_add_theme_item_dialog), varray(Theme::DATA_TYPE_STYLEBOX));
+
+	edit_items_toolbar->add_child(memnew(VSeparator));
+
+	Label *edit_items_toolbar_remove_label = memnew(Label);
+	edit_items_toolbar_remove_label->set_text(TTR("Remove:"));
+	edit_items_toolbar->add_child(edit_items_toolbar_remove_label);
+
+	edit_items_remove_class = memnew(Button);
+	edit_items_remove_class->set_tooltip(TTR("Remove Class Items"));
+	edit_items_remove_class->set_flat(true);
+	edit_items_remove_class->set_disabled(true);
+	edit_items_toolbar->add_child(edit_items_remove_class);
+	edit_items_remove_class->connect("pressed", callable_mp(this, &ThemeItemEditorDialog::_remove_class_items));
+
+	edit_items_remove_custom = memnew(Button);
+	edit_items_remove_custom->set_tooltip(TTR("Remove Custom Items"));
+	edit_items_remove_custom->set_flat(true);
+	edit_items_remove_custom->set_disabled(true);
+	edit_items_toolbar->add_child(edit_items_remove_custom);
+	edit_items_remove_custom->connect("pressed", callable_mp(this, &ThemeItemEditorDialog::_remove_custom_items));
+
+	edit_items_remove_all = memnew(Button);
+	edit_items_remove_all->set_tooltip(TTR("Remove All Items"));
+	edit_items_remove_all->set_flat(true);
+	edit_items_remove_all->set_disabled(true);
+	edit_items_toolbar->add_child(edit_items_remove_all);
+	edit_items_remove_all->connect("pressed", callable_mp(this, &ThemeItemEditorDialog::_remove_all_items));
+
+	edit_items_tree = memnew(Tree);
+	edit_items_tree->set_v_size_flags(Control::SIZE_EXPAND_FILL);
+	edit_items_tree->set_hide_root(true);
+	edit_items_tree->set_columns(1);
+	edit_items_vb->add_child(edit_items_tree);
+	edit_items_tree->connect("button_pressed", callable_mp(this, &ThemeItemEditorDialog::_item_tree_button_pressed));
+
+	edit_theme_item_dialog = memnew(ConfirmationDialog);
+	edit_theme_item_dialog->set_title(TTR("Add Theme Item"));
+	add_child(edit_theme_item_dialog);
+	VBoxContainer *edit_theme_item_vb = memnew(VBoxContainer);
+	edit_theme_item_dialog->add_child(edit_theme_item_vb);
+
+	edit_theme_item_old_vb = memnew(VBoxContainer);
+	edit_theme_item_vb->add_child(edit_theme_item_old_vb);
+	Label *edit_theme_item_old = memnew(Label);
+	edit_theme_item_old->set_text(TTR("Old Name:"));
+	edit_theme_item_old_vb->add_child(edit_theme_item_old);
+	theme_item_old_name = memnew(Label);
+	edit_theme_item_old_vb->add_child(theme_item_old_name);
+
+	Label *edit_theme_item_label = memnew(Label);
+	edit_theme_item_label->set_text(TTR("Name:"));
+	edit_theme_item_vb->add_child(edit_theme_item_label);
+	theme_item_name = memnew(LineEdit);
+	edit_theme_item_vb->add_child(theme_item_name);
+	theme_item_name->connect("gui_input", callable_mp(this, &ThemeItemEditorDialog::_edit_theme_item_gui_input));
+	edit_theme_item_dialog->connect("confirmed", callable_mp(this, &ThemeItemEditorDialog::_confirm_edit_theme_item));
+}
+
 void ThemeEditor::edit(const Ref<Theme> &p_theme) {
 	theme = p_theme;
+	theme_edit_dialog->set_edited_theme(p_theme);
 	main_panel->set_theme(p_theme);
 	main_container->set_theme(p_theme);
 }
@@ -58,55 +742,6 @@ void ThemeEditor::_refresh_interval() {
 	_propagate_redraw(main_container);
 }
 
-void ThemeEditor::_type_menu_cbk(int p_option) {
-	type_edit->set_text(type_menu->get_popup()->get_item_text(p_option));
-}
-
-void ThemeEditor::_name_menu_about_to_show() {
-	String fromtype = type_edit->get_text();
-	List<StringName> names;
-
-	if (popup_mode == POPUP_ADD) {
-		switch (type_select->get_selected()) {
-			case 0:
-				Theme::get_default()->get_icon_list(fromtype, &names);
-				break;
-			case 1:
-				Theme::get_default()->get_stylebox_list(fromtype, &names);
-				break;
-			case 2:
-				Theme::get_default()->get_font_list(fromtype, &names);
-				break;
-			case 3:
-				Theme::get_default()->get_font_size_list(fromtype, &names);
-				break;
-			case 4:
-				Theme::get_default()->get_color_list(fromtype, &names);
-				break;
-			case 5:
-				Theme::get_default()->get_constant_list(fromtype, &names);
-				break;
-		}
-	} else if (popup_mode == POPUP_REMOVE) {
-		theme->get_icon_list(fromtype, &names);
-		theme->get_stylebox_list(fromtype, &names);
-		theme->get_font_list(fromtype, &names);
-		theme->get_font_size_list(fromtype, &names);
-		theme->get_color_list(fromtype, &names);
-		theme->get_constant_list(fromtype, &names);
-	}
-
-	name_menu->get_popup()->clear();
-	name_menu->get_popup()->set_size(Size2());
-	for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-		name_menu->get_popup()->add_item(E->get());
-	}
-}
-
-void ThemeEditor::_name_menu_cbk(int p_option) {
-	name_edit->set_text(name_menu->get_popup()->get_item_text(p_option));
-}
-
 struct _TECategory {
 	template <class T>
 	struct RefItem {
@@ -335,296 +970,71 @@ void ThemeEditor::_save_template_cbk(String fname) {
 	memdelete(file);
 }
 
-void ThemeEditor::_dialog_cbk() {
-	switch (popup_mode) {
-		case POPUP_ADD: {
-			switch (type_select->get_selected()) {
-				case 0:
-					theme->set_icon(name_edit->get_text(), type_edit->get_text(), Ref<Texture2D>());
-					break;
-				case 1:
-					theme->set_stylebox(name_edit->get_text(), type_edit->get_text(), Ref<StyleBox>());
-					break;
-				case 2:
-					theme->set_font(name_edit->get_text(), type_edit->get_text(), Ref<Font>());
-					break;
-				case 3:
-					theme->set_font_size(name_edit->get_text(), type_edit->get_text(), -1);
-					break;
-				case 4:
-					theme->set_color(name_edit->get_text(), type_edit->get_text(), Color());
-					break;
-				case 5:
-					theme->set_constant(name_edit->get_text(), type_edit->get_text(), 0);
-					break;
-			}
-
-		} break;
-		case POPUP_CLASS_ADD: {
-			StringName fromtype = type_edit->get_text();
-			List<StringName> names;
-
-			{
-				names.clear();
-				Theme::get_default()->get_icon_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->set_icon(E->get(), fromtype, Ref<Texture2D>());
-				}
-			}
-			{
-				names.clear();
-				Theme::get_default()->get_stylebox_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->set_stylebox(E->get(), fromtype, Ref<StyleBox>());
-				}
-			}
-			{
-				names.clear();
-				Theme::get_default()->get_font_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->set_font(E->get(), fromtype, Ref<Font>());
-				}
-			}
-			{
-				names.clear();
-				Theme::get_default()->get_font_size_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->set_font_size(E->get(), fromtype, Theme::get_default()->get_font_size(E->get(), fromtype));
-				}
-			}
-			{
-				names.clear();
-				Theme::get_default()->get_color_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->set_color(E->get(), fromtype, Theme::get_default()->get_color(E->get(), fromtype));
-				}
-			}
-			{
-				names.clear();
-				Theme::get_default()->get_constant_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->set_constant(E->get(), fromtype, Theme::get_default()->get_constant(E->get(), fromtype));
-				}
-			}
-		} break;
-		case POPUP_REMOVE: {
-			switch (type_select->get_selected()) {
-				case 0:
-					theme->clear_icon(name_edit->get_text(), type_edit->get_text());
-					break;
-				case 1:
-					theme->clear_stylebox(name_edit->get_text(), type_edit->get_text());
-					break;
-				case 2:
-					theme->clear_font(name_edit->get_text(), type_edit->get_text());
-					break;
-				case 3:
-					theme->clear_font_size(name_edit->get_text(), type_edit->get_text());
-					break;
-				case 4:
-					theme->clear_color(name_edit->get_text(), type_edit->get_text());
-					break;
-				case 5:
-					theme->clear_constant(name_edit->get_text(), type_edit->get_text());
-					break;
-			}
-
-		} break;
-		case POPUP_CLASS_REMOVE: {
-			StringName fromtype = type_edit->get_text();
-			List<StringName> names;
-
-			{
-				names.clear();
-				Theme::get_default()->get_icon_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->clear_icon(E->get(), fromtype);
-				}
-			}
-			{
-				names.clear();
-				Theme::get_default()->get_stylebox_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->clear_stylebox(E->get(), fromtype);
-				}
-			}
-			{
-				names.clear();
-				Theme::get_default()->get_font_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->clear_font(E->get(), fromtype);
-				}
-			}
-			{
-				names.clear();
-				Theme::get_default()->get_font_size_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->clear_font_size(E->get(), fromtype);
-				}
-			}
-			{
-				names.clear();
-				Theme::get_default()->get_color_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->clear_color(E->get(), fromtype);
-				}
-			}
-			{
-				names.clear();
-				Theme::get_default()->get_constant_list(fromtype, &names);
-				for (List<StringName>::Element *E = names.front(); E; E = E->next()) {
-					theme->clear_constant(E->get(), fromtype);
-				}
-			}
-
-		} break;
-	}
-}
-
-void ThemeEditor::_theme_menu_cbk(int p_option) {
-	if (p_option == POPUP_CREATE_EMPTY || p_option == POPUP_CREATE_EDITOR_EMPTY || p_option == POPUP_IMPORT_EDITOR_THEME) {
-		bool import = (p_option == POPUP_IMPORT_EDITOR_THEME);
-
-		Ref<Theme> base_theme;
-
-		if (p_option == POPUP_CREATE_EMPTY) {
-			base_theme = Theme::get_default();
-		} else {
-			base_theme = EditorNode::get_singleton()->get_theme_base()->get_theme();
-		}
-
-		{
-			List<StringName> types;
-			base_theme->get_type_list(&types);
-
-			for (List<StringName>::Element *T = types.front(); T; T = T->next()) {
-				StringName type = T->get();
-
-				List<StringName> icons;
-				base_theme->get_icon_list(type, &icons);
-
-				for (List<StringName>::Element *E = icons.front(); E; E = E->next()) {
-					theme->set_icon(E->get(), type, import ? base_theme->get_icon(E->get(), type) : Ref<Texture2D>());
-				}
-
-				List<StringName> styleboxs;
-				base_theme->get_stylebox_list(type, &styleboxs);
-
-				for (List<StringName>::Element *E = styleboxs.front(); E; E = E->next()) {
-					theme->set_stylebox(E->get(), type, import ? base_theme->get_stylebox(E->get(), type) : Ref<StyleBox>());
-				}
-
-				List<StringName> fonts;
-				base_theme->get_font_list(type, &fonts);
-
-				for (List<StringName>::Element *E = fonts.front(); E; E = E->next()) {
-					theme->set_font(E->get(), type, Ref<Font>());
-				}
-
-				List<StringName> font_sizes;
-				base_theme->get_font_size_list(type, &font_sizes);
-
-				for (List<StringName>::Element *E = font_sizes.front(); E; E = E->next()) {
-					theme->set_font_size(E->get(), type, base_theme->get_font_size(E->get(), type));
-				}
-
-				List<StringName> colors;
-				base_theme->get_color_list(type, &colors);
-
-				for (List<StringName>::Element *E = colors.front(); E; E = E->next()) {
-					theme->set_color(E->get(), type, import ? base_theme->get_color(E->get(), type) : Color());
-				}
-
-				List<StringName> constants;
-				base_theme->get_constant_list(type, &constants);
-
-				for (List<StringName>::Element *E = constants.front(); E; E = E->next()) {
-					theme->set_constant(E->get(), type, base_theme->get_constant(E->get(), type));
-				}
-			}
-		}
-		return;
-	}
+void ThemeEditor::_theme_create_menu_cbk(int p_option) {
+	bool import = (p_option == POPUP_IMPORT_EDITOR_THEME);
 
 	Ref<Theme> base_theme;
 
-	name_select_label->show();
-	name_hbc->show();
-	type_select_label->show();
-	type_select->show();
-
-	if (p_option == POPUP_ADD) { // Add.
-
-		add_del_dialog->set_title(TTR("Add Item"));
-		add_del_dialog->get_ok_button()->set_text(TTR("Add"));
-		add_del_dialog->popup_centered(Size2(490, 85) * EDSCALE);
-
+	if (p_option == POPUP_CREATE_EMPTY) {
 		base_theme = Theme::get_default();
+	} else {
+		base_theme = EditorNode::get_singleton()->get_theme_base()->get_theme();
+	}
 
-	} else if (p_option == POPUP_CLASS_ADD) { // Add.
-
-		add_del_dialog->set_title(TTR("Add All Items"));
-		add_del_dialog->get_ok_button()->set_text(TTR("Add All"));
-		add_del_dialog->popup_centered(Size2(240, 85) * EDSCALE);
-
-		base_theme = Theme::get_default();
+	{
+		List<StringName> types;
+		base_theme->get_type_list(&types);
 
-		name_select_label->hide();
-		name_hbc->hide();
-		type_select_label->hide();
-		type_select->hide();
+		for (List<StringName>::Element *T = types.front(); T; T = T->next()) {
+			StringName type = T->get();
 
-	} else if (p_option == POPUP_REMOVE) {
-		add_del_dialog->set_title(TTR("Remove Item"));
-		add_del_dialog->get_ok_button()->set_text(TTR("Remove"));
-		add_del_dialog->popup_centered(Size2(490, 85) * EDSCALE);
+			List<StringName> icons;
+			base_theme->get_icon_list(type, &icons);
 
-		base_theme = theme;
+			for (List<StringName>::Element *E = icons.front(); E; E = E->next()) {
+				theme->set_icon(E->get(), type, import ? base_theme->get_icon(E->get(), type) : Ref<Texture2D>());
+			}
 
-	} else if (p_option == POPUP_CLASS_REMOVE) {
-		add_del_dialog->set_title(TTR("Remove All Items"));
-		add_del_dialog->get_ok_button()->set_text(TTR("Remove All"));
-		add_del_dialog->popup_centered(Size2(240, 85) * EDSCALE);
+			List<StringName> styleboxs;
+			base_theme->get_stylebox_list(type, &styleboxs);
 
-		base_theme = Theme::get_default();
+			for (List<StringName>::Element *E = styleboxs.front(); E; E = E->next()) {
+				theme->set_stylebox(E->get(), type, import ? base_theme->get_stylebox(E->get(), type) : Ref<StyleBox>());
+			}
 
-		name_select_label->hide();
-		name_hbc->hide();
-		type_select_label->hide();
-		type_select->hide();
-	}
-	popup_mode = p_option;
+			List<StringName> fonts;
+			base_theme->get_font_list(type, &fonts);
 
-	ERR_FAIL_COND(theme.is_null());
+			for (List<StringName>::Element *E = fonts.front(); E; E = E->next()) {
+				theme->set_font(E->get(), type, Ref<Font>());
+			}
 
-	List<StringName> types;
-	base_theme->get_type_list(&types);
+			List<StringName> font_sizes;
+			base_theme->get_font_size_list(type, &font_sizes);
 
-	type_menu->get_popup()->clear();
+			for (List<StringName>::Element *E = font_sizes.front(); E; E = E->next()) {
+				theme->set_font_size(E->get(), type, base_theme->get_font_size(E->get(), type));
+			}
 
-	if (p_option == 0 || p_option == 1) { // Add.
+			List<StringName> colors;
+			base_theme->get_color_list(type, &colors);
 
-		List<StringName> new_types;
-		theme->get_type_list(&new_types);
-		for (List<StringName>::Element *F = new_types.front(); F; F = F->next()) {
-			bool found = false;
-			for (List<StringName>::Element *E = types.front(); E; E = E->next()) {
-				if (E->get() == F->get()) {
-					found = true;
-					break;
-				}
+			for (List<StringName>::Element *E = colors.front(); E; E = E->next()) {
+				theme->set_color(E->get(), type, import ? base_theme->get_color(E->get(), type) : Color());
 			}
 
-			if (!found) {
-				types.push_back(F->get());
+			List<StringName> constants;
+			base_theme->get_constant_list(type, &constants);
+
+			for (List<StringName>::Element *E = constants.front(); E; E = E->next()) {
+				theme->set_constant(E->get(), type, base_theme->get_constant(E->get(), type));
 			}
 		}
 	}
+}
 
-	types.sort_custom<StringName::AlphCompare>();
-	for (List<StringName>::Element *E = types.front(); E; E = E->next()) {
-		type_menu->get_popup()->add_item(E->get());
-	}
+void ThemeEditor::_theme_edit_button_cbk() {
+	theme_edit_dialog->popup_centered(Size2(800, 640) * EDSCALE);
 }
 
 void ThemeEditor::_notification(int p_what) {
@@ -636,9 +1046,6 @@ void ThemeEditor::_notification(int p_what) {
 				_refresh_interval();
 			}
 		} break;
-		case NOTIFICATION_THEME_CHANGED: {
-			theme_menu->set_icon(get_theme_icon("Theme", "EditorIcons"));
-		} break;
 	}
 }
 
@@ -646,27 +1053,28 @@ void ThemeEditor::_bind_methods() {
 }
 
 ThemeEditor::ThemeEditor() {
-	time_left = 0;
-
 	HBoxContainer *top_menu = memnew(HBoxContainer);
 	add_child(top_menu);
 
 	top_menu->add_child(memnew(Label(TTR("Preview:"))));
 	top_menu->add_spacer(false);
 
-	theme_menu = memnew(MenuButton);
-	theme_menu->set_text(TTR("Edit Theme"));
-	theme_menu->set_tooltip(TTR("Theme editing menu."));
-	theme_menu->get_popup()->add_item(TTR("Add Item"), POPUP_ADD);
-	theme_menu->get_popup()->add_item(TTR("Add Class Items"), POPUP_CLASS_ADD);
-	theme_menu->get_popup()->add_item(TTR("Remove Item"), POPUP_REMOVE);
-	theme_menu->get_popup()->add_item(TTR("Remove Class Items"), POPUP_CLASS_REMOVE);
-	theme_menu->get_popup()->add_separator();
-	theme_menu->get_popup()->add_item(TTR("Create Empty Template"), POPUP_CREATE_EMPTY);
-	theme_menu->get_popup()->add_item(TTR("Create Empty Editor Template"), POPUP_CREATE_EDITOR_EMPTY);
-	theme_menu->get_popup()->add_item(TTR("Create From Current Editor Theme"), POPUP_IMPORT_EDITOR_THEME);
-	top_menu->add_child(theme_menu);
-	theme_menu->get_popup()->connect("id_pressed", callable_mp(this, &ThemeEditor::_theme_menu_cbk));
+	theme_create_menu = memnew(MenuButton);
+	theme_create_menu->set_text(TTR("Create Theme..."));
+	theme_create_menu->set_tooltip(TTR("Create a new Theme."));
+	theme_create_menu->get_popup()->add_item(TTR("Empty Template"), POPUP_CREATE_EMPTY);
+	theme_create_menu->get_popup()->add_separator();
+	theme_create_menu->get_popup()->add_item(TTR("Empty Editor Template"), POPUP_CREATE_EDITOR_EMPTY);
+	theme_create_menu->get_popup()->add_item(TTR("From Current Editor Theme"), POPUP_IMPORT_EDITOR_THEME);
+	top_menu->add_child(theme_create_menu);
+	theme_create_menu->get_popup()->connect("id_pressed", callable_mp(this, &ThemeEditor::_theme_create_menu_cbk));
+
+	theme_edit_button = memnew(Button);
+	theme_edit_button->set_text(TTR("Edit Theme Items"));
+	theme_edit_button->set_tooltip(TTR("Customize Theme items."));
+	theme_edit_button->set_flat(true);
+	theme_edit_button->connect("pressed", callable_mp(this, &ThemeEditor::_theme_edit_button_cbk));
+	top_menu->add_child(theme_edit_button);
 
 	ScrollContainer *scroll = memnew(ScrollContainer);
 	add_child(scroll);
@@ -849,66 +1257,9 @@ ThemeEditor::ThemeEditor() {
 
 	main_hb->add_theme_constant_override("separation", 20 * EDSCALE);
 
-	////////
-
-	add_del_dialog = memnew(ConfirmationDialog);
-	add_del_dialog->hide();
-	add_child(add_del_dialog);
-
-	VBoxContainer *dialog_vbc = memnew(VBoxContainer);
-	add_del_dialog->add_child(dialog_vbc);
-
-	Label *l = memnew(Label);
-	l->set_text(TTR("Type:"));
-	dialog_vbc->add_child(l);
-
-	type_hbc = memnew(HBoxContainer);
-	dialog_vbc->add_child(type_hbc);
-
-	type_edit = memnew(LineEdit);
-	type_edit->set_h_size_flags(SIZE_EXPAND_FILL);
-	type_hbc->add_child(type_edit);
-	type_menu = memnew(MenuButton);
-	type_menu->set_flat(false);
-	type_menu->set_text("...");
-	type_hbc->add_child(type_menu);
-
-	type_menu->get_popup()->connect("id_pressed", callable_mp(this, &ThemeEditor::_type_menu_cbk));
-
-	l = memnew(Label);
-	l->set_text(TTR("Name:"));
-	dialog_vbc->add_child(l);
-	name_select_label = l;
-
-	name_hbc = memnew(HBoxContainer);
-	dialog_vbc->add_child(name_hbc);
-
-	name_edit = memnew(LineEdit);
-	name_edit->set_h_size_flags(SIZE_EXPAND_FILL);
-	name_hbc->add_child(name_edit);
-	name_menu = memnew(MenuButton);
-	type_menu->set_flat(false);
-	name_menu->set_text("...");
-	name_hbc->add_child(name_menu);
-
-	name_menu->get_popup()->connect("about_to_popup", callable_mp(this, &ThemeEditor::_name_menu_about_to_show));
-	name_menu->get_popup()->connect("id_pressed", callable_mp(this, &ThemeEditor::_name_menu_cbk));
-
-	type_select_label = memnew(Label);
-	type_select_label->set_text(TTR("Data Type:"));
-	dialog_vbc->add_child(type_select_label);
-
-	type_select = memnew(OptionButton);
-	type_select->add_item(TTR("Icon"));
-	type_select->add_item(TTR("Style"));
-	type_select->add_item(TTR("Font"));
-	type_select->add_item(TTR("Font Size"));
-	type_select->add_item(TTR("Color"));
-	type_select->add_item(TTR("Constant"));
-
-	dialog_vbc->add_child(type_select);
-
-	add_del_dialog->get_ok_button()->connect("pressed", callable_mp(this, &ThemeEditor::_dialog_cbk));
+	theme_edit_dialog = memnew(ThemeItemEditorDialog);
+	theme_edit_dialog->hide();
+	add_child(theme_edit_dialog);
 
 	file_dialog = memnew(EditorFileDialog);
 	file_dialog->add_filter("*.theme ; " + TTR("Theme File"));
diff --git a/editor/plugins/theme_editor_plugin.h b/editor/plugins/theme_editor_plugin.h
index ab199f8e51..0a840aecd7 100644
--- a/editor/plugins/theme_editor_plugin.h
+++ b/editor/plugins/theme_editor_plugin.h
@@ -41,6 +41,77 @@
 
 #include "editor/editor_node.h"
 
+class ThemeItemEditorDialog : public AcceptDialog {
+	GDCLASS(ThemeItemEditorDialog, AcceptDialog);
+
+	Ref<Theme> edited_theme;
+
+	ItemList *edit_type_list;
+	OptionButton *edit_add_class_options;
+	LineEdit *edit_add_custom_value;
+	String edited_item_type;
+
+	Button *edit_items_add_color;
+	Button *edit_items_add_constant;
+	Button *edit_items_add_font;
+	Button *edit_items_add_font_size;
+	Button *edit_items_add_icon;
+	Button *edit_items_add_stylebox;
+	Button *edit_items_remove_class;
+	Button *edit_items_remove_custom;
+	Button *edit_items_remove_all;
+	Tree *edit_items_tree;
+
+	enum ItemsTreeAction {
+		ITEMS_TREE_RENAME_ITEM,
+		ITEMS_TREE_REMOVE_ITEM,
+		ITEMS_TREE_REMOVE_DATA_TYPE,
+	};
+
+	ConfirmationDialog *edit_theme_item_dialog;
+	VBoxContainer *edit_theme_item_old_vb;
+	Label *theme_item_old_name;
+	LineEdit *theme_item_name;
+
+	enum ItemPopupMode {
+		CREATE_THEME_ITEM,
+		RENAME_THEME_ITEM,
+		ITEM_POPUP_MODE_MAX
+	};
+
+	ItemPopupMode item_popup_mode = ITEM_POPUP_MODE_MAX;
+	String edit_item_old_name;
+	Theme::DataType edit_item_data_type = Theme::DATA_TYPE_MAX;
+
+	void _dialog_about_to_show();
+	void _update_edit_types();
+	void _edited_type_selected(int p_item_idx);
+
+	void _update_edit_item_tree(String p_item_type);
+	void _item_tree_button_pressed(Object *p_item, int p_column, int p_id);
+
+	void _add_class_type_items();
+	void _add_custom_type();
+	void _add_theme_item(Theme::DataType p_data_type, String p_item_name, String p_item_type);
+	void _remove_data_type_items(Theme::DataType p_data_type, String p_item_type);
+	void _remove_class_items();
+	void _remove_custom_items();
+	void _remove_all_items();
+
+	void _open_add_theme_item_dialog(int p_data_type);
+	void _open_rename_theme_item_dialog(Theme::DataType p_data_type, String p_item_name);
+	void _confirm_edit_theme_item();
+	void _edit_theme_item_gui_input(const Ref<InputEvent> &p_event);
+
+protected:
+	void _notification(int p_what);
+
+public:
+	void set_edited_theme(const Ref<Theme> &p_theme);
+
+	ThemeItemEditorDialog();
+};
+
 class ThemeEditor : public VBoxContainer {
 	GDCLASS(ThemeEditor, VBoxContainer);
 
@@ -50,40 +121,23 @@ class ThemeEditor : public VBoxContainer {
 
 	EditorFileDialog *file_dialog;
 
-	double time_left;
-
-	MenuButton *theme_menu;
-	ConfirmationDialog *add_del_dialog;
-	HBoxContainer *type_hbc;
-	MenuButton *type_menu;
-	LineEdit *type_edit;
-	HBoxContainer *name_hbc;
-	MenuButton *name_menu;
-	LineEdit *name_edit;
-	OptionButton *type_select;
-	Label *type_select_label;
-	Label *name_select_label;
-
-	enum PopupMode {
-		POPUP_ADD,
-		POPUP_CLASS_ADD,
-		POPUP_REMOVE,
-		POPUP_CLASS_REMOVE,
+	double time_left = 0;
+
+	Button *theme_edit_button;
+	MenuButton *theme_create_menu;
+	ThemeItemEditorDialog *theme_edit_dialog;
+
+	enum CreatePopupMode {
 		POPUP_CREATE_EMPTY,
 		POPUP_CREATE_EDITOR_EMPTY,
-		POPUP_IMPORT_EDITOR_THEME
+		POPUP_IMPORT_EDITOR_THEME,
 	};
 
-	int popup_mode;
-
 	Tree *test_tree;
 
 	void _save_template_cbk(String fname);
-	void _dialog_cbk();
-	void _type_menu_cbk(int p_option);
-	void _name_menu_about_to_show();
-	void _name_menu_cbk(int p_option);
-	void _theme_menu_cbk(int p_option);
+	void _theme_edit_button_cbk();
+	void _theme_create_menu_cbk(int p_option);
 	void _propagate_redraw(Control *p_at);
 	void _refresh_interval();
 
diff --git a/editor/plugins/tile_map_editor_plugin.cpp b/editor/plugins/tile_map_editor_plugin.cpp
index bd721244ea..1d6ff92e0c 100644
--- a/editor/plugins/tile_map_editor_plugin.cpp
+++ b/editor/plugins/tile_map_editor_plugin.cpp
@@ -573,6 +573,7 @@ void TileMapEditor::_update_palette() {
 		entries2.sort_custom<SwapComparator>();
 
 		Ref<Texture2D> tex = tileset->tile_get_texture(sel_tile);
+		Color modulate = tileset->tile_get_modulate(sel_tile);
 
 		for (int i = 0; i < entries2.size(); i++) {
 			manual_palette->add_item(String());
@@ -588,6 +589,7 @@ void TileMapEditor::_update_palette() {
 				}
 
 				manual_palette->set_item_icon(manual_palette->get_item_count() - 1, tex);
+				manual_palette->set_item_icon_modulate(manual_palette->get_item_count() - 1, modulate);
 			}
 
 			manual_palette->set_item_metadata(manual_palette->get_item_count() - 1, entries2[i]);
@@ -658,11 +660,15 @@ Vector<Vector2> TileMapEditor::_bucket_fill(const Point2i &p_start, bool erase,
 	}
 
 	// Check if the tile variation is the same
-	Vector2 prev_position = node->get_cell_autotile_coord(p_start.x, p_start.y);
 	if (ids.size() == 1 && ids[0] == prev_id) {
 		int current = manual_palette->get_current();
-		Vector2 position = manual_palette->get_item_metadata(current);
-		if (prev_position == position) {
+		if (current == -1) {
+			// Same ID, no variation selected, nothing to change
+			return Vector<Vector2>();
+		}
+		Vector2 prev_autotile_coord = node->get_cell_autotile_coord(p_start.x, p_start.y);
+		Vector2 autotile_coord = manual_palette->get_item_metadata(current);
+		if (autotile_coord == prev_autotile_coord) {
 			// Same ID and variation, nothing to change
 			return Vector<Vector2>();
 		}
diff --git a/editor/plugins/tile_set_editor_plugin.cpp b/editor/plugins/tile_set_editor_plugin.cpp
index feaf609557..f683c4b10d 100644
--- a/editor/plugins/tile_set_editor_plugin.cpp
+++ b/editor/plugins/tile_set_editor_plugin.cpp
@@ -319,7 +319,7 @@ void TileSetEditor::_notification(int p_what) {
 			tool_editmode[EDITMODE_NAVIGATION]->set_icon(get_theme_icon("Navigation2D", "EditorIcons"));
 			tool_editmode[EDITMODE_BITMASK]->set_icon(get_theme_icon("PackedDataContainer", "EditorIcons"));
 			tool_editmode[EDITMODE_PRIORITY]->set_icon(get_theme_icon("MaterialPreviewLight1", "EditorIcons"));
-			tool_editmode[EDITMODE_ICON]->set_icon(get_theme_icon("LargeTexture", "EditorIcons"));
+			tool_editmode[EDITMODE_ICON]->set_icon(get_theme_icon("Image", "EditorIcons"));
 			tool_editmode[EDITMODE_Z_INDEX]->set_icon(get_theme_icon("Sort", "EditorIcons"));
 
 			scroll->add_theme_style_override("bg", get_theme_stylebox("bg", "Tree"));
diff --git a/editor/plugins/visual_shader_editor_plugin.cpp b/editor/plugins/visual_shader_editor_plugin.cpp
index b2fa9c540e..acc77bd098 100644
--- a/editor/plugins/visual_shader_editor_plugin.cpp
+++ b/editor/plugins/visual_shader_editor_plugin.cpp
@@ -353,6 +353,11 @@ void VisualShaderGraphPlugin::add_node(VisualShader::Type p_type, int p_id) {
 	bool is_expression = !expression_node.is_null();
 	String expression = "";
 
+	VisualShaderNodeCustom *custom_node = Object::cast_to<VisualShaderNodeCustom>(vsnode.ptr());
+	if (custom_node) {
+		custom_node->_set_initialized(true);
+	}
+
 	GraphNode *node = memnew(GraphNode);
 	register_link(p_type, p_id, vsnode.ptr(), node);
 
@@ -1133,16 +1138,24 @@ void VisualShaderEditor::_update_options_menu() {
 }
 
 void VisualShaderEditor::_set_mode(int p_which) {
-	if (p_which == VisualShader::MODE_PARTICLES) {
+	if (p_which == VisualShader::MODE_SKY) {
+		edit_type_standart->set_visible(false);
+		edit_type_particles->set_visible(false);
+		edit_type_sky->set_visible(true);
+		edit_type = edit_type_sky;
+		mode = MODE_FLAGS_SKY;
+	} else if (p_which == VisualShader::MODE_PARTICLES) {
 		edit_type_standart->set_visible(false);
 		edit_type_particles->set_visible(true);
+		edit_type_sky->set_visible(false);
 		edit_type = edit_type_particles;
-		particles_mode = true;
+		mode = MODE_FLAGS_PARTICLES;
 	} else {
 		edit_type_particles->set_visible(false);
 		edit_type_standart->set_visible(true);
+		edit_type_sky->set_visible(false);
 		edit_type = edit_type_standart;
-		particles_mode = false;
+		mode = MODE_FLAGS_SPATIAL_CANVASITEM;
 	}
 	visual_shader->set_shader_type(get_current_shader_type());
 }
@@ -1303,8 +1316,10 @@ void VisualShaderEditor::_update_graph() {
 
 VisualShader::Type VisualShaderEditor::get_current_shader_type() const {
 	VisualShader::Type type;
-	if (particles_mode) {
+	if (mode & MODE_FLAGS_PARTICLES) {
 		type = VisualShader::Type(edit_type->get_selected() + 3);
+	} else if (mode & MODE_FLAGS_SKY) {
+		type = VisualShader::Type(edit_type->get_selected() + 6);
 	} else {
 		type = VisualShader::Type(edit_type->get_selected());
 	}
@@ -1772,8 +1787,15 @@ void VisualShaderEditor::_port_edited() {
 	ERR_FAIL_COND(!vsn.is_valid());
 
 	undo_redo->create_action(TTR("Set Input Default Port"));
-	undo_redo->add_do_method(vsn.ptr(), "set_input_port_default_value", editing_port, value);
-	undo_redo->add_undo_method(vsn.ptr(), "set_input_port_default_value", editing_port, vsn->get_input_port_default_value(editing_port));
+
+	Ref<VisualShaderNodeCustom> custom = Object::cast_to<VisualShaderNodeCustom>(vsn.ptr());
+	if (custom.is_valid()) {
+		undo_redo->add_do_method(custom.ptr(), "_set_input_port_default_value", editing_port, value);
+		undo_redo->add_undo_method(custom.ptr(), "_set_input_port_default_value", editing_port, vsn->get_input_port_default_value(editing_port));
+	} else {
+		undo_redo->add_do_method(vsn.ptr(), "set_input_port_default_value", editing_port, value);
+		undo_redo->add_undo_method(vsn.ptr(), "set_input_port_default_value", editing_port, vsn->get_input_port_default_value(editing_port));
+	}
 	undo_redo->add_do_method(graph_plugin.ptr(), "set_input_port_default_value", type, editing_node, editing_port, value);
 	undo_redo->add_undo_method(graph_plugin.ptr(), "set_input_port_default_value", type, editing_node, editing_port, vsn->get_input_port_default_value(editing_port));
 	undo_redo->commit_action();
@@ -3025,7 +3047,14 @@ void VisualShaderEditor::_paste_nodes(bool p_use_custom_position, const Vector2
 }
 
 void VisualShaderEditor::_mode_selected(int p_id) {
-	visual_shader->set_shader_type(particles_mode ? VisualShader::Type(p_id + 3) : VisualShader::Type(p_id));
+	int offset = 0;
+	if (mode & MODE_FLAGS_PARTICLES) {
+		offset = 3;
+	} else if (mode & MODE_FLAGS_SKY) {
+		offset = 6;
+	}
+
+	visual_shader->set_shader_type(VisualShader::Type(p_id + offset));
 	_update_options_menu();
 	_update_graph();
 }
@@ -3398,7 +3427,7 @@ void VisualShaderEditor::_update_preview() {
 
 	ShaderLanguage sl;
 
-	Error err = sl.compile(code, ShaderTypes::get_singleton()->get_functions(RenderingServer::ShaderMode(visual_shader->get_mode())), ShaderTypes::get_singleton()->get_modes(RenderingServer::ShaderMode(visual_shader->get_mode())), ShaderTypes::get_singleton()->get_types(), _get_global_variable_type);
+	Error err = sl.compile(code, ShaderTypes::get_singleton()->get_functions(RenderingServer::ShaderMode(visual_shader->get_mode())), ShaderTypes::get_singleton()->get_modes(RenderingServer::ShaderMode(visual_shader->get_mode())), ShaderLanguage::VaryingFunctionNames(), ShaderTypes::get_singleton()->get_types(), _get_global_variable_type);
 
 	for (int i = 0; i < preview_text->get_line_count(); i++) {
 		preview_text->set_line_as_marked(i, false);
@@ -3531,10 +3560,17 @@ VisualShaderEditor::VisualShaderEditor() {
 	edit_type_particles->select(0);
 	edit_type_particles->connect("item_selected", callable_mp(this, &VisualShaderEditor::_mode_selected));
 
+	edit_type_sky = memnew(OptionButton);
+	edit_type_sky->add_item(TTR("Sky"));
+	edit_type_sky->select(0);
+	edit_type_sky->connect("item_selected", callable_mp(this, &VisualShaderEditor::_mode_selected));
+
 	edit_type = edit_type_standart;
 
 	graph->get_zoom_hbox()->add_child(edit_type_particles);
 	graph->get_zoom_hbox()->move_child(edit_type_particles, 0);
+	graph->get_zoom_hbox()->add_child(edit_type_sky);
+	graph->get_zoom_hbox()->move_child(edit_type_sky, 0);
 	graph->get_zoom_hbox()->add_child(edit_type_standart);
 	graph->get_zoom_hbox()->move_child(edit_type_standart, 0);
 
@@ -3671,7 +3707,7 @@ VisualShaderEditor::VisualShaderEditor() {
 
 	comment_title_change_popup = memnew(PopupPanel);
 	comment_title_change_edit = memnew(LineEdit);
-	comment_title_change_edit->set_expand_to_text_length(true);
+	comment_title_change_edit->set_expand_to_text_length_enabled(true);
 	comment_title_change_edit->connect("text_changed", callable_mp(this, &VisualShaderEditor::_comment_title_text_changed));
 	comment_title_change_edit->connect("text_entered", callable_mp(this, &VisualShaderEditor::_comment_title_text_entered));
 	comment_title_change_popup->add_child(comment_title_change_edit);
@@ -3782,6 +3818,7 @@ VisualShaderEditor::VisualShaderEditor() {
 	const String input_param_for_vertex_and_fragment_shader_modes = TTR("'%s' input parameter for vertex and fragment shader modes.");
 	const String input_param_for_fragment_and_light_shader_modes = TTR("'%s' input parameter for fragment and light shader modes.");
 	const String input_param_for_fragment_shader_mode = TTR("'%s' input parameter for fragment shader mode.");
+	const String input_param_for_sky_shader_mode = TTR("'%s' input parameter for sky shader mode.");
 	const String input_param_for_light_shader_mode = TTR("'%s' input parameter for light shader mode.");
 	const String input_param_for_vertex_shader_mode = TTR("'%s' input parameter for vertex shader mode.");
 	const String input_param_for_emit_shader_mode = TTR("'%s' input parameter for emit shader mode.");
@@ -3911,35 +3948,35 @@ VisualShaderEditor::VisualShaderEditor() {
 
 	// SKY INPUTS
 
-	add_options.push_back(AddOption("AtCubeMapPass", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "at_cubemap_pass"), "at_cubemap_pass", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("AtHalfResPass", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "at_half_res_pass"), "at_half_res_pass", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("AtQuarterResPass", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "at_quarter_res_pass"), "at_quarter_res_pass", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("EyeDir", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "eyedir"), "eyedir", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("HalfResColor", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "half_res_color"), "half_res_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("HalfResAlpha", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "half_res_alpha"), "half_res_alpha", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light0Color", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light0_color"), "light0_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light0Direction", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light0_direction"), "light0_direction", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light0Enabled", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light0_enabled"), "light0_enabled", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light0Energy", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light0_energy"), "light0_energy", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light1Color", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light1_color"), "light1_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light1Direction", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light1_direction"), "light1_direction", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light1Enabled", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light1_enabled"), "light1_enabled", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light1Energy", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light1_energy"), "light1_energy", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light2Color", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light2_color"), "light2_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light2Direction", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light2_direction"), "light2_direction", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light2Enabled", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light2_enabled"), "light2_enabled", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light2Energy", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light2_energy"), "light2_energy", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light3Color", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light3_color"), "light3_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light3Direction", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light3_direction"), "light3_direction", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light3Enabled", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light3_enabled"), "light3_enabled", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Light3Energy", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "light3_energy"), "light3_energy", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Position", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "position"), "position", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("QuarterResColor", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "quarter_res_color"), "quarter_res_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("QuarterResAlpha", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "quarter_res_alpha"), "quarter_res_alpha", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Radiance", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "radiance"), "radiance", VisualShaderNode::PORT_TYPE_SAMPLER, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("ScreenUV", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "screen_uv"), "screen_uv", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("SkyCoords", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "sky_coords"), "sky_coords", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
-	add_options.push_back(AddOption("Time", "Input", "Fragment", "VisualShaderNodeInput", vformat(input_param_for_fragment_shader_mode, "time"), "time", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_FRAGMENT, Shader::MODE_SKY));
+	add_options.push_back(AddOption("AtCubeMapPass", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "at_cubemap_pass"), "at_cubemap_pass", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("AtHalfResPass", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "at_half_res_pass"), "at_half_res_pass", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("AtQuarterResPass", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "at_quarter_res_pass"), "at_quarter_res_pass", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("EyeDir", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "eyedir"), "eyedir", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("HalfResColor", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "half_res_color"), "half_res_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("HalfResAlpha", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "half_res_alpha"), "half_res_alpha", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light0Color", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light0_color"), "light0_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light0Direction", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light0_direction"), "light0_direction", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light0Enabled", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light0_enabled"), "light0_enabled", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light0Energy", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light0_energy"), "light0_energy", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light1Color", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light1_color"), "light1_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light1Direction", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light1_direction"), "light1_direction", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light1Enabled", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light1_enabled"), "light1_enabled", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light1Energy", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light1_energy"), "light1_energy", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light2Color", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light2_color"), "light2_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light2Direction", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light2_direction"), "light2_direction", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light2Enabled", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light2_enabled"), "light2_enabled", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light2Energy", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light2_energy"), "light2_energy", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light3Color", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light3_color"), "light3_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light3Direction", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light3_direction"), "light3_direction", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light3Enabled", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light3_enabled"), "light3_enabled", VisualShaderNode::PORT_TYPE_BOOLEAN, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Light3Energy", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "light3_energy"), "light3_energy", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Position", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "position"), "position", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("QuarterResColor", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "quarter_res_color"), "quarter_res_color", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("QuarterResAlpha", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "quarter_res_alpha"), "quarter_res_alpha", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Radiance", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "radiance"), "radiance", VisualShaderNode::PORT_TYPE_SAMPLER, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("ScreenUV", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "screen_uv"), "screen_uv", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("SkyCoords", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "sky_coords"), "sky_coords", VisualShaderNode::PORT_TYPE_VECTOR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
+	add_options.push_back(AddOption("Time", "Input", "Sky", "VisualShaderNodeInput", vformat(input_param_for_sky_shader_mode, "time"), "time", VisualShaderNode::PORT_TYPE_SCALAR, TYPE_FLAGS_SKY, Shader::MODE_SKY));
 
 	// SCALAR
 
diff --git a/editor/plugins/visual_shader_editor_plugin.h b/editor/plugins/visual_shader_editor_plugin.h
index 517dc6056f..6d57d38cab 100644
--- a/editor/plugins/visual_shader_editor_plugin.h
+++ b/editor/plugins/visual_shader_editor_plugin.h
@@ -141,6 +141,7 @@ class VisualShaderEditor : public VBoxContainer {
 	OptionButton *edit_type = nullptr;
 	OptionButton *edit_type_standart;
 	OptionButton *edit_type_particles;
+	OptionButton *edit_type_sky;
 
 	PanelContainer *error_panel;
 	Label *error_label;
@@ -169,7 +170,14 @@ class VisualShaderEditor : public VBoxContainer {
 
 	bool preview_first = true;
 	bool preview_showed = false;
-	bool particles_mode;
+
+	enum ShaderModeFlags {
+		MODE_FLAGS_SPATIAL_CANVASITEM = 1,
+		MODE_FLAGS_SKY = 2,
+		MODE_FLAGS_PARTICLES = 4
+	};
+
+	int mode = MODE_FLAGS_SPATIAL_CANVASITEM;
 
 	enum TypeFlags {
 		TYPE_FLAGS_VERTEX = 1,
@@ -183,6 +191,10 @@ class VisualShaderEditor : public VBoxContainer {
 		TYPE_FLAGS_END = 4
 	};
 
+	enum SkyTypeFlags {
+		TYPE_FLAGS_SKY = 1,
+	};
+
 	enum ToolsMenuOptions {
 		EXPAND_ALL,
 		COLLAPSE_ALL
diff --git a/editor/pot_generator.cpp b/editor/pot_generator.cpp
index 497cc0cbdc..b58b7e4cac 100644
--- a/editor/pot_generator.cpp
+++ b/editor/pot_generator.cpp
@@ -39,7 +39,7 @@ POTGenerator *POTGenerator::singleton = nullptr;
 
 #ifdef DEBUG_POT
 void POTGenerator::_print_all_translation_strings() {
-	for (auto E = all_translation_strings.front(); E; E = E.next()) {
+	for (OrderedHashMap<String, Vector<POTGenerator::MsgidData>>::Element E = all_translation_strings.front(); E; E = E.next()) {
 		Vector<MsgidData> v_md = all_translation_strings[E.key()];
 		for (int i = 0; i < v_md.size(); i++) {
 			print_line("++++++");
diff --git a/editor/project_manager.cpp b/editor/project_manager.cpp
index d3def86bd1..e51e8ee82e 100644
--- a/editor/project_manager.cpp
+++ b/editor/project_manager.cpp
@@ -296,7 +296,7 @@ private:
 		String sp = _test_path();
 		if (sp != "") {
 			// If the project name is empty or default, infer the project name from the selected folder name
-			if (project_name->get_text() == "" || project_name->get_text() == TTR("New Game Project")) {
+			if (project_name->get_text().strip_edges() == "" || project_name->get_text().strip_edges() == TTR("New Game Project")) {
 				sp = sp.replace("\\", "/");
 				int lidx = sp.rfind("/");
 
@@ -380,16 +380,17 @@ private:
 	}
 
 	void _create_folder() {
-		if (project_name->get_text() == "" || created_folder_path != "" || project_name->get_text().ends_with(".") || project_name->get_text().ends_with(" ")) {
-			set_message(TTR("Invalid Project Name."), MESSAGE_WARNING);
+		const String project_name_no_edges = project_name->get_text().strip_edges();
+		if (project_name_no_edges == "" || created_folder_path != "" || project_name_no_edges.ends_with(".")) {
+			set_message(TTR("Invalid project name."), MESSAGE_WARNING);
 			return;
 		}
 
 		DirAccess *d = DirAccess::create(DirAccess::ACCESS_FILESYSTEM);
 		if (d->change_dir(project_path->get_text()) == OK) {
-			if (!d->dir_exists(project_name->get_text())) {
-				if (d->make_dir(project_name->get_text()) == OK) {
-					d->change_dir(project_name->get_text());
+			if (!d->dir_exists(project_name_no_edges)) {
+				if (d->make_dir(project_name_no_edges) == OK) {
+					d->change_dir(project_name_no_edges);
 					String dir_str = d->get_current_dir();
 					project_path->set_text(dir_str);
 					_path_text_changed(dir_str);
@@ -415,7 +416,7 @@ private:
 
 		_test_path();
 
-		if (p_text == "") {
+		if (p_text.strip_edges() == "") {
 			set_message(TTR("It would be a good idea to name your project."), MESSAGE_ERROR);
 		}
 	}
@@ -442,7 +443,7 @@ private:
 				set_message(vformat(TTR("Couldn't load project.godot in project path (error %d). It may be missing or corrupted."), err), MESSAGE_ERROR);
 			} else {
 				ProjectSettings::CustomMap edited_settings;
-				edited_settings["application/config/name"] = project_name->get_text();
+				edited_settings["application/config/name"] = project_name->get_text().strip_edges();
 
 				if (current->save_custom(dir2.plus_file("project.godot"), edited_settings, Vector<String>(), true) != OK) {
 					set_message(TTR("Couldn't edit project.godot in project path."), MESSAGE_ERROR);
@@ -483,7 +484,7 @@ private:
 						initial_settings["rendering/textures/vram_compression/import_etc2"] = false;
 						initial_settings["rendering/textures/vram_compression/import_etc"] = true;
 					}
-					initial_settings["application/config/name"] = project_name->get_text();
+					initial_settings["application/config/name"] = project_name->get_text().strip_edges();
 					initial_settings["application/config/icon"] = "res://icon.png";
 					initial_settings["rendering/environment/defaults/default_environment"] = "res://default_env.tres";
 
@@ -1851,6 +1852,9 @@ void ProjectManager::_notification(int p_what) {
 		case NOTIFICATION_WM_CLOSE_REQUEST: {
 			_dim_window();
 		} break;
+		case NOTIFICATION_WM_ABOUT: {
+			_show_about();
+		} break;
 	}
 }
 
@@ -2254,6 +2258,10 @@ void ProjectManager::_erase_missing_projects() {
 	erase_missing_ask->popup_centered();
 }
 
+void ProjectManager::_show_about() {
+	about->popup_centered(Size2(780, 500) * EDSCALE);
+}
+
 void ProjectManager::_language_selected(int p_id) {
 	String lang = language_btn->get_item_metadata(p_id);
 	EditorSettings::get_singleton()->set("interface/editor/editor_language", lang);
@@ -2443,12 +2451,7 @@ ProjectManager::ProjectManager() {
 	}
 
 	// TRANSLATORS: This refers to the application where users manage their Godot projects.
-	if (TS->is_locale_right_to_left(TranslationServer::get_singleton()->get_tool_locale())) {
-		// For RTL languages, embed translated part of the title (using control characters) to ensure correct order.
-		DisplayServer::get_singleton()->window_set_title(VERSION_NAME + String(" - ") + String::chr(0x202B) + TTR("Project Manager") + String::chr(0x202C) + String::chr(0x200E) + " - " + String::chr(0xA9) + " 2007-2021 Juan Linietsky, Ariel Manzur & Godot Contributors");
-	} else {
-		DisplayServer::get_singleton()->window_set_title(VERSION_NAME + String(" - ") + TTR("Project Manager") + " - " + String::chr(0xA9) + " 2007-2021 Juan Linietsky, Ariel Manzur & Godot Contributors");
-	}
+	DisplayServer::get_singleton()->window_set_title(VERSION_NAME + String(" - ") + TTR("Project Manager"));
 
 	FileDialog::set_default_show_hidden_files(EditorSettings::get_singleton()->get("filesystem/file_dialog/show_hidden_files"));
 
@@ -2582,6 +2585,13 @@ ProjectManager::ProjectManager() {
 		erase_missing_btn->set_text(TTR("Remove Missing"));
 		erase_missing_btn->connect("pressed", callable_mp(this, &ProjectManager::_erase_missing_projects));
 		tree_vb->add_child(erase_missing_btn);
+
+		tree_vb->add_spacer();
+
+		about_btn = memnew(Button);
+		about_btn->set_text(TTR("About"));
+		about_btn->connect("pressed", callable_mp(this, &ProjectManager::_show_about));
+		tree_vb->add_child(about_btn);
 	}
 
 	{
@@ -2715,6 +2725,9 @@ ProjectManager::ProjectManager() {
 		open_templates->get_ok_button()->set_text(TTR("Open Asset Library"));
 		open_templates->connect("confirmed", callable_mp(this, &ProjectManager::_open_asset_library));
 		add_child(open_templates);
+
+		about = memnew(EditorAbout);
+		add_child(about);
 	}
 
 	_load_recent_projects();
diff --git a/editor/project_manager.h b/editor/project_manager.h
index d13315c022..a66b7c4ab6 100644
--- a/editor/project_manager.h
+++ b/editor/project_manager.h
@@ -31,6 +31,7 @@
 #ifndef PROJECT_MANAGER_H
 #define PROJECT_MANAGER_H
 
+#include "editor/editor_about.h"
 #include "editor/plugins/asset_library_editor_plugin.h"
 #include "scene/gui/dialogs.h"
 #include "scene/gui/file_dialog.h"
@@ -62,6 +63,7 @@ class ProjectManager : public Control {
 	Button *rename_btn;
 	Button *erase_btn;
 	Button *erase_missing_btn;
+	Button *about_btn;
 
 	EditorAssetLibrary *asset_library;
 
@@ -78,6 +80,7 @@ class ProjectManager : public Control {
 	ConfirmationDialog *multi_scan_ask;
 	ConfirmationDialog *ask_update_settings;
 	ConfirmationDialog *open_templates;
+	EditorAbout *about;
 
 	HBoxContainer *settings_hb;
 
@@ -100,6 +103,7 @@ class ProjectManager : public Control {
 	void _erase_missing_projects();
 	void _erase_project_confirm();
 	void _erase_missing_projects_confirm();
+	void _show_about();
 	void _update_project_buttons();
 	void _language_selected(int p_id);
 	void _restart_confirm();
diff --git a/editor/project_settings_editor.cpp b/editor/project_settings_editor.cpp
index de7996eaa2..faec3355ac 100644
--- a/editor/project_settings_editor.cpp
+++ b/editor/project_settings_editor.cpp
@@ -102,10 +102,9 @@ void ProjectSettingsEditor::_add_setting() {
 	String setting = _get_setting_name();
 
 	// Initialize the property with the default value for the given type.
-	// The type list starts at 1 (as we exclude Nil), so add 1 to the selected value.
 	Callable::CallError ce;
 	Variant value;
-	Variant::construct(Variant::Type(type->get_selected() + 1), value, nullptr, 0, ce);
+	Variant::construct(Variant::Type(type->get_selected_id()), value, nullptr, 0, ce);
 
 	undo_redo->create_action(TTR("Add Project Setting"));
 	undo_redo->add_do_property(ps, setting, value);
@@ -584,7 +583,7 @@ ProjectSettingsEditor::ProjectSettingsEditor(EditorData *p_data) {
 			// There's no point in adding Nil types, and Object types
 			// can't be serialized correctly in the project settings.
 			if (i != Variant::NIL && i != Variant::OBJECT) {
-				type->add_item(Variant::get_type_name(Variant::Type(i)));
+				type->add_item(Variant::get_type_name(Variant::Type(i)), i);
 			}
 		}
 
diff --git a/editor/property_editor.cpp b/editor/property_editor.cpp
index 0a4f432e4a..1a010b9168 100644
--- a/editor/property_editor.cpp
+++ b/editor/property_editor.cpp
@@ -832,6 +832,9 @@ bool CustomPropertyEditor::edit(Object *p_owner, const String &p_name, Variant::
 				} else if (default_color_mode == 2) {
 					color_picker->set_raw_mode(true);
 				}
+
+				int picker_shape = EDITOR_GET("interface/inspector/default_color_picker_shape");
+				color_picker->set_picker_shape((ColorPicker::PickerShapeType)picker_shape);
 			}
 
 			color_picker->show();
diff --git a/editor/rename_dialog.cpp b/editor/rename_dialog.cpp
index b51524b299..0f15d4b119 100644
--- a/editor/rename_dialog.cpp
+++ b/editor/rename_dialog.cpp
@@ -632,7 +632,7 @@ void RenameDialog::_insert_text(String text) {
 
 	if (_is_main_field(focus_owner)) {
 		focus_owner->selection_delete();
-		focus_owner->append_at_cursor(text);
+		focus_owner->insert_text_at_caret(text);
 		_update_preview();
 	}
 }
diff --git a/editor/scene_tree_dock.cpp b/editor/scene_tree_dock.cpp
index c62e9cbe5f..49c231de69 100644
--- a/editor/scene_tree_dock.cpp
+++ b/editor/scene_tree_dock.cpp
@@ -33,6 +33,7 @@
 #include "core/config/project_settings.h"
 #include "core/input/input.h"
 #include "core/io/resource_saver.h"
+#include "core/object/message_queue.h"
 #include "core/os/keyboard.h"
 #include "editor/debugger/editor_debugger_node.h"
 #include "editor/editor_feature_profile.h"
@@ -140,7 +141,11 @@ void SceneTreeDock::instance_scenes(const Vector<String> &p_files, Node *p_paren
 		parent = scene_tree->get_selected();
 	}
 
-	if (!parent || !edited_scene) {
+	if (!parent) {
+		parent = edited_scene;
+	}
+
+	if (!parent) {
 		if (p_files.size() == 1) {
 			accept->set_text(TTR("No parent to instance a child at."));
 		} else {
@@ -3012,7 +3017,16 @@ void SceneTreeDock::_bind_methods() {
 	ADD_SIGNAL(MethodInfo("node_created", PropertyInfo(Variant::OBJECT, "node", PROPERTY_HINT_RESOURCE_TYPE, "Node")));
 }
 
+SceneTreeDock *SceneTreeDock::singleton = nullptr;
+
+void SceneTreeDock::_update_configuration_warning() {
+	if (singleton) {
+		MessageQueue::get_singleton()->push_callable(callable_mp(singleton->scene_tree, &SceneTreeEditor::update_warning));
+	}
+}
+
 SceneTreeDock::SceneTreeDock(EditorNode *p_editor, Node *p_scene_root, EditorSelection *p_editor_selection, EditorData &p_editor_data) {
+	singleton = this;
 	set_name("Scene");
 	editor = p_editor;
 	edited_scene = nullptr;
@@ -3095,6 +3109,7 @@ SceneTreeDock::SceneTreeDock(EditorNode *p_editor, Node *p_scene_root, EditorSel
 	edit_remote->set_h_size_flags(SIZE_EXPAND_FILL);
 	edit_remote->set_text(TTR("Remote"));
 	edit_remote->set_toggle_mode(true);
+	edit_remote->set_tooltip(TTR("If selected, the Remote scene tree dock will cause the project to stutter every time it updates.\nSwitch back to the Local scene tree dock to improve performance."));
 	edit_remote->connect("pressed", callable_mp(this, &SceneTreeDock::_remote_tree_selected));
 
 	edit_local = memnew(Button);
@@ -3202,9 +3217,12 @@ SceneTreeDock::SceneTreeDock(EditorNode *p_editor, Node *p_scene_root, EditorSel
 	EDITOR_DEF("interface/editors/show_scene_tree_root_selection", true);
 	EDITOR_DEF("interface/editors/derive_script_globals_by_name", true);
 	EDITOR_DEF("_use_favorites_root_selection", false);
+
+	Resource::_update_configuration_warning = _update_configuration_warning;
 }
 
 SceneTreeDock::~SceneTreeDock() {
+	singleton = nullptr;
 	if (!node_clipboard.is_empty()) {
 		_clear_clipboard();
 	}
diff --git a/editor/scene_tree_dock.h b/editor/scene_tree_dock.h
index aa62c93cb5..53f31375f8 100644
--- a/editor/scene_tree_dock.h
+++ b/editor/scene_tree_dock.h
@@ -244,6 +244,9 @@ class SceneTreeDock : public VBoxContainer {
 	bool profile_allow_editing;
 	bool profile_allow_script_editing;
 
+	static SceneTreeDock *singleton;
+	static void _update_configuration_warning();
+
 protected:
 	void _notification(int p_what);
 	static void _bind_methods();
diff --git a/editor/scene_tree_editor.cpp b/editor/scene_tree_editor.cpp
index b5e9aec854..f979f61196 100644
--- a/editor/scene_tree_editor.cpp
+++ b/editor/scene_tree_editor.cpp
@@ -120,7 +120,7 @@ void SceneTreeEditor::_cell_button_pressed(Object *p_item, int p_column, int p_i
 		}
 		undo_redo->commit_action();
 	} else if (p_id == BUTTON_WARNING) {
-		String config_err = n->get_configuration_warning();
+		String config_err = n->get_configuration_warnings_as_string();
 		if (config_err == String()) {
 			return;
 		}
@@ -252,9 +252,9 @@ bool SceneTreeEditor::_add_nodes(Node *p_node, TreeItem *p_parent, bool p_scroll
 
 	if (can_rename) { //should be can edit..
 
-		String warning = p_node->get_configuration_warning();
+		String warning = p_node->get_configuration_warnings_as_string();
 		if (!warning.is_empty()) {
-			item->add_button(0, get_theme_icon("NodeWarning", "EditorIcons"), BUTTON_WARNING, false, TTR("Node configuration warning:") + "\n" + p_node->get_configuration_warning());
+			item->add_button(0, get_theme_icon("NodeWarning", "EditorIcons"), BUTTON_WARNING, false, TTR("Node configuration warning:") + "\n" + warning);
 		}
 
 		int num_connections = p_node->get_persistent_signal_connection_count();
@@ -277,7 +277,7 @@ bool SceneTreeEditor::_add_nodes(Node *p_node, TreeItem *p_parent, bool p_scroll
 		}
 
 		Ref<Texture2D> icon_temp;
-		auto signal_temp = BUTTON_SIGNALS;
+		SceneTreeEditorButton signal_temp = BUTTON_SIGNALS;
 		if (num_connections >= 1 && num_groups >= 1) {
 			icon_temp = get_theme_icon("SignalsAndGroups", "EditorIcons");
 		} else if (num_connections >= 1) {
@@ -665,7 +665,7 @@ void SceneTreeEditor::_notification(int p_what) {
 			get_tree()->connect("tree_process_mode_changed", callable_mp(this, &SceneTreeEditor::_tree_process_mode_changed));
 			get_tree()->connect("node_removed", callable_mp(this, &SceneTreeEditor::_node_removed));
 			get_tree()->connect("node_renamed", callable_mp(this, &SceneTreeEditor::_node_renamed));
-			get_tree()->connect("node_configuration_warning_changed", callable_mp(this, &SceneTreeEditor::_warning_changed));
+			get_tree()->connect("node_configuration_warning_changed", callable_mp(this, &SceneTreeEditor::_warning_changed), varray(), CONNECT_DEFERRED);
 
 			tree->connect("item_collapsed", callable_mp(this, &SceneTreeEditor::_cell_collapsed));
 
@@ -1102,6 +1102,9 @@ void SceneTreeEditor::_rmb_select(const Vector2 &p_pos) {
 	emit_signal("rmb_pressed", tree->get_screen_transform().xform(p_pos));
 }
 
+void SceneTreeEditor::update_warning() {
+	_warning_changed(nullptr);
+}
 void SceneTreeEditor::_warning_changed(Node *p_for_node) {
 	//should use a timer
 	update_timer->start();
diff --git a/editor/scene_tree_editor.h b/editor/scene_tree_editor.h
index fd5157f04f..acd49e8d92 100644
--- a/editor/scene_tree_editor.h
+++ b/editor/scene_tree_editor.h
@@ -43,7 +43,7 @@ class SceneTreeEditor : public Control {
 
 	EditorSelection *editor_selection;
 
-	enum {
+	enum SceneTreeEditorButton {
 		BUTTON_SUBSCENE = 0,
 		BUTTON_VISIBILITY = 1,
 		BUTTON_SCRIPT = 2,
@@ -157,6 +157,8 @@ public:
 
 	Tree *get_scene_tree() { return tree; }
 
+	void update_warning();
+
 	SceneTreeEditor(bool p_label = true, bool p_can_rename = false, bool p_can_open_instance = false);
 	~SceneTreeEditor();
 };
diff --git a/editor/script_create_dialog.cpp b/editor/script_create_dialog.cpp
index b707f6c353..f3addd8904 100644
--- a/editor/script_create_dialog.cpp
+++ b/editor/script_create_dialog.cpp
@@ -87,8 +87,8 @@ void ScriptCreateDialog::_path_hbox_sorted() {
 
 		// First set cursor to the end of line to scroll LineEdit view
 		// to the right and then set the actual cursor position.
-		file_path->set_cursor_position(file_path->get_text().length());
-		file_path->set_cursor_position(filename_start_pos);
+		file_path->set_caret_column(file_path->get_text().length());
+		file_path->set_caret_column(filename_start_pos);
 
 		file_path->grab_focus();
 	}
@@ -238,6 +238,14 @@ String ScriptCreateDialog::_validate_path(const String &p_path, bool p_file_must
 	return "";
 }
 
+String ScriptCreateDialog::_get_class_name() const {
+	if (has_named_classes) {
+		return class_name->get_text();
+	} else {
+		return ProjectSettings::get_singleton()->localize_path(file_path->get_text()).get_file().get_basename();
+	}
+}
+
 void ScriptCreateDialog::_class_name_changed(const String &p_name) {
 	if (_validate_class(class_name->get_text())) {
 		is_class_name_valid = true;
@@ -287,13 +295,7 @@ void ScriptCreateDialog::ok_pressed() {
 }
 
 void ScriptCreateDialog::_create_new() {
-	String cname_param;
-
-	if (has_named_classes) {
-		cname_param = class_name->get_text();
-	} else {
-		cname_param = ProjectSettings::get_singleton()->localize_path(file_path->get_text()).get_file().get_basename();
-	}
+	String cname_param = _get_class_name();
 
 	Ref<Script> scr;
 	if (script_template != "") {
@@ -555,7 +557,7 @@ void ScriptCreateDialog::_file_selected(const String &p_file) {
 		String filename = p.get_file().get_basename();
 		int select_start = p.rfind(filename);
 		file_path->select(select_start, select_start + filename.length());
-		file_path->set_cursor_position(select_start + filename.length());
+		file_path->set_caret_column(select_start + filename.length());
 		file_path->grab_focus();
 	}
 }
@@ -687,6 +689,10 @@ void ScriptCreateDialog::_update_dialog() {
 
 	builtin_warning_label->set_visible(is_built_in);
 
+	// Check if the script name is the same as the parent class.
+	// This warning isn't relevant if the script is built-in.
+	script_name_warning_label->set_visible(!is_built_in && _get_class_name() == parent_name->get_text());
+
 	if (is_built_in) {
 		get_ok_button()->set_text(TTR("Create"));
 		parent_name->set_editable(true);
@@ -768,6 +774,14 @@ ScriptCreateDialog::ScriptCreateDialog() {
 	builtin_warning_label->set_autowrap(true);
 	builtin_warning_label->hide();
 
+	script_name_warning_label = memnew(Label);
+	script_name_warning_label->set_text(
+			TTR("Warning: Having the script name be the same as a built-in type is usually not desired."));
+	vb->add_child(script_name_warning_label);
+	script_name_warning_label->add_theme_color_override("font_color", Color(1, 0.85, 0.4));
+	script_name_warning_label->set_autowrap(true);
+	script_name_warning_label->hide();
+
 	status_panel = memnew(PanelContainer);
 	status_panel->set_h_size_flags(Control::SIZE_FILL);
 	status_panel->add_child(vb);
diff --git a/editor/script_create_dialog.h b/editor/script_create_dialog.h
index e898b6f927..d6417b9d33 100644
--- a/editor/script_create_dialog.h
+++ b/editor/script_create_dialog.h
@@ -50,6 +50,7 @@ class ScriptCreateDialog : public ConfirmationDialog {
 	Label *error_label;
 	Label *path_error_label;
 	Label *builtin_warning_label;
+	Label *script_name_warning_label;
 	PanelContainer *status_panel;
 	LineEdit *parent_name;
 	Button *parent_browse_button;
@@ -110,6 +111,7 @@ class ScriptCreateDialog : public ConfirmationDialog {
 	bool _validate_parent(const String &p_string);
 	bool _validate_class(const String &p_string);
 	String _validate_path(const String &p_path, bool p_file_must_exist);
+	String _get_class_name() const;
 	void _class_name_changed(const String &p_name);
 	void _parent_name_changed(const String &p_parent);
 	void _template_changed(int p_template = 0);
diff --git a/editor/settings_config_dialog.cpp b/editor/settings_config_dialog.cpp
index 3be2136a20..81af4996ed 100644
--- a/editor/settings_config_dialog.cpp
+++ b/editor/settings_config_dialog.cpp
@@ -393,9 +393,10 @@ void EditorSettingsDialog::_shortcut_button_pressed(Object *p_item, int p_column
 	TreeItem *ti = Object::cast_to<TreeItem>(p_item);
 	ERR_FAIL_COND(!ti);
 
+	button_idx = p_idx;
+
 	if (ti->get_metadata(0) == "Common") {
 		// Editing a Built-in action, which can have multiple bindings.
-		button_idx = p_idx;
 		editing_action = true;
 		current_action = ti->get_text(0);
 
diff --git a/editor/translations/af.po b/editor/translations/af.po
index a60466f417..1b952d51b6 100644
--- a/editor/translations/af.po
+++ b/editor/translations/af.po
@@ -7616,6 +7616,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10926,6 +10931,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/ar.po b/editor/translations/ar.po
index f4b65c0065..9051e2cf82 100644
--- a/editor/translations/ar.po
+++ b/editor/translations/ar.po
@@ -48,12 +48,15 @@
 # bruvzg <bruvzg13@gmail.com>, 2020.
 # StarlkYT <mrsstarlkps4@gmail.com>, 2020, 2021.
 # Games Toon <xxtvgoodxx@gmail.com>, 2021.
+# Kareem Abduljaleel <karemjaleel34@gmail.com>, 2021.
+# ILG - Game <moegypt277@gmail.com>, 2021.
+# Hatim Jamal <hatimjamal8@gmail.com>, 2021.
 msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: \n"
-"PO-Revision-Date: 2021-03-07 06:04+0000\n"
-"Last-Translator: StarlkYT <mrsstarlkps4@gmail.com>\n"
+"PO-Revision-Date: 2021-04-16 07:52+0000\n"
+"Last-Translator: Hatim Jamal <hatimjamal8@gmail.com>\n"
 "Language-Team: Arabic <https://hosted.weblate.org/projects/godot-engine/"
 "godot/ar/>\n"
 "Language: ar\n"
@@ -62,7 +65,7 @@ msgstr ""
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=6; plural=n==0 ? 0 : n==1 ? 1 : n==2 ? 2 : n%100>=3 "
 "&& n%100<=10 ? 3 : n%100>=11 ? 4 : 5;\n"
-"X-Generator: Weblate 4.5.1\n"
+"X-Generator: Weblate 4.6-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -275,7 +278,7 @@ msgstr "تكرار الرسوم المتحركة"
 #: editor/animation_track_editor.cpp
 #: modules/visual_script/visual_script_editor.cpp
 msgid "Functions:"
-msgstr "الإعدادات:"
+msgstr "الإعدادات:المهام:"
 
 #: editor/animation_track_editor.cpp
 msgid "Audio Clips:"
@@ -385,7 +388,7 @@ msgstr "حذف مسار التحريك"
 
 #: editor/animation_track_editor.cpp
 msgid "Create NEW track for %s and insert key?"
-msgstr "أنشئ مسار جديد لـ %s و إدخال مفتاح؟"
+msgstr "أنشئ مسار جديد ل %s و إدخال مفتاح؟"
 
 #: editor/animation_track_editor.cpp
 msgid "Create %d NEW tracks and insert keys?"
@@ -998,11 +1001,11 @@ msgstr "الوصف:"
 
 #: editor/dependency_editor.cpp
 msgid "Search Replacement For:"
-msgstr "البحث عن بديل لـ:"
+msgstr "البحث عن بديل ل:"
 
 #: editor/dependency_editor.cpp
 msgid "Dependencies For:"
-msgstr "تبعيات لـ:"
+msgstr "تبعيات ل:"
 
 #: editor/dependency_editor.cpp
 msgid ""
@@ -1394,7 +1397,7 @@ msgstr "تحريك مسار الصوت"
 
 #: editor/editor_audio_buses.cpp
 msgid "Save Audio Bus Layout As..."
-msgstr "حفظ تخطيط مسار الصوت كـ…"
+msgstr "حفظ تخطيط مسار الصوت ك…"
 
 #: editor/editor_audio_buses.cpp
 msgid "Location for New Layout..."
@@ -1703,7 +1706,7 @@ msgstr "رصيف العُقد"
 
 #: editor/editor_feature_profile.cpp
 msgid "FileSystem Dock"
-msgstr "قوائم نظام الملفات"
+msgstr "إرساء نظام الملفات"
 
 #: editor/editor_feature_profile.cpp
 msgid "Import Dock"
@@ -1788,7 +1791,7 @@ msgstr "جديد"
 #: editor/editor_feature_profile.cpp editor/editor_node.cpp
 #: editor/project_manager.cpp
 msgid "Import"
-msgstr "إستيراد"
+msgstr "استيراد"
 
 #: editor/editor_feature_profile.cpp editor/project_export.cpp
 msgid "Export"
@@ -2018,7 +2021,7 @@ msgstr "التعليمات على الإنترنت"
 
 #: editor/editor_help.cpp
 msgid "Properties"
-msgstr "خصائص"
+msgstr "خاصيات"
 
 #: editor/editor_help.cpp
 msgid "override:"
@@ -2453,7 +2456,7 @@ msgstr "يتطلب حفظ المشهد توافر عُقدة رئيسة."
 
 #: editor/editor_node.cpp
 msgid "Save Scene As..."
-msgstr "حفظ المشهد كـ…"
+msgstr "حفظ المشهد ك…"
 
 #: editor/editor_node.cpp editor/scene_tree_dock.cpp
 msgid "This operation can't be done without a scene."
@@ -2554,15 +2557,12 @@ msgid "Unable to enable addon plugin at: '%s' parsing of config failed."
 msgstr "غير قادر علي تفعيل إضافة البرنامج المُساعد في: '%s' تحميل الظبط فشل."
 
 #: editor/editor_node.cpp
-#, fuzzy
 msgid "Unable to find script field for addon plugin at: '%s'."
-msgstr ""
-"غير قادر علي إيجاد منطقة النص البرمجي من أجل إضافة البرنامج في: 'res://"
-"addons/%s'."
+msgstr "غير قادر على إيجاد منطقة النص البرمجي من أجل إضافة البرنامج في: '%s'."
 
 #: editor/editor_node.cpp
 msgid "Unable to load addon script from path: '%s'."
-msgstr "غير قادر علي تحميل النص البرمجي للإضافة من المسار:  '%s'."
+msgstr "غير قادر علي تحميل النص البرمجي للإضافة من المسار: '%s'."
 
 #: editor/editor_node.cpp
 msgid ""
@@ -2851,6 +2851,12 @@ msgid ""
 "mobile device).\n"
 "You don't need to enable it to use the GDScript debugger locally."
 msgstr ""
+"عند تمكين هذا الخيار ، سيؤدي استخدام النشر بنقرة واحدة إلى إجراء المحاولة "
+"القابلة للتنفيذ للاتصال ب IP الخاص بهذا الكمبيوتر بحيث يمكن تصحيح أخطاء "
+"المشروع الجاري تشغيله.\n"
+"الغرض من هذا الخيار هو استخدامه لتصحيح الأخطاء عن بُعد (عادةً باستخدام جهاز "
+"محمول).\n"
+"لا تحتاج إلى تمكينه لاستخدام مصحح أخطاء GDScript محليًا."
 
 #: editor/editor_node.cpp
 #, fuzzy
@@ -2858,7 +2864,6 @@ msgid "Small Deploy with Network Filesystem"
 msgstr "نشر مصغر مع نظام شبكات الملفات"
 
 #: editor/editor_node.cpp
-#, fuzzy
 msgid ""
 "When this option is enabled, using one-click deploy for Android will only "
 "export an executable without the project data.\n"
@@ -2878,7 +2883,6 @@ msgid "Visible Collision Shapes"
 msgstr "أشكال إصطدام ظاهرة"
 
 #: editor/editor_node.cpp
-#, fuzzy
 msgid ""
 "When this option is enabled, collision shapes and raycast nodes (for 2D and "
 "3D) will be visible in the running project."
@@ -3078,7 +3082,7 @@ msgstr "نظام الملفات"
 
 #: editor/editor_node.cpp
 msgid "Inspector"
-msgstr "المُراقب"
+msgstr "مُتفحص"
 
 #: editor/editor_node.cpp
 msgid "Expand Bottom Panel"
@@ -3086,7 +3090,7 @@ msgstr "توسيع التبويب السفلي"
 
 #: editor/editor_node.cpp
 msgid "Output"
-msgstr "الخرج"
+msgstr "المخرجات"
 
 #: editor/editor_node.cpp
 msgid "Don't Save"
@@ -3693,7 +3697,7 @@ msgstr "الحالة: إستيراد الملف فشل. من فضلك أصلح �
 #: editor/filesystem_dock.cpp
 msgid ""
 "Importing has been disabled for this file, so it can't be opened for editing."
-msgstr ""
+msgstr "تم تعطيل الاستيراد لهذا الملف ، لذا لا يمكن فتحه للتحرير."
 
 #: editor/filesystem_dock.cpp
 msgid "Cannot move/rename resources root."
@@ -3740,6 +3744,8 @@ msgid ""
 "\n"
 "Do you wish to overwrite them?"
 msgstr ""
+"تتعارض الملفات أو المجلدات التالية مع العناصر الموجودة في الموقع الهدف\n"
+"هل ترغب في الكتابة عليها؟"
 
 #: editor/filesystem_dock.cpp
 msgid "Renaming file:"
@@ -3771,7 +3777,7 @@ msgstr "فتح المَشاهِد"
 
 #: editor/filesystem_dock.cpp
 msgid "Instance"
-msgstr "نموذج"
+msgstr "كائن"
 
 #: editor/filesystem_dock.cpp
 msgid "Add to Favorites"
@@ -4087,9 +4093,8 @@ msgid "Select Importer"
 msgstr "تحديد الوضع"
 
 #: editor/import_defaults_editor.cpp
-#, fuzzy
 msgid "Importer:"
-msgstr "إستيراد"
+msgstr "المستورد:"
 
 #: editor/import_defaults_editor.cpp
 #, fuzzy
@@ -4098,7 +4103,7 @@ msgstr "تحميل الإفتراضي"
 
 #: editor/import_dock.cpp
 msgid "Keep File (No Import)"
-msgstr ""
+msgstr "الاحتفاظ بالملف (بدون استيراد)"
 
 #: editor/import_dock.cpp
 msgid "%d Files"
@@ -4180,7 +4185,7 @@ msgstr "إفتح في المساعدة"
 
 #: editor/inspector_dock.cpp
 msgid "Create a new resource in memory and edit it."
-msgstr "إنشاء مورد جديد في الذاكرة وتعديله."
+msgstr "انشاء مورد جديد فى الذاكرة و تعديله"
 
 #: editor/inspector_dock.cpp
 msgid "Load an existing resource from disk and edit it."
@@ -4208,7 +4213,7 @@ msgstr "خصائص العنصر."
 
 #: editor/inspector_dock.cpp
 msgid "Filter properties"
-msgstr "تصفية الخصائص"
+msgstr "خصائص التصفية"
 
 #: editor/inspector_dock.cpp
 msgid "Changes may be lost!"
@@ -4335,7 +4340,7 @@ msgstr "إضافة نقطة العقدة"
 #: editor/plugins/animation_blend_space_1d_editor.cpp
 #: editor/plugins/animation_blend_space_2d_editor.cpp
 msgid "Add Animation Point"
-msgstr "أضف نقطة حركة"
+msgstr "أضفة نقطة الرسوم المتحركة"
 
 #: editor/plugins/animation_blend_space_1d_editor.cpp
 msgid "Remove BlendSpace1D Point"
@@ -4580,7 +4585,7 @@ msgstr "مسح الرسم المتحرك"
 
 #: editor/plugins/animation_player_editor_plugin.cpp
 msgid "Invalid animation name!"
-msgstr "إسم الرسم المتحرك خاطئ!"
+msgstr "اسم حركة خاطئ!"
 
 #: editor/plugins/animation_player_editor_plugin.cpp
 msgid "Animation name already exists!"
@@ -4805,7 +4810,7 @@ msgstr "لم يتم تعيين موارد التشغيل في المسار: %s."
 
 #: editor/plugins/animation_state_machine_editor.cpp
 msgid "Node Removed"
-msgstr "مُسِحت العقدة"
+msgstr "تمت إزالة الكائن"
 
 #: editor/plugins/animation_state_machine_editor.cpp
 msgid "Transition Removed"
@@ -5222,32 +5227,33 @@ msgstr "لا يمكن انشاء خرائط الضوء, تاكد من ان ال�
 
 #: editor/plugins/baked_lightmap_editor_plugin.cpp
 msgid "Failed determining lightmap size. Maximum lightmap size too small?"
-msgstr ""
+msgstr "فشل تحديد حجم الخريطة الضوئية. الحجم الأقصى للخريطة المضيئة صغير جدًا؟"
 
 #: editor/plugins/baked_lightmap_editor_plugin.cpp
 msgid ""
 "Some mesh is invalid. Make sure the UV2 channel values are contained within "
 "the [0.0,1.0] square region."
 msgstr ""
+"بعض الشبكات غير صالحة. تأكد من احتواء قيم قناة UV2 داخل منطقة مربعة "
+"[0.0،1.0]."
 
 #: editor/plugins/baked_lightmap_editor_plugin.cpp
 msgid ""
 "Godot editor was built without ray tracing support, lightmaps can't be baked."
-msgstr ""
+msgstr "تم تجميع محرر Godot دون دعم لتتبع الأشعة. لا يمكن بناء خرائط الإضاءة."
 
 #: editor/plugins/baked_lightmap_editor_plugin.cpp
 msgid "Bake Lightmaps"
 msgstr "إعداد خرائط الضوء"
 
 #: editor/plugins/baked_lightmap_editor_plugin.cpp
-#, fuzzy
 msgid "Select lightmap bake file:"
-msgstr "حدد ملف القالب"
+msgstr "حدد ملف الخريطة الضوئية:"
 
 #: editor/plugins/camera_editor_plugin.cpp
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Preview"
-msgstr "استعراض"
+msgstr "عرض"
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Configure Snap"
@@ -5311,7 +5317,7 @@ msgstr "إنشاء موجه عمودي وأفقي جديد"
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Set CanvasItem \"%s\" Pivot Offset to (%d, %d)"
-msgstr ""
+msgstr "تعيين إزاحة \"CanvasItem \"%s المحورية إلى (%d, %d)"
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 #, fuzzy
@@ -5330,7 +5336,7 @@ msgstr "تحريك العنصر القماشي"
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Scale Node2D \"%s\" to (%s, %s)"
-msgstr ""
+msgstr "تعديل حجم العقدة \"Node2D \"%s إلى (s, %s%)"
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Resize Control \"%s\" to (%d, %d)"
@@ -5785,11 +5791,11 @@ msgstr "إخلاء الوضع"
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Multiply grid step by 2"
-msgstr "ضاعف خطوة الشبكة بـ 2"
+msgstr "ضاعف خطوة الشبكة ب 2"
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Divide grid step by 2"
-msgstr "قسم خطوة الشبكة بـ 2"
+msgstr "قسم خطوة الشبكة ب 2"
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Pan View"
@@ -6095,7 +6101,7 @@ msgstr "أنشئ الحد"
 
 #: editor/plugins/mesh_instance_editor_plugin.cpp
 msgid "Mesh"
-msgstr "مجسم"
+msgstr "مجسّم"
 
 #: editor/plugins/mesh_instance_editor_plugin.cpp
 msgid "Create Trimesh Static Body"
@@ -6711,7 +6717,7 @@ msgstr "تمكين المحاذاة"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
 msgid "Grid"
-msgstr "الشبكة"
+msgstr "شبكة"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
 msgid "Show Grid"
@@ -7556,6 +7562,11 @@ msgstr "تدوير الرؤية مقفول"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -8339,7 +8350,7 @@ msgstr "الإطباق Occlusion"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Navigation"
-msgstr "التنقل"
+msgstr "تنقل"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Bitmask"
@@ -8391,7 +8402,7 @@ msgstr "نسخ قناع-البِت."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Paste bitmask."
-msgstr "لصق قناع-البِت"
+msgstr "لصق قناع البِت"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Erase bitmask."
@@ -10507,14 +10518,12 @@ msgid "Replace:"
 msgstr "إستبدال:"
 
 #: editor/rename_dialog.cpp
-#, fuzzy
 msgid "Prefix:"
-msgstr "بادئة"
+msgstr "بادئة:"
 
 #: editor/rename_dialog.cpp
-#, fuzzy
 msgid "Suffix:"
-msgstr "لاحقة"
+msgstr "لاحقة:"
 
 #: editor/rename_dialog.cpp
 msgid "Use Regular Expressions"
@@ -10561,9 +10570,8 @@ msgid "Per-level Counter"
 msgstr "العداد وفق-المستوى"
 
 #: editor/rename_dialog.cpp
-#, fuzzy
 msgid "If set, the counter restarts for each group of child nodes."
-msgstr "إذا تم تحديده فإن العداد سيعيد البدء لكل مجموعة من العُقد الأبناء"
+msgstr "إذا تم تحديده فإن العداد سيعيد البدء لكل مجموعة من العُقد الأبناء."
 
 #: editor/rename_dialog.cpp
 msgid "Initial value for the counter"
@@ -10622,9 +10630,8 @@ msgid "Reset"
 msgstr "إعادة تعيين"
 
 #: editor/rename_dialog.cpp
-#, fuzzy
 msgid "Regular Expression Error:"
-msgstr "خطأ ذو علاقة بالتعبير الاعتيادي Regular Expression"
+msgstr "خطأ في التعبير العادي:"
 
 #: editor/rename_dialog.cpp
 msgid "At character %s"
@@ -10693,9 +10700,8 @@ msgid "Instance Child Scene"
 msgstr "نمذجة المشهد الابن"
 
 #: editor/scene_tree_dock.cpp
-#, fuzzy
 msgid "Can't paste root node into the same scene."
-msgstr "لا يمكن تنفيذ الإجراء على عُقدة من مشهد أجنبي!"
+msgstr "لا يمكن لصق عقدة الجذر في نفس المشهد."
 
 #: editor/scene_tree_dock.cpp
 #, fuzzy
@@ -10770,7 +10776,7 @@ msgstr "لا يمكن تنفيذ هذا الإجراء على المشاهد ا�
 
 #: editor/scene_tree_dock.cpp
 msgid "Save New Scene As..."
-msgstr "احفظ المشهد الجديد كـ..."
+msgstr "احفظ المشهد الجديد ك..."
 
 #: editor/scene_tree_dock.cpp
 msgid ""
@@ -10946,6 +10952,13 @@ msgid "Remote"
 msgstr "عن بعد"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "محلي"
 
@@ -11456,11 +11469,11 @@ msgstr "مكتبة GDNativeLibrary"
 
 #: modules/gdnative/gdnative_library_singleton_editor.cpp
 msgid "Enabled GDNative Singleton"
-msgstr "تمكين نمط البرمجة Singleton لـ GDNative"
+msgstr "تمكين نمط البرمجة Singleton ل GDNative"
 
 #: modules/gdnative/gdnative_library_singleton_editor.cpp
 msgid "Disabled GDNative Singleton"
-msgstr "تعطيل نمط البرمجة Singleton لـ GDNative"
+msgstr "تعطيل نمط البرمجة Singleton ل GDNative"
 
 #: modules/gdnative/gdnative_library_singleton_editor.cpp
 msgid "Library"
@@ -11669,9 +11682,8 @@ msgid "Post processing"
 msgstr "المعالجة-اللاحقة Post-Process"
 
 #: modules/lightmapper_cpu/lightmapper_cpu.cpp
-#, fuzzy
 msgid "Plotting lightmaps"
-msgstr "تخطيط الإضاءات:"
+msgstr "تخطيط الإضاءات"
 
 #: modules/mono/csharp_script.cpp
 msgid "Class name can't be a reserved keyword"
@@ -12260,21 +12272,21 @@ msgid ""
 "\"Degrees Of Freedom\" is only valid when \"Xr Mode\" is \"Oculus Mobile VR"
 "\"."
 msgstr ""
-"\"Degrees Of Freedom\" تكون صالحة فقط عندما يكون وضع الـ \"Xr Mode\"هو "
+"\"Degrees Of Freedom\" تكون صالحة فقط عندما يكون وضع ال \"Xr Mode\"هو "
 "\"Oculus Mobile VR\"."
 
 #: platform/android/export/export.cpp
 msgid ""
 "\"Hand Tracking\" is only valid when \"Xr Mode\" is \"Oculus Mobile VR\"."
 msgstr ""
-"\"Hand Tracking\" تكون صالحة فقط عندما يكون وضع الـ \"Xr Mode\"هو \"Oculus "
+"\"Hand Tracking\" تكون صالحة فقط عندما يكون وضع ال \"Xr Mode\"هو \"Oculus "
 "Mobile VR\"."
 
 #: platform/android/export/export.cpp
 msgid ""
 "\"Focus Awareness\" is only valid when \"Xr Mode\" is \"Oculus Mobile VR\"."
 msgstr ""
-"\"Focus Awareness\" تكون صالحة فقط عندما يكون وضع الـ \"Xr Mode\"هو \"Oculus "
+"\"Focus Awareness\" تكون صالحة فقط عندما يكون وضع ال \"Xr Mode\"هو \"Oculus "
 "Mobile VR\"."
 
 #: platform/android/export/export.cpp
@@ -12725,9 +12737,8 @@ msgid "Finding meshes and lights"
 msgstr ""
 
 #: scene/3d/baked_lightmap.cpp
-#, fuzzy
 msgid "Preparing geometry (%d/%d)"
-msgstr "توزيع الأشكال الهندسية..."
+msgstr "تجضير الهندسة (%d/%d)"
 
 #: scene/3d/baked_lightmap.cpp
 #, fuzzy
@@ -12788,8 +12799,8 @@ msgid ""
 "A shape must be provided for CollisionShape to function. Please create a "
 "shape resource for it."
 msgstr ""
-"يجب توفير شكل لـ CollisionShape2D بإحدى الأشكال (من نوع Shape2D) لتعمل "
-"بالشكل المطلوب. الرجاء تكوين و ضبط الشكل لها."
+"يجب توفير شكل ل CollisionShape2D بإحدى الأشكال (من نوع Shape2D) لتعمل بالشكل "
+"المطلوب. الرجاء تكوين و ضبط الشكل لها."
 
 #: scene/3d/collision_shape.cpp
 msgid ""
@@ -13084,9 +13095,8 @@ msgid "Must use a valid extension."
 msgstr "يجب أن يستخدم صيغة صحيحة."
 
 #: scene/gui/graph_edit.cpp
-#, fuzzy
 msgid "Enable grid minimap."
-msgstr "تمكين المحاذاة"
+msgstr "تفعيل الخريطة المصغرة للشبكة."
 
 #: scene/gui/popup.cpp
 msgid ""
@@ -13099,8 +13109,7 @@ msgstr ""
 
 #: scene/gui/range.cpp
 msgid "If \"Exp Edit\" is enabled, \"Min Value\" must be greater than 0."
-msgstr ""
-"إذا تم تفعيل الـ\"Exp Edit\" يجب على ان يكون \"Min Value\" اعلى من صفر."
+msgstr "إذا تم تفعيل ال\"Exp Edit\" يجب على ان يكون \"Min Value\" اعلى من صفر."
 
 #: scene/gui/scroll_container.cpp
 msgid ""
diff --git a/editor/translations/bg.po b/editor/translations/bg.po
index cb2b9e1bd2..548d71df18 100644
--- a/editor/translations/bg.po
+++ b/editor/translations/bg.po
@@ -7358,6 +7358,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10565,6 +10570,13 @@ msgid "Remote"
 msgstr "Отдалечен"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/bn.po b/editor/translations/bn.po
index c8d082fbd5..21144a829b 100644
--- a/editor/translations/bn.po
+++ b/editor/translations/bn.po
@@ -8058,6 +8058,11 @@ msgstr "তথ্য দেখুন"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11574,6 +11579,13 @@ msgid "Remote"
 msgstr "অপসারণ করুন"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 #, fuzzy
 msgid "Local"
 msgstr "ঘটনাস্থল"
diff --git a/editor/translations/br.po b/editor/translations/br.po
index 29f9cd2d79..4d03911bbe 100644
--- a/editor/translations/br.po
+++ b/editor/translations/br.po
@@ -7285,6 +7285,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10474,6 +10479,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/ca.po b/editor/translations/ca.po
index ca28ea5eaf..01e60b0fac 100644
--- a/editor/translations/ca.po
+++ b/editor/translations/ca.po
@@ -7691,6 +7691,11 @@ msgid "View Rotation Locked"
 msgstr "Rotació de la Vista Bloquejada"
 
 #: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
 #, fuzzy
 msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
@@ -11241,6 +11246,13 @@ msgid "Remote"
 msgstr "Remot"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Local"
 
diff --git a/editor/translations/cs.po b/editor/translations/cs.po
index 17e44a4863..79163c835f 100644
--- a/editor/translations/cs.po
+++ b/editor/translations/cs.po
@@ -29,8 +29,8 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: \n"
-"PO-Revision-Date: 2021-04-05 14:28+0000\n"
-"Last-Translator: ProfJack <profjackcz@gmail.com>\n"
+"PO-Revision-Date: 2021-04-21 07:35+0000\n"
+"Last-Translator: Zbyněk <zbynek.fiala@gmail.com>\n"
 "Language-Team: Czech <https://hosted.weblate.org/projects/godot-engine/godot/"
 "cs/>\n"
 "Language: cs\n"
@@ -38,7 +38,7 @@ msgstr ""
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=3; plural=(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2;\n"
-"X-Generator: Weblate 4.6-dev\n"
+"X-Generator: Weblate 4.7-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -7104,7 +7104,7 @@ msgstr "Malá písmena"
 
 #: editor/plugins/script_text_editor.cpp editor/plugins/text_editor.cpp
 msgid "Capitalize"
-msgstr "Velká písmena"
+msgstr "Velká Písmena"
 
 #: editor/plugins/script_text_editor.cpp editor/plugins/text_editor.cpp
 msgid "Syntax Highlighter"
@@ -7168,7 +7168,7 @@ msgstr "Duplikovat dolů"
 
 #: editor/plugins/script_text_editor.cpp
 msgid "Complete Symbol"
-msgstr "Kompletní symbol"
+msgstr "Doplnit symbol"
 
 #: editor/plugins/script_text_editor.cpp
 msgid "Evaluate Selection"
@@ -7533,6 +7533,11 @@ msgstr "Rotace pohledu uzamknuta"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10899,6 +10904,13 @@ msgid "Remote"
 msgstr "Vzdálený"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Místní"
 
diff --git a/editor/translations/da.po b/editor/translations/da.po
index 7de7e428c5..01d6dbc42e 100644
--- a/editor/translations/da.po
+++ b/editor/translations/da.po
@@ -7800,6 +7800,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11178,6 +11183,13 @@ msgid "Remote"
 msgstr "Fjern"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/de.po b/editor/translations/de.po
index f70522a365..9b49a15db4 100644
--- a/editor/translations/de.po
+++ b/editor/translations/de.po
@@ -7647,6 +7647,11 @@ msgstr "Sichtrotation gesperrt"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11045,6 +11050,13 @@ msgid "Remote"
 msgstr "Fern"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Lokal"
 
diff --git a/editor/translations/editor.pot b/editor/translations/editor.pot
index 2c0294e8b8..1c44e9dd5c 100644
--- a/editor/translations/editor.pot
+++ b/editor/translations/editor.pot
@@ -7263,6 +7263,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10452,6 +10457,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/el.po b/editor/translations/el.po
index 2aa33c39aa..4648f83a72 100644
--- a/editor/translations/el.po
+++ b/editor/translations/el.po
@@ -7610,6 +7610,11 @@ msgstr "Κλείδωμα Περιστροφής"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11014,6 +11019,13 @@ msgid "Remote"
 msgstr "Απομακρυσμένο"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Τοπικό"
 
diff --git a/editor/translations/eo.po b/editor/translations/eo.po
index 3cb57c4089..3fe7877be0 100644
--- a/editor/translations/eo.po
+++ b/editor/translations/eo.po
@@ -11,18 +11,20 @@
 # Cristian Yepez <cristianyepez@gmail.com>, 2020.
 # BinotaLIU <me@binota.org>, 2020.
 # Jakub Fabijan <animatorzPolski@gmail.com>, 2021.
+# mourning20s <mourning20s@protonmail.com>, 2021.
+# Manuel González <mgoopazo@gmail.com>, 2021.
 msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
-"PO-Revision-Date: 2021-02-21 10:51+0000\n"
-"Last-Translator: Jakub Fabijan <animatorzPolski@gmail.com>\n"
+"PO-Revision-Date: 2021-04-19 22:33+0000\n"
+"Last-Translator: mourning20s <mourning20s@protonmail.com>\n"
 "Language-Team: Esperanto <https://hosted.weblate.org/projects/godot-engine/"
 "godot/eo/>\n"
 "Language: eo\n"
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8-bit\n"
 "Plural-Forms: nplurals=2; plural=n != 1;\n"
-"X-Generator: Weblate 4.5\n"
+"X-Generator: Weblate 4.7-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -505,7 +507,7 @@ msgstr "Averto: Redaktanti importis animadon"
 
 #: editor/animation_track_editor.cpp
 msgid "Select an AnimationPlayer node to create and edit animations."
-msgstr ""
+msgstr "Selektu AnimationPlayer nodo por krei kaj redakti animadoj."
 
 #: editor/animation_track_editor.cpp
 msgid "Only show tracks from nodes selected in tree."
@@ -688,19 +690,16 @@ msgid "Line Number:"
 msgstr "Lineo-Numeron:"
 
 #: editor/code_editor.cpp
-#, fuzzy
 msgid "%d replaced."
-msgstr "Anstataŭigi..."
+msgstr "%d anstataŭiĝis."
 
 #: editor/code_editor.cpp editor/editor_help.cpp
-#, fuzzy
 msgid "%d match."
-msgstr "Trovis %d matĉo(j)n."
+msgstr "%d rekono."
 
 #: editor/code_editor.cpp editor/editor_help.cpp
-#, fuzzy
 msgid "%d matches."
-msgstr "Trovis %d matĉo(j)n."
+msgstr "%d rekonoj."
 
 #: editor/code_editor.cpp editor/find_in_files.cpp
 msgid "Match Case"
@@ -729,7 +728,7 @@ msgstr "Norma"
 
 #: editor/code_editor.cpp editor/plugins/script_editor_plugin.cpp
 msgid "Toggle Scripts Panel"
-msgstr ""
+msgstr "Baskuli Skriptoj Panelo"
 
 #: editor/code_editor.cpp editor/plugins/canvas_item_editor_plugin.cpp
 #: editor/plugins/texture_region_editor_plugin.cpp
@@ -760,17 +759,15 @@ msgid "Method in target node must be specified."
 msgstr "Metodo en celo nodo devas esti specifita."
 
 #: editor/connections_dialog.cpp
-#, fuzzy
 msgid "Method name must be a valid identifier."
-msgstr "Metodo en celo nodo devas esti specifita."
+msgstr "La nomo de la metodo devas esti valida identigilo."
 
 #: editor/connections_dialog.cpp
-#, fuzzy
 msgid ""
 "Target method not found. Specify a valid method or attach a script to the "
 "target node."
 msgstr ""
-"Celo metodo maltrovita. Indiku ekzistanta metodo aŭ ligu la skripto al celo "
+"Cela metodo maltrovitas. Specifu valida metodo aŭ ligu skripto al la cela "
 "nodo."
 
 #: editor/connections_dialog.cpp
@@ -816,7 +813,7 @@ msgstr "Aldona argumentoj de alvoko:"
 
 #: editor/connections_dialog.cpp
 msgid "Receiver Method:"
-msgstr ""
+msgstr "Ricevila metodo:"
 
 #: editor/connections_dialog.cpp
 msgid "Advanced"
@@ -898,20 +895,19 @@ msgstr "Redakti Konekton:"
 
 #: editor/connections_dialog.cpp
 msgid "Are you sure you want to remove all connections from the \"%s\" signal?"
-msgstr ""
+msgstr "Ĉu vi certe volas forigi ĉiajn konektojn el la \"%s\" signalo?"
 
 #: editor/connections_dialog.cpp editor/editor_help.cpp editor/node_dock.cpp
 msgid "Signals"
 msgstr "Signaloj"
 
 #: editor/connections_dialog.cpp
-#, fuzzy
 msgid "Filter signals"
-msgstr "Filtri nodojn"
+msgstr "Filtri signalojn"
 
 #: editor/connections_dialog.cpp
 msgid "Are you sure you want to remove all connections from this signal?"
-msgstr ""
+msgstr "Ĉu vi certe volas forigi ĉiajn konektojn el la ĉi tiu signalo?"
 
 #: editor/connections_dialog.cpp
 msgid "Disconnect All"
@@ -944,7 +940,7 @@ msgstr "Favoritaj:"
 
 #: editor/create_dialog.cpp editor/editor_file_dialog.cpp
 msgid "Recent:"
-msgstr ""
+msgstr "Lastatempe:"
 
 #: editor/create_dialog.cpp editor/plugins/script_editor_plugin.cpp
 #: editor/property_selector.cpp editor/quick_open.cpp editor/rename_dialog.cpp
@@ -968,23 +964,27 @@ msgstr "Priskribo:"
 
 #: editor/dependency_editor.cpp
 msgid "Search Replacement For:"
-msgstr ""
+msgstr "Serĉi anstataŭigo por:"
 
 #: editor/dependency_editor.cpp
 msgid "Dependencies For:"
-msgstr ""
+msgstr "Dependoj por:"
 
 #: editor/dependency_editor.cpp
 msgid ""
 "Scene '%s' is currently being edited.\n"
 "Changes will only take effect when reloaded."
 msgstr ""
+"La sceno '%s' redaktadas nune.\n"
+"Ŝanĝoj nur efektiviĝos je reŝargo."
 
 #: editor/dependency_editor.cpp
 msgid ""
 "Resource '%s' is in use.\n"
 "Changes will only take effect when reloaded."
 msgstr ""
+"La risurco '%s' en uzo.\n"
+"Ŝanĝoj nur efektiviĝos je reŝargo."
 
 #: editor/dependency_editor.cpp
 #: modules/gdnative/gdnative_library_editor_plugin.cpp
@@ -1006,15 +1006,15 @@ msgstr "Dependecoj:"
 
 #: editor/dependency_editor.cpp
 msgid "Fix Broken"
-msgstr ""
+msgstr "Ripari difekta"
 
 #: editor/dependency_editor.cpp
 msgid "Dependency Editor"
-msgstr ""
+msgstr "Redaktilo de Dependoj"
 
 #: editor/dependency_editor.cpp
 msgid "Search Replacement Resource:"
-msgstr ""
+msgstr "Serĉi anstataŭiga risurco:"
 
 #: editor/dependency_editor.cpp editor/editor_file_dialog.cpp
 #: editor/editor_help_search.cpp editor/editor_node.cpp
@@ -1024,17 +1024,19 @@ msgstr ""
 #: modules/visual_script/visual_script_property_selector.cpp
 #: scene/gui/file_dialog.cpp
 msgid "Open"
-msgstr ""
+msgstr "Malfermi"
 
 #: editor/dependency_editor.cpp
 msgid "Owners Of:"
-msgstr ""
+msgstr "Proprietuloj de:"
 
 #: editor/dependency_editor.cpp
 msgid ""
 "Remove selected files from the project? (no undo)\n"
 "You can find the removed files in the system trash to restore them."
 msgstr ""
+"Forigi selektajn dosierojn el la projekto? (ne malfaro)\n"
+"Vi povas trovi la forigajn dosierojn en la sistema rubujo por restaŭri ilin."
 
 #: editor/dependency_editor.cpp
 msgid ""
@@ -1043,46 +1045,49 @@ msgid ""
 "Remove them anyway? (no undo)\n"
 "You can find the removed files in the system trash to restore them."
 msgstr ""
+"La forigotaj dosieroj bezonas por ke aliaj risurcoj funkciadi.\n"
+"Forigu ilin iel? (ne malfaro)\n"
+"Vi povas trovi la forigajn dosierojn en la sistema rubujo por restaŭri ilin."
 
 #: editor/dependency_editor.cpp
 msgid "Cannot remove:"
-msgstr ""
+msgstr "Ne povas forigi:"
 
 #: editor/dependency_editor.cpp
 msgid "Error loading:"
-msgstr ""
+msgstr "Eraro dum ŝargado:"
 
 #: editor/dependency_editor.cpp
 msgid "Load failed due to missing dependencies:"
-msgstr ""
+msgstr "Ŝargado malplenumis pro mankaj dependoj:"
 
 #: editor/dependency_editor.cpp editor/editor_node.cpp
 msgid "Open Anyway"
-msgstr ""
+msgstr "Malfermi iel"
 
 #: editor/dependency_editor.cpp
 msgid "Which action should be taken?"
-msgstr ""
+msgstr "Kiu ago devu preni?"
 
 #: editor/dependency_editor.cpp
 msgid "Fix Dependencies"
-msgstr ""
+msgstr "Ripari dependojn"
 
 #: editor/dependency_editor.cpp
 msgid "Errors loading!"
-msgstr ""
+msgstr "Eraroj dum ŝargado!"
 
 #: editor/dependency_editor.cpp
 msgid "Permanently delete %d item(s)? (No undo!)"
-msgstr ""
+msgstr "Permanente forigi %d elemento(j)n? (Ne malfaro!)"
 
 #: editor/dependency_editor.cpp
 msgid "Show Dependencies"
-msgstr ""
+msgstr "Vidigi Dependojn"
 
 #: editor/dependency_editor.cpp
 msgid "Orphan Resource Explorer"
-msgstr ""
+msgstr "Esploranto de orfaj risurcoj"
 
 #: editor/dependency_editor.cpp editor/editor_audio_buses.cpp
 #: editor/editor_file_dialog.cpp editor/editor_node.cpp
@@ -1090,98 +1095,98 @@ msgstr ""
 #: editor/plugins/sprite_frames_editor_plugin.cpp editor/project_export.cpp
 #: editor/project_settings_editor.cpp editor/scene_tree_dock.cpp
 msgid "Delete"
-msgstr ""
+msgstr "Forigi"
 
 #: editor/dependency_editor.cpp
 msgid "Owns"
-msgstr ""
+msgstr "Posede"
 
 #: editor/dependency_editor.cpp
 msgid "Resources Without Explicit Ownership:"
-msgstr ""
+msgstr "Risurcoj sen eksplicita proprieto:"
 
 #: editor/dictionary_property_edit.cpp
 msgid "Change Dictionary Key"
-msgstr ""
+msgstr "Ŝanĝi la vortaran ŝlosilon"
 
 #: editor/dictionary_property_edit.cpp
 msgid "Change Dictionary Value"
-msgstr ""
+msgstr "Ŝanĝi la vortaran valoron"
 
 #: editor/editor_about.cpp
 msgid "Thanks from the Godot community!"
-msgstr ""
+msgstr "Dankon de la komunumo de Godot!"
 
 #: editor/editor_about.cpp
 msgid "Godot Engine contributors"
-msgstr ""
+msgstr "Kontribuantoj de Godot Engine"
 
 #: editor/editor_about.cpp
 msgid "Project Founders"
-msgstr ""
+msgstr "Fondintoj de la Projekto"
 
 #: editor/editor_about.cpp
 msgid "Lead Developer"
-msgstr ""
+msgstr "Ĉefprogramisto"
 
 #. TRANSLATORS: This refers to a job title.
 #. The trailing space is used to distinguish with the project list application,
 #. you do not have to keep it in your translation.
 #: editor/editor_about.cpp
 msgid "Project Manager "
-msgstr ""
+msgstr "Projektadministrilo "
 
 #: editor/editor_about.cpp
 msgid "Developers"
-msgstr ""
+msgstr "Programistoj"
 
 #: editor/editor_about.cpp
 msgid "Authors"
-msgstr ""
+msgstr "Aŭtoroj"
 
 #: editor/editor_about.cpp
 msgid "Platinum Sponsors"
-msgstr ""
+msgstr "Platenaj Sponsoroj"
 
 #: editor/editor_about.cpp
 msgid "Gold Sponsors"
-msgstr ""
+msgstr "Oraj Sponsoroj"
 
 #: editor/editor_about.cpp
 msgid "Silver Sponsors"
-msgstr ""
+msgstr "Arĝentaj Sponsoroj"
 
 #: editor/editor_about.cpp
 msgid "Bronze Sponsors"
-msgstr ""
+msgstr "Bronzaj Sponsoroj"
 
 #: editor/editor_about.cpp
 msgid "Mini Sponsors"
-msgstr ""
+msgstr "Minisponsoroj"
 
 #: editor/editor_about.cpp
 msgid "Gold Donors"
-msgstr ""
+msgstr "Oraj Donacantoj"
 
 #: editor/editor_about.cpp
 msgid "Silver Donors"
-msgstr ""
+msgstr "Arĝentaj Donacantoj"
 
 #: editor/editor_about.cpp
 msgid "Bronze Donors"
-msgstr ""
+msgstr "Bronzaj Donacantoj"
 
 #: editor/editor_about.cpp
 msgid "Donors"
-msgstr ""
+msgstr "Donacantoj"
 
 #: editor/editor_about.cpp
 msgid "License"
-msgstr ""
+msgstr "Permesilo"
 
 #: editor/editor_about.cpp
 msgid "Third-party Licenses"
-msgstr ""
+msgstr "Permesiloj de eksteraj liverantoj"
 
 #: editor/editor_about.cpp
 msgid ""
@@ -1190,181 +1195,185 @@ msgid ""
 "is an exhaustive list of all such third-party components with their "
 "respective copyright statements and license terms."
 msgstr ""
+"Godot Engine fidas al multe liberaj kaj malfermitkodaj bibliotekoj de "
+"ekstera liveranto, ĉio kongruas kun la kondiĉoj de ĝia MIT-permesilo. La "
+"jenoj estas elĉerpa listo de ĉiom tiaj komponantoj de ekstera liveranto kun "
+"iliaj kopirajtaj atentigoj kaj permesilaj kondiĉoj respektive."
 
 #: editor/editor_about.cpp
 msgid "All Components"
-msgstr ""
+msgstr "Ĉiaj komponantoj"
 
 #: editor/editor_about.cpp
 msgid "Components"
-msgstr ""
+msgstr "Komponantoj"
 
 #: editor/editor_about.cpp
 msgid "Licenses"
-msgstr ""
+msgstr "Permesiloj"
 
 #: editor/editor_asset_installer.cpp editor/project_manager.cpp
 msgid "Error opening package file, not in ZIP format."
-msgstr ""
+msgstr "Eraro dum malfermi la pakaĵan dosieron, ne de ZIP formato."
 
 #: editor/editor_asset_installer.cpp
 msgid "%s (Already Exists)"
-msgstr ""
+msgstr "%s (jam ekzistante)"
 
 #: editor/editor_asset_installer.cpp
 msgid "Uncompressing Assets"
-msgstr ""
+msgstr "Maldensigas havaĵojn"
 
 #: editor/editor_asset_installer.cpp editor/project_manager.cpp
 msgid "The following files failed extraction from package:"
-msgstr ""
+msgstr "La jenaj dosieroj malplenumis malkompaktigi el la pakaĵo:"
 
 #: editor/editor_asset_installer.cpp
 msgid "And %s more files."
-msgstr ""
+msgstr "Kaj %s pli dosieroj."
 
 #: editor/editor_asset_installer.cpp editor/project_manager.cpp
 msgid "Package installed successfully!"
-msgstr ""
+msgstr "Pakaĵo instalis sukcese!"
 
 #: editor/editor_asset_installer.cpp
 #: editor/plugins/asset_library_editor_plugin.cpp
 msgid "Success!"
-msgstr "Sukcesis!"
+msgstr "Sukcese!"
 
 #: editor/editor_asset_installer.cpp
 msgid "Package Contents:"
-msgstr ""
+msgstr "Enhavo de pakaĵo:"
 
 #: editor/editor_asset_installer.cpp editor/editor_node.cpp
 msgid "Install"
-msgstr ""
+msgstr "Instali"
 
 #: editor/editor_asset_installer.cpp
 msgid "Package Installer"
-msgstr ""
+msgstr "Pakaĵa instalilo"
 
 #: editor/editor_audio_buses.cpp
 msgid "Speakers"
-msgstr ""
+msgstr "Laŭtparolilo"
 
 #: editor/editor_audio_buses.cpp
 msgid "Add Effect"
-msgstr ""
+msgstr "Aldoni efekton"
 
 #: editor/editor_audio_buses.cpp
 msgid "Rename Audio Bus"
-msgstr ""
+msgstr "Renomi aŭdia buso"
 
 #: editor/editor_audio_buses.cpp
 msgid "Change Audio Bus Volume"
-msgstr ""
+msgstr "Ŝangi la laŭtecon de la aŭdia buso"
 
 #: editor/editor_audio_buses.cpp
 msgid "Toggle Audio Bus Solo"
-msgstr ""
+msgstr "Baskuli la sola reĝimo de la aŭdia buso"
 
 #: editor/editor_audio_buses.cpp
 msgid "Toggle Audio Bus Mute"
-msgstr ""
+msgstr "Baskuli la muta reĝimo de la aŭdia buso"
 
 #: editor/editor_audio_buses.cpp
+#, fuzzy
 msgid "Toggle Audio Bus Bypass Effects"
-msgstr ""
+msgstr "Baskuli preterpasajn efektojn de aŭdia buso"
 
 #: editor/editor_audio_buses.cpp
 msgid "Select Audio Bus Send"
-msgstr ""
+msgstr "Elekti senditon de aŭdia buso"
 
 #: editor/editor_audio_buses.cpp
 msgid "Add Audio Bus Effect"
-msgstr ""
+msgstr "Aldoni efekton de aŭdia buso"
 
 #: editor/editor_audio_buses.cpp
 msgid "Move Bus Effect"
-msgstr ""
+msgstr "Movi busan efekton"
 
 #: editor/editor_audio_buses.cpp
 msgid "Delete Bus Effect"
-msgstr ""
+msgstr "Forigi busan efekton"
 
 #: editor/editor_audio_buses.cpp
 msgid "Drag & drop to rearrange."
-msgstr ""
+msgstr "Ŝovi kaj demeti por rearanĝi."
 
 #: editor/editor_audio_buses.cpp
 msgid "Solo"
-msgstr ""
+msgstr "Solo"
 
 #: editor/editor_audio_buses.cpp
 msgid "Mute"
-msgstr ""
+msgstr "Mute"
 
 #: editor/editor_audio_buses.cpp
 msgid "Bypass"
-msgstr ""
+msgstr "Preterpase"
 
 #: editor/editor_audio_buses.cpp
 msgid "Bus options"
-msgstr ""
+msgstr "Busaj agordoj"
 
 #: editor/editor_audio_buses.cpp editor/filesystem_dock.cpp
 #: editor/plugins/animation_player_editor_plugin.cpp editor/scene_tree_dock.cpp
 msgid "Duplicate"
-msgstr ""
+msgstr "Duobligi"
 
 #: editor/editor_audio_buses.cpp
 msgid "Reset Volume"
-msgstr ""
+msgstr "Rekomencigi la laŭtecon"
 
 #: editor/editor_audio_buses.cpp
 msgid "Delete Effect"
-msgstr ""
+msgstr "Forigi la efekton"
 
 #: editor/editor_audio_buses.cpp
 msgid "Audio"
-msgstr ""
+msgstr "Aŭdio"
 
 #: editor/editor_audio_buses.cpp
 msgid "Add Audio Bus"
-msgstr ""
+msgstr "Aldoni aŭdian buson"
 
 #: editor/editor_audio_buses.cpp
 msgid "Master bus can't be deleted!"
-msgstr ""
+msgstr "La ĉefan buson ne forigeblas!"
 
 #: editor/editor_audio_buses.cpp
 msgid "Delete Audio Bus"
-msgstr ""
+msgstr "Forigi aŭdian buson"
 
 #: editor/editor_audio_buses.cpp
 msgid "Duplicate Audio Bus"
-msgstr ""
+msgstr "Duobligi aŭdian buson"
 
 #: editor/editor_audio_buses.cpp
 msgid "Reset Bus Volume"
-msgstr ""
+msgstr "Rekomencigi la laŭtecon de buso"
 
 #: editor/editor_audio_buses.cpp
 msgid "Move Audio Bus"
-msgstr ""
+msgstr "Movi aŭdian buson"
 
 #: editor/editor_audio_buses.cpp
 msgid "Save Audio Bus Layout As..."
-msgstr ""
+msgstr "Konservi aranĝon de aŭdia buso kiel..."
 
 #: editor/editor_audio_buses.cpp
 msgid "Location for New Layout..."
-msgstr ""
+msgstr "Dosierlokigo por nova aranĝo..."
 
 #: editor/editor_audio_buses.cpp
 msgid "Open Audio Bus Layout"
-msgstr ""
+msgstr "Malfermi aranĝon de aŭdia buso"
 
 #: editor/editor_audio_buses.cpp
-#, fuzzy
 msgid "There is no '%s' file."
-msgstr "Neniu '%s' dosiero."
+msgstr "Estas neniu dosiero '%s'."
 
 #: editor/editor_audio_buses.cpp editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Layout"
@@ -1372,12 +1381,11 @@ msgstr "Aranĝo"
 
 #: editor/editor_audio_buses.cpp
 msgid "Invalid file, not an audio bus layout."
-msgstr ""
+msgstr "Malvalida dosiero, ne estas aranĝo de aŭdia buso."
 
 #: editor/editor_audio_buses.cpp
-#, fuzzy
 msgid "Error saving file: %s"
-msgstr "Eraro dum ŝargante tiparon."
+msgstr "Eraris konservi dosieron: %s"
 
 #: editor/editor_audio_buses.cpp
 msgid "Add Bus"
@@ -1385,97 +1393,97 @@ msgstr "Aldoni Buso"
 
 #: editor/editor_audio_buses.cpp
 msgid "Add a new Audio Bus to this layout."
-msgstr ""
+msgstr "Aldonu novan Aŭdobuson al ĉi tiu aranĝo."
 
 #: editor/editor_audio_buses.cpp editor/editor_properties.cpp
 #: editor/plugins/animation_player_editor_plugin.cpp editor/property_editor.cpp
 #: editor/script_create_dialog.cpp
 msgid "Load"
-msgstr "Ŝarĝi"
+msgstr "Ŝargi"
 
 #: editor/editor_audio_buses.cpp
 msgid "Load an existing Bus Layout."
-msgstr ""
+msgstr "Ŝargi ekzistante busan aranĝon."
 
 #: editor/editor_audio_buses.cpp
 msgid "Save As"
-msgstr ""
+msgstr "Konservi kiel"
 
 #: editor/editor_audio_buses.cpp
 msgid "Save this Bus Layout to a file."
-msgstr ""
+msgstr "Konservi ĉi tiun busan aranĝon al dosiero."
 
 #: editor/editor_audio_buses.cpp editor/import_dock.cpp
 msgid "Load Default"
-msgstr ""
+msgstr "Ŝargi defaŭlto"
 
 #: editor/editor_audio_buses.cpp
 msgid "Load the default Bus Layout."
-msgstr ""
+msgstr "Ŝargi la defaŭlta busaranĝo."
 
 #: editor/editor_audio_buses.cpp
 msgid "Create a new Bus Layout."
-msgstr ""
+msgstr "Krei nova busaranĝo."
 
 #: editor/editor_autoload_settings.cpp
 msgid "Invalid name."
-msgstr ""
+msgstr "Malvalida nomo."
 
 #: editor/editor_autoload_settings.cpp
 msgid "Valid characters:"
-msgstr ""
+msgstr "Validaj signoj:"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Must not collide with an existing engine class name."
-msgstr ""
+msgstr "Ne devu konflikti kun la nomo de motora klaso ekzistante."
 
 #: editor/editor_autoload_settings.cpp
 msgid "Must not collide with an existing built-in type name."
-msgstr ""
+msgstr "Ne devu konflikti kun la nomo de enkonstruita tipo ekzistante."
 
 #: editor/editor_autoload_settings.cpp
 msgid "Must not collide with an existing global constant name."
-msgstr ""
+msgstr "Ne devu konflikti kun la nomo de malloka konstanto ekzistante."
 
 #: editor/editor_autoload_settings.cpp
 msgid "Keyword cannot be used as an autoload name."
-msgstr ""
+msgstr "Ŝlosilvorto ne povas uzi kiel aŭtoŝarga nomo."
 
 #: editor/editor_autoload_settings.cpp
 msgid "Autoload '%s' already exists!"
-msgstr ""
+msgstr "Aŭtoŝarga '%s' jam ekzistas!"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Rename Autoload"
-msgstr ""
+msgstr "Renomi aŭtoŝargon"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Toggle AutoLoad Globals"
-msgstr ""
+msgstr "Baskuli aŭtoŝargajn mallokojn"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Move Autoload"
-msgstr ""
+msgstr "Movi aŭtoŝargon"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Remove Autoload"
-msgstr ""
+msgstr "Forigi aŭtoŝargon"
 
 #: editor/editor_autoload_settings.cpp editor/editor_plugin_settings.cpp
 msgid "Enable"
-msgstr ""
+msgstr "Ŝaltita"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Rearrange Autoloads"
-msgstr ""
+msgstr "Rearanĝi aŭtoŝargojn"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Can't add autoload:"
-msgstr ""
+msgstr "Ne aldoneblas aŭtoŝargon:"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Add AutoLoad"
-msgstr ""
+msgstr "Aldoni aŭtoŝargon"
 
 #: editor/editor_autoload_settings.cpp editor/editor_file_dialog.cpp
 #: editor/editor_plugin_settings.cpp
@@ -1486,7 +1494,7 @@ msgstr "Vojo:"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Node Name:"
-msgstr ""
+msgstr "Nomo de nodo:"
 
 #: editor/editor_autoload_settings.cpp editor/editor_help_search.cpp
 #: editor/editor_profiler.cpp editor/project_manager.cpp
@@ -1496,41 +1504,39 @@ msgstr "Nomo"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Singleton"
-msgstr ""
+msgstr "Unuopo"
 
 #: editor/editor_data.cpp editor/inspector_dock.cpp
 msgid "Paste Params"
-msgstr ""
+msgstr "Alglui parametroj"
 
 #: editor/editor_data.cpp
-#, fuzzy
 msgid "Updating Scene"
-msgstr "Aktualas scenon"
+msgstr "Aktualigas la scenon"
 
 #: editor/editor_data.cpp
 msgid "Storing local changes..."
-msgstr ""
+msgstr "Memoras lokajn ŝanĝojn..."
 
 #: editor/editor_data.cpp
-#, fuzzy
 msgid "Updating scene..."
-msgstr "Aktualas scenon..."
+msgstr "Aktualigas la scenon..."
 
 #: editor/editor_data.cpp editor/editor_properties.cpp
 msgid "[empty]"
-msgstr ""
+msgstr "[malplena]"
 
 #: editor/editor_data.cpp
 msgid "[unsaved]"
-msgstr ""
+msgstr "[ne konservis]"
 
 #: editor/editor_dir_dialog.cpp
 msgid "Please select a base directory first."
-msgstr ""
+msgstr "Bonvolu selekti bazan dosierujon unue."
 
 #: editor/editor_dir_dialog.cpp
 msgid "Choose a Directory"
-msgstr ""
+msgstr "Elekti dosierujon"
 
 #: editor/editor_dir_dialog.cpp editor/editor_file_dialog.cpp
 #: editor/filesystem_dock.cpp editor/project_manager.cpp
@@ -1552,31 +1558,35 @@ msgstr "Ne povis krei dosierujon."
 
 #: editor/editor_dir_dialog.cpp
 msgid "Choose"
-msgstr ""
+msgstr "Elekti"
 
 #: editor/editor_export.cpp
 msgid "Storing File:"
-msgstr ""
+msgstr "Memoras dosieron:"
 
 #: editor/editor_export.cpp
 msgid "No export template found at the expected path:"
-msgstr ""
+msgstr "Ne eksporta ŝablono trovis al la atenda dosierindiko:"
 
 #: editor/editor_export.cpp
 msgid "Packing"
-msgstr ""
+msgstr "Pakas"
 
 #: editor/editor_export.cpp
 msgid ""
 "Target platform requires 'ETC' texture compression for GLES2. Enable 'Import "
 "Etc' in Project Settings."
 msgstr ""
+"Cela platformo bezonas 'ETC' teksturan densigon por GLES2. Ebligu 'Import "
+"Etc' en projektaj agordoj."
 
 #: editor/editor_export.cpp
 msgid ""
 "Target platform requires 'ETC2' texture compression for GLES3. Enable "
 "'Import Etc 2' in Project Settings."
 msgstr ""
+"Cela platformo bezonas 'ETC2' teksturan densigon por GLES3. Ebligu 'Import "
+"Etc 2' en projektaj agordoj."
 
 #: editor/editor_export.cpp
 msgid ""
@@ -1585,18 +1595,26 @@ msgid ""
 "Enable 'Import Etc' in Project Settings, or disable 'Driver Fallback "
 "Enabled'."
 msgstr ""
+"Cela platformo bezonas 'ETC' teksturan densigon por la retrodefaŭlta pelilo "
+"de GLES2.\n"
+"Ebligu 'Import Etc' en projektaj agordoj, aŭ malŝalti 'Driver Fallback "
+"Enabled'."
 
 #: editor/editor_export.cpp
 msgid ""
 "Target platform requires 'PVRTC' texture compression for GLES2. Enable "
 "'Import Pvrtc' in Project Settings."
 msgstr ""
+"Cela platformo bezonas 'PVRTC' teksturan densigon por GLES2. Ebligu 'Import "
+"Pvrtc' en projektaj agordoj."
 
 #: editor/editor_export.cpp
 msgid ""
 "Target platform requires 'ETC2' or 'PVRTC' texture compression for GLES3. "
 "Enable 'Import Etc 2' or 'Import Pvrtc' in Project Settings."
 msgstr ""
+"Cela platformo bezonas 'ETC2' aŭ 'PVRTC' teksturan densigon por GLES3. "
+"Ebligu 'Import Etc 2' aŭ 'Import Pvrtc' en projektaj agordoj."
 
 #: editor/editor_export.cpp
 msgid ""
@@ -1605,6 +1623,10 @@ msgid ""
 "Enable 'Import Pvrtc' in Project Settings, or disable 'Driver Fallback "
 "Enabled'."
 msgstr ""
+"Cela platformo bezonas 'PVRTC' teksturan densigon por la retrodefaŭlta "
+"pelilo de GLES2.\n"
+"Ebligu 'Import Pvrtc' en projektaj agordoj, aŭ malŝalti 'Driver Fallback "
+"Enabled'."
 
 #: editor/editor_export.cpp platform/android/export/export.cpp
 #: platform/iphone/export/export.cpp platform/javascript/export/export.cpp
@@ -1624,10 +1646,9 @@ msgid "Template file not found:"
 msgstr "Ŝablonan dosieron ne trovitis:"
 
 #: editor/editor_export.cpp
-#, fuzzy
 msgid "On 32-bit exports the embedded PCK cannot be bigger than 4 GiB."
 msgstr ""
-"Sur 32-bita eksportoj la enigita PCK ne eblos esti pli granda ol 4 GiB."
+"Sur 32-bita eksportoj la enigita PCK ne eblas esti pli granda ol 4 GiB."
 
 #: editor/editor_feature_profile.cpp
 msgid "3D Editor"
@@ -1643,92 +1664,89 @@ msgstr "Biblioteko de havaĵoj"
 
 #: editor/editor_feature_profile.cpp
 msgid "Scene Tree Editing"
-msgstr ""
+msgstr "Redaktado de scena arbo"
 
 #: editor/editor_feature_profile.cpp
 msgid "Node Dock"
-msgstr ""
+msgstr "Doko de nodo"
 
 #: editor/editor_feature_profile.cpp
-#, fuzzy
 msgid "FileSystem Dock"
-msgstr "Dosiersistemo"
+msgstr "Doko de dosiersistemo"
 
 #: editor/editor_feature_profile.cpp
 msgid "Import Dock"
-msgstr "Enporti dokon"
+msgstr "Doko de enporto"
 
 #: editor/editor_feature_profile.cpp
 msgid "Erase profile '%s'? (no undo)"
-msgstr ""
+msgstr "Viŝi profilon '%s'? (ne malfaro)"
 
 #: editor/editor_feature_profile.cpp
 msgid "Profile must be a valid filename and must not contain '.'"
-msgstr ""
+msgstr "Profilo devas esti valida dosiernomo kaj devas ne enhavi '.'"
 
 #: editor/editor_feature_profile.cpp
 msgid "Profile with this name already exists."
-msgstr ""
+msgstr "Profilo kun tia nomo jam ekzistas."
 
 #: editor/editor_feature_profile.cpp
 msgid "(Editor Disabled, Properties Disabled)"
-msgstr ""
+msgstr "(Redaktilo malŝaltita, Atributoj malŝaltitaj)"
 
 #: editor/editor_feature_profile.cpp
 msgid "(Properties Disabled)"
-msgstr ""
+msgstr "(Atributoj malŝaltitaj)"
 
 #: editor/editor_feature_profile.cpp
 msgid "(Editor Disabled)"
-msgstr ""
+msgstr "(Redaktilo malŝaltita)"
 
 #: editor/editor_feature_profile.cpp
 msgid "Class Options:"
-msgstr ""
+msgstr "Agordoj de klaso:"
 
 #: editor/editor_feature_profile.cpp
 msgid "Enable Contextual Editor"
-msgstr ""
+msgstr "Ŝalti kuntekstan redaktilon"
 
 #: editor/editor_feature_profile.cpp
 msgid "Enabled Properties:"
-msgstr ""
+msgstr "Ŝaltitaj atributoj:"
 
 #: editor/editor_feature_profile.cpp
 msgid "Enabled Features:"
-msgstr ""
+msgstr "Ŝaltitaj eblecoj:"
 
 #: editor/editor_feature_profile.cpp
 msgid "Enabled Classes:"
-msgstr ""
+msgstr "Ŝaltitaj klasoj:"
 
 #: editor/editor_feature_profile.cpp
-#, fuzzy
 msgid "File '%s' format is invalid, import aborted."
-msgstr "Dosierformo de la '%s' estas malvalida, enporto ĉesiĝis."
+msgstr "La dosierformo '%s' estas malvalida, enporto ĉesigis."
 
 #: editor/editor_feature_profile.cpp
 msgid ""
 "Profile '%s' already exists. Remove it first before importing, import "
 "aborted."
-msgstr ""
+msgstr "Profilo '%s' jam ekzistas. Forigu ĝin antaŭ enporti. Enporto ĉesigis."
 
 #: editor/editor_feature_profile.cpp
 msgid "Error saving profile to path: '%s'."
-msgstr ""
+msgstr "Eraras konservi profilon al dosierindiko: '%s'."
 
 #: editor/editor_feature_profile.cpp
 msgid "Unset"
-msgstr ""
+msgstr "Malagordi"
 
 #: editor/editor_feature_profile.cpp
 msgid "Current Profile:"
-msgstr ""
+msgstr "Aktuala profilo:"
 
 #: editor/editor_feature_profile.cpp
-#, fuzzy
 msgid "Make Current"
-msgstr "Nuntempigi"
+msgstr "Farigi aktuale"
 
 #: editor/editor_feature_profile.cpp
 #: editor/plugins/animation_player_editor_plugin.cpp
@@ -1747,35 +1765,35 @@ msgstr "Eksporti"
 
 #: editor/editor_feature_profile.cpp
 msgid "Available Profiles:"
-msgstr ""
+msgstr "Disponeblaj profiloj:"
 
 #: editor/editor_feature_profile.cpp
 msgid "Class Options"
-msgstr ""
+msgstr "Agordoj de klaso"
 
 #: editor/editor_feature_profile.cpp
 msgid "New profile name:"
-msgstr ""
+msgstr "Nomo de nova profilo:"
 
 #: editor/editor_feature_profile.cpp
 msgid "Erase Profile"
-msgstr ""
+msgstr "Viŝi profilon"
 
 #: editor/editor_feature_profile.cpp
 msgid "Godot Feature Profile"
-msgstr ""
+msgstr "Profilo de funkciaro de Godot"
 
 #: editor/editor_feature_profile.cpp
 msgid "Import Profile(s)"
-msgstr ""
+msgstr "Enporti profilo(j)n"
 
 #: editor/editor_feature_profile.cpp
 msgid "Export Profile"
-msgstr ""
+msgstr "Eksporti profilo(j)n"
 
 #: editor/editor_feature_profile.cpp
 msgid "Manage Editor Feature Profiles"
-msgstr ""
+msgstr "Administri profilojn de funkciaro de redaktilo"
 
 #: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
 msgid "Select Current Folder"
@@ -1791,7 +1809,7 @@ msgstr "Elekti ĉi tiun dosierujon"
 
 #: editor/editor_file_dialog.cpp editor/filesystem_dock.cpp
 msgid "Copy Path"
-msgstr "Kopii vojo"
+msgstr "Kopii dosierindikon"
 
 #: editor/editor_file_dialog.cpp editor/filesystem_dock.cpp
 msgid "Open in File Manager"
@@ -1848,73 +1866,71 @@ msgstr "Konservi dosieron"
 
 #: editor/editor_file_dialog.cpp
 msgid "Go Back"
-msgstr ""
+msgstr "Posteniri"
 
 #: editor/editor_file_dialog.cpp
 msgid "Go Forward"
-msgstr ""
+msgstr "Antaŭeniri"
 
 #: editor/editor_file_dialog.cpp
 msgid "Go Up"
-msgstr ""
+msgstr "Supreniri"
 
 #: editor/editor_file_dialog.cpp
 msgid "Toggle Hidden Files"
-msgstr ""
+msgstr "Baskuli kaŝitaj dosieroj"
 
 #: editor/editor_file_dialog.cpp
 msgid "Toggle Favorite"
-msgstr ""
+msgstr "Baskuli favorata"
 
 #: editor/editor_file_dialog.cpp
 msgid "Toggle Mode"
-msgstr ""
+msgstr "Baskuli reĝimon"
 
 #: editor/editor_file_dialog.cpp
 msgid "Focus Path"
-msgstr ""
+msgstr "Fokusi al dosierindiko"
 
 #: editor/editor_file_dialog.cpp
 msgid "Move Favorite Up"
-msgstr ""
+msgstr "Suprentiri favoraton"
 
 #: editor/editor_file_dialog.cpp
 msgid "Move Favorite Down"
-msgstr ""
+msgstr "Subentiri favoraton"
 
 #: editor/editor_file_dialog.cpp
-#, fuzzy
 msgid "Go to previous folder."
-msgstr "Iri al Antaŭa Paŝo"
+msgstr "Iri al antaŭa dosierujo."
 
 #: editor/editor_file_dialog.cpp
-#, fuzzy
 msgid "Go to next folder."
-msgstr "Iri al Neksta Paŝo"
+msgstr "Iri al sekva dosierujo."
 
 #: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
 msgid "Go to parent folder."
-msgstr ""
+msgstr "Iri al superdosierujo."
 
 #: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
 msgid "Refresh files."
-msgstr ""
+msgstr "Aktualigi dosieroj."
 
 #: editor/editor_file_dialog.cpp
 msgid "(Un)favorite current folder."
-msgstr ""
+msgstr "(Mal)favoratigi aktualan dosieron."
 
 #: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
 msgid "Toggle the visibility of hidden files."
-msgstr ""
+msgstr "Baskuli videblo de kaŝitaj dosieroj."
 
 #: editor/editor_file_dialog.cpp editor/filesystem_dock.cpp
 msgid "View items as a grid of thumbnails."
-msgstr ""
+msgstr "Vidigi elementoj kiel krado de miniaturoj."
 
 #: editor/editor_file_dialog.cpp editor/filesystem_dock.cpp
 msgid "View items as a list."
-msgstr ""
+msgstr "Vidigi elementoj kiel listo."
 
 #: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
 msgid "Directories & Files:"
@@ -1924,110 +1940,109 @@ msgstr "Dosierujoj kaj dosieroj:"
 #: editor/plugins/style_box_editor_plugin.cpp
 #: editor/plugins/theme_editor_plugin.cpp editor/rename_dialog.cpp
 msgid "Preview:"
-msgstr ""
+msgstr "Antaŭrigardo:"
 
 #: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
 msgid "File:"
-msgstr ""
+msgstr "Dosiero:"
 
 #: editor/editor_file_system.cpp
 msgid "ScanSources"
-msgstr ""
+msgstr "Esplori fontoj"
 
 #: editor/editor_file_system.cpp
 msgid ""
 "There are multiple importers for different types pointing to file %s, import "
 "aborted"
 msgstr ""
+"Estas pluraj enportiloj por malsamaj tipoj almontri dosieron %s, enporto "
+"ĉesigis"
 
 #: editor/editor_file_system.cpp
 msgid "(Re)Importing Assets"
-msgstr ""
+msgstr "(Re)enportas havaĵoj"
 
 #: editor/editor_help.cpp editor/plugins/spatial_editor_plugin.cpp
 msgid "Top"
-msgstr ""
+msgstr "Supro"
 
 #: editor/editor_help.cpp
 msgid "Class:"
-msgstr ""
+msgstr "Klaso:"
 
 #: editor/editor_help.cpp editor/scene_tree_editor.cpp
 #: editor/script_create_dialog.cpp
 msgid "Inherits:"
-msgstr ""
+msgstr "Heredato:"
 
 #: editor/editor_help.cpp
 msgid "Inherited by:"
-msgstr ""
+msgstr "Heredadas de:"
 
 #: editor/editor_help.cpp
-#, fuzzy
 msgid "Description"
-msgstr "Priskribo:"
+msgstr "Priskribo"
 
 #: editor/editor_help.cpp
 msgid "Online Tutorials"
-msgstr ""
+msgstr "Retaj Instruiloj"
 
 #: editor/editor_help.cpp
 msgid "Properties"
-msgstr ""
+msgstr "Atributoj"
 
 #: editor/editor_help.cpp
+#, fuzzy
 msgid "override:"
-msgstr ""
+msgstr "redifino:"
 
 #: editor/editor_help.cpp
 msgid "default:"
-msgstr ""
+msgstr "defaŭlto:"
 
 #: editor/editor_help.cpp
 msgid "Methods"
-msgstr ""
+msgstr "Metodoj"
 
 #: editor/editor_help.cpp
 msgid "Theme Properties"
-msgstr ""
+msgstr "Etosaj Atributoj"
 
 #: editor/editor_help.cpp
 msgid "Enumerations"
-msgstr ""
+msgstr "Enumeracioj"
 
 #: editor/editor_help.cpp
 msgid "Constants"
-msgstr ""
+msgstr "Konstantoj"
 
 #: editor/editor_help.cpp
 msgid "Property Descriptions"
-msgstr ""
+msgstr "Priskribo de Atributoj"
 
 #: editor/editor_help.cpp
-#, fuzzy
 msgid "(value)"
-msgstr "Valoro:"
+msgstr "(valoro)"
 
 #: editor/editor_help.cpp
-#, fuzzy
 msgid ""
 "There is currently no description for this property. Please help us by "
 "[color=$color][url=$url]contributing one[/url][/color]!"
 msgstr ""
-"Tie aktuale ne estas priskribon por ĉi tiun eco. Bonvolu helpi nin per "
-"[color=$color][url=$url]kontribui oni[/url][/color]!"
+"Estas aktuale ne priskribo por ĉi tiu atributo. Bonvolu helpi nin per [color="
+"$color][url=$url]kontribui unu[/url][/color]!"
 
 #: editor/editor_help.cpp
 msgid "Method Descriptions"
-msgstr ""
+msgstr "Metodaj Priskriboj"
 
 #: editor/editor_help.cpp
-#, fuzzy
 msgid ""
 "There is currently no description for this method. Please help us by [color="
 "$color][url=$url]contributing one[/url][/color]!"
 msgstr ""
-"Tie aktuale ne estas priskribon por ĉi tiun metodo. Bonvolu helpi nin per "
-"[color=$color][url=$url]kontribui oni[/url][/color]!"
+"Estas aktuale ne priskribo por ĉi tiu metodo. Bonvolu helpi nin per [color="
+"$color][url=$url]kontribui unu[/url][/color]!"
 
 #: editor/editor_help_search.cpp editor/editor_node.cpp
 #: editor/plugins/script_editor_plugin.cpp
@@ -2035,93 +2050,89 @@ msgid "Search Help"
 msgstr "Serĉi helpon"
 
 #: editor/editor_help_search.cpp
-#, fuzzy
 msgid "Case Sensitive"
-msgstr "Fermi scenon"
+msgstr "Uskleciva"
 
 #: editor/editor_help_search.cpp
-#, fuzzy
 msgid "Show Hierarchy"
-msgstr "Montri helpantoj"
+msgstr "Montri hierarĥion"
 
 #: editor/editor_help_search.cpp
 msgid "Display All"
-msgstr ""
+msgstr "Vidigi tutan"
 
 #: editor/editor_help_search.cpp
 msgid "Classes Only"
-msgstr ""
+msgstr "Nur klasoj"
 
 #: editor/editor_help_search.cpp
 msgid "Methods Only"
-msgstr ""
+msgstr "Nur metodoj"
 
 #: editor/editor_help_search.cpp
 msgid "Signals Only"
-msgstr ""
+msgstr "Nur signaloj"
 
 #: editor/editor_help_search.cpp
 msgid "Constants Only"
-msgstr ""
+msgstr "Nur konstantoj"
 
 #: editor/editor_help_search.cpp
 msgid "Properties Only"
-msgstr ""
+msgstr "Nur atributoj"
 
 #: editor/editor_help_search.cpp
 msgid "Theme Properties Only"
-msgstr ""
+msgstr "Nur etosaj atributoj"
 
 #: editor/editor_help_search.cpp
 msgid "Member Type"
-msgstr ""
+msgstr "Tipo de membro"
 
 #: editor/editor_help_search.cpp
 msgid "Class"
-msgstr ""
+msgstr "Klaso"
 
 #: editor/editor_help_search.cpp
-#, fuzzy
 msgid "Method"
-msgstr "Iru al metodo"
+msgstr "Metodo"
 
 #: editor/editor_help_search.cpp editor/plugins/script_text_editor.cpp
 msgid "Signal"
-msgstr ""
+msgstr "Signalo"
 
 #: editor/editor_help_search.cpp editor/plugins/theme_editor_plugin.cpp
 msgid "Constant"
-msgstr ""
+msgstr "Konstanto"
 
 #: editor/editor_help_search.cpp
-#, fuzzy
 msgid "Property"
-msgstr "Atributo Vojeto"
+msgstr "Atributo"
 
 #: editor/editor_help_search.cpp
-#, fuzzy
 msgid "Theme Property"
-msgstr "Renomi projekton"
+msgstr "Etosa atributo"
 
 #: editor/editor_inspector.cpp editor/project_settings_editor.cpp
 msgid "Property:"
-msgstr ""
+msgstr "Atributo:"
 
 #: editor/editor_inspector.cpp
 msgid "Set"
-msgstr ""
+msgstr "Agordi"
 
 #: editor/editor_inspector.cpp
 msgid "Set Multiple:"
-msgstr ""
+msgstr "Agordi pluroblan:"
 
 #: editor/editor_log.cpp
+#, fuzzy
 msgid "Output:"
-msgstr ""
+msgstr "Eligo:"
 
 #: editor/editor_log.cpp editor/plugins/tile_map_editor_plugin.cpp
 msgid "Copy Selection"
-msgstr ""
+msgstr "Kopii elektaron"
 
 #: editor/editor_log.cpp editor/editor_network_profiler.cpp
 #: editor/editor_profiler.cpp editor/editor_properties.cpp
@@ -2131,96 +2142,102 @@ msgstr ""
 #: modules/gdnative/gdnative_library_editor_plugin.cpp scene/gui/line_edit.cpp
 #: scene/gui/text_edit.cpp
 msgid "Clear"
-msgstr ""
+msgstr "Vakigi"
 
 #: editor/editor_log.cpp
 msgid "Clear Output"
-msgstr ""
+msgstr "Vakigi eligon"
 
 #: editor/editor_network_profiler.cpp editor/editor_node.cpp
 #: editor/editor_profiler.cpp
 msgid "Stop"
-msgstr ""
+msgstr "Halti"
 
 #: editor/editor_network_profiler.cpp editor/editor_profiler.cpp
 #: editor/plugins/animation_state_machine_editor.cpp editor/rename_dialog.cpp
 msgid "Start"
-msgstr ""
+msgstr "Komenci"
 
 #: editor/editor_network_profiler.cpp
 msgid "%s/s"
-msgstr ""
+msgstr "%s/s"
 
 #: editor/editor_network_profiler.cpp
 msgid "Down"
-msgstr ""
+msgstr "Elŝuta"
 
 #: editor/editor_network_profiler.cpp
 msgid "Up"
-msgstr ""
+msgstr "Alŝuta"
 
 #: editor/editor_network_profiler.cpp editor/editor_node.cpp
 msgid "Node"
-msgstr ""
+msgstr "Nodo"
 
 #: editor/editor_network_profiler.cpp
 msgid "Incoming RPC"
-msgstr ""
+msgstr "Envena RPC"
 
 #: editor/editor_network_profiler.cpp
 msgid "Incoming RSET"
-msgstr ""
+msgstr "Envena RSET"
 
 #: editor/editor_network_profiler.cpp
 msgid "Outgoing RPC"
-msgstr ""
+msgstr "Elira RPC"
 
 #: editor/editor_network_profiler.cpp
 msgid "Outgoing RSET"
-msgstr ""
+msgstr "Elira RSET"
 
 #: editor/editor_node.cpp editor/project_manager.cpp
 msgid "New Window"
-msgstr ""
+msgstr "Nova Fenestro"
 
 #: editor/editor_node.cpp
 msgid "Imported resources can't be saved."
-msgstr ""
+msgstr "Enportitaj risurcoj ne povas konservi."
 
 #: editor/editor_node.cpp editor/plugins/script_editor_plugin.cpp
 #: scene/gui/dialogs.cpp
 msgid "OK"
-msgstr ""
+msgstr "Bone"
 
 #: editor/editor_node.cpp editor/plugins/animation_player_editor_plugin.cpp
 msgid "Error saving resource!"
-msgstr ""
+msgstr "Eraras konservi risurcon!"
 
 #: editor/editor_node.cpp
+#, fuzzy
 msgid ""
 "This resource can't be saved because it does not belong to the edited scene. "
 "Make it unique first."
 msgstr ""
+"Ĉi tiu risurco ne konserveblas, ĉar ĝi ne apartenas la redaktita sceno. "
+"Farigu ĝin unikan unue."
 
 #: editor/editor_node.cpp editor/plugins/animation_player_editor_plugin.cpp
 msgid "Save Resource As..."
-msgstr ""
+msgstr "Konservi risurcon kiel..."
 
 #: editor/editor_node.cpp
 msgid "Can't open file for writing:"
-msgstr ""
+msgstr "Ne malfermeblas dosieron por skribi:"
 
 #: editor/editor_node.cpp
+#, fuzzy
 msgid "Requested file format unknown:"
-msgstr ""
+msgstr "Petitan dosierformon senkonatas:"
 
 #: editor/editor_node.cpp
+#, fuzzy
 msgid "Error while saving."
-msgstr ""
+msgstr "Eraro dum la konservo."
 
 #: editor/editor_node.cpp editor/plugins/script_editor_plugin.cpp
+#, fuzzy
 msgid "Can't open '%s'. The file could have been moved or deleted."
-msgstr ""
+msgstr "Ne malfermeblas '%s'. La dosiero estus movita aŭ forigita."
 
 #: editor/editor_node.cpp
 msgid "Error while parsing '%s'."
@@ -2252,7 +2269,7 @@ msgstr ""
 
 #: editor/editor_node.cpp
 msgid "This operation can't be done without a tree root."
-msgstr ""
+msgstr "Ĉi tiu operacio ne farigeblas sen arbradiko."
 
 #: editor/editor_node.cpp
 msgid ""
@@ -2631,7 +2648,7 @@ msgstr ""
 
 #: editor/editor_node.cpp
 msgid "Scene"
-msgstr ""
+msgstr "Sceno"
 
 #: editor/editor_node.cpp
 msgid "Go to previously opened scene."
@@ -7396,6 +7413,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10190,9 +10212,8 @@ msgid "Batch Rename"
 msgstr ""
 
 #: editor/rename_dialog.cpp
-#, fuzzy
 msgid "Replace:"
-msgstr "Anstataŭigi: "
+msgstr "Anstataŭigi:"
 
 #: editor/rename_dialog.cpp
 msgid "Prefix:"
@@ -10479,19 +10500,19 @@ msgstr ""
 
 #: editor/scene_tree_dock.cpp
 msgid "2D Scene"
-msgstr ""
+msgstr "2D Sceno"
 
 #: editor/scene_tree_dock.cpp
 msgid "3D Scene"
-msgstr ""
+msgstr "3D Sceno"
 
 #: editor/scene_tree_dock.cpp
 msgid "User Interface"
-msgstr ""
+msgstr "Uzanta Interfaco"
 
 #: editor/scene_tree_dock.cpp
 msgid "Other Node"
-msgstr ""
+msgstr "Alia Nodo"
 
 #: editor/scene_tree_dock.cpp
 msgid "Can't operate on nodes from a foreign scene!"
@@ -10503,19 +10524,19 @@ msgstr ""
 
 #: editor/scene_tree_dock.cpp
 msgid "Attach Script"
-msgstr ""
+msgstr "Alligi Skripto"
 
 #: editor/scene_tree_dock.cpp
 msgid "Cut Node(s)"
-msgstr ""
+msgstr "Eltondi nodo(j)n"
 
 #: editor/scene_tree_dock.cpp
 msgid "Remove Node(s)"
-msgstr ""
+msgstr "Forigi nodo(j)n"
 
 #: editor/scene_tree_dock.cpp
 msgid "Change type of node(s)"
-msgstr ""
+msgstr "Ŝanĝi la tipo de nodo(j)n"
 
 #: editor/scene_tree_dock.cpp
 msgid ""
@@ -10533,23 +10554,24 @@ msgstr ""
 
 #: editor/scene_tree_dock.cpp
 msgid "Sub-Resources"
-msgstr ""
+msgstr "Subrisurcoj"
 
 #: editor/scene_tree_dock.cpp
 msgid "Clear Inheritance"
-msgstr ""
+msgstr "Vakigi heredadon"
 
 #: editor/scene_tree_dock.cpp
+#, fuzzy
 msgid "Editable Children"
-msgstr ""
+msgstr "Redakteblaj infanoj"
 
 #: editor/scene_tree_dock.cpp
 msgid "Load As Placeholder"
-msgstr ""
+msgstr "Ŝargi kiel lokokupilo"
 
 #: editor/scene_tree_dock.cpp
 msgid "Open Documentation"
-msgstr ""
+msgstr "Malfermi dokumentaro"
 
 #: editor/scene_tree_dock.cpp
 msgid ""
@@ -10560,31 +10582,34 @@ msgstr ""
 
 #: editor/scene_tree_dock.cpp
 msgid "Add Child Node"
-msgstr ""
+msgstr "Aldoni infanon nodon"
 
 #: editor/scene_tree_dock.cpp
+#, fuzzy
 msgid "Expand/Collapse All"
-msgstr ""
+msgstr "(Mal)etendi tutan"
 
 #: editor/scene_tree_dock.cpp
 msgid "Change Type"
-msgstr ""
+msgstr "Ŝanĝi tipon"
 
 #: editor/scene_tree_dock.cpp
+#, fuzzy
 msgid "Reparent to New Node"
-msgstr ""
+msgstr "Repatri al nova nodo"
 
 #: editor/scene_tree_dock.cpp
 msgid "Make Scene Root"
-msgstr ""
+msgstr "Farigi scena radiko"
 
 #: editor/scene_tree_dock.cpp
+#, fuzzy
 msgid "Merge From Scene"
-msgstr ""
+msgstr "Kunigi el sceno"
 
 #: editor/scene_tree_dock.cpp editor/script_editor_debugger.cpp
 msgid "Save Branch as Scene"
-msgstr ""
+msgstr "Konservi la branĉon kiel sceno"
 
 #: editor/scene_tree_dock.cpp editor/script_editor_debugger.cpp
 msgid "Copy Node Path"
@@ -10592,11 +10617,11 @@ msgstr ""
 
 #: editor/scene_tree_dock.cpp
 msgid "Delete (No Confirm)"
-msgstr ""
+msgstr "Forigi (ne konfirmo)"
 
 #: editor/scene_tree_dock.cpp
 msgid "Add/Create a New Node."
-msgstr ""
+msgstr "Aldoni/Krei nova nodo."
 
 #: editor/scene_tree_dock.cpp
 msgid ""
@@ -10614,83 +10639,102 @@ msgstr ""
 
 #: editor/scene_tree_dock.cpp
 msgid "Remote"
+msgstr "Fora"
+
+#: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
 msgid "Local"
-msgstr ""
+msgstr "Loka"
 
 #: editor/scene_tree_dock.cpp
 msgid "Clear Inheritance? (No Undo!)"
-msgstr ""
+msgstr "Vakigi heredadon? (Ne malfaro!)"
 
 #: editor/scene_tree_editor.cpp
 msgid "Toggle Visible"
-msgstr ""
+msgstr "Baskuli videblon"
 
 #: editor/scene_tree_editor.cpp
 msgid "Unlock Node"
-msgstr ""
+msgstr "Malŝlosi nodon"
 
 #: editor/scene_tree_editor.cpp
 msgid "Button Group"
-msgstr ""
+msgstr "Grupo de butono"
 
 #: editor/scene_tree_editor.cpp
 msgid "(Connecting From)"
-msgstr ""
+msgstr "(Konektas el)"
 
 #: editor/scene_tree_editor.cpp
 msgid "Node configuration warning:"
-msgstr ""
+msgstr "Agorda averto de nodo:"
 
 #: editor/scene_tree_editor.cpp
 msgid ""
 "Node has %s connection(s) and %s group(s).\n"
 "Click to show signals dock."
 msgstr ""
+"La nodo havas %s konekto(j)n kaj %s grupo(j)n.\n"
+"Alklaku por vidigi la dokon de signaloj."
 
 #: editor/scene_tree_editor.cpp
 msgid ""
 "Node has %s connection(s).\n"
 "Click to show signals dock."
 msgstr ""
+"La nodo havas %s konekto(j)n.\n"
+"Alklaku por vidigi la dokon de signaloj."
 
 #: editor/scene_tree_editor.cpp
 msgid ""
 "Node is in %s group(s).\n"
 "Click to show groups dock."
 msgstr ""
+"La nodo havas %s grupo(j)n.\n"
+"Alklaku por vidigi la dokon de grupoj."
 
 #: editor/scene_tree_editor.cpp
 msgid "Open Script:"
-msgstr ""
+msgstr "Malfermi skripton:"
 
 #: editor/scene_tree_editor.cpp
 msgid ""
 "Node is locked.\n"
 "Click to unlock it."
 msgstr ""
+"Nodo ŝlosis.\n"
+"Alklaku por malŝlosi ĝin."
 
 #: editor/scene_tree_editor.cpp
 msgid ""
 "Children are not selectable.\n"
 "Click to make selectable."
 msgstr ""
+"Infanoj ne estas selektebla.\n"
+"Alklaku por farigi selekteblan."
 
 #: editor/scene_tree_editor.cpp
 msgid "Toggle Visibility"
-msgstr ""
+msgstr "Baskuli videblon"
 
 #: editor/scene_tree_editor.cpp
 msgid ""
 "AnimationPlayer is pinned.\n"
 "Click to unpin."
 msgstr ""
+"AnimationPlayer estas kejlita.\n"
+"Alklaku por malkejli."
 
 #: editor/scene_tree_editor.cpp
 msgid "Invalid node name, the following characters are not allowed:"
-msgstr ""
+msgstr "Malvalida nomo de nodo, la jenaj signoj ne permesas:"
 
 #: editor/scene_tree_editor.cpp
 msgid "Rename Node"
@@ -10698,39 +10742,39 @@ msgstr "Renomi nodon"
 
 #: editor/scene_tree_editor.cpp
 msgid "Scene Tree (Nodes):"
-msgstr ""
+msgstr "Scena arbo (nodoj):"
 
 #: editor/scene_tree_editor.cpp
 msgid "Node Configuration Warning!"
-msgstr ""
+msgstr "Agorda averto de nodo!"
 
 #: editor/scene_tree_editor.cpp
 msgid "Select a Node"
-msgstr ""
+msgstr "Elektu nodon"
 
 #: editor/script_create_dialog.cpp
 msgid "Path is empty."
-msgstr ""
+msgstr "La dosierindiko estas malplena."
 
 #: editor/script_create_dialog.cpp
 msgid "Filename is empty."
-msgstr ""
+msgstr "La dosiernomo estas malplena."
 
 #: editor/script_create_dialog.cpp
 msgid "Path is not local."
-msgstr ""
+msgstr "La dosierindiko ne estas loka."
 
 #: editor/script_create_dialog.cpp
 msgid "Invalid base path."
-msgstr "Nevalida dosierindiko."
+msgstr "Malvalida baza dosierindiko."
 
 #: editor/script_create_dialog.cpp
 msgid "A directory with the same name exists."
-msgstr ""
+msgstr "Dosierujo ekzistas kun la sama nomo."
 
 #: editor/script_create_dialog.cpp
 msgid "File does not exist."
-msgstr ""
+msgstr "Dosiero ne ekzistas."
 
 #: editor/script_create_dialog.cpp
 msgid "Invalid extension."
@@ -10738,39 +10782,40 @@ msgstr "Nevalida kromprogramo."
 
 #: editor/script_create_dialog.cpp
 msgid "Wrong extension chosen."
-msgstr ""
+msgstr "Elektinta kromprogramo eraras."
 
 #: editor/script_create_dialog.cpp
 msgid "Error loading template '%s'"
-msgstr ""
+msgstr "Eraris ŝargi ŝablono '%s'"
 
 #: editor/script_create_dialog.cpp
 msgid "Error - Could not create script in filesystem."
-msgstr "Eraro - Ne povis krei skripton en dosiersistemo."
+msgstr "Eraro - Ne povis krei skripton en la dosiersistemo."
 
 #: editor/script_create_dialog.cpp
 msgid "Error loading script from %s"
-msgstr ""
+msgstr "Eraris ŝargi skripton de %s"
 
 #: editor/script_create_dialog.cpp
+#, fuzzy
 msgid "Overrides"
-msgstr ""
+msgstr "Redifinoj"
 
 #: editor/script_create_dialog.cpp
 msgid "N/A"
-msgstr ""
+msgstr "Neaplikebla"
 
 #: editor/script_create_dialog.cpp
 msgid "Open Script / Choose Location"
-msgstr ""
+msgstr "Malfermi skripton / Elekti lokon"
 
 #: editor/script_create_dialog.cpp
 msgid "Open Script"
-msgstr ""
+msgstr "Malfermi skripton"
 
 #: editor/script_create_dialog.cpp
 msgid "File exists, it will be reused."
-msgstr ""
+msgstr "Dosiero ekzistas, ĝi reuziĝos."
 
 #: editor/script_create_dialog.cpp
 msgid "Invalid path."
@@ -10778,212 +10823,208 @@ msgstr "Nevalida dosierindiko."
 
 #: editor/script_create_dialog.cpp
 msgid "Invalid class name."
-msgstr ""
+msgstr "Malvalida nomo de klaso."
 
 #: editor/script_create_dialog.cpp
 msgid "Invalid inherited parent name or path."
-msgstr ""
+msgstr "Malvalida nomo aŭ dosierindiko de heredita gepatro."
 
 #: editor/script_create_dialog.cpp
 msgid "Script path/name is valid."
-msgstr ""
+msgstr "La dosierindiko/nomo de skripto estas valida."
 
 #: editor/script_create_dialog.cpp
 msgid "Allowed: a-z, A-Z, 0-9, _ and ."
-msgstr ""
+msgstr "Permesite: a-z, A-Z, 0-9, _ kaj ."
 
 #: editor/script_create_dialog.cpp
 msgid "Built-in script (into scene file)."
-msgstr ""
+msgstr "Enkonstruita skripto (en scena dosiero)."
 
 #: editor/script_create_dialog.cpp
 msgid "Will create a new script file."
-msgstr ""
+msgstr "Kreos novan dosieron de skripto."
 
 #: editor/script_create_dialog.cpp
 msgid "Will load an existing script file."
-msgstr ""
+msgstr "Ŝargos ekzistitan dosieron de skripto."
 
 #: editor/script_create_dialog.cpp
-#, fuzzy
 msgid "Script file already exists."
-msgstr "Grupa nomo jam ekzistas."
+msgstr "La skripta dosiero jam ekzistas."
 
 #: editor/script_create_dialog.cpp
 msgid ""
 "Note: Built-in scripts have some limitations and can't be edited using an "
 "external editor."
 msgstr ""
+"Rimarko: Enkonstruitaj skriptoj havas iom limiĝoj kaj ne redakteblas uzi "
+"ekstera redaktilo."
 
 #: editor/script_create_dialog.cpp
-#, fuzzy
 msgid "Class Name:"
-msgstr "Nomo:"
+msgstr "Klasa nomo:"
 
 #: editor/script_create_dialog.cpp
-#, fuzzy
 msgid "Template:"
-msgstr "Ŝablonoj"
+msgstr "Ŝablono:"
 
 #: editor/script_create_dialog.cpp
-#, fuzzy
 msgid "Built-in Script:"
-msgstr "Konektu al skripto:"
+msgstr "Enkonstruita skripto:"
 
 #: editor/script_create_dialog.cpp
 msgid "Attach Node Script"
-msgstr ""
+msgstr "Alligi Noda Skripto"
 
 #: editor/script_editor_debugger.cpp
 msgid "Remote "
-msgstr ""
+msgstr "Fora "
 
 #: editor/script_editor_debugger.cpp
 msgid "Bytes:"
-msgstr ""
+msgstr "Bitokoj:"
 
 #: editor/script_editor_debugger.cpp
-#, fuzzy
 msgid "Warning:"
-msgstr "Avertoj"
+msgstr "Averto:"
 
 #: editor/script_editor_debugger.cpp
-#, fuzzy
 msgid "Error:"
-msgstr "Spegulo"
+msgstr "Eraro:"
 
 #: editor/script_editor_debugger.cpp
 msgid "C++ Error"
-msgstr ""
+msgstr "C++ Eraro"
 
 #: editor/script_editor_debugger.cpp
 msgid "C++ Error:"
-msgstr ""
+msgstr "C++ Eraro:"
 
 #: editor/script_editor_debugger.cpp
 msgid "C++ Source"
-msgstr ""
+msgstr "C++ Fonto"
 
 #: editor/script_editor_debugger.cpp
-#, fuzzy
 msgid "Source:"
-msgstr "Rimedo"
+msgstr "Fonto:"
 
 #: editor/script_editor_debugger.cpp
 msgid "C++ Source:"
-msgstr ""
+msgstr "C++ Fonto:"
 
 #: editor/script_editor_debugger.cpp
 msgid "Stack Trace"
-msgstr ""
+msgstr "Stakspuro"
 
 #: editor/script_editor_debugger.cpp
 msgid "Errors"
-msgstr ""
+msgstr "Eraroj"
 
 #: editor/script_editor_debugger.cpp
 msgid "Child process connected."
-msgstr ""
+msgstr "Infana procezo konektis."
 
 #: editor/script_editor_debugger.cpp
 msgid "Copy Error"
-msgstr ""
+msgstr "Kopii eraro"
 
 #: editor/script_editor_debugger.cpp
 msgid "Video RAM"
-msgstr ""
+msgstr "Videomemoro"
 
 #: editor/script_editor_debugger.cpp
 msgid "Skip Breakpoints"
-msgstr ""
+msgstr "Pasi preter paŭzpunktojn"
 
 #: editor/script_editor_debugger.cpp
 msgid "Inspect Previous Instance"
-msgstr ""
+msgstr "Inspekti antaŭan ekzemplon"
 
 #: editor/script_editor_debugger.cpp
 msgid "Inspect Next Instance"
-msgstr ""
+msgstr "Inspekti sekvan ekzemplon"
 
 #: editor/script_editor_debugger.cpp
 msgid "Stack Frames"
-msgstr ""
+msgstr "Stakaj Framoj"
 
 #: editor/script_editor_debugger.cpp
 msgid "Profiler"
-msgstr ""
+msgstr "Profililo"
 
 #: editor/script_editor_debugger.cpp
 msgid "Network Profiler"
-msgstr ""
+msgstr "Reta Profililo"
 
 #: editor/script_editor_debugger.cpp
 msgid "Monitor"
-msgstr ""
+msgstr "Monitoro"
 
 #: editor/script_editor_debugger.cpp
 msgid "Value"
-msgstr ""
+msgstr "Valoro"
 
 #: editor/script_editor_debugger.cpp
 msgid "Monitors"
-msgstr ""
+msgstr "Monitoroj"
 
 #: editor/script_editor_debugger.cpp
 msgid "Pick one or more items from the list to display the graph."
-msgstr ""
+msgstr "Elektu unu aŭ pli elementojn de la listo por vidigi la diagramon."
 
 #: editor/script_editor_debugger.cpp
 msgid "List of Video Memory Usage by Resource:"
-msgstr ""
+msgstr "Listo de la Uzo de Videomemoro per Risurco:"
 
 #: editor/script_editor_debugger.cpp
 msgid "Total:"
-msgstr ""
+msgstr "Totalo:"
 
 #: editor/script_editor_debugger.cpp
 msgid "Export list to a CSV file"
-msgstr ""
+msgstr "Eksporti liston al CSV dosiero"
 
 #: editor/script_editor_debugger.cpp
 msgid "Resource Path"
-msgstr ""
+msgstr "Risurca Vojo"
 
 #: editor/script_editor_debugger.cpp
 msgid "Type"
-msgstr ""
+msgstr "Tipo"
 
 #: editor/script_editor_debugger.cpp
 msgid "Format"
-msgstr ""
+msgstr "Formo"
 
 #: editor/script_editor_debugger.cpp
 msgid "Usage"
-msgstr ""
+msgstr "Uzo"
 
 #: editor/script_editor_debugger.cpp
 msgid "Misc"
-msgstr ""
+msgstr "Diversa"
 
 #: editor/script_editor_debugger.cpp
 msgid "Clicked Control:"
-msgstr ""
+msgstr "Alklakita stirilo:"
 
 #: editor/script_editor_debugger.cpp
 msgid "Clicked Control Type:"
-msgstr ""
+msgstr "Tipo de alklakita stirilo:"
 
 #: editor/script_editor_debugger.cpp
+#, fuzzy
 msgid "Live Edit Root:"
-msgstr ""
+msgstr "Senpere redakta radiko:"
 
 #: editor/script_editor_debugger.cpp
 msgid "Set From Tree"
-msgstr ""
+msgstr "Agordi de la Arbo"
 
 #: editor/script_editor_debugger.cpp
 msgid "Export measures as CSV"
-msgstr ""
+msgstr "Eksporti mezurojn en CSV"
 
 #: editor/settings_config_dialog.cpp
 msgid "Erase Shortcut"
diff --git a/editor/translations/es.po b/editor/translations/es.po
index 7fc20c2f14..e83d33e9fa 100644
--- a/editor/translations/es.po
+++ b/editor/translations/es.po
@@ -59,12 +59,13 @@
 # A <kaieltroll@gmail.com>, 2021.
 # Lucasdelpiero <lucasdelpiero98@gmail.com>, 2021.
 # SteamGoblin <SteamGoblin860@gmail.com>, 2021.
+# Francisco C <pruebasfrancisco17@gmail.com>, 2021.
 msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: \n"
-"PO-Revision-Date: 2021-03-31 03:53+0000\n"
-"Last-Translator: Javier Ocampos <xavier.ocampos@gmail.com>\n"
+"PO-Revision-Date: 2021-04-19 22:33+0000\n"
+"Last-Translator: Francisco C <pruebasfrancisco17@gmail.com>\n"
 "Language-Team: Spanish <https://hosted.weblate.org/projects/godot-engine/"
 "godot/es/>\n"
 "Language: es\n"
@@ -72,7 +73,7 @@ msgstr ""
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=2; plural=n != 1;\n"
-"X-Generator: Weblate 4.6-dev\n"
+"X-Generator: Weblate 4.7-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -1825,7 +1826,7 @@ msgstr "Nuevo"
 #: editor/editor_feature_profile.cpp editor/editor_node.cpp
 #: editor/project_manager.cpp
 msgid "Import"
-msgstr "Importación"
+msgstr "Importar"
 
 #: editor/editor_feature_profile.cpp editor/project_export.cpp
 msgid "Export"
@@ -7644,6 +7645,11 @@ msgstr "Bloquear Rotación de Vista"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11037,6 +11043,13 @@ msgid "Remote"
 msgstr "Remoto"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Local"
 
diff --git a/editor/translations/es_AR.po b/editor/translations/es_AR.po
index ef65c1d220..d9e193da4e 100644
--- a/editor/translations/es_AR.po
+++ b/editor/translations/es_AR.po
@@ -7590,6 +7590,11 @@ msgstr "Rotación de Vista Trabada"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10982,6 +10987,13 @@ msgid "Remote"
 msgstr "Remoto"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Local"
 
diff --git a/editor/translations/et.po b/editor/translations/et.po
index d1f68d4402..de0b0360ee 100644
--- a/editor/translations/et.po
+++ b/editor/translations/et.po
@@ -7321,6 +7321,11 @@ msgstr "Vaateakna pöördenurk on lukustatud"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10523,6 +10528,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/eu.po b/editor/translations/eu.po
index 0fda17a8d5..421a255054 100644
--- a/editor/translations/eu.po
+++ b/editor/translations/eu.po
@@ -7287,6 +7287,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10488,6 +10493,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/fa.po b/editor/translations/fa.po
index 4b2ad80f34..4a1906f4e3 100644
--- a/editor/translations/fa.po
+++ b/editor/translations/fa.po
@@ -7604,6 +7604,11 @@ msgstr "بومی‌سازی"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11000,6 +11005,13 @@ msgid "Remote"
 msgstr "از راه دور"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "محلی"
 
diff --git a/editor/translations/fi.po b/editor/translations/fi.po
index c60ab24d1d..dd1d5da4e8 100644
--- a/editor/translations/fi.po
+++ b/editor/translations/fi.po
@@ -7543,6 +7543,11 @@ msgstr "Näkymän kierto lukittu"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10925,6 +10930,13 @@ msgid "Remote"
 msgstr "Etäinen"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Paikallinen"
 
diff --git a/editor/translations/fil.po b/editor/translations/fil.po
index ef48d8b143..99964e6ee8 100644
--- a/editor/translations/fil.po
+++ b/editor/translations/fil.po
@@ -7285,6 +7285,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10480,6 +10485,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/fr.po b/editor/translations/fr.po
index 3e6dc5fb35..1ce05a4b93 100644
--- a/editor/translations/fr.po
+++ b/editor/translations/fr.po
@@ -7678,6 +7678,11 @@ msgstr "Rotation de la vue verrouillée"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11078,6 +11083,13 @@ msgid "Remote"
 msgstr "Distant"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Local"
 
diff --git a/editor/translations/ga.po b/editor/translations/ga.po
index 3bedee8314..8a6c2a3f0b 100644
--- a/editor/translations/ga.po
+++ b/editor/translations/ga.po
@@ -7279,6 +7279,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10473,6 +10478,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/gl.po b/editor/translations/gl.po
index 519fc06c8d..771b0f07e6 100644
--- a/editor/translations/gl.po
+++ b/editor/translations/gl.po
@@ -7480,6 +7480,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10776,6 +10781,13 @@ msgid "Remote"
 msgstr "Remoto"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Local"
 
diff --git a/editor/translations/he.po b/editor/translations/he.po
index 30a3212661..2966711057 100644
--- a/editor/translations/he.po
+++ b/editor/translations/he.po
@@ -23,7 +23,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: \n"
-"PO-Revision-Date: 2021-02-21 10:51+0000\n"
+"PO-Revision-Date: 2021-04-11 22:02+0000\n"
 "Last-Translator: Omer I.S. <omeritzicschwartz@gmail.com>\n"
 "Language-Team: Hebrew <https://hosted.weblate.org/projects/godot-engine/"
 "godot/he/>\n"
@@ -33,7 +33,7 @@ msgstr ""
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=4; plural=(n == 1) ? 0 : ((n == 2) ? 1 : ((n > 10 && "
 "n % 10 == 0) ? 2 : 3));\n"
-"X-Generator: Weblate 4.5\n"
+"X-Generator: Weblate 4.6-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -1248,9 +1248,8 @@ msgid "Success!"
 msgstr "הצלחה!"
 
 #: editor/editor_asset_installer.cpp
-#, fuzzy
 msgid "Package Contents:"
-msgstr "מתקין החבילות"
+msgstr "תוכן החבילה:"
 
 #: editor/editor_asset_installer.cpp editor/editor_node.cpp
 msgid "Install"
@@ -3211,9 +3210,8 @@ msgid "Status:"
 msgstr "מצב:"
 
 #: editor/editor_plugin_settings.cpp
-#, fuzzy
 msgid "Edit:"
-msgstr "עריכה"
+msgstr "עריכה:"
 
 #: editor/editor_profiler.cpp
 msgid "Measure:"
@@ -7610,6 +7608,11 @@ msgstr "הצגת מידע"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10973,6 +10976,13 @@ msgid "Remote"
 msgstr "מרוחק"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "מקומי"
 
diff --git a/editor/translations/hi.po b/editor/translations/hi.po
index 8425dd284f..6c465ad015 100644
--- a/editor/translations/hi.po
+++ b/editor/translations/hi.po
@@ -7461,6 +7461,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10737,6 +10742,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/hr.po b/editor/translations/hr.po
index 861f3e6c1c..dc71edeec3 100644
--- a/editor/translations/hr.po
+++ b/editor/translations/hr.po
@@ -5,11 +5,11 @@
 # Unlimited Creativity <marinosah1@gmail.com>, 2019.
 # Patik <patrikfs5@gmail.com>, 2019.
 # Nikola Bunjevac <nikola.bunjevac@gmail.com>, 2019, 2020.
-# LeoClose <leoclose575@gmail.com>, 2020.
+# LeoClose <leoclose575@gmail.com>, 2020, 2021.
 msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
-"PO-Revision-Date: 2020-11-17 11:07+0000\n"
+"PO-Revision-Date: 2021-04-11 22:02+0000\n"
 "Last-Translator: LeoClose <leoclose575@gmail.com>\n"
 "Language-Team: Croatian <https://hosted.weblate.org/projects/godot-engine/"
 "godot/hr/>\n"
@@ -18,7 +18,7 @@ msgstr ""
 "Content-Transfer-Encoding: 8-bit\n"
 "Plural-Forms: nplurals=3; plural=n%10==1 && n%100!=11 ? 0 : n%10>=2 && n"
 "%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2;\n"
-"X-Generator: Weblate 4.4-dev\n"
+"X-Generator: Weblate 4.6-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -2855,7 +2855,7 @@ msgstr ""
 
 #: editor/editor_node.cpp editor/plugins/asset_library_editor_plugin.cpp
 msgid "Community"
-msgstr ""
+msgstr "Zajednica"
 
 #: editor/editor_node.cpp
 msgid "About"
@@ -7292,6 +7292,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -9918,7 +9923,7 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp editor/settings_config_dialog.cpp
 msgid "General"
-msgstr ""
+msgstr "Općenito"
 
 #: editor/project_settings_editor.cpp
 msgid "Override For..."
@@ -10496,6 +10501,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/hu.po b/editor/translations/hu.po
index 448c79c7f1..2ef5783de9 100644
--- a/editor/translations/hu.po
+++ b/editor/translations/hu.po
@@ -7487,6 +7487,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10681,6 +10686,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/id.po b/editor/translations/id.po
index 6657101598..e97e193d0f 100644
--- a/editor/translations/id.po
+++ b/editor/translations/id.po
@@ -7559,6 +7559,11 @@ msgstr "Rotasi Tampilan Terkunci"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10953,6 +10958,13 @@ msgid "Remote"
 msgstr "Remot"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Lokal"
 
diff --git a/editor/translations/is.po b/editor/translations/is.po
index 9ae40b5085..fd9e23d91b 100644
--- a/editor/translations/is.po
+++ b/editor/translations/is.po
@@ -7351,6 +7351,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10593,6 +10598,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/it.po b/editor/translations/it.po
index a0fb10367a..d1b39155c9 100644
--- a/editor/translations/it.po
+++ b/editor/translations/it.po
@@ -61,8 +61,8 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: \n"
-"PO-Revision-Date: 2021-03-24 23:44+0000\n"
-"Last-Translator: Alessandro Mandelli <mandelli.alessandro@ngi.it>\n"
+"PO-Revision-Date: 2021-04-16 07:52+0000\n"
+"Last-Translator: Marco Galli <mrcgll98@gmail.com>\n"
 "Language-Team: Italian <https://hosted.weblate.org/projects/godot-engine/"
 "godot/it/>\n"
 "Language: it\n"
@@ -70,7 +70,7 @@ msgstr ""
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=2; plural=n != 1;\n"
-"X-Generator: Weblate 4.5.2-dev\n"
+"X-Generator: Weblate 4.6-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -2064,7 +2064,7 @@ msgstr "Metodi"
 
 #: editor/editor_help.cpp
 msgid "Theme Properties"
-msgstr "Proprietà Tema"
+msgstr "Proprietà del tema"
 
 #: editor/editor_help.cpp
 msgid "Enumerations"
@@ -2076,7 +2076,7 @@ msgstr "Costanti"
 
 #: editor/editor_help.cpp
 msgid "Property Descriptions"
-msgstr "Descrizioni Proprietà"
+msgstr "Descrizioni delle proprietà"
 
 #: editor/editor_help.cpp
 msgid "(value)"
@@ -2092,7 +2092,7 @@ msgstr ""
 
 #: editor/editor_help.cpp
 msgid "Method Descriptions"
-msgstr "Descrizioni Metodo"
+msgstr "Descrizioni del metodo"
 
 #: editor/editor_help.cpp
 msgid ""
@@ -7635,6 +7635,11 @@ msgstr "Rotazione Vista Bloccata"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11034,6 +11039,13 @@ msgid "Remote"
 msgstr "Remoto"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Locale"
 
diff --git a/editor/translations/ja.po b/editor/translations/ja.po
index 6c6340e9b8..2d694989fc 100644
--- a/editor/translations/ja.po
+++ b/editor/translations/ja.po
@@ -7559,6 +7559,11 @@ msgstr "ビューの回転を固定中"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10933,6 +10938,13 @@ msgid "Remote"
 msgstr "リモート"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "ローカル"
 
diff --git a/editor/translations/ka.po b/editor/translations/ka.po
index 6d7d40a6ad..1894b0e156 100644
--- a/editor/translations/ka.po
+++ b/editor/translations/ka.po
@@ -7520,6 +7520,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10809,6 +10814,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/km.po b/editor/translations/km.po
new file mode 100644
index 0000000000..9e167dfe2c
--- /dev/null
+++ b/editor/translations/km.po
@@ -0,0 +1,12486 @@
+# LANGUAGE translation of the Godot Engine editor.
+# Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.
+# Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).
+# This file is distributed under the same license as the Godot source code.
+#
+# Withuse <withuse@gmail.com>, 2021.
+msgid ""
+msgstr ""
+"Project-Id-Version: Godot Engine editor\n"
+"PO-Revision-Date: 2021-04-19 22:33+0000\n"
+"Last-Translator: Withuse <withuse@gmail.com>\n"
+"Language-Team: Khmer (Central) <https://hosted.weblate.org/projects/godot-"
+"engine/godot/km/>\n"
+"Language: km\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8-bit\n"
+"Plural-Forms: nplurals=1; plural=0;\n"
+"X-Generator: Weblate 4.7-dev\n"
+
+#: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
+#: modules/visual_script/visual_script_builtin_funcs.cpp
+msgid "Invalid type argument to convert(), use TYPE_* constants."
+msgstr "ប្រភេទ argument មិនត្រឹមត្រូវដើម្បី convert() សូមប្រើ TYPE_* constants."
+
+#: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
+msgid "Expected a string of length 1 (a character)."
+msgstr "តម្រូវអោយមាន string យ៉ាងតឹច១អក្សរ (មួយ character)."
+
+#: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
+#: modules/mono/glue/gd_glue.cpp
+#: modules/visual_script/visual_script_builtin_funcs.cpp
+msgid "Not enough bytes for decoding bytes, or invalid format."
+msgstr "ចំនួន bytes សម្រាប់ decoding bytes មិនគ្រប់គ្រាន់ ឬ format មិនត្រឹមត្រូវ."
+
+#: core/math/expression.cpp
+msgid "Invalid input %i (not passed) in expression"
+msgstr "ការបញ្ចូល %i មានបញ្ហា (មិនបានបញ្ចូល) ក្នុង expression"
+
+#: core/math/expression.cpp
+msgid "self can't be used because instance is null (not passed)"
+msgstr "self មិនអាចប្រើបានទេ ព្រោះ instance វា null (មិនបានបញ្ចូល)"
+
+#: core/math/expression.cpp
+msgid "Invalid operands to operator %s, %s and %s."
+msgstr "operands មិនអាចប្រើជាមួយ operator %s, %s និង %s បានទេ."
+
+#: core/math/expression.cpp
+msgid "Invalid index of type %s for base type %s"
+msgstr "index នៃ type %s សម្រាប់ base type %s មិនត្រឺមត្រូវទេ"
+
+#: core/math/expression.cpp
+msgid "Invalid named index '%s' for base type %s"
+msgstr "មិនអាចដាក់ឈ្មោះ index '%s' សម្រាប់ base type %s បានទេ"
+
+#: core/math/expression.cpp
+msgid "Invalid arguments to construct '%s'"
+msgstr "arguments ដែលប្រើសំរាប់រៀប '%s' មិនត្រឹមត្រូវទេ"
+
+#: core/math/expression.cpp
+msgid "On call to '%s':"
+msgstr "កំពុងហៅទៅកាន់ '%s':"
+
+#: core/ustring.cpp
+msgid "B"
+msgstr "B"
+
+#: core/ustring.cpp
+msgid "KiB"
+msgstr "KB"
+
+#: core/ustring.cpp
+msgid "MiB"
+msgstr "MB"
+
+#: core/ustring.cpp
+msgid "GiB"
+msgstr "GB"
+
+#: core/ustring.cpp
+msgid "TiB"
+msgstr "TB"
+
+#: core/ustring.cpp
+msgid "PiB"
+msgstr "PB"
+
+#: core/ustring.cpp
+msgid "EiB"
+msgstr "EB"
+
+#: editor/animation_bezier_editor.cpp
+msgid "Free"
+msgstr "Free"
+
+#: editor/animation_bezier_editor.cpp
+msgid "Balanced"
+msgstr "មានតុល្យភាព"
+
+#: editor/animation_bezier_editor.cpp
+msgid "Mirror"
+msgstr "កញ្ចក់"
+
+#: editor/animation_bezier_editor.cpp editor/editor_profiler.cpp
+msgid "Time:"
+msgstr "ពេលវេលា:"
+
+#: editor/animation_bezier_editor.cpp
+msgid "Value:"
+msgstr "តម្លៃ:"
+
+#: editor/animation_bezier_editor.cpp
+msgid "Insert Key Here"
+msgstr "បញ្ចូល Key នៅទីនេះ"
+
+#: editor/animation_bezier_editor.cpp
+msgid "Duplicate Selected Key(s)"
+msgstr "Key(s) ដែលបានជ្រើសស្ទួន"
+
+#: editor/animation_bezier_editor.cpp
+msgid "Delete Selected Key(s)"
+msgstr "លុប Key(s) ដែលបានជ្រើស"
+
+#: editor/animation_bezier_editor.cpp
+msgid "Add Bezier Point"
+msgstr "បន្ថែម Bezier Point"
+
+#: editor/animation_bezier_editor.cpp
+msgid "Move Bezier Points"
+msgstr "ផ្លាស់ទី Bezier Points"
+
+#: editor/animation_bezier_editor.cpp editor/animation_track_editor.cpp
+msgid "Anim Duplicate Keys"
+msgstr "Anim Keys ស្ទួន"
+
+#: editor/animation_bezier_editor.cpp editor/animation_track_editor.cpp
+msgid "Anim Delete Keys"
+msgstr "លុប Anim Delete Keys"
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Change Keyframe Time"
+msgstr "Anim ផ្លាស់ប្តូរ Keyframe ពេលវេលា"
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Change Transition"
+msgstr "Anim ផ្លាស់ប្តូរ Transition"
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Change Transform"
+msgstr "Anim ផ្លាស់ប្តូរ Transform"
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Change Keyframe Value"
+msgstr "Anim ផ្លាស់ប្តូរតម្លៃ Keyframe"
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Change Call"
+msgstr "Anim ផ្លាស់ប្តូរ Call"
+
+#: editor/animation_track_editor.cpp
+#, fuzzy
+msgid "Anim Multi Change Keyframe Time"
+msgstr "Anim ផ្លាស់ប្តូរ Keyframe ពេលវេលាច្រើន"
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Multi Change Transition"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Multi Change Transform"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Multi Change Keyframe Value"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Multi Change Call"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Change Animation Length"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Change Animation Loop"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Property Track"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "3D Transform Track"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Call Method Track"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Bezier Curve Track"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Audio Playback Track"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Animation Playback Track"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Animation length (frames)"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Animation length (seconds)"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Add Track"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Animation Looping"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Functions:"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Audio Clips:"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Clips:"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Change Track Path"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Toggle this track on/off."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Update Mode (How this property is set)"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Interpolation Mode"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Loop Wrap Mode (Interpolate end with beginning on loop)"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Remove this track."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Time (s): "
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Toggle Track Enabled"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Continuous"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Discrete"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Trigger"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Capture"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Nearest"
+msgstr ""
+
+#: editor/animation_track_editor.cpp editor/plugins/curve_editor_plugin.cpp
+#: editor/property_editor.cpp
+msgid "Linear"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Cubic"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Clamp Loop Interp"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Wrap Loop Interp"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Insert Key"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Duplicate Key(s)"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Delete Key(s)"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Change Animation Update Mode"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Change Animation Interpolation Mode"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Change Animation Loop Mode"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Remove Anim Track"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Create NEW track for %s and insert key?"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Create %d NEW tracks and insert keys?"
+msgstr ""
+
+#: editor/animation_track_editor.cpp editor/create_dialog.cpp
+#: editor/editor_audio_buses.cpp editor/editor_feature_profile.cpp
+#: editor/editor_plugin_settings.cpp editor/plugin_config_dialog.cpp
+#: editor/plugins/abstract_polygon_2d_editor.cpp
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+#: editor/plugins/particles_editor_plugin.cpp
+#: editor/plugins/visual_shader_editor_plugin.cpp
+#: editor/script_create_dialog.cpp
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Create"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Insert"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "AnimationPlayer can't animate itself, only other players."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Create & Insert"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Insert Track & Key"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Insert Key"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Change Animation Step"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Rearrange Tracks"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Transform tracks only apply to Spatial-based nodes."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid ""
+"Audio tracks can only point to nodes of type:\n"
+"-AudioStreamPlayer\n"
+"-AudioStreamPlayer2D\n"
+"-AudioStreamPlayer3D"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Animation tracks can only point to AnimationPlayer nodes."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "An animation player can't animate itself, only other players."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Not possible to add a new track without a root"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Invalid track for Bezier (no suitable sub-properties)"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Add Bezier Track"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Track path is invalid, so can't add a key."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Track is not of type Spatial, can't insert key"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Add Transform Track Key"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Add Track Key"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Track path is invalid, so can't add a method key."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Add Method Track Key"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Method not found in object: "
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Move Keys"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Clipboard is empty"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Paste Tracks"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Anim Scale Keys"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid ""
+"This option does not work for Bezier editing, as it's only a single track."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid ""
+"This animation belongs to an imported scene, so changes to imported tracks "
+"will not be saved.\n"
+"\n"
+"To enable the ability to add custom tracks, navigate to the scene's import "
+"settings and set\n"
+"\"Animation > Storage\" to \"Files\", enable \"Animation > Keep Custom Tracks"
+"\", then re-import.\n"
+"Alternatively, use an import preset that imports animations to separate "
+"files."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Warning: Editing imported animation"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Select an AnimationPlayer node to create and edit animations."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Only show tracks from nodes selected in tree."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Group tracks by node or display them as plain list."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Snap:"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Animation step value."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Seconds"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "FPS"
+msgstr ""
+
+#: editor/animation_track_editor.cpp editor/editor_properties.cpp
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+#: editor/plugins/script_text_editor.cpp
+#: editor/plugins/shader_editor_plugin.cpp editor/plugins/text_editor.cpp
+#: editor/plugins/tile_set_editor_plugin.cpp editor/project_manager.cpp
+#: editor/project_settings_editor.cpp editor/property_editor.cpp
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Edit"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Animation properties."
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Copy Tracks"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Scale Selection"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Scale From Cursor"
+msgstr ""
+
+#: editor/animation_track_editor.cpp modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Duplicate Selection"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Duplicate Transposed"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Delete Selection"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Go to Next Step"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Go to Previous Step"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Optimize Animation"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Clean-Up Animation"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Pick the node that will be animated:"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Use Bezier Curves"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Anim. Optimizer"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Max. Linear Error:"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Max. Angular Error:"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Max Optimizable Angle:"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Optimize"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Remove invalid keys"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Remove unresolved and empty tracks"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Clean-up all animations"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Clean-Up Animation(s) (NO UNDO!)"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Clean-Up"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Scale Ratio:"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Select Tracks to Copy"
+msgstr ""
+
+#: editor/animation_track_editor.cpp editor/editor_log.cpp
+#: editor/editor_properties.cpp
+#: editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/script_text_editor.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp editor/property_editor.cpp
+#: editor/scene_tree_dock.cpp scene/gui/line_edit.cpp scene/gui/text_edit.cpp
+msgid "Copy"
+msgstr ""
+
+#: editor/animation_track_editor.cpp
+msgid "Select All/None"
+msgstr ""
+
+#: editor/animation_track_editor_plugins.cpp
+msgid "Add Audio Track Clip"
+msgstr ""
+
+#: editor/animation_track_editor_plugins.cpp
+msgid "Change Audio Track Clip Start Offset"
+msgstr ""
+
+#: editor/animation_track_editor_plugins.cpp
+msgid "Change Audio Track Clip End Offset"
+msgstr ""
+
+#: editor/array_property_edit.cpp
+msgid "Resize Array"
+msgstr ""
+
+#: editor/array_property_edit.cpp
+msgid "Change Array Value Type"
+msgstr ""
+
+#: editor/array_property_edit.cpp
+msgid "Change Array Value"
+msgstr ""
+
+#: editor/code_editor.cpp
+msgid "Go to Line"
+msgstr ""
+
+#: editor/code_editor.cpp
+msgid "Line Number:"
+msgstr ""
+
+#: editor/code_editor.cpp
+msgid "%d replaced."
+msgstr ""
+
+#: editor/code_editor.cpp editor/editor_help.cpp
+msgid "%d match."
+msgstr ""
+
+#: editor/code_editor.cpp editor/editor_help.cpp
+msgid "%d matches."
+msgstr ""
+
+#: editor/code_editor.cpp editor/find_in_files.cpp
+msgid "Match Case"
+msgstr ""
+
+#: editor/code_editor.cpp editor/find_in_files.cpp
+msgid "Whole Words"
+msgstr ""
+
+#: editor/code_editor.cpp
+msgid "Replace"
+msgstr ""
+
+#: editor/code_editor.cpp
+msgid "Replace All"
+msgstr ""
+
+#: editor/code_editor.cpp
+msgid "Selection Only"
+msgstr ""
+
+#: editor/code_editor.cpp editor/plugins/script_text_editor.cpp
+#: editor/plugins/text_editor.cpp
+msgid "Standard"
+msgstr ""
+
+#: editor/code_editor.cpp editor/plugins/script_editor_plugin.cpp
+msgid "Toggle Scripts Panel"
+msgstr ""
+
+#: editor/code_editor.cpp editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/texture_region_editor_plugin.cpp
+#: editor/plugins/tile_set_editor_plugin.cpp scene/gui/graph_edit.cpp
+msgid "Zoom In"
+msgstr ""
+
+#: editor/code_editor.cpp editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/texture_region_editor_plugin.cpp
+#: editor/plugins/tile_set_editor_plugin.cpp scene/gui/graph_edit.cpp
+msgid "Zoom Out"
+msgstr ""
+
+#: editor/code_editor.cpp
+msgid "Reset Zoom"
+msgstr ""
+
+#: editor/code_editor.cpp
+msgid "Warnings"
+msgstr ""
+
+#: editor/code_editor.cpp
+msgid "Line and column numbers."
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Method in target node must be specified."
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Method name must be a valid identifier."
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid ""
+"Target method not found. Specify a valid method or attach a script to the "
+"target node."
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Connect to Node:"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Connect to Script:"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "From Signal:"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Scene does not contain any script."
+msgstr ""
+
+#: editor/connections_dialog.cpp editor/editor_autoload_settings.cpp
+#: editor/groups_editor.cpp editor/plugins/item_list_editor_plugin.cpp
+#: editor/plugins/theme_editor_plugin.cpp editor/project_settings_editor.cpp
+msgid "Add"
+msgstr ""
+
+#: editor/connections_dialog.cpp editor/dependency_editor.cpp
+#: editor/editor_feature_profile.cpp editor/groups_editor.cpp
+#: editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+#: editor/plugins/theme_editor_plugin.cpp
+#: editor/plugins/visual_shader_editor_plugin.cpp editor/project_manager.cpp
+#: editor/project_settings_editor.cpp
+msgid "Remove"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Add Extra Call Argument:"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Extra Call Arguments:"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Receiver Method:"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Advanced"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Deferred"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid ""
+"Defers the signal, storing it in a queue and only firing it at idle time."
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Oneshot"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Disconnects the signal after its first emission."
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Cannot connect signal"
+msgstr ""
+
+#: editor/connections_dialog.cpp editor/dependency_editor.cpp
+#: editor/export_template_manager.cpp editor/groups_editor.cpp
+#: editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/asset_library_editor_plugin.cpp
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+#: editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+#: editor/plugins/version_control_editor_plugin.cpp editor/project_export.cpp
+#: editor/project_settings_editor.cpp editor/property_editor.cpp
+#: editor/run_settings_dialog.cpp editor/settings_config_dialog.cpp
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Close"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Connect"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Signal:"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Connect '%s' to '%s'"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Disconnect '%s' from '%s'"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Disconnect all from signal: '%s'"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Connect..."
+msgstr ""
+
+#: editor/connections_dialog.cpp
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Disconnect"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Connect a Signal to a Method"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Edit Connection:"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Are you sure you want to remove all connections from the \"%s\" signal?"
+msgstr ""
+
+#: editor/connections_dialog.cpp editor/editor_help.cpp editor/node_dock.cpp
+msgid "Signals"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Filter signals"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Are you sure you want to remove all connections from this signal?"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Disconnect All"
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Edit..."
+msgstr ""
+
+#: editor/connections_dialog.cpp
+msgid "Go To Method"
+msgstr ""
+
+#: editor/create_dialog.cpp
+msgid "Change %s Type"
+msgstr ""
+
+#: editor/create_dialog.cpp editor/project_settings_editor.cpp
+msgid "Change"
+msgstr ""
+
+#: editor/create_dialog.cpp
+msgid "Create New %s"
+msgstr ""
+
+#: editor/create_dialog.cpp editor/editor_file_dialog.cpp
+#: editor/filesystem_dock.cpp
+msgid "Favorites:"
+msgstr ""
+
+#: editor/create_dialog.cpp editor/editor_file_dialog.cpp
+msgid "Recent:"
+msgstr ""
+
+#: editor/create_dialog.cpp editor/plugins/script_editor_plugin.cpp
+#: editor/property_selector.cpp editor/quick_open.cpp editor/rename_dialog.cpp
+#: modules/visual_script/visual_script_property_selector.cpp
+msgid "Search:"
+msgstr ""
+
+#: editor/create_dialog.cpp editor/plugins/script_editor_plugin.cpp
+#: editor/property_selector.cpp editor/quick_open.cpp
+#: modules/visual_script/visual_script_property_selector.cpp
+msgid "Matches:"
+msgstr ""
+
+#: editor/create_dialog.cpp editor/editor_plugin_settings.cpp
+#: editor/plugin_config_dialog.cpp
+#: editor/plugins/asset_library_editor_plugin.cpp
+#: editor/plugins/visual_shader_editor_plugin.cpp editor/property_selector.cpp
+#: modules/visual_script/visual_script_property_selector.cpp
+msgid "Description:"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Search Replacement For:"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Dependencies For:"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid ""
+"Scene '%s' is currently being edited.\n"
+"Changes will only take effect when reloaded."
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid ""
+"Resource '%s' is in use.\n"
+"Changes will only take effect when reloaded."
+msgstr ""
+
+#: editor/dependency_editor.cpp
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "Dependencies"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Resource"
+msgstr ""
+
+#: editor/dependency_editor.cpp editor/editor_autoload_settings.cpp
+#: editor/project_manager.cpp editor/project_settings_editor.cpp
+msgid "Path"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Dependencies:"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Fix Broken"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Dependency Editor"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Search Replacement Resource:"
+msgstr ""
+
+#: editor/dependency_editor.cpp editor/editor_file_dialog.cpp
+#: editor/editor_help_search.cpp editor/editor_node.cpp
+#: editor/filesystem_dock.cpp editor/plugins/script_editor_plugin.cpp
+#: editor/property_selector.cpp editor/quick_open.cpp
+#: editor/script_create_dialog.cpp
+#: modules/visual_script/visual_script_property_selector.cpp
+#: scene/gui/file_dialog.cpp
+msgid "Open"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Owners Of:"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid ""
+"Remove selected files from the project? (no undo)\n"
+"You can find the removed files in the system trash to restore them."
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid ""
+"The files being removed are required by other resources in order for them to "
+"work.\n"
+"Remove them anyway? (no undo)\n"
+"You can find the removed files in the system trash to restore them."
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Cannot remove:"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Error loading:"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Load failed due to missing dependencies:"
+msgstr ""
+
+#: editor/dependency_editor.cpp editor/editor_node.cpp
+msgid "Open Anyway"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Which action should be taken?"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Fix Dependencies"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Errors loading!"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Permanently delete %d item(s)? (No undo!)"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Show Dependencies"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Orphan Resource Explorer"
+msgstr ""
+
+#: editor/dependency_editor.cpp editor/editor_audio_buses.cpp
+#: editor/editor_file_dialog.cpp editor/editor_node.cpp
+#: editor/plugins/item_list_editor_plugin.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp editor/project_export.cpp
+#: editor/project_settings_editor.cpp editor/scene_tree_dock.cpp
+msgid "Delete"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Owns"
+msgstr ""
+
+#: editor/dependency_editor.cpp
+msgid "Resources Without Explicit Ownership:"
+msgstr ""
+
+#: editor/dictionary_property_edit.cpp
+msgid "Change Dictionary Key"
+msgstr ""
+
+#: editor/dictionary_property_edit.cpp
+msgid "Change Dictionary Value"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Thanks from the Godot community!"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Godot Engine contributors"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Project Founders"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Lead Developer"
+msgstr ""
+
+#. TRANSLATORS: This refers to a job title.
+#. The trailing space is used to distinguish with the project list application,
+#. you do not have to keep it in your translation.
+#: editor/editor_about.cpp
+msgid "Project Manager "
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Developers"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Authors"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Platinum Sponsors"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Gold Sponsors"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Silver Sponsors"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Bronze Sponsors"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Mini Sponsors"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Gold Donors"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Silver Donors"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Bronze Donors"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Donors"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "License"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Third-party Licenses"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid ""
+"Godot Engine relies on a number of third-party free and open source "
+"libraries, all compatible with the terms of its MIT license. The following "
+"is an exhaustive list of all such third-party components with their "
+"respective copyright statements and license terms."
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "All Components"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Components"
+msgstr ""
+
+#: editor/editor_about.cpp
+msgid "Licenses"
+msgstr ""
+
+#: editor/editor_asset_installer.cpp editor/project_manager.cpp
+msgid "Error opening package file, not in ZIP format."
+msgstr ""
+
+#: editor/editor_asset_installer.cpp
+msgid "%s (Already Exists)"
+msgstr ""
+
+#: editor/editor_asset_installer.cpp
+msgid "Uncompressing Assets"
+msgstr ""
+
+#: editor/editor_asset_installer.cpp editor/project_manager.cpp
+msgid "The following files failed extraction from package:"
+msgstr ""
+
+#: editor/editor_asset_installer.cpp
+msgid "And %s more files."
+msgstr ""
+
+#: editor/editor_asset_installer.cpp editor/project_manager.cpp
+msgid "Package installed successfully!"
+msgstr ""
+
+#: editor/editor_asset_installer.cpp
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Success!"
+msgstr ""
+
+#: editor/editor_asset_installer.cpp
+msgid "Package Contents:"
+msgstr ""
+
+#: editor/editor_asset_installer.cpp editor/editor_node.cpp
+msgid "Install"
+msgstr ""
+
+#: editor/editor_asset_installer.cpp
+msgid "Package Installer"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Speakers"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Add Effect"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Rename Audio Bus"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Change Audio Bus Volume"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Toggle Audio Bus Solo"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Toggle Audio Bus Mute"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Toggle Audio Bus Bypass Effects"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Select Audio Bus Send"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Add Audio Bus Effect"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Move Bus Effect"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Delete Bus Effect"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Drag & drop to rearrange."
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Solo"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Mute"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Bypass"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Bus options"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp editor/filesystem_dock.cpp
+#: editor/plugins/animation_player_editor_plugin.cpp editor/scene_tree_dock.cpp
+msgid "Duplicate"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Reset Volume"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Delete Effect"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Audio"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Add Audio Bus"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Master bus can't be deleted!"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Delete Audio Bus"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Duplicate Audio Bus"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Reset Bus Volume"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Move Audio Bus"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Save Audio Bus Layout As..."
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Location for New Layout..."
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Open Audio Bus Layout"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "There is no '%s' file."
+msgstr ""
+
+#: editor/editor_audio_buses.cpp editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Layout"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Invalid file, not an audio bus layout."
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Error saving file: %s"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Add Bus"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Add a new Audio Bus to this layout."
+msgstr ""
+
+#: editor/editor_audio_buses.cpp editor/editor_properties.cpp
+#: editor/plugins/animation_player_editor_plugin.cpp editor/property_editor.cpp
+#: editor/script_create_dialog.cpp
+msgid "Load"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Load an existing Bus Layout."
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Save As"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Save this Bus Layout to a file."
+msgstr ""
+
+#: editor/editor_audio_buses.cpp editor/import_dock.cpp
+msgid "Load Default"
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Load the default Bus Layout."
+msgstr ""
+
+#: editor/editor_audio_buses.cpp
+msgid "Create a new Bus Layout."
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Invalid name."
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Valid characters:"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Must not collide with an existing engine class name."
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Must not collide with an existing built-in type name."
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Must not collide with an existing global constant name."
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Keyword cannot be used as an autoload name."
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Autoload '%s' already exists!"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Rename Autoload"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Toggle AutoLoad Globals"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Move Autoload"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Remove Autoload"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp editor/editor_plugin_settings.cpp
+msgid "Enable"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Rearrange Autoloads"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Can't add autoload:"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Add AutoLoad"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp editor/editor_file_dialog.cpp
+#: editor/editor_plugin_settings.cpp
+#: editor/plugins/animation_tree_editor_plugin.cpp
+#: editor/script_create_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Path:"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Node Name:"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp editor/editor_help_search.cpp
+#: editor/editor_profiler.cpp editor/project_manager.cpp
+#: editor/settings_config_dialog.cpp
+msgid "Name"
+msgstr ""
+
+#: editor/editor_autoload_settings.cpp
+msgid "Singleton"
+msgstr ""
+
+#: editor/editor_data.cpp editor/inspector_dock.cpp
+msgid "Paste Params"
+msgstr ""
+
+#: editor/editor_data.cpp
+msgid "Updating Scene"
+msgstr ""
+
+#: editor/editor_data.cpp
+msgid "Storing local changes..."
+msgstr ""
+
+#: editor/editor_data.cpp
+msgid "Updating scene..."
+msgstr ""
+
+#: editor/editor_data.cpp editor/editor_properties.cpp
+msgid "[empty]"
+msgstr ""
+
+#: editor/editor_data.cpp
+msgid "[unsaved]"
+msgstr ""
+
+#: editor/editor_dir_dialog.cpp
+msgid "Please select a base directory first."
+msgstr ""
+
+#: editor/editor_dir_dialog.cpp
+msgid "Choose a Directory"
+msgstr ""
+
+#: editor/editor_dir_dialog.cpp editor/editor_file_dialog.cpp
+#: editor/filesystem_dock.cpp editor/project_manager.cpp
+#: scene/gui/file_dialog.cpp
+msgid "Create Folder"
+msgstr ""
+
+#: editor/editor_dir_dialog.cpp editor/editor_file_dialog.cpp
+#: editor/editor_plugin_settings.cpp editor/filesystem_dock.cpp
+#: editor/plugins/theme_editor_plugin.cpp editor/project_export.cpp
+#: modules/visual_script/visual_script_editor.cpp scene/gui/file_dialog.cpp
+msgid "Name:"
+msgstr ""
+
+#: editor/editor_dir_dialog.cpp editor/editor_file_dialog.cpp
+#: editor/filesystem_dock.cpp scene/gui/file_dialog.cpp
+msgid "Could not create folder."
+msgstr ""
+
+#: editor/editor_dir_dialog.cpp
+msgid "Choose"
+msgstr ""
+
+#: editor/editor_export.cpp
+msgid "Storing File:"
+msgstr ""
+
+#: editor/editor_export.cpp
+msgid "No export template found at the expected path:"
+msgstr ""
+
+#: editor/editor_export.cpp
+msgid "Packing"
+msgstr ""
+
+#: editor/editor_export.cpp
+msgid ""
+"Target platform requires 'ETC' texture compression for GLES2. Enable 'Import "
+"Etc' in Project Settings."
+msgstr ""
+
+#: editor/editor_export.cpp
+msgid ""
+"Target platform requires 'ETC2' texture compression for GLES3. Enable "
+"'Import Etc 2' in Project Settings."
+msgstr ""
+
+#: editor/editor_export.cpp
+msgid ""
+"Target platform requires 'ETC' texture compression for the driver fallback "
+"to GLES2.\n"
+"Enable 'Import Etc' in Project Settings, or disable 'Driver Fallback "
+"Enabled'."
+msgstr ""
+
+#: editor/editor_export.cpp
+msgid ""
+"Target platform requires 'PVRTC' texture compression for GLES2. Enable "
+"'Import Pvrtc' in Project Settings."
+msgstr ""
+
+#: editor/editor_export.cpp
+msgid ""
+"Target platform requires 'ETC2' or 'PVRTC' texture compression for GLES3. "
+"Enable 'Import Etc 2' or 'Import Pvrtc' in Project Settings."
+msgstr ""
+
+#: editor/editor_export.cpp
+msgid ""
+"Target platform requires 'PVRTC' texture compression for the driver fallback "
+"to GLES2.\n"
+"Enable 'Import Pvrtc' in Project Settings, or disable 'Driver Fallback "
+"Enabled'."
+msgstr ""
+
+#: editor/editor_export.cpp platform/android/export/export.cpp
+#: platform/iphone/export/export.cpp platform/javascript/export/export.cpp
+#: platform/osx/export/export.cpp platform/uwp/export/export.cpp
+msgid "Custom debug template not found."
+msgstr ""
+
+#: editor/editor_export.cpp platform/android/export/export.cpp
+#: platform/iphone/export/export.cpp platform/javascript/export/export.cpp
+#: platform/osx/export/export.cpp platform/uwp/export/export.cpp
+msgid "Custom release template not found."
+msgstr ""
+
+#: editor/editor_export.cpp platform/javascript/export/export.cpp
+msgid "Template file not found:"
+msgstr ""
+
+#: editor/editor_export.cpp
+msgid "On 32-bit exports the embedded PCK cannot be bigger than 4 GiB."
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "3D Editor"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Script Editor"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Asset Library"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Scene Tree Editing"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Node Dock"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "FileSystem Dock"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Import Dock"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Erase profile '%s'? (no undo)"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Profile must be a valid filename and must not contain '.'"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Profile with this name already exists."
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "(Editor Disabled, Properties Disabled)"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "(Properties Disabled)"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "(Editor Disabled)"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Class Options:"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Enable Contextual Editor"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Enabled Properties:"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Enabled Features:"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Enabled Classes:"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "File '%s' format is invalid, import aborted."
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid ""
+"Profile '%s' already exists. Remove it first before importing, import "
+"aborted."
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Error saving profile to path: '%s'."
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Unset"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Current Profile:"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Make Current"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+#: editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "New"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp editor/editor_node.cpp
+#: editor/project_manager.cpp
+msgid "Import"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp editor/project_export.cpp
+msgid "Export"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Available Profiles:"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Class Options"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "New profile name:"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Erase Profile"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Godot Feature Profile"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Import Profile(s)"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Export Profile"
+msgstr ""
+
+#: editor/editor_feature_profile.cpp
+msgid "Manage Editor Feature Profiles"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Select Current Folder"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "File Exists, Overwrite?"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Select This Folder"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp editor/filesystem_dock.cpp
+msgid "Copy Path"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp editor/filesystem_dock.cpp
+msgid "Open in File Manager"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp editor/editor_node.cpp
+#: editor/filesystem_dock.cpp editor/project_manager.cpp
+msgid "Show in File Manager"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp editor/filesystem_dock.cpp
+msgid "New Folder..."
+msgstr ""
+
+#: editor/editor_file_dialog.cpp editor/find_in_files.cpp
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Refresh"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "All Recognized"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "All Files (*)"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Open a File"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Open File(s)"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Open a Directory"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Open a File or Directory"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp editor/editor_node.cpp
+#: editor/editor_properties.cpp editor/import_defaults_editor.cpp
+#: editor/inspector_dock.cpp editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/script_editor_plugin.cpp scene/gui/file_dialog.cpp
+msgid "Save"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Save a File"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "Go Back"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "Go Forward"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "Go Up"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "Toggle Hidden Files"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "Toggle Favorite"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "Toggle Mode"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "Focus Path"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "Move Favorite Up"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "Move Favorite Down"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "Go to previous folder."
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "Go to next folder."
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Go to parent folder."
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Refresh files."
+msgstr ""
+
+#: editor/editor_file_dialog.cpp
+msgid "(Un)favorite current folder."
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Toggle the visibility of hidden files."
+msgstr ""
+
+#: editor/editor_file_dialog.cpp editor/filesystem_dock.cpp
+msgid "View items as a grid of thumbnails."
+msgstr ""
+
+#: editor/editor_file_dialog.cpp editor/filesystem_dock.cpp
+msgid "View items as a list."
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "Directories & Files:"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp editor/plugins/sprite_editor_plugin.cpp
+#: editor/plugins/style_box_editor_plugin.cpp
+#: editor/plugins/theme_editor_plugin.cpp editor/rename_dialog.cpp
+msgid "Preview:"
+msgstr ""
+
+#: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
+msgid "File:"
+msgstr ""
+
+#: editor/editor_file_system.cpp
+msgid "ScanSources"
+msgstr ""
+
+#: editor/editor_file_system.cpp
+msgid ""
+"There are multiple importers for different types pointing to file %s, import "
+"aborted"
+msgstr ""
+
+#: editor/editor_file_system.cpp
+msgid "(Re)Importing Assets"
+msgstr ""
+
+#: editor/editor_help.cpp editor/plugins/spatial_editor_plugin.cpp
+msgid "Top"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "Class:"
+msgstr ""
+
+#: editor/editor_help.cpp editor/scene_tree_editor.cpp
+#: editor/script_create_dialog.cpp
+msgid "Inherits:"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "Inherited by:"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "Description"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "Online Tutorials"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "Properties"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "override:"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "default:"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "Methods"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "Theme Properties"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "Enumerations"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "Constants"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "Property Descriptions"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "(value)"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid ""
+"There is currently no description for this property. Please help us by "
+"[color=$color][url=$url]contributing one[/url][/color]!"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid "Method Descriptions"
+msgstr ""
+
+#: editor/editor_help.cpp
+msgid ""
+"There is currently no description for this method. Please help us by [color="
+"$color][url=$url]contributing one[/url][/color]!"
+msgstr ""
+
+#: editor/editor_help_search.cpp editor/editor_node.cpp
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Search Help"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Case Sensitive"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Show Hierarchy"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Display All"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Classes Only"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Methods Only"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Signals Only"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Constants Only"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Properties Only"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Theme Properties Only"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Member Type"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Class"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Method"
+msgstr ""
+
+#: editor/editor_help_search.cpp editor/plugins/script_text_editor.cpp
+msgid "Signal"
+msgstr ""
+
+#: editor/editor_help_search.cpp editor/plugins/theme_editor_plugin.cpp
+msgid "Constant"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Property"
+msgstr ""
+
+#: editor/editor_help_search.cpp
+msgid "Theme Property"
+msgstr ""
+
+#: editor/editor_inspector.cpp editor/project_settings_editor.cpp
+msgid "Property:"
+msgstr ""
+
+#: editor/editor_inspector.cpp
+msgid "Set"
+msgstr ""
+
+#: editor/editor_inspector.cpp
+msgid "Set Multiple:"
+msgstr ""
+
+#: editor/editor_log.cpp
+msgid "Output:"
+msgstr ""
+
+#: editor/editor_log.cpp editor/plugins/tile_map_editor_plugin.cpp
+msgid "Copy Selection"
+msgstr ""
+
+#: editor/editor_log.cpp editor/editor_network_profiler.cpp
+#: editor/editor_profiler.cpp editor/editor_properties.cpp
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+#: editor/property_editor.cpp editor/scene_tree_dock.cpp
+#: editor/script_editor_debugger.cpp
+#: modules/gdnative/gdnative_library_editor_plugin.cpp scene/gui/line_edit.cpp
+#: scene/gui/text_edit.cpp
+msgid "Clear"
+msgstr ""
+
+#: editor/editor_log.cpp
+msgid "Clear Output"
+msgstr ""
+
+#: editor/editor_network_profiler.cpp editor/editor_node.cpp
+#: editor/editor_profiler.cpp
+msgid "Stop"
+msgstr ""
+
+#: editor/editor_network_profiler.cpp editor/editor_profiler.cpp
+#: editor/plugins/animation_state_machine_editor.cpp editor/rename_dialog.cpp
+msgid "Start"
+msgstr ""
+
+#: editor/editor_network_profiler.cpp
+msgid "%s/s"
+msgstr ""
+
+#: editor/editor_network_profiler.cpp
+msgid "Down"
+msgstr ""
+
+#: editor/editor_network_profiler.cpp
+msgid "Up"
+msgstr ""
+
+#: editor/editor_network_profiler.cpp editor/editor_node.cpp
+msgid "Node"
+msgstr ""
+
+#: editor/editor_network_profiler.cpp
+msgid "Incoming RPC"
+msgstr ""
+
+#: editor/editor_network_profiler.cpp
+msgid "Incoming RSET"
+msgstr ""
+
+#: editor/editor_network_profiler.cpp
+msgid "Outgoing RPC"
+msgstr ""
+
+#: editor/editor_network_profiler.cpp
+msgid "Outgoing RSET"
+msgstr ""
+
+#: editor/editor_node.cpp editor/project_manager.cpp
+msgid "New Window"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Imported resources can't be saved."
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/script_editor_plugin.cpp
+#: scene/gui/dialogs.cpp
+msgid "OK"
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/animation_player_editor_plugin.cpp
+msgid "Error saving resource!"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"This resource can't be saved because it does not belong to the edited scene. "
+"Make it unique first."
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/animation_player_editor_plugin.cpp
+msgid "Save Resource As..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Can't open file for writing:"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Requested file format unknown:"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Error while saving."
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/script_editor_plugin.cpp
+msgid "Can't open '%s'. The file could have been moved or deleted."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Error while parsing '%s'."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Unexpected end of file '%s'."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Missing '%s' or its dependencies."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Error while loading '%s'."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Saving Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Analyzing"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Creating Thumbnail"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "This operation can't be done without a tree root."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"This scene can't be saved because there is a cyclic instancing inclusion.\n"
+"Please resolve it and then attempt to save again."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"Couldn't save scene. Likely dependencies (instances or inheritance) couldn't "
+"be satisfied."
+msgstr ""
+
+#: editor/editor_node.cpp editor/scene_tree_dock.cpp
+msgid "Can't overwrite scene that is still open!"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Can't load MeshLibrary for merging!"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Error saving MeshLibrary!"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Can't load TileSet for merging!"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Error saving TileSet!"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"An error occurred while trying to save the editor layout.\n"
+"Make sure the editor's user data path is writable."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"Default editor layout overridden.\n"
+"To restore the Default layout to its base settings, use the Delete Layout "
+"option and delete the Default layout."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Layout name not found!"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Restored the Default layout to its base settings."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"This resource belongs to a scene that was imported, so it's not editable.\n"
+"Please read the documentation relevant to importing scenes to better "
+"understand this workflow."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"This resource belongs to a scene that was instanced or inherited.\n"
+"Changes to it won't be kept when saving the current scene."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"This resource was imported, so it's not editable. Change its settings in the "
+"import panel and then re-import."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"This scene was imported, so changes to it won't be kept.\n"
+"Instancing it or inheriting will allow making changes to it.\n"
+"Please read the documentation relevant to importing scenes to better "
+"understand this workflow."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"This is a remote object, so changes to it won't be kept.\n"
+"Please read the documentation relevant to debugging to better understand "
+"this workflow."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "There is no defined scene to run."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Save scene before running..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Could not start subprocess!"
+msgstr ""
+
+#: editor/editor_node.cpp editor/filesystem_dock.cpp
+msgid "Open Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open Base Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Quick Open..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Quick Open Scene..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Quick Open Script..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Save & Close"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Save changes to '%s' before closing?"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Saved %s modified resource(s)."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "A root node is required to save the scene."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Save Scene As..."
+msgstr ""
+
+#: editor/editor_node.cpp editor/scene_tree_dock.cpp
+msgid "This operation can't be done without a scene."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Export Mesh Library"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "This operation can't be done without a root node."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Export Tile Set"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "This operation can't be done without a selected node."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Current scene not saved. Open anyway?"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Can't reload a scene that was never saved."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Reload Saved Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"The current scene has unsaved changes.\n"
+"Reload the saved scene anyway? This action cannot be undone."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Quick Run Scene..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Quit"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Yes"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Exit the editor?"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open Project Manager?"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Save & Quit"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Save changes to the following scene(s) before quitting?"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Save changes the following scene(s) before opening Project Manager?"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"This option is deprecated. Situations where refresh must be forced are now "
+"considered a bug. Please report."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Pick a Main Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Close Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Reopen Closed Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Unable to enable addon plugin at: '%s' parsing of config failed."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Unable to find script field for addon plugin at: '%s'."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Unable to load addon script from path: '%s'."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"Unable to load addon script from path: '%s' There seems to be an error in "
+"the code, please check the syntax."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"Unable to load addon script from path: '%s' Base type is not EditorPlugin."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Unable to load addon script from path: '%s' Script is not in tool mode."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"Scene '%s' was automatically imported, so it can't be modified.\n"
+"To make changes to it, a new inherited scene can be created."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"Error loading scene, it must be inside the project path. Use 'Import' to "
+"open the scene, then save it inside the project path."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Scene '%s' has broken dependencies:"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Clear Recent Scenes"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"No main scene has ever been defined, select one?\n"
+"You can change it later in \"Project Settings\" under the 'application' "
+"category."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"Selected scene '%s' does not exist, select a valid one?\n"
+"You can change it later in \"Project Settings\" under the 'application' "
+"category."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"Selected scene '%s' is not a scene file, select a valid one?\n"
+"You can change it later in \"Project Settings\" under the 'application' "
+"category."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Save Layout"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Delete Layout"
+msgstr ""
+
+#: editor/editor_node.cpp editor/import_dock.cpp
+#: editor/script_create_dialog.cpp
+msgid "Default"
+msgstr ""
+
+#: editor/editor_node.cpp editor/editor_properties.cpp
+#: editor/plugins/script_editor_plugin.cpp editor/property_editor.cpp
+msgid "Show in FileSystem"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Play This Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Close Tab"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Undo Close Tab"
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/script_editor_plugin.cpp
+msgid "Close Other Tabs"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Close Tabs to the Right"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Close All Tabs"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Switch Scene Tab"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "%d more files or folders"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "%d more folders"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "%d more files"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Dock Position"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Distraction Free Mode"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Toggle distraction-free mode."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Add a new scene."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Go to previously opened scene."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Copy Text"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Next tab"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Previous tab"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Filter Files..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Operations with scene files."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "New Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "New Inherited Scene..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open Scene..."
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/script_editor_plugin.cpp
+msgid "Open Recent"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Save Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Save All Scenes"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Convert To..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "MeshLibrary..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "TileSet..."
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/script_text_editor.cpp
+#: scene/gui/line_edit.cpp scene/gui/text_edit.cpp
+msgid "Undo"
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/script_text_editor.cpp
+#: scene/gui/line_edit.cpp scene/gui/text_edit.cpp
+msgid "Redo"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Miscellaneous project or scene-wide tools."
+msgstr ""
+
+#: editor/editor_node.cpp editor/project_manager.cpp
+#: editor/script_create_dialog.cpp
+msgid "Project"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Project Settings..."
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/version_control_editor_plugin.cpp
+msgid "Version Control"
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/version_control_editor_plugin.cpp
+msgid "Set Up Version Control"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Shut Down Version Control"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Export..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Install Android Build Template..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open Project Data Folder"
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/tile_set_editor_plugin.cpp
+msgid "Tools"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Orphan Resource Explorer..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Quit to Project List"
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/script_editor_plugin.cpp
+#: editor/project_export.cpp
+msgid "Debug"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Deploy with Remote Debug"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"When this option is enabled, using one-click deploy will make the executable "
+"attempt to connect to this computer's IP so the running project can be "
+"debugged.\n"
+"This option is intended to be used for remote debugging (typically with a "
+"mobile device).\n"
+"You don't need to enable it to use the GDScript debugger locally."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Small Deploy with Network Filesystem"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"When this option is enabled, using one-click deploy for Android will only "
+"export an executable without the project data.\n"
+"The filesystem will be provided from the project by the editor over the "
+"network.\n"
+"On Android, deploying will use the USB cable for faster performance. This "
+"option speeds up testing for projects with large assets."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Visible Collision Shapes"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"When this option is enabled, collision shapes and raycast nodes (for 2D and "
+"3D) will be visible in the running project."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Visible Navigation"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"When this option is enabled, navigation meshes and polygons will be visible "
+"in the running project."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Synchronize Scene Changes"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"When this option is enabled, any changes made to the scene in the editor "
+"will be replicated in the running project.\n"
+"When used remotely on a device, this is more efficient when the network "
+"filesystem option is enabled."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Synchronize Script Changes"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"When this option is enabled, any script that is saved will be reloaded in "
+"the running project.\n"
+"When used remotely on a device, this is more efficient when the network "
+"filesystem option is enabled."
+msgstr ""
+
+#: editor/editor_node.cpp editor/script_create_dialog.cpp
+msgid "Editor"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Editor Settings..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Editor Layout"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Take Screenshot"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Screenshots are stored in the Editor Data/Settings Folder."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Toggle Fullscreen"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Toggle System Console"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open Editor Data/Settings Folder"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open Editor Data Folder"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open Editor Settings Folder"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Manage Editor Features..."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Manage Export Templates..."
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/shader_editor_plugin.cpp
+msgid "Help"
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/shader_editor_plugin.cpp
+msgid "Online Docs"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Q&A"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Report a Bug"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Send Docs Feedback"
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/asset_library_editor_plugin.cpp
+msgid "Community"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "About"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Play the project."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Play"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Pause the scene execution for debugging."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Pause Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Stop the scene."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Play the edited scene."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Play Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Play custom scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Play Custom Scene"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Changing the video driver requires restarting the editor."
+msgstr ""
+
+#: editor/editor_node.cpp editor/project_settings_editor.cpp
+#: editor/settings_config_dialog.cpp
+msgid "Save & Restart"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Spins when the editor window redraws."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Update Continuously"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Update When Changed"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Hide Update Spinner"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "FileSystem"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Inspector"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Expand Bottom Panel"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Output"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Don't Save"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Android build template is missing, please install relevant templates."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Manage Templates"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"This will set up your project for custom Android builds by installing the "
+"source template to \"res://android/build\".\n"
+"You can then apply modifications and build your own custom APK on export "
+"(adding modules, changing the AndroidManifest.xml, etc.).\n"
+"Note that in order to make custom builds instead of using pre-built APKs, "
+"the \"Use Custom Build\" option should be enabled in the Android export "
+"preset."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"The Android build template is already installed in this project and it won't "
+"be overwritten.\n"
+"Remove the \"res://android/build\" directory manually before attempting this "
+"operation again."
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Import Templates From ZIP File"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Template Package"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Export Library"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Merge With Existing"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open & Run a Script"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid ""
+"The following files are newer on disk.\n"
+"What action should be taken?"
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/shader_editor_plugin.cpp
+msgid "Reload"
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/shader_editor_plugin.cpp
+msgid "Resave"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "New Inherited"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Load Errors"
+msgstr ""
+
+#: editor/editor_node.cpp editor/plugins/tile_map_editor_plugin.cpp
+msgid "Select"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open 2D Editor"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open 3D Editor"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open Script Editor"
+msgstr ""
+
+#: editor/editor_node.cpp editor/project_manager.cpp
+msgid "Open Asset Library"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open the next Editor"
+msgstr ""
+
+#: editor/editor_node.cpp
+msgid "Open the previous Editor"
+msgstr ""
+
+#: editor/editor_node.h
+msgid "Warning!"
+msgstr ""
+
+#: editor/editor_path.cpp
+msgid "No sub-resources found."
+msgstr ""
+
+#: editor/editor_plugin.cpp
+msgid "Creating Mesh Previews"
+msgstr ""
+
+#: editor/editor_plugin.cpp
+msgid "Thumbnail..."
+msgstr ""
+
+#: editor/editor_plugin_settings.cpp
+msgid "Main Script:"
+msgstr ""
+
+#: editor/editor_plugin_settings.cpp
+msgid "Edit Plugin"
+msgstr ""
+
+#: editor/editor_plugin_settings.cpp
+msgid "Installed Plugins:"
+msgstr ""
+
+#: editor/editor_plugin_settings.cpp editor/plugin_config_dialog.cpp
+msgid "Update"
+msgstr ""
+
+#: editor/editor_plugin_settings.cpp editor/plugin_config_dialog.cpp
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Version:"
+msgstr ""
+
+#: editor/editor_plugin_settings.cpp editor/plugin_config_dialog.cpp
+msgid "Author:"
+msgstr ""
+
+#: editor/editor_plugin_settings.cpp
+msgid "Status:"
+msgstr ""
+
+#: editor/editor_plugin_settings.cpp
+msgid "Edit:"
+msgstr ""
+
+#: editor/editor_profiler.cpp
+msgid "Measure:"
+msgstr ""
+
+#: editor/editor_profiler.cpp
+msgid "Frame Time (sec)"
+msgstr ""
+
+#: editor/editor_profiler.cpp
+msgid "Average Time (sec)"
+msgstr ""
+
+#: editor/editor_profiler.cpp
+msgid "Frame %"
+msgstr ""
+
+#: editor/editor_profiler.cpp
+msgid "Physics Frame %"
+msgstr ""
+
+#: editor/editor_profiler.cpp
+msgid "Inclusive"
+msgstr ""
+
+#: editor/editor_profiler.cpp
+msgid "Self"
+msgstr ""
+
+#: editor/editor_profiler.cpp
+msgid "Frame #:"
+msgstr ""
+
+#: editor/editor_profiler.cpp
+msgid "Time"
+msgstr ""
+
+#: editor/editor_profiler.cpp
+msgid "Calls"
+msgstr ""
+
+#: editor/editor_properties.cpp
+msgid "Edit Text:"
+msgstr ""
+
+#: editor/editor_properties.cpp editor/script_create_dialog.cpp
+msgid "On"
+msgstr ""
+
+#: editor/editor_properties.cpp
+msgid "Layer"
+msgstr ""
+
+#: editor/editor_properties.cpp
+msgid "Bit %d, value %d"
+msgstr ""
+
+#: editor/editor_properties.cpp
+msgid "[Empty]"
+msgstr ""
+
+#: editor/editor_properties.cpp editor/plugins/root_motion_editor_plugin.cpp
+msgid "Assign..."
+msgstr ""
+
+#: editor/editor_properties.cpp
+msgid "Invalid RID"
+msgstr ""
+
+#: editor/editor_properties.cpp
+msgid ""
+"The selected resource (%s) does not match any type expected for this "
+"property (%s)."
+msgstr ""
+
+#: editor/editor_properties.cpp
+msgid ""
+"Can't create a ViewportTexture on resources saved as a file.\n"
+"Resource needs to belong to a scene."
+msgstr ""
+
+#: editor/editor_properties.cpp
+msgid ""
+"Can't create a ViewportTexture on this resource because it's not set as "
+"local to scene.\n"
+"Please switch on the 'local to scene' property on it (and all resources "
+"containing it up to a node)."
+msgstr ""
+
+#: editor/editor_properties.cpp editor/property_editor.cpp
+msgid "Pick a Viewport"
+msgstr ""
+
+#: editor/editor_properties.cpp editor/property_editor.cpp
+msgid "New Script"
+msgstr ""
+
+#: editor/editor_properties.cpp editor/scene_tree_dock.cpp
+msgid "Extend Script"
+msgstr ""
+
+#: editor/editor_properties.cpp editor/property_editor.cpp
+msgid "New %s"
+msgstr ""
+
+#: editor/editor_properties.cpp editor/property_editor.cpp
+msgid "Make Unique"
+msgstr ""
+
+#: editor/editor_properties.cpp
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/animation_state_machine_editor.cpp
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+#: editor/plugins/script_text_editor.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+#: editor/plugins/tile_map_editor_plugin.cpp editor/property_editor.cpp
+#: editor/scene_tree_dock.cpp scene/gui/line_edit.cpp scene/gui/text_edit.cpp
+msgid "Paste"
+msgstr ""
+
+#: editor/editor_properties.cpp editor/property_editor.cpp
+msgid "Convert To %s"
+msgstr ""
+
+#: editor/editor_properties.cpp editor/property_editor.cpp
+msgid "Selected node is not a Viewport!"
+msgstr ""
+
+#: editor/editor_properties_array_dict.cpp
+msgid "Size: "
+msgstr ""
+
+#: editor/editor_properties_array_dict.cpp
+msgid "Page: "
+msgstr ""
+
+#: editor/editor_properties_array_dict.cpp
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Remove Item"
+msgstr ""
+
+#: editor/editor_properties_array_dict.cpp
+msgid "New Key:"
+msgstr ""
+
+#: editor/editor_properties_array_dict.cpp
+msgid "New Value:"
+msgstr ""
+
+#: editor/editor_properties_array_dict.cpp
+msgid "Add Key/Value Pair"
+msgstr ""
+
+#: editor/editor_run_native.cpp
+msgid ""
+"No runnable export preset found for this platform.\n"
+"Please add a runnable preset in the Export menu or define an existing preset "
+"as runnable."
+msgstr ""
+
+#: editor/editor_run_script.cpp
+msgid "Write your logic in the _run() method."
+msgstr ""
+
+#: editor/editor_run_script.cpp
+msgid "There is an edited scene already."
+msgstr ""
+
+#: editor/editor_run_script.cpp
+msgid "Couldn't instance script:"
+msgstr ""
+
+#: editor/editor_run_script.cpp
+msgid "Did you forget the 'tool' keyword?"
+msgstr ""
+
+#: editor/editor_run_script.cpp
+msgid "Couldn't run script:"
+msgstr ""
+
+#: editor/editor_run_script.cpp
+msgid "Did you forget the '_run' method?"
+msgstr ""
+
+#: editor/editor_spin_slider.cpp
+msgid "Hold Ctrl to round to integers. Hold Shift for more precise changes."
+msgstr ""
+
+#: editor/editor_sub_scene.cpp
+msgid "Select Node(s) to Import"
+msgstr ""
+
+#: editor/editor_sub_scene.cpp editor/project_manager.cpp
+msgid "Browse"
+msgstr ""
+
+#: editor/editor_sub_scene.cpp
+msgid "Scene Path:"
+msgstr ""
+
+#: editor/editor_sub_scene.cpp
+msgid "Import From Node:"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Redownload"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Uninstall"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "(Installed)"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Download"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Official export templates aren't available for development builds."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "(Missing)"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "(Current)"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Retrieving mirrors, please wait..."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Remove template version '%s'?"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Can't open export templates zip."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Invalid version.txt format inside templates: %s."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "No version.txt found inside templates."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Error creating path for templates:"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Extracting Export Templates"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Importing:"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Error getting the list of mirrors."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Error parsing JSON of mirror list. Please report this issue!"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid ""
+"No download links found for this version. Direct download is only available "
+"for official releases."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Can't resolve."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Can't connect."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "No response."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Request Failed."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Redirect Loop."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Failed:"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Download Complete."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Cannot remove temporary file:"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid ""
+"Templates installation failed.\n"
+"The problematic templates archives can be found at '%s'."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Error requesting URL:"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Connecting to Mirror..."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Disconnected"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Resolving"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Can't Resolve"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Connecting..."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Can't Connect"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Connected"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Requesting..."
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Downloading"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Connection Error"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "SSL Handshake Error"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Uncompressing Android Build Sources"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Current Version:"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Installed Versions:"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Install From File"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Remove Template"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Select Template File"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Godot Export Templates"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Export Template Manager"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Download Templates"
+msgstr ""
+
+#: editor/export_template_manager.cpp
+msgid "Select mirror from list: (Shift+Click: Open in Browser)"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Favorites"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Status: Import of file failed. Please fix file and reimport manually."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid ""
+"Importing has been disabled for this file, so it can't be opened for editing."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Cannot move/rename resources root."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Cannot move a folder into itself."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Error moving:"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Error duplicating:"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Unable to update dependencies:"
+msgstr ""
+
+#: editor/filesystem_dock.cpp editor/scene_tree_editor.cpp
+msgid "No name provided."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Provided name contains invalid characters."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "A file or folder with this name already exists."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Name contains invalid characters."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid ""
+"The following files or folders conflict with items in the target location "
+"'%s':\n"
+"\n"
+"%s\n"
+"\n"
+"Do you wish to overwrite them?"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Renaming file:"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Renaming folder:"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Duplicating file:"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Duplicating folder:"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "New Inherited Scene"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Set As Main Scene"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Open Scenes"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Instance"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Add to Favorites"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Remove from Favorites"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Edit Dependencies..."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "View Owners..."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Move To..."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "New Scene..."
+msgstr ""
+
+#: editor/filesystem_dock.cpp editor/plugins/script_editor_plugin.cpp
+msgid "New Script..."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "New Resource..."
+msgstr ""
+
+#: editor/filesystem_dock.cpp editor/plugins/visual_shader_editor_plugin.cpp
+#: editor/script_editor_debugger.cpp
+msgid "Expand All"
+msgstr ""
+
+#: editor/filesystem_dock.cpp editor/plugins/visual_shader_editor_plugin.cpp
+#: editor/script_editor_debugger.cpp
+msgid "Collapse All"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Duplicate..."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Move to Trash"
+msgstr ""
+
+#: editor/filesystem_dock.cpp editor/plugins/animation_player_editor_plugin.cpp
+msgid "Rename..."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Previous Folder/File"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Next Folder/File"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Re-Scan Filesystem"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Toggle Split Mode"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Search files"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid ""
+"Scanning Files,\n"
+"Please Wait..."
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Move"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+#: editor/project_manager.cpp editor/rename_dialog.cpp
+#: editor/scene_tree_dock.cpp
+msgid "Rename"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Overwrite"
+msgstr ""
+
+#: editor/filesystem_dock.cpp
+msgid "Create Scene"
+msgstr ""
+
+#: editor/filesystem_dock.cpp editor/plugins/script_editor_plugin.cpp
+msgid "Create Script"
+msgstr ""
+
+#: editor/find_in_files.cpp editor/plugins/script_editor_plugin.cpp
+msgid "Find in Files"
+msgstr ""
+
+#: editor/find_in_files.cpp
+msgid "Find:"
+msgstr ""
+
+#: editor/find_in_files.cpp
+msgid "Folder:"
+msgstr ""
+
+#: editor/find_in_files.cpp
+msgid "Filters:"
+msgstr ""
+
+#: editor/find_in_files.cpp
+msgid ""
+"Include the files with the following extensions. Add or remove them in "
+"ProjectSettings."
+msgstr ""
+
+#: editor/find_in_files.cpp editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/script_text_editor.cpp
+msgid "Find..."
+msgstr ""
+
+#: editor/find_in_files.cpp editor/plugins/script_text_editor.cpp
+msgid "Replace..."
+msgstr ""
+
+#: editor/find_in_files.cpp editor/progress_dialog.cpp scene/gui/dialogs.cpp
+msgid "Cancel"
+msgstr ""
+
+#: editor/find_in_files.cpp
+msgid "Find: "
+msgstr ""
+
+#: editor/find_in_files.cpp
+msgid "Replace: "
+msgstr ""
+
+#: editor/find_in_files.cpp
+msgid "Replace all (no undo)"
+msgstr ""
+
+#: editor/find_in_files.cpp
+msgid "Searching..."
+msgstr ""
+
+#: editor/find_in_files.cpp
+msgid "%d match in %d file."
+msgstr ""
+
+#: editor/find_in_files.cpp
+msgid "%d matches in %d file."
+msgstr ""
+
+#: editor/find_in_files.cpp
+msgid "%d matches in %d files."
+msgstr ""
+
+#: editor/groups_editor.cpp
+msgid "Add to Group"
+msgstr ""
+
+#: editor/groups_editor.cpp
+msgid "Remove from Group"
+msgstr ""
+
+#: editor/groups_editor.cpp
+msgid "Group name already exists."
+msgstr ""
+
+#: editor/groups_editor.cpp
+msgid "Invalid group name."
+msgstr ""
+
+#: editor/groups_editor.cpp
+msgid "Rename Group"
+msgstr ""
+
+#: editor/groups_editor.cpp
+msgid "Delete Group"
+msgstr ""
+
+#: editor/groups_editor.cpp editor/node_dock.cpp
+msgid "Groups"
+msgstr ""
+
+#: editor/groups_editor.cpp
+msgid "Nodes Not in Group"
+msgstr ""
+
+#: editor/groups_editor.cpp editor/scene_tree_dock.cpp
+#: editor/scene_tree_editor.cpp
+msgid "Filter nodes"
+msgstr ""
+
+#: editor/groups_editor.cpp
+msgid "Nodes in Group"
+msgstr ""
+
+#: editor/groups_editor.cpp
+msgid "Empty groups will be automatically removed."
+msgstr ""
+
+#: editor/groups_editor.cpp
+msgid "Group Editor"
+msgstr ""
+
+#: editor/groups_editor.cpp
+msgid "Manage Groups"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Import as Single Scene"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Import with Separate Animations"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Import with Separate Materials"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Import with Separate Objects"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Import with Separate Objects+Materials"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Import with Separate Objects+Animations"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Import with Separate Materials+Animations"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Import with Separate Objects+Materials+Animations"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Import as Multiple Scenes"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Import as Multiple Scenes+Materials"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+#: editor/plugins/mesh_library_editor_plugin.cpp
+msgid "Import Scene"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Importing Scene..."
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Generating Lightmaps"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Generating for Mesh: "
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Running Custom Script..."
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Couldn't load post-import script:"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Invalid/broken script for post-import (check console):"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Error running post-import script:"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Did you return a Node-derived object in the `post_import()` method?"
+msgstr ""
+
+#: editor/import/resource_importer_scene.cpp
+msgid "Saving..."
+msgstr ""
+
+#: editor/import_defaults_editor.cpp
+msgid "Select Importer"
+msgstr ""
+
+#: editor/import_defaults_editor.cpp
+msgid "Importer:"
+msgstr ""
+
+#: editor/import_defaults_editor.cpp
+msgid "Reset to Defaults"
+msgstr ""
+
+#: editor/import_dock.cpp
+msgid "Keep File (No Import)"
+msgstr ""
+
+#: editor/import_dock.cpp
+msgid "%d Files"
+msgstr ""
+
+#: editor/import_dock.cpp
+msgid "Set as Default for '%s'"
+msgstr ""
+
+#: editor/import_dock.cpp
+msgid "Clear Default for '%s'"
+msgstr ""
+
+#: editor/import_dock.cpp
+msgid "Import As:"
+msgstr ""
+
+#: editor/import_dock.cpp
+msgid "Preset"
+msgstr ""
+
+#: editor/import_dock.cpp
+msgid "Reimport"
+msgstr ""
+
+#: editor/import_dock.cpp
+msgid "Save Scenes, Re-Import, and Restart"
+msgstr ""
+
+#: editor/import_dock.cpp
+msgid "Changing the type of an imported file requires editor restart."
+msgstr ""
+
+#: editor/import_dock.cpp
+msgid ""
+"WARNING: Assets exist that use this resource, they may stop loading properly."
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Failed to load resource."
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Expand All Properties"
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Collapse All Properties"
+msgstr ""
+
+#: editor/inspector_dock.cpp editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Save As..."
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Copy Params"
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Edit Resource Clipboard"
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Copy Resource"
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Make Built-In"
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Make Sub-Resources Unique"
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Open in Help"
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Create a new resource in memory and edit it."
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Load an existing resource from disk and edit it."
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Save the currently edited resource."
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Go to the previous edited object in history."
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Go to the next edited object in history."
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "History of recently edited objects."
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Object properties."
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Filter properties"
+msgstr ""
+
+#: editor/inspector_dock.cpp
+msgid "Changes may be lost!"
+msgstr ""
+
+#: editor/multi_node_edit.cpp
+msgid "MultiNode Set"
+msgstr ""
+
+#: editor/node_dock.cpp
+msgid "Select a single node to edit its signals and groups."
+msgstr ""
+
+#: editor/plugin_config_dialog.cpp
+msgid "Edit a Plugin"
+msgstr ""
+
+#: editor/plugin_config_dialog.cpp
+msgid "Create a Plugin"
+msgstr ""
+
+#: editor/plugin_config_dialog.cpp
+msgid "Plugin Name:"
+msgstr ""
+
+#: editor/plugin_config_dialog.cpp
+msgid "Subfolder:"
+msgstr ""
+
+#: editor/plugin_config_dialog.cpp editor/script_create_dialog.cpp
+msgid "Language:"
+msgstr ""
+
+#: editor/plugin_config_dialog.cpp
+msgid "Script Name:"
+msgstr ""
+
+#: editor/plugin_config_dialog.cpp
+msgid "Activate now?"
+msgstr ""
+
+#: editor/plugins/abstract_polygon_2d_editor.cpp
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Create Polygon"
+msgstr ""
+
+#: editor/plugins/abstract_polygon_2d_editor.cpp
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Create points."
+msgstr ""
+
+#: editor/plugins/abstract_polygon_2d_editor.cpp
+msgid ""
+"Edit points.\n"
+"LMB: Move Point\n"
+"RMB: Erase Point"
+msgstr ""
+
+#: editor/plugins/abstract_polygon_2d_editor.cpp
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+msgid "Erase points."
+msgstr ""
+
+#: editor/plugins/abstract_polygon_2d_editor.cpp
+msgid "Edit Polygon"
+msgstr ""
+
+#: editor/plugins/abstract_polygon_2d_editor.cpp
+msgid "Insert Point"
+msgstr ""
+
+#: editor/plugins/abstract_polygon_2d_editor.cpp
+msgid "Edit Polygon (Remove Point)"
+msgstr ""
+
+#: editor/plugins/abstract_polygon_2d_editor.cpp
+msgid "Remove Polygon And Point"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+#: editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/animation_state_machine_editor.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Add Animation"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Load..."
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Move Node Point"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+msgid "Change BlendSpace1D Limits"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+msgid "Change BlendSpace1D Labels"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "This type of node can't be used. Only root nodes are allowed."
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Add Node Point"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Add Animation Point"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+msgid "Remove BlendSpace1D Point"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+msgid "Move BlendSpace1D Node Point"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid ""
+"AnimationTree is inactive.\n"
+"Activate to enable playback, check node warnings if activation fails."
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Set the blending position within the space"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Select and move points, create points with RMB."
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp scene/gui/graph_edit.cpp
+msgid "Enable snap and show grid."
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Point"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Open Editor"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_1d_editor.cpp
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Open Animation Node"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Triangle already exists."
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Add Triangle"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Change BlendSpace2D Limits"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Change BlendSpace2D Labels"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Remove BlendSpace2D Point"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Remove BlendSpace2D Triangle"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "BlendSpace2D does not belong to an AnimationTree node."
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "No triangles exist, so no blending can take place."
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Toggle Auto Triangles"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Create triangles by connecting points."
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Erase points and triangles."
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+msgid "Generate blend triangles automatically (instead of manually)"
+msgstr ""
+
+#: editor/plugins/animation_blend_space_2d_editor.cpp
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Blend:"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Parameter Changed"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Edit Filters"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Output node can't be added to the blend tree."
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Add Node to BlendTree"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Node Moved"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Unable to connect, port may be in use or connection may be invalid."
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Nodes Connected"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Nodes Disconnected"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Set Animation"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Delete Node"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/scene_tree_dock.cpp
+msgid "Delete Node(s)"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Toggle Filter On/Off"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Change Filter"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "No animation player set, so unable to retrieve track names."
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Player path set is invalid, so unable to retrieve track names."
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/root_motion_editor_plugin.cpp
+msgid ""
+"Animation player has no valid root node path, so unable to retrieve track "
+"names."
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Anim Clips"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Audio Clips"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Functions"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Node Renamed"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Add Node..."
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+#: editor/plugins/root_motion_editor_plugin.cpp
+msgid "Edit Filtered Tracks:"
+msgstr ""
+
+#: editor/plugins/animation_blend_tree_editor_plugin.cpp
+msgid "Enable Filtering"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Toggle Autoplay"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "New Animation Name:"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "New Anim"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Change Animation Name:"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Delete Animation?"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Remove Animation"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Invalid animation name!"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Animation name already exists!"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Rename Animation"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Blend Next Changed"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Change Blend Time"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Load Animation"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Duplicate Animation"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "No animation to copy!"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "No animation resource on clipboard!"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Pasted Animation"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Paste Animation"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "No animation to edit!"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Play selected animation backwards from current pos. (A)"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Play selected animation backwards from end. (Shift+A)"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Stop animation playback. (S)"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Play selected animation from start. (Shift+D)"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Play selected animation from current pos. (D)"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Animation position (in seconds)."
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Scale animation playback globally for the node."
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Animation Tools"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Animation"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Edit Transitions..."
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Open in Inspector"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Display list of animations in player."
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Autoplay on Load"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Enable Onion Skinning"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Onion Skinning Options"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Directions"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Past"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Future"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Depth"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "1 step"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "2 steps"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "3 steps"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Differences Only"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Force White Modulate"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Include Gizmos (3D)"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Pin AnimationPlayer"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Create New Animation"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Animation Name:"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+#: editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp editor/property_editor.cpp
+msgid "Error!"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Blend Times:"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Next (Auto Queue):"
+msgstr ""
+
+#: editor/plugins/animation_player_editor_plugin.cpp
+msgid "Cross-Animation Blend Times"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Move Node"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Transition exists!"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Add Transition"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Node"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "End"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Immediate"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Sync"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "At End"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Travel"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Start and end nodes are needed for a sub-transition."
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "No playback resource set at path: %s."
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Node Removed"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Transition Removed"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Set Start Node (Autoplay)"
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid ""
+"Select and move nodes.\n"
+"RMB to add new nodes.\n"
+"Shift+LMB to create connections."
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Create new nodes."
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Connect nodes."
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Remove selected node or transition."
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Toggle autoplay this animation on start, restart or seek to zero."
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Set the end animation. This is useful for sub-transitions."
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Transition: "
+msgstr ""
+
+#: editor/plugins/animation_state_machine_editor.cpp
+msgid "Play Mode:"
+msgstr ""
+
+#: editor/plugins/animation_tree_editor_plugin.cpp
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "AnimationTree"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "New name:"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Scale:"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Fade In (s):"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Fade Out (s):"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Blend"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Mix"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Auto Restart:"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Restart (s):"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Random Restart (s):"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Start!"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Amount:"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Blend 0:"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Blend 1:"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "X-Fade Time (s):"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Current:"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+#: editor/plugins/visual_shader_editor_plugin.cpp
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Input"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Clear Auto-Advance"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Set Auto-Advance"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Delete Input"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Animation tree is valid."
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Animation tree is invalid."
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Animation Node"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "OneShot Node"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Mix Node"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Blend2 Node"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Blend3 Node"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Blend4 Node"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "TimeScale Node"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "TimeSeek Node"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Transition Node"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Import Animations..."
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Edit Node Filters"
+msgstr ""
+
+#: editor/plugins/animation_tree_player_editor_plugin.cpp
+msgid "Filters..."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Contents:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "View Files"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Connection error, please try again."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Can't connect to host:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "No response from host:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Can't resolve hostname:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Request failed, return code:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Request failed."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Cannot save response to:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Write error."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Request failed, too many redirects"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Redirect loop."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Request failed, timeout"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Timeout."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Bad download hash, assuming file has been tampered with."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Expected:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Got:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Failed SHA-256 hash check"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Asset Download Error:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Downloading (%s / %s)..."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Downloading..."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Resolving..."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Error making request"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Idle"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Install..."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Retry"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Download Error"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Download for this asset is already in progress!"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Recently Updated"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Least Recently Updated"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Name (A-Z)"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Name (Z-A)"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "License (A-Z)"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "License (Z-A)"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "First"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Previous"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Next"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Last"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "All"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "No results for \"%s\"."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Import..."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Plugins..."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp editor/project_manager.cpp
+msgid "Sort:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Category:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Site:"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Support"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Official"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Testing"
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Loading..."
+msgstr ""
+
+#: editor/plugins/asset_library_editor_plugin.cpp
+msgid "Assets ZIP File"
+msgstr ""
+
+#: editor/plugins/baked_lightmap_editor_plugin.cpp
+msgid ""
+"Can't determine a save path for lightmap images.\n"
+"Save your scene and try again."
+msgstr ""
+
+#: editor/plugins/baked_lightmap_editor_plugin.cpp
+msgid ""
+"No meshes to bake. Make sure they contain an UV2 channel and that the 'Bake "
+"Light' flag is on."
+msgstr ""
+
+#: editor/plugins/baked_lightmap_editor_plugin.cpp
+msgid "Failed creating lightmap images, make sure path is writable."
+msgstr ""
+
+#: editor/plugins/baked_lightmap_editor_plugin.cpp
+msgid "Failed determining lightmap size. Maximum lightmap size too small?"
+msgstr ""
+
+#: editor/plugins/baked_lightmap_editor_plugin.cpp
+msgid ""
+"Some mesh is invalid. Make sure the UV2 channel values are contained within "
+"the [0.0,1.0] square region."
+msgstr ""
+
+#: editor/plugins/baked_lightmap_editor_plugin.cpp
+msgid ""
+"Godot editor was built without ray tracing support, lightmaps can't be baked."
+msgstr ""
+
+#: editor/plugins/baked_lightmap_editor_plugin.cpp
+msgid "Bake Lightmaps"
+msgstr ""
+
+#: editor/plugins/baked_lightmap_editor_plugin.cpp
+msgid "Select lightmap bake file:"
+msgstr ""
+
+#: editor/plugins/camera_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Preview"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Configure Snap"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Grid Offset:"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Grid Step:"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Primary Line Every:"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "steps"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Rotation Offset:"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Rotation Step:"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Scale Step:"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Move Vertical Guide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Create Vertical Guide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Remove Vertical Guide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Move Horizontal Guide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Create Horizontal Guide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Remove Horizontal Guide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Create Horizontal and Vertical Guides"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Set CanvasItem \"%s\" Pivot Offset to (%d, %d)"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Rotate %d CanvasItems"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Rotate CanvasItem \"%s\" to %d degrees"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Move CanvasItem \"%s\" Anchor"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Scale Node2D \"%s\" to (%s, %s)"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Resize Control \"%s\" to (%d, %d)"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Scale %d CanvasItems"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Scale CanvasItem \"%s\" to (%s, %s)"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Move %d CanvasItems"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Move CanvasItem \"%s\" to (%d, %d)"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid ""
+"Children of containers have their anchors and margins values overridden by "
+"their parent."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Presets for the anchors and margins values of a Control node."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid ""
+"When active, moving Control nodes changes their anchors instead of their "
+"margins."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Top Left"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Top Right"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Bottom Right"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Bottom Left"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Center Left"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Center Top"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Center Right"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Center Bottom"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Center"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Left Wide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Top Wide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Right Wide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Bottom Wide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "VCenter Wide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "HCenter Wide"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Full Rect"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Keep Ratio"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Anchors only"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Change Anchors and Margins"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Change Anchors"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
+"Game Camera Override\n"
+"Overrides game camera with editor viewport camera."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
+"Game Camera Override\n"
+"No game instance running."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Lock Selected"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Unlock Selected"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Group Selected"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Ungroup Selected"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Paste Pose"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Clear Guides"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Create Custom Bone(s) from Node(s)"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Clear Bones"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Make IK Chain"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Clear IK Chain"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid ""
+"Warning: Children of a container get their position and size determined only "
+"by their parent."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/texture_region_editor_plugin.cpp
+#: editor/plugins/tile_set_editor_plugin.cpp scene/gui/graph_edit.cpp
+msgid "Zoom Reset"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Select Mode"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Drag: Rotate"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Alt+Drag: Move"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Press 'v' to Change Pivot, 'Shift+v' to Drag Pivot (while moving)."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Alt+RMB: Depth list selection"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Move Mode"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Rotate Mode"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Scale Mode"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
+"Show a list of all objects at the position clicked\n"
+"(same as Alt+RMB in select mode)."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Click to change object's rotation pivot."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Pan Mode"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Ruler Mode"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Toggle smart snapping."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Use Smart Snap"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Toggle grid snapping."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Use Grid Snap"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Snapping Options"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Use Rotation Snap"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Use Scale Snap"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Snap Relative"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Use Pixel Snap"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Smart Snapping"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Configure Snap..."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Snap to Parent"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Snap to Node Anchor"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Snap to Node Sides"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Snap to Node Center"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Snap to Other Nodes"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Snap to Guides"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Lock the selected object in place (can't be moved)."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Unlock the selected object (can be moved)."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Makes sure the object's children are not selectable."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Restores the object's children's ability to be selected."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Skeleton Options"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Show Bones"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Make Custom Bone(s) from Node(s)"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Clear Custom Bones"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "View"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Always Show Grid"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Show Helpers"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Show Rulers"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Show Guides"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Show Origin"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Show Viewport"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Show Group And Lock Icons"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Center Selection"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Frame Selection"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Preview Canvas Scale"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Translation mask for inserting keys."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Rotation mask for inserting keys."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Scale mask for inserting keys."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Insert keys (based on mask)."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid ""
+"Auto insert keys when objects are translated, rotated or scaled (based on "
+"mask).\n"
+"Keys are only added to existing tracks, no new tracks will be created.\n"
+"Keys must be inserted manually for the first time."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Auto Insert Key"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Animation Key and Pose Options"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Insert Key (Existing Tracks)"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Copy Pose"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Clear Pose"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Multiply grid step by 2"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Divide grid step by 2"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Pan View"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Add %s"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Adding %s..."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Cannot instantiate multiple nodes without root."
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp editor/scene_tree_dock.cpp
+msgid "Create Node"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+#: editor/plugins/spatial_editor_plugin.cpp editor/scene_tree_dock.cpp
+msgid "Error instancing scene from %s"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid "Change Default Type"
+msgstr ""
+
+#: editor/plugins/canvas_item_editor_plugin.cpp
+msgid ""
+"Drag & drop + Shift : Add node as sibling\n"
+"Drag & drop + Alt : Change node type"
+msgstr ""
+
+#: editor/plugins/collision_polygon_editor_plugin.cpp
+msgid "Create Polygon3D"
+msgstr ""
+
+#: editor/plugins/collision_polygon_editor_plugin.cpp
+msgid "Edit Poly"
+msgstr ""
+
+#: editor/plugins/collision_polygon_editor_plugin.cpp
+msgid "Edit Poly (Remove Point)"
+msgstr ""
+
+#: editor/plugins/collision_shape_2d_editor_plugin.cpp
+msgid "Set Handle"
+msgstr ""
+
+#: editor/plugins/cpu_particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Load Emission Mask"
+msgstr ""
+
+#: editor/plugins/cpu_particles_2d_editor_plugin.cpp
+#: editor/plugins/cpu_particles_editor_plugin.cpp
+#: editor/plugins/particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Restart"
+msgstr ""
+
+#: editor/plugins/cpu_particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Clear Emission Mask"
+msgstr ""
+
+#: editor/plugins/cpu_particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Particles"
+msgstr ""
+
+#: editor/plugins/cpu_particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Generated Point Count:"
+msgstr ""
+
+#: editor/plugins/cpu_particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Emission Mask"
+msgstr ""
+
+#: editor/plugins/cpu_particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Solid Pixels"
+msgstr ""
+
+#: editor/plugins/cpu_particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Border Pixels"
+msgstr ""
+
+#: editor/plugins/cpu_particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Directed Border Pixels"
+msgstr ""
+
+#: editor/plugins/cpu_particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Capture from Pixel"
+msgstr ""
+
+#: editor/plugins/cpu_particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Emission Colors"
+msgstr ""
+
+#: editor/plugins/cpu_particles_editor_plugin.cpp
+msgid "CPUParticles"
+msgstr ""
+
+#: editor/plugins/cpu_particles_editor_plugin.cpp
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Create Emission Points From Mesh"
+msgstr ""
+
+#: editor/plugins/cpu_particles_editor_plugin.cpp
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Create Emission Points From Node"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Flat 0"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Flat 1"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp editor/property_editor.cpp
+msgid "Ease In"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp editor/property_editor.cpp
+msgid "Ease Out"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Smoothstep"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Modify Curve Point"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Modify Curve Tangent"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Load Curve Preset"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Add Point"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Remove Point"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Left Linear"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Right Linear"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Load Preset"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Remove Curve Point"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Toggle Curve Linear Tangent"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Hold Shift to edit tangents individually"
+msgstr ""
+
+#: editor/plugins/curve_editor_plugin.cpp
+msgid "Right click to add point"
+msgstr ""
+
+#: editor/plugins/gi_probe_editor_plugin.cpp
+msgid "Bake GI Probe"
+msgstr ""
+
+#: editor/plugins/gradient_editor_plugin.cpp
+msgid "Gradient Edited"
+msgstr ""
+
+#: editor/plugins/item_list_editor_plugin.cpp
+msgid "Item %d"
+msgstr ""
+
+#: editor/plugins/item_list_editor_plugin.cpp
+msgid "Items"
+msgstr ""
+
+#: editor/plugins/item_list_editor_plugin.cpp
+msgid "Item List Editor"
+msgstr ""
+
+#: editor/plugins/light_occluder_2d_editor_plugin.cpp
+msgid "Create Occluder Polygon"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Mesh is empty!"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Couldn't create a Trimesh collision shape."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Static Trimesh Body"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "This doesn't work on scene root!"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Trimesh Static Shape"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Can't create a single convex collision shape for the scene root."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Couldn't create a single convex collision shape."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Single Convex Shape"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Can't create multiple convex collision shapes for the scene root."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Couldn't create any collision shapes."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Multiple Convex Shapes"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Navigation Mesh"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Contained Mesh is not of type ArrayMesh."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "UV Unwrap failed, mesh may not be manifold?"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "No mesh to debug."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Model has no UV in this layer"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "MeshInstance lacks a Mesh!"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Mesh has not surface to create outlines from!"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Mesh primitive type is not PRIMITIVE_TRIANGLES!"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Could not create outline!"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Outline"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Mesh"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Trimesh Static Body"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid ""
+"Creates a StaticBody and assigns a polygon-based collision shape to it "
+"automatically.\n"
+"This is the most accurate (but slowest) option for collision detection."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Trimesh Collision Sibling"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid ""
+"Creates a polygon-based collision shape.\n"
+"This is the most accurate (but slowest) option for collision detection."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Single Convex Collision Sibling"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid ""
+"Creates a single convex collision shape.\n"
+"This is the fastest (but least accurate) option for collision detection."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Multiple Convex Collision Siblings"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid ""
+"Creates a polygon-based collision shape.\n"
+"This is a performance middle-ground between the two above options."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Outline Mesh..."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid ""
+"Creates a static outline mesh. The outline mesh will have its normals "
+"flipped automatically.\n"
+"This can be used instead of the SpatialMaterial Grow property when using "
+"that property isn't possible."
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "View UV1"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "View UV2"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Unwrap UV2 for Lightmap/AO"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Create Outline Mesh"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "Outline Size:"
+msgstr ""
+
+#: editor/plugins/mesh_instance_editor_plugin.cpp
+msgid "UV Channel Debug"
+msgstr ""
+
+#: editor/plugins/mesh_library_editor_plugin.cpp
+msgid "Remove item %d?"
+msgstr ""
+
+#: editor/plugins/mesh_library_editor_plugin.cpp
+msgid ""
+"Update from existing scene?:\n"
+"%s"
+msgstr ""
+
+#: editor/plugins/mesh_library_editor_plugin.cpp
+msgid "Mesh Library"
+msgstr ""
+
+#: editor/plugins/mesh_library_editor_plugin.cpp
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Add Item"
+msgstr ""
+
+#: editor/plugins/mesh_library_editor_plugin.cpp
+msgid "Remove Selected Item"
+msgstr ""
+
+#: editor/plugins/mesh_library_editor_plugin.cpp
+msgid "Import from Scene"
+msgstr ""
+
+#: editor/plugins/mesh_library_editor_plugin.cpp
+msgid "Update from Scene"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "No mesh source specified (and no MultiMesh set in node)."
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "No mesh source specified (and MultiMesh contains no Mesh)."
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Mesh source is invalid (invalid path)."
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Mesh source is invalid (not a MeshInstance)."
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Mesh source is invalid (contains no Mesh resource)."
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "No surface source specified."
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Surface source is invalid (invalid path)."
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Surface source is invalid (no geometry)."
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Surface source is invalid (no faces)."
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Select a Source Mesh:"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Select a Target Surface:"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Populate Surface"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Populate MultiMesh"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Target Surface:"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Source Mesh:"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "X-Axis"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Y-Axis"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Z-Axis"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Mesh Up Axis:"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Random Rotation:"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Random Tilt:"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Random Scale:"
+msgstr ""
+
+#: editor/plugins/multimesh_editor_plugin.cpp
+msgid "Populate"
+msgstr ""
+
+#: editor/plugins/navigation_polygon_editor_plugin.cpp
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Create Navigation Polygon"
+msgstr ""
+
+#: editor/plugins/particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Convert to CPUParticles"
+msgstr ""
+
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Generating Visibility Rect"
+msgstr ""
+
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Generate Visibility Rect"
+msgstr ""
+
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Can only set point into a ParticlesMaterial process material"
+msgstr ""
+
+#: editor/plugins/particles_2d_editor_plugin.cpp
+msgid "Convert to CPUParticles2D"
+msgstr ""
+
+#: editor/plugins/particles_2d_editor_plugin.cpp
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Generation Time (sec):"
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "The geometry's faces don't contain any area."
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "The geometry doesn't contain any faces."
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "\"%s\" doesn't inherit from Spatial."
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "\"%s\" doesn't contain geometry."
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "\"%s\" doesn't contain face geometry."
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Create Emitter"
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Emission Points:"
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Surface Points"
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Surface Points+Normal (Directed)"
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Volume"
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Emission Source: "
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "A processor material of type 'ParticlesMaterial' is required."
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Generating AABB"
+msgstr ""
+
+#: editor/plugins/particles_editor_plugin.cpp
+msgid "Generate Visibility AABB"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+msgid "Remove Point from Curve"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+msgid "Remove Out-Control from Curve"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+msgid "Remove In-Control from Curve"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Add Point to Curve"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+msgid "Split Curve"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+msgid "Move Point in Curve"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+msgid "Move In-Control in Curve"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+msgid "Move Out-Control in Curve"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Select Points"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Shift+Drag: Select Control Points"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Click: Add Point"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+msgid "Left Click: Split Segment (in curve)"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Right Click: Delete Point"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+msgid "Select Control Points (Shift+Drag)"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Add Point (in empty space)"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Delete Point"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Close Curve"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+#: editor/plugins/path_editor_plugin.cpp editor/plugins/theme_editor_plugin.cpp
+#: editor/plugins/visual_shader_editor_plugin.cpp editor/project_export.cpp
+msgid "Options"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Mirror Handle Angles"
+msgstr ""
+
+#: editor/plugins/path_2d_editor_plugin.cpp
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Mirror Handle Lengths"
+msgstr ""
+
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Curve Point #"
+msgstr ""
+
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Set Curve Point Position"
+msgstr ""
+
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Set Curve In Position"
+msgstr ""
+
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Set Curve Out Position"
+msgstr ""
+
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Split Path"
+msgstr ""
+
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Remove Path Point"
+msgstr ""
+
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Remove Out-Control Point"
+msgstr ""
+
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Remove In-Control Point"
+msgstr ""
+
+#: editor/plugins/path_editor_plugin.cpp
+msgid "Split Segment (in curve)"
+msgstr ""
+
+#: editor/plugins/physical_bone_plugin.cpp
+msgid "Move Joint"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid ""
+"The skeleton property of the Polygon2D does not point to a Skeleton2D node"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Sync Bones"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid ""
+"No texture in this polygon.\n"
+"Set a texture to be able to edit UV."
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Create UV Map"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid ""
+"Polygon 2D has internal vertices, so it can no longer be edited in the "
+"viewport."
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Create Polygon & UV"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Create Internal Vertex"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Remove Internal Vertex"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Invalid Polygon (need 3 different vertices)"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Add Custom Polygon"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Remove Custom Polygon"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Transform UV Map"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Transform Polygon"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Paint Bone Weights"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Open Polygon 2D UV editor."
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Polygon 2D UV Editor"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "UV"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Points"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Polygons"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Bones"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Move Points"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Command: Rotate"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Shift: Move All"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Shift+Command: Scale"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Ctrl: Rotate"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Shift+Ctrl: Scale"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Move Polygon"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Rotate Polygon"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Scale Polygon"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Create a custom polygon. Enables custom polygon rendering."
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid ""
+"Remove a custom polygon. If none remain, custom polygon rendering is "
+"disabled."
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Paint weights with specified intensity."
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Unpaint weights with specified intensity."
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Radius:"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Copy Polygon to UV"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Copy UV to Polygon"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Clear UV"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Grid Settings"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Snap"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Enable Snap"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Grid"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Show Grid"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Configure Grid:"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Grid Offset X:"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Grid Offset Y:"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Grid Step X:"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Grid Step Y:"
+msgstr ""
+
+#: editor/plugins/polygon_2d_editor_plugin.cpp
+msgid "Sync Bones to Polygon"
+msgstr ""
+
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+msgid "ERROR: Couldn't load resource!"
+msgstr ""
+
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+msgid "Add Resource"
+msgstr ""
+
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+msgid "Rename Resource"
+msgstr ""
+
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Delete Resource"
+msgstr ""
+
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+msgid "Resource clipboard is empty!"
+msgstr ""
+
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+msgid "Paste Resource"
+msgstr ""
+
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+#: editor/scene_tree_editor.cpp
+msgid "Instance:"
+msgstr ""
+
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+#: editor/plugins/theme_editor_plugin.cpp editor/project_settings_editor.cpp
+#: editor/scene_tree_editor.cpp editor/script_editor_debugger.cpp
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Type:"
+msgstr ""
+
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+#: editor/scene_tree_dock.cpp editor/scene_tree_editor.cpp
+msgid "Open in Editor"
+msgstr ""
+
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+msgid "Load Resource"
+msgstr ""
+
+#: editor/plugins/resource_preloader_editor_plugin.cpp
+msgid "ResourcePreloader"
+msgstr ""
+
+#: editor/plugins/root_motion_editor_plugin.cpp
+msgid "AnimationTree has no path set to an AnimationPlayer"
+msgstr ""
+
+#: editor/plugins/root_motion_editor_plugin.cpp
+msgid "Path to AnimationPlayer is invalid"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Clear Recent Files"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Close and save changes?"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Error writing TextFile:"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Could not load file at:"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Error saving file!"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Error while saving theme."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Error Saving"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Error importing theme."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Error Importing"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "New Text File..."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Open File"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Save File As..."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Can't obtain the script for running."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Script failed reloading, check console for errors."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Script is not in tool mode, will not be able to run."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid ""
+"To run this script, it must inherit EditorScript and be set to tool mode."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Import Theme"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Error while saving theme"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Error saving"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Save Theme As..."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "%s Class Reference"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/script_text_editor.cpp
+msgid "Find Next"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/script_text_editor.cpp
+msgid "Find Previous"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Filter scripts"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Toggle alphabetical sorting of the method list."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Filter methods"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Sort"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/script_text_editor.cpp editor/scene_tree_dock.cpp
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "Move Up"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/script_text_editor.cpp editor/scene_tree_dock.cpp
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "Move Down"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Next script"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Previous script"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "File"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Open..."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Reopen Closed Script"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Save All"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Soft Reload Script"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Copy Script Path"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "History Previous"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "History Next"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Theme"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Import Theme..."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Reload Theme"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Save Theme"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Close All"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Close Docs"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp editor/project_manager.cpp
+msgid "Run"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+#: editor/plugins/script_text_editor.cpp
+#: editor/plugins/shader_editor_plugin.cpp editor/plugins/text_editor.cpp
+#: editor/plugins/visual_shader_editor_plugin.cpp editor/project_manager.cpp
+#: editor/project_settings_editor.cpp editor/settings_config_dialog.cpp
+msgid "Search"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp editor/script_editor_debugger.cpp
+msgid "Step Into"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp editor/script_editor_debugger.cpp
+msgid "Step Over"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp editor/script_editor_debugger.cpp
+msgid "Break"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp editor/project_manager.cpp
+#: editor/script_editor_debugger.cpp
+msgid "Continue"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Keep Debugger Open"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Debug with External Editor"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Open Godot online documentation."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Search the reference documentation."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Go to previous edited document."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Go to next edited document."
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Discard"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid ""
+"The following files are newer on disk.\n"
+"What action should be taken?:"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp editor/script_editor_debugger.cpp
+msgid "Debugger"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Search Results"
+msgstr ""
+
+#: editor/plugins/script_editor_plugin.cpp
+msgid "Clear Recent Scripts"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Connections to method:"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp editor/script_editor_debugger.cpp
+msgid "Source"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Target"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid ""
+"Missing connected method '%s' for signal '%s' from node '%s' to node '%s'."
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "[Ignore]"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Line"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Go to Function"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Only resources from filesystem can be dropped."
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Can't drop nodes because script '%s' is not used in this scene."
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Lookup Symbol"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Pick Color"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp editor/plugins/text_editor.cpp
+msgid "Convert Case"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp editor/plugins/text_editor.cpp
+msgid "Uppercase"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp editor/plugins/text_editor.cpp
+msgid "Lowercase"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp editor/plugins/text_editor.cpp
+msgid "Capitalize"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp editor/plugins/text_editor.cpp
+msgid "Syntax Highlighter"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+#: editor/plugins/shader_editor_plugin.cpp editor/plugins/text_editor.cpp
+msgid "Bookmarks"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Breakpoints"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+#: editor/plugins/shader_editor_plugin.cpp editor/plugins/text_editor.cpp
+msgid "Go To"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp editor/scene_tree_dock.cpp
+#: scene/gui/line_edit.cpp scene/gui/text_edit.cpp
+msgid "Cut"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp scene/gui/line_edit.cpp
+#: scene/gui/text_edit.cpp
+msgid "Select All"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Delete Line"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Indent Left"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Indent Right"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Toggle Comment"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Fold/Unfold Line"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Fold All Lines"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Unfold All Lines"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Clone Down"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Complete Symbol"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Evaluate Selection"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Trim Trailing Whitespace"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Convert Indent to Spaces"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Convert Indent to Tabs"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Auto Indent"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Find in Files..."
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Contextual Help"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Toggle Bookmark"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Go to Next Bookmark"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Go to Previous Bookmark"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Remove All Bookmarks"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Go to Function..."
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Go to Line..."
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Toggle Breakpoint"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Remove All Breakpoints"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Go to Next Breakpoint"
+msgstr ""
+
+#: editor/plugins/script_text_editor.cpp
+msgid "Go to Previous Breakpoint"
+msgstr ""
+
+#: editor/plugins/shader_editor_plugin.cpp
+msgid ""
+"This shader has been modified on on disk.\n"
+"What action should be taken?"
+msgstr ""
+
+#: editor/plugins/shader_editor_plugin.cpp
+msgid "Shader"
+msgstr ""
+
+#: editor/plugins/skeleton_2d_editor_plugin.cpp
+msgid "This skeleton has no bones, create some children Bone2D nodes."
+msgstr ""
+
+#: editor/plugins/skeleton_2d_editor_plugin.cpp
+msgid "Create Rest Pose from Bones"
+msgstr ""
+
+#: editor/plugins/skeleton_2d_editor_plugin.cpp
+msgid "Set Rest Pose to Bones"
+msgstr ""
+
+#: editor/plugins/skeleton_2d_editor_plugin.cpp
+msgid "Skeleton2D"
+msgstr ""
+
+#: editor/plugins/skeleton_2d_editor_plugin.cpp
+msgid "Make Rest Pose (From Bones)"
+msgstr ""
+
+#: editor/plugins/skeleton_2d_editor_plugin.cpp
+msgid "Set Bones to Rest Pose"
+msgstr ""
+
+#: editor/plugins/skeleton_editor_plugin.cpp
+msgid "Create physical bones"
+msgstr ""
+
+#: editor/plugins/skeleton_editor_plugin.cpp
+msgid "Skeleton"
+msgstr ""
+
+#: editor/plugins/skeleton_editor_plugin.cpp
+msgid "Create physical skeleton"
+msgstr ""
+
+#: editor/plugins/skeleton_ik_editor_plugin.cpp
+msgid "Play IK"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Orthogonal"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Perspective"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Transform Aborted."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "X-Axis Transform."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Y-Axis Transform."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Z-Axis Transform."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "View Plane Transform."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Scaling: "
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Translating: "
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Rotating %s degrees."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Keying is disabled (no key inserted)."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Animation Key Inserted."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Pitch"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Yaw"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Size"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Objects Drawn"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Material Changes"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Shader Changes"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Surface Changes"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Draw Calls"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Vertices"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Top View."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Bottom View."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Bottom"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Left View."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Left"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Right View."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Right"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Front View."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Front"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Rear View."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Rear"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Align Transform with View"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Align Rotation with View"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp editor/scene_tree_dock.cpp
+msgid "No parent to instance a child at."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp editor/scene_tree_dock.cpp
+msgid "This operation requires a single selected node."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Auto Orthogonal Enabled"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Lock View Rotation"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Display Normal"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Display Wireframe"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Display Overdraw"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Display Unshaded"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "View Environment"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "View Gizmos"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "View Information"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "View FPS"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Half Resolution"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Audio Listener"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Enable Doppler"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Cinematic Preview"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Not available when using the GLES2 renderer."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Freelook Left"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Freelook Right"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Freelook Forward"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Freelook Backwards"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Freelook Up"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Freelook Down"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Freelook Speed Modifier"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Freelook Slow Modifier"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "View Rotation Locked"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
+"Note: The FPS value displayed is the editor's framerate.\n"
+"It cannot be used as a reliable indication of in-game performance."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "XForm Dialog"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
+"Click to toggle between visibility states.\n"
+"\n"
+"Open eye: Gizmo is visible.\n"
+"Closed eye: Gizmo is hidden.\n"
+"Half-open eye: Gizmo is also visible through opaque surfaces (\"x-ray\")."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Snap Nodes To Floor"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Couldn't find a solid floor to snap the selection to."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
+"Drag: Rotate\n"
+"Alt+Drag: Move\n"
+"Alt+RMB: Depth list selection"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Use Local Space"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Use Snap"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Bottom View"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Top View"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Rear View"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Front View"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Left View"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Right View"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Switch Perspective/Orthogonal View"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Insert Animation Key"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Focus Origin"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Focus Selection"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Toggle Freelook"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Transform"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Snap Object to Floor"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Transform Dialog..."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "1 Viewport"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "2 Viewports"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "2 Viewports (Alt)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "3 Viewports"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "3 Viewports (Alt)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "4 Viewports"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Gizmos"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "View Origin"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "View Grid"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Settings..."
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Snap Settings"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Translate Snap:"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Rotate Snap (deg.):"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Scale Snap (%):"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Viewport Settings"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Perspective FOV (deg.):"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "View Z-Near:"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "View Z-Far:"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Transform Change"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Translate:"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Rotate (deg.):"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Scale (ratio):"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Transform Type"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Pre"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Post"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid "Nameless gizmo"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Create Mesh2D"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Mesh2D Preview"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Create Polygon2D"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Polygon2D Preview"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Create CollisionPolygon2D"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "CollisionPolygon2D Preview"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Create LightOccluder2D"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "LightOccluder2D Preview"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Sprite is empty!"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Can't convert a sprite using animation frames to mesh."
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Invalid geometry, can't replace by mesh."
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Convert to Mesh2D"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Invalid geometry, can't create polygon."
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Convert to Polygon2D"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Invalid geometry, can't create collision polygon."
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Create CollisionPolygon2D Sibling"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Invalid geometry, can't create light occluder."
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Create LightOccluder2D Sibling"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Sprite"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Simplification: "
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Shrink (Pixels): "
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Grow (Pixels): "
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Update Preview"
+msgstr ""
+
+#: editor/plugins/sprite_editor_plugin.cpp
+msgid "Settings:"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "No Frames Selected"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Add %d Frame(s)"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Add Frame"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Unable to load images"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "ERROR: Couldn't load frame resource!"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Resource clipboard is empty or not a texture!"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Paste Frame"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Add Empty"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Change Animation FPS"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "(empty)"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Move Frame"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Animations:"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "New Animation"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Speed:"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Loop"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Animation Frames:"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Add a Texture from File"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Add Frames from a Sprite Sheet"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Insert Empty (Before)"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Insert Empty (After)"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Move (Before)"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Move (After)"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Select Frames"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Horizontal:"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Vertical:"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Select/Clear All Frames"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "Create Frames from Sprite Sheet"
+msgstr ""
+
+#: editor/plugins/sprite_frames_editor_plugin.cpp
+msgid "SpriteFrames"
+msgstr ""
+
+#: editor/plugins/texture_region_editor_plugin.cpp
+msgid "Set Region Rect"
+msgstr ""
+
+#: editor/plugins/texture_region_editor_plugin.cpp
+msgid "Set Margin"
+msgstr ""
+
+#: editor/plugins/texture_region_editor_plugin.cpp
+msgid "Snap Mode:"
+msgstr ""
+
+#: editor/plugins/texture_region_editor_plugin.cpp
+#: scene/resources/visual_shader.cpp
+msgid "None"
+msgstr ""
+
+#: editor/plugins/texture_region_editor_plugin.cpp
+msgid "Pixel Snap"
+msgstr ""
+
+#: editor/plugins/texture_region_editor_plugin.cpp
+msgid "Grid Snap"
+msgstr ""
+
+#: editor/plugins/texture_region_editor_plugin.cpp
+msgid "Auto Slice"
+msgstr ""
+
+#: editor/plugins/texture_region_editor_plugin.cpp
+msgid "Offset:"
+msgstr ""
+
+#: editor/plugins/texture_region_editor_plugin.cpp
+msgid "Step:"
+msgstr ""
+
+#: editor/plugins/texture_region_editor_plugin.cpp
+msgid "Sep.:"
+msgstr ""
+
+#: editor/plugins/texture_region_editor_plugin.cpp
+msgid "TextureRegion"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Add All Items"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Add All"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Remove All Items"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp editor/project_manager.cpp
+msgid "Remove All"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Edit Theme"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Theme editing menu."
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Add Class Items"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Remove Class Items"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Create Empty Template"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Create Empty Editor Template"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Create From Current Editor Theme"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Toggle Button"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Disabled Button"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Item"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Disabled Item"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Check Item"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Checked Item"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Radio Item"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Checked Radio Item"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Named Sep."
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Submenu"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Subitem 1"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Subitem 2"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Has"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Many"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Disabled LineEdit"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Tab 1"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Tab 2"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Tab 3"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Editable Item"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Subtree"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Has,Many,Options"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Data Type:"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Icon"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp editor/rename_dialog.cpp
+msgid "Style"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Font"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Color"
+msgstr ""
+
+#: editor/plugins/theme_editor_plugin.cpp
+msgid "Theme File"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Erase Selection"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Fix Invalid Tiles"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Cut Selection"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Paint TileMap"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Line Draw"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Rectangle Paint"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Bucket Fill"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Erase TileMap"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Find Tile"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Transpose"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Disable Autotile"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Enable Priority"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Filter tiles"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Give a TileSet resource to this TileMap to use its tiles."
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Paint Tile"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid ""
+"Shift+LMB: Line Draw\n"
+"Shift+Command+LMB: Rectangle Paint"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid ""
+"Shift+LMB: Line Draw\n"
+"Shift+Ctrl+LMB: Rectangle Paint"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Pick Tile"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Rotate Left"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Rotate Right"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Flip Horizontally"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Flip Vertically"
+msgstr ""
+
+#: editor/plugins/tile_map_editor_plugin.cpp
+msgid "Clear Transform"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Add Texture(s) to TileSet."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Remove selected Texture from TileSet."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Create from Scene"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Merge from Scene"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "New Single Tile"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "New Autotile"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "New Atlas"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Next Coordinate"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Select the next shape, subtile, or Tile."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Previous Coordinate"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Select the previous shape, subtile, or Tile."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Region"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Collision"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Occlusion"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Navigation"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Bitmask"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Priority"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Z Index"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Region Mode"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Collision Mode"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Occlusion Mode"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Navigation Mode"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Bitmask Mode"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Priority Mode"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Icon Mode"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Z Index Mode"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Copy bitmask."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Paste bitmask."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Erase bitmask."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Create a new rectangle."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "New Rectangle"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Create a new polygon."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "New Polygon"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Delete Selected Shape"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Keep polygon inside region Rect."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Enable snap and show grid (configurable via the Inspector)."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Display Tile Names (Hold Alt Key)"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid ""
+"Add or select a texture on the left panel to edit the tiles bound to it."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Remove selected texture? This will remove all tiles which use it."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "You haven't selected a texture to remove."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Create from scene? This will overwrite all current tiles."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Merge from scene?"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Remove Texture"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "%s file(s) were not added because was already on the list."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid ""
+"Drag handles to edit Rect.\n"
+"Click on another Tile to edit it."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Delete selected Rect."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid ""
+"Select current edited sub-tile.\n"
+"Click on another Tile to edit it."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Delete polygon."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid ""
+"LMB: Set bit on.\n"
+"RMB: Set bit off.\n"
+"Shift+LMB: Set wildcard bit.\n"
+"Click on another Tile to edit it."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid ""
+"Select sub-tile to use as icon, this will be also used on invalid autotile "
+"bindings.\n"
+"Click on another Tile to edit it."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid ""
+"Select sub-tile to change its priority.\n"
+"Click on another Tile to edit it."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid ""
+"Select sub-tile to change its z index.\n"
+"Click on another Tile to edit it."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Set Tile Region"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Create Tile"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Set Tile Icon"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Edit Tile Bitmask"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Edit Collision Polygon"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Edit Occlusion Polygon"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Edit Navigation Polygon"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Paste Tile Bitmask"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Clear Tile Bitmask"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Make Polygon Concave"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Make Polygon Convex"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Remove Tile"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Remove Collision Polygon"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Remove Occlusion Polygon"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Remove Navigation Polygon"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Edit Tile Priority"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Edit Tile Z Index"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Make Convex"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Make Concave"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Create Collision Polygon"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "Create Occlusion Polygon"
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "This property can't be changed."
+msgstr ""
+
+#: editor/plugins/tile_set_editor_plugin.cpp
+msgid "TileSet"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "No VCS addons are available."
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Error"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "No files added to stage"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Commit"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "VCS Addon is not initialized"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Version Control System"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Initialize"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Staging area"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Detect new changes"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Changes"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Modified"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Renamed"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Deleted"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Typechange"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Stage Selected"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Stage All"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Commit Changes"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+#: modules/gdnative/gdnative_library_singleton_editor.cpp
+msgid "Status"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "View file diffs before committing them to the latest version"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "No file diff is active"
+msgstr ""
+
+#: editor/plugins/version_control_editor_plugin.cpp
+msgid "Detect changes in file diff"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "(GLES3 only)"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Add Output"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Scalar"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Vector"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Boolean"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Sampler"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Add input port"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Add output port"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Change input port type"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Change output port type"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Change input port name"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Change output port name"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Remove input port"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Remove output port"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Set expression"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Resize VisualShader node"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Set Uniform Name"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Set Input Default Port"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Add Node to Visual Shader"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Node(s) Moved"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Duplicate Nodes"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Paste Nodes"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Delete Nodes"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Visual Shader Input Type Changed"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "UniformRef Name Changed"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Vertex"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Fragment"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Light"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Show resulted shader code."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Create Shader Node"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Color function."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Color operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Grayscale function."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Converts HSV vector to RGB equivalent."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Converts RGB vector to HSV equivalent."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Sepia function."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Burn operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Darken operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Difference operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Dodge operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "HardLight operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Lighten operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Overlay operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Screen operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "SoftLight operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Color constant."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Color uniform."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the boolean result of the %s comparison between two parameters."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Equal (==)"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Greater Than (>)"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Greater Than or Equal (>=)"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Returns an associated vector if the provided scalars are equal, greater or "
+"less."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Returns the boolean result of the comparison between INF and a scalar "
+"parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Returns the boolean result of the comparison between NaN and a scalar "
+"parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Less Than (<)"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Less Than or Equal (<=)"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Not Equal (!=)"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Returns an associated vector if the provided boolean value is true or false."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Returns an associated scalar if the provided boolean value is true or false."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the boolean result of the comparison between two parameters."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Returns the boolean result of the comparison between INF (or NaN) and a "
+"scalar parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Boolean constant."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Boolean uniform."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "'%s' input parameter for all shader modes."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Input parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "'%s' input parameter for vertex and fragment shader modes."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "'%s' input parameter for fragment and light shader modes."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "'%s' input parameter for fragment shader mode."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "'%s' input parameter for light shader mode."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "'%s' input parameter for vertex shader mode."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "'%s' input parameter for vertex and fragment shader mode."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Scalar function."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Scalar operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "E constant (2.718282). Represents the base of the natural logarithm."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Epsilon constant (0.00001). Smallest possible scalar number."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Phi constant (1.618034). Golden ratio."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Pi/4 constant (0.785398) or 45 degrees."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Pi/2 constant (1.570796) or 90 degrees."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Pi constant (3.141593) or 180 degrees."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Tau constant (6.283185) or 360 degrees."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Sqrt2 constant (1.414214). Square root of 2."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the absolute value of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the arc-cosine of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the inverse hyperbolic cosine of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the arc-sine of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the inverse hyperbolic sine of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the arc-tangent of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the arc-tangent of the parameters."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the inverse hyperbolic tangent of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Finds the nearest integer that is greater than or equal to the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Constrains a value to lie between two further values."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the cosine of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the hyperbolic cosine of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Converts a quantity in radians to degrees."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Base-e Exponential."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Base-2 Exponential."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Finds the nearest integer less than or equal to the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Computes the fractional part of the argument."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the inverse of the square root of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Natural logarithm."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Base-2 logarithm."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the greater of two values."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the lesser of two values."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Linear interpolation between two scalars."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the opposite value of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "1.0 - scalar"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Returns the value of the first parameter raised to the power of the second."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Converts a quantity in degrees to radians."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "1.0 / scalar"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Finds the nearest integer to the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Finds the nearest even integer to the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Clamps the value between 0.0 and 1.0."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Extracts the sign of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the sine of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the hyperbolic sine of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the square root of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"SmoothStep function( scalar(edge0), scalar(edge1), scalar(x) ).\n"
+"\n"
+"Returns 0.0 if 'x' is smaller than 'edge0' and 1.0 if x is larger than "
+"'edge1'. Otherwise the return value is interpolated between 0.0 and 1.0 "
+"using Hermite polynomials."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Step function( scalar(edge), scalar(x) ).\n"
+"\n"
+"Returns 0.0 if 'x' is smaller than 'edge' and otherwise 1.0."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the tangent of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the hyperbolic tangent of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Finds the truncated value of the parameter."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Adds scalar to scalar."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Divides scalar by scalar."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Multiplies scalar by scalar."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the remainder of the two scalars."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Subtracts scalar from scalar."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Scalar constant."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Scalar uniform."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Perform the cubic texture lookup."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Perform the texture lookup."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Cubic texture uniform lookup."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "2D texture uniform lookup."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "2D texture uniform lookup with triplanar."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Transform function."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Calculate the outer product of a pair of vectors.\n"
+"\n"
+"OuterProduct treats the first parameter 'c' as a column vector (matrix with "
+"one column) and the second parameter 'r' as a row vector (matrix with one "
+"row) and does a linear algebraic matrix multiply 'c * r', yielding a matrix "
+"whose number of rows is the number of components in 'c' and whose number of "
+"columns is the number of components in 'r'."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Composes transform from four vectors."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Decomposes transform to four vectors."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Calculates the determinant of a transform."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Calculates the inverse of a transform."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Calculates the transpose of a transform."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Multiplies transform by transform."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Multiplies vector by transform."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Transform constant."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Transform uniform."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Vector function."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Vector operator."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Composes vector from three scalars."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Decomposes vector to three scalars."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Calculates the cross product of two vectors."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the distance between two points."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Calculates the dot product of two vectors."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Returns the vector that points in the same direction as a reference vector. "
+"The function has three vector parameters : N, the vector to orient, I, the "
+"incident vector, and Nref, the reference vector. If the dot product of I and "
+"Nref is smaller than zero the return value is N. Otherwise -N is returned."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Calculates the length of a vector."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Linear interpolation between two vectors."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Linear interpolation between two vectors using scalar."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Calculates the normalize product of vector."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "1.0 - vector"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "1.0 / vector"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Returns the vector that points in the direction of reflection ( a : incident "
+"vector, b : normal vector )."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the vector that points in the direction of refraction."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"SmoothStep function( vector(edge0), vector(edge1), vector(x) ).\n"
+"\n"
+"Returns 0.0 if 'x' is smaller than 'edge0' and 1.0 if 'x' is larger than "
+"'edge1'. Otherwise the return value is interpolated between 0.0 and 1.0 "
+"using Hermite polynomials."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"SmoothStep function( scalar(edge0), scalar(edge1), vector(x) ).\n"
+"\n"
+"Returns 0.0 if 'x' is smaller than 'edge0' and 1.0 if 'x' is larger than "
+"'edge1'. Otherwise the return value is interpolated between 0.0 and 1.0 "
+"using Hermite polynomials."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Step function( vector(edge), vector(x) ).\n"
+"\n"
+"Returns 0.0 if 'x' is smaller than 'edge' and otherwise 1.0."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Step function( scalar(edge), vector(x) ).\n"
+"\n"
+"Returns 0.0 if 'x' is smaller than 'edge' and otherwise 1.0."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Adds vector to vector."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Divides vector by vector."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Multiplies vector by vector."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Returns the remainder of the two vectors."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Subtracts vector from vector."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Vector constant."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Vector uniform."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Custom Godot Shader Language expression, with custom amount of input and "
+"output ports. This is a direct injection of code into the vertex/fragment/"
+"light function, do not use it to write the function declarations inside."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Returns falloff based on the dot product of surface normal and view "
+"direction of camera (pass associated inputs to it)."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"Custom Godot Shader Language expression, which is placed on top of the "
+"resulted shader. You can place various function definitions inside and call "
+"it later in the Expressions. You can also declare varyings, uniforms and "
+"constants."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "A reference to an existing uniform."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "(Fragment/Light mode only) Scalar derivative function."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "(Fragment/Light mode only) Vector derivative function."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"(Fragment/Light mode only) (Vector) Derivative in 'x' using local "
+"differencing."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"(Fragment/Light mode only) (Scalar) Derivative in 'x' using local "
+"differencing."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"(Fragment/Light mode only) (Vector) Derivative in 'y' using local "
+"differencing."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"(Fragment/Light mode only) (Scalar) Derivative in 'y' using local "
+"differencing."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"(Fragment/Light mode only) (Vector) Sum of absolute derivative in 'x' and "
+"'y'."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid ""
+"(Fragment/Light mode only) (Scalar) Sum of absolute derivative in 'x' and "
+"'y'."
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "VisualShader"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Edit Visual Property"
+msgstr ""
+
+#: editor/plugins/visual_shader_editor_plugin.cpp
+msgid "Visual Shader Mode Changed"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Runnable"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Delete preset '%s'?"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid ""
+"Failed to export the project for platform '%s'.\n"
+"Export templates seem to be missing or invalid."
+msgstr ""
+
+#: editor/project_export.cpp
+msgid ""
+"Failed to export the project for platform '%s'.\n"
+"This might be due to a configuration issue in the export preset or your "
+"export settings."
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Release"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Exporting All"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "The given export path doesn't exist:"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export templates for this platform are missing/corrupted:"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Presets"
+msgstr ""
+
+#: editor/project_export.cpp editor/project_settings_editor.cpp
+msgid "Add..."
+msgstr ""
+
+#: editor/project_export.cpp
+msgid ""
+"If checked, the preset will be available for use in one-click deploy.\n"
+"Only one preset per platform may be marked as runnable."
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export Path"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Resources"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export all resources in the project"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export selected scenes (and dependencies)"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export selected resources (and dependencies)"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export Mode:"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Resources to export:"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid ""
+"Filters to export non-resource files/folders\n"
+"(comma-separated, e.g: *.json, *.txt, docs/*)"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid ""
+"Filters to exclude files/folders from project\n"
+"(comma-separated, e.g: *.json, *.txt, docs/*)"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Features"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Custom (comma-separated):"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Feature List:"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Script"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Script Export Mode:"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Text"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Compiled"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Encrypted (Provide Key Below)"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Invalid Encryption Key (must be 64 characters long)"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Script Encryption Key (256-bits as hex):"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export PCK/Zip"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export Project"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export mode?"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export All"
+msgstr ""
+
+#: editor/project_export.cpp editor/project_manager.cpp
+msgid "ZIP File"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Godot Game Pack"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export templates for this platform are missing:"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Manage Export Templates"
+msgstr ""
+
+#: editor/project_export.cpp
+msgid "Export With Debug"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "The path specified doesn't exist."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Error opening package file (it's not in ZIP format)."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"Invalid \".zip\" project file; it doesn't contain a \"project.godot\" file."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Please choose an empty folder."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Please choose a \"project.godot\" or \".zip\" file."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "This directory already contains a Godot project."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "New Game Project"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Imported Project"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Invalid Project Name."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Couldn't create folder."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "There is already a folder in this path with the specified name."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "It would be a good idea to name your project."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Invalid project path (changed anything?)."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"Couldn't load project.godot in project path (error %d). It may be missing or "
+"corrupted."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Couldn't edit project.godot in project path."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Couldn't create project.godot in project path."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Rename Project"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Import Existing Project"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Import & Edit"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Create New Project"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Create & Edit"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Install Project:"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Install & Edit"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Project Name:"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Project Path:"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Project Installation Path:"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Renderer:"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "OpenGL ES 3.0"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Not supported by your GPU drivers."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"Higher visual quality\n"
+"All features available\n"
+"Incompatible with older hardware\n"
+"Not recommended for web games"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "OpenGL ES 2.0"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"Lower visual quality\n"
+"Some features not available\n"
+"Works on most hardware\n"
+"Recommended for web games"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Renderer can be changed later, but scenes may need to be adjusted."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Unnamed Project"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Missing Project"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Error: Project is missing on the filesystem."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Can't open project at '%s'."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Are you sure to open more than one project?"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"The following project settings file does not specify the version of Godot "
+"through which it was created.\n"
+"\n"
+"%s\n"
+"\n"
+"If you proceed with opening it, it will be converted to Godot's current "
+"configuration file format.\n"
+"Warning: You won't be able to open the project with previous versions of the "
+"engine anymore."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"The following project settings file was generated by an older engine "
+"version, and needs to be converted for this version:\n"
+"\n"
+"%s\n"
+"\n"
+"Do you want to convert it?\n"
+"Warning: You won't be able to open the project with previous versions of the "
+"engine anymore."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"The project settings were created by a newer engine version, whose settings "
+"are not compatible with this version."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"Can't run project: no main scene defined.\n"
+"Please edit the project and set the main scene in the Project Settings under "
+"the \"Application\" category."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"Can't run project: Assets need to be imported.\n"
+"Please edit the project to trigger the initial import."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Are you sure to run %d projects at once?"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"Remove %d projects from the list?\n"
+"The project folders' contents won't be modified."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"Remove this project from the list?\n"
+"The project folder's contents won't be modified."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"Remove all missing projects from the list?\n"
+"The project folders' contents won't be modified."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"Language changed.\n"
+"The interface will update after restarting the editor or project manager."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"Are you sure to scan %s folders for existing Godot projects?\n"
+"This could take a while."
+msgstr ""
+
+#. TRANSLATORS: This refers to the application where users manage their Godot projects.
+#: editor/project_manager.cpp
+msgid "Project Manager"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Projects"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Loading, please wait..."
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Last Modified"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Scan"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Select a Folder to Scan"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "New Project"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Remove Missing"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Templates"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Restart Now"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid "Can't run project"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"You currently don't have any projects.\n"
+"Would you like to explore official example projects in the Asset Library?"
+msgstr ""
+
+#: editor/project_manager.cpp
+msgid ""
+"The search box filters projects by name and last path component.\n"
+"To filter projects by name and full path, the query must contain at least "
+"one `/` character."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Key "
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Joy Button"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Joy Axis"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Mouse Button"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid ""
+"Invalid action name. it cannot be empty nor contain '/', ':', '=', '\\' or "
+"'\"'"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "An action with the name '%s' already exists."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Rename Input Action Event"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Change Action deadzone"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Add Input Action Event"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "All Devices"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Device"
+msgstr ""
+
+#: editor/project_settings_editor.cpp editor/settings_config_dialog.cpp
+msgid "Press a Key..."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Mouse Button Index:"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Left Button"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Right Button"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Middle Button"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Wheel Up Button"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Wheel Down Button"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Wheel Left Button"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Wheel Right Button"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "X Button 1"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "X Button 2"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Joypad Axis Index:"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Axis"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Joypad Button Index:"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Erase Input Action"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Erase Input Action Event"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Add Event"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Button"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Left Button."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Right Button."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Middle Button."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Wheel Up."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Wheel Down."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Add Global Property"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Select a setting item first!"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "No property '%s' exists."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Setting '%s' is internal, and it can't be deleted."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Delete Item"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid ""
+"Invalid action name. It cannot be empty nor contain '/', ':', '=', '\\' or "
+"'\"'."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Add Input Action"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Error saving settings."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Settings saved OK."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Moved Input Action Event"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Override for Feature"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Add Translation"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Remove Translation"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Add Remapped Path"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Resource Remap Add Remap"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Change Resource Remap Language"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Remove Resource Remap"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Remove Resource Remap Option"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Changed Locale Filter"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Changed Locale Filter Mode"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Project Settings (project.godot)"
+msgstr ""
+
+#: editor/project_settings_editor.cpp editor/settings_config_dialog.cpp
+msgid "General"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Override For..."
+msgstr ""
+
+#: editor/project_settings_editor.cpp editor/settings_config_dialog.cpp
+msgid "The editor must be restarted for changes to take effect."
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Input Map"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Action:"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Action"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Deadzone"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Device:"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Index:"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Localization"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Translations"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Translations:"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Remaps"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Resources:"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Remaps by Locale:"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Locale"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Locales Filter"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Show All Locales"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Show Selected Locales Only"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Filter mode:"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Locales:"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "AutoLoad"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Plugins"
+msgstr ""
+
+#: editor/project_settings_editor.cpp
+msgid "Import Defaults"
+msgstr ""
+
+#: editor/property_editor.cpp
+msgid "Preset..."
+msgstr ""
+
+#: editor/property_editor.cpp
+msgid "Zero"
+msgstr ""
+
+#: editor/property_editor.cpp
+msgid "Easing In-Out"
+msgstr ""
+
+#: editor/property_editor.cpp
+msgid "Easing Out-In"
+msgstr ""
+
+#: editor/property_editor.cpp
+msgid "File..."
+msgstr ""
+
+#: editor/property_editor.cpp
+msgid "Dir..."
+msgstr ""
+
+#: editor/property_editor.cpp
+msgid "Assign"
+msgstr ""
+
+#: editor/property_editor.cpp
+msgid "Select Node"
+msgstr ""
+
+#: editor/property_editor.cpp
+msgid "Error loading file: Not a resource!"
+msgstr ""
+
+#: editor/property_editor.cpp
+msgid "Pick a Node"
+msgstr ""
+
+#: editor/property_editor.cpp
+msgid "Bit %d, val %d."
+msgstr ""
+
+#: editor/property_selector.cpp
+msgid "Select Property"
+msgstr ""
+
+#: editor/property_selector.cpp
+msgid "Select Virtual Method"
+msgstr ""
+
+#: editor/property_selector.cpp
+msgid "Select Method"
+msgstr ""
+
+#: editor/rename_dialog.cpp editor/scene_tree_dock.cpp
+msgid "Batch Rename"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Replace:"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Prefix:"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Suffix:"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Use Regular Expressions"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Advanced Options"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Substitute"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Node name"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Node's parent name, if available"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Node type"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Current scene name"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Root node name"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid ""
+"Sequential integer counter.\n"
+"Compare counter options."
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Per-level Counter"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "If set, the counter restarts for each group of child nodes."
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Initial value for the counter"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Step"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Amount by which counter is incremented for each node"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Padding"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid ""
+"Minimum number of digits for the counter.\n"
+"Missing digits are padded with leading zeros."
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Post-Process"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Keep"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "PascalCase to snake_case"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "snake_case to PascalCase"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Case"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "To Lowercase"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "To Uppercase"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Reset"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "Regular Expression Error:"
+msgstr ""
+
+#: editor/rename_dialog.cpp
+msgid "At character %s"
+msgstr ""
+
+#: editor/reparent_dialog.cpp editor/scene_tree_dock.cpp
+msgid "Reparent Node"
+msgstr ""
+
+#: editor/reparent_dialog.cpp
+msgid "Reparent Location (Select new Parent):"
+msgstr ""
+
+#: editor/reparent_dialog.cpp
+msgid "Keep Global Transform"
+msgstr ""
+
+#: editor/reparent_dialog.cpp editor/scene_tree_dock.cpp
+msgid "Reparent"
+msgstr ""
+
+#: editor/run_settings_dialog.cpp
+msgid "Run Mode:"
+msgstr ""
+
+#: editor/run_settings_dialog.cpp
+msgid "Current Scene"
+msgstr ""
+
+#: editor/run_settings_dialog.cpp
+msgid "Main Scene"
+msgstr ""
+
+#: editor/run_settings_dialog.cpp
+msgid "Main Scene Arguments:"
+msgstr ""
+
+#: editor/run_settings_dialog.cpp
+msgid "Scene Run Settings"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "No parent to instance the scenes at."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Error loading scene from %s"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid ""
+"Cannot instance the scene '%s' because the current scene exists within one "
+"of its nodes."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Instance Scene(s)"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Replace with Branch Scene"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Instance Child Scene"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Can't paste root node into the same scene."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Paste Node(s)"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Detach Script"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "This operation can't be done on the tree root."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Move Node In Parent"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Move Nodes In Parent"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Duplicate Node(s)"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Can't reparent nodes in inherited scenes, order of nodes can't change."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Node must belong to the edited scene to become root."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Instantiated scenes can't become root"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Make node as Root"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Delete %d nodes and any children?"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Delete %d nodes?"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Delete the root node \"%s\"?"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Delete node \"%s\" and its children?"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Delete node \"%s\"?"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Can not perform with the root node."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "This operation can't be done on instanced scenes."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Save New Scene As..."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid ""
+"Disabling \"editable_instance\" will cause all properties of the node to be "
+"reverted to their default."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid ""
+"Enabling \"Load As Placeholder\" will disable \"Editable Children\" and "
+"cause all properties of the node to be reverted to their default."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Make Local"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "New Scene Root"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Create Root Node:"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "2D Scene"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "3D Scene"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "User Interface"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Other Node"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Can't operate on nodes from a foreign scene!"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Can't operate on nodes the current scene inherits from!"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Attach Script"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Cut Node(s)"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Remove Node(s)"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Change type of node(s)"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid ""
+"Couldn't save new scene. Likely dependencies (instances) couldn't be "
+"satisfied."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Error saving scene."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Error duplicating scene to save it."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Sub-Resources"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Clear Inheritance"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Editable Children"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Load As Placeholder"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Open Documentation"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid ""
+"Cannot attach a script: there are no languages registered.\n"
+"This is probably because this editor was built with all language modules "
+"disabled."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Add Child Node"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Expand/Collapse All"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Change Type"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Reparent to New Node"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Make Scene Root"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Merge From Scene"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp editor/script_editor_debugger.cpp
+msgid "Save Branch as Scene"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp editor/script_editor_debugger.cpp
+msgid "Copy Node Path"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Delete (No Confirm)"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Add/Create a New Node."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid ""
+"Instance a scene file as a Node. Creates an inherited scene if no root node "
+"exists."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Attach a new or existing script to the selected node."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Detach the script from the selected node."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Remote"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Local"
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
+msgid "Clear Inheritance? (No Undo!)"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "Toggle Visible"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "Unlock Node"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "Button Group"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "(Connecting From)"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "Node configuration warning:"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid ""
+"Node has %s connection(s) and %s group(s).\n"
+"Click to show signals dock."
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid ""
+"Node has %s connection(s).\n"
+"Click to show signals dock."
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid ""
+"Node is in %s group(s).\n"
+"Click to show groups dock."
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "Open Script:"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid ""
+"Node is locked.\n"
+"Click to unlock it."
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid ""
+"Children are not selectable.\n"
+"Click to make selectable."
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "Toggle Visibility"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid ""
+"AnimationPlayer is pinned.\n"
+"Click to unpin."
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "Invalid node name, the following characters are not allowed:"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "Rename Node"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "Scene Tree (Nodes):"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "Node Configuration Warning!"
+msgstr ""
+
+#: editor/scene_tree_editor.cpp
+msgid "Select a Node"
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Path is empty."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Filename is empty."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Path is not local."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Invalid base path."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "A directory with the same name exists."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "File does not exist."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Invalid extension."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Wrong extension chosen."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Error loading template '%s'"
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Error - Could not create script in filesystem."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Error loading script from %s"
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Overrides"
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "N/A"
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Open Script / Choose Location"
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Open Script"
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "File exists, it will be reused."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Invalid path."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Invalid class name."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Invalid inherited parent name or path."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Script path/name is valid."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Allowed: a-z, A-Z, 0-9, _ and ."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Built-in script (into scene file)."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Will create a new script file."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Will load an existing script file."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Script file already exists."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid ""
+"Note: Built-in scripts have some limitations and can't be edited using an "
+"external editor."
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Class Name:"
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Template:"
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Built-in Script:"
+msgstr ""
+
+#: editor/script_create_dialog.cpp
+msgid "Attach Node Script"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Remote "
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Bytes:"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Warning:"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Error:"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "C++ Error"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "C++ Error:"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "C++ Source"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Source:"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "C++ Source:"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Stack Trace"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Errors"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Child process connected."
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Copy Error"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Video RAM"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Skip Breakpoints"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Inspect Previous Instance"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Inspect Next Instance"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Stack Frames"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Profiler"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Network Profiler"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Monitor"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Value"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Monitors"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Pick one or more items from the list to display the graph."
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "List of Video Memory Usage by Resource:"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Total:"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Export list to a CSV file"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Resource Path"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Type"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Format"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Usage"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Misc"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Clicked Control:"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Clicked Control Type:"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Live Edit Root:"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Set From Tree"
+msgstr ""
+
+#: editor/script_editor_debugger.cpp
+msgid "Export measures as CSV"
+msgstr ""
+
+#: editor/settings_config_dialog.cpp
+msgid "Erase Shortcut"
+msgstr ""
+
+#: editor/settings_config_dialog.cpp
+msgid "Restore Shortcut"
+msgstr ""
+
+#: editor/settings_config_dialog.cpp
+msgid "Change Shortcut"
+msgstr ""
+
+#: editor/settings_config_dialog.cpp
+msgid "Editor Settings"
+msgstr ""
+
+#: editor/settings_config_dialog.cpp
+msgid "Shortcuts"
+msgstr ""
+
+#: editor/settings_config_dialog.cpp
+msgid "Binding"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change Light Radius"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change AudioStreamPlayer3D Emission Angle"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change Camera FOV"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change Camera Size"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change Notifier AABB"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change Particles AABB"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change Probe Extents"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp modules/csg/csg_gizmos.cpp
+msgid "Change Sphere Shape Radius"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp modules/csg/csg_gizmos.cpp
+msgid "Change Box Shape Extents"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change Capsule Shape Radius"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change Capsule Shape Height"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change Cylinder Shape Radius"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change Cylinder Shape Height"
+msgstr ""
+
+#: editor/spatial_editor_gizmos.cpp
+msgid "Change Ray Shape Length"
+msgstr ""
+
+#: modules/csg/csg_gizmos.cpp
+msgid "Change Cylinder Radius"
+msgstr ""
+
+#: modules/csg/csg_gizmos.cpp
+msgid "Change Cylinder Height"
+msgstr ""
+
+#: modules/csg/csg_gizmos.cpp
+msgid "Change Torus Inner Radius"
+msgstr ""
+
+#: modules/csg/csg_gizmos.cpp
+msgid "Change Torus Outer Radius"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "Select the dynamic library for this entry"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "Select dependencies of the library for this entry"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "Remove current entry"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "Double click to create a new entry"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "Platform:"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "Platform"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "Dynamic Library"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "Add an architecture entry"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_editor_plugin.cpp
+msgid "GDNativeLibrary"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_singleton_editor.cpp
+msgid "Enabled GDNative Singleton"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_singleton_editor.cpp
+msgid "Disabled GDNative Singleton"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_singleton_editor.cpp
+msgid "Library"
+msgstr ""
+
+#: modules/gdnative/gdnative_library_singleton_editor.cpp
+msgid "Libraries: "
+msgstr ""
+
+#: modules/gdnative/register_types.cpp
+msgid "GDNative"
+msgstr ""
+
+#: modules/gdscript/gdscript_functions.cpp
+msgid "Step argument is zero!"
+msgstr ""
+
+#: modules/gdscript/gdscript_functions.cpp
+msgid "Not a script with an instance"
+msgstr ""
+
+#: modules/gdscript/gdscript_functions.cpp
+msgid "Not based on a script"
+msgstr ""
+
+#: modules/gdscript/gdscript_functions.cpp
+msgid "Not based on a resource file"
+msgstr ""
+
+#: modules/gdscript/gdscript_functions.cpp
+msgid "Invalid instance dictionary format (missing @path)"
+msgstr ""
+
+#: modules/gdscript/gdscript_functions.cpp
+msgid "Invalid instance dictionary format (can't load script at @path)"
+msgstr ""
+
+#: modules/gdscript/gdscript_functions.cpp
+msgid "Invalid instance dictionary format (invalid script at @path)"
+msgstr ""
+
+#: modules/gdscript/gdscript_functions.cpp
+msgid "Invalid instance dictionary (invalid subclasses)"
+msgstr ""
+
+#: modules/gdscript/gdscript_functions.cpp
+msgid "Object can't provide a length."
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Next Plane"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Previous Plane"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Plane:"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Next Floor"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Previous Floor"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Floor:"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "GridMap Delete Selection"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "GridMap Fill Selection"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "GridMap Paste Selection"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "GridMap Paint"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Grid Map"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Snap View"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Clip Disabled"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Clip Above"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Clip Below"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Edit X Axis"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Edit Y Axis"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Edit Z Axis"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Cursor Rotate X"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Cursor Rotate Y"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Cursor Rotate Z"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Cursor Back Rotate X"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Cursor Back Rotate Y"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Cursor Back Rotate Z"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Cursor Clear Rotation"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Paste Selects"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Clear Selection"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Fill Selection"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "GridMap Settings"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Pick Distance:"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Filter meshes"
+msgstr ""
+
+#: modules/gridmap/grid_map_editor_plugin.cpp
+msgid "Give a MeshLibrary resource to this GridMap to use its meshes."
+msgstr ""
+
+#: modules/lightmapper_cpu/lightmapper_cpu.cpp
+msgid "Begin Bake"
+msgstr ""
+
+#: modules/lightmapper_cpu/lightmapper_cpu.cpp
+msgid "Preparing data structures"
+msgstr ""
+
+#: modules/lightmapper_cpu/lightmapper_cpu.cpp
+msgid "Generate buffers"
+msgstr ""
+
+#: modules/lightmapper_cpu/lightmapper_cpu.cpp
+msgid "Direct lighting"
+msgstr ""
+
+#: modules/lightmapper_cpu/lightmapper_cpu.cpp
+msgid "Indirect lighting"
+msgstr ""
+
+#: modules/lightmapper_cpu/lightmapper_cpu.cpp
+msgid "Post processing"
+msgstr ""
+
+#: modules/lightmapper_cpu/lightmapper_cpu.cpp
+msgid "Plotting lightmaps"
+msgstr ""
+
+#: modules/mono/csharp_script.cpp
+msgid "Class name can't be a reserved keyword"
+msgstr ""
+
+#: modules/mono/mono_gd/gd_mono_utils.cpp
+msgid "End of inner exception stack trace"
+msgstr ""
+
+#: modules/recast/navigation_mesh_editor_plugin.cpp
+msgid "Bake NavMesh"
+msgstr ""
+
+#: modules/recast/navigation_mesh_editor_plugin.cpp
+msgid "Clear the navigation mesh."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Setting up Configuration..."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Calculating grid size..."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Creating heightfield..."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Marking walkable triangles..."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Constructing compact heightfield..."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Eroding walkable area..."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Partitioning..."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Creating contours..."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Creating polymesh..."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Converting to native navigation mesh..."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Navigation Mesh Generator Setup:"
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Parsing Geometry..."
+msgstr ""
+
+#: modules/recast/navigation_mesh_generator.cpp
+msgid "Done!"
+msgstr ""
+
+#: modules/visual_script/visual_script.cpp
+msgid ""
+"A node yielded without working memory, please read the docs on how to yield "
+"properly!"
+msgstr ""
+
+#: modules/visual_script/visual_script.cpp
+msgid ""
+"Node yielded, but did not return a function state in the first working "
+"memory."
+msgstr ""
+
+#: modules/visual_script/visual_script.cpp
+msgid ""
+"Return value must be assigned to first element of node working memory! Fix "
+"your node please."
+msgstr ""
+
+#: modules/visual_script/visual_script.cpp
+msgid "Node returned an invalid sequence output: "
+msgstr ""
+
+#: modules/visual_script/visual_script.cpp
+msgid "Found sequence bit but not the node in the stack, report bug!"
+msgstr ""
+
+#: modules/visual_script/visual_script.cpp
+msgid "Stack overflow with stack depth: "
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Change Signal Arguments"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Change Argument Type"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Change Argument name"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Set Variable Default Value"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Set Variable Type"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Input Port"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Output Port"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Override an existing built-in function."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Create a new function."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Variables:"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Create a new variable."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Signals:"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Create a new signal."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Name is not a valid identifier:"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Name already in use by another func/var/signal:"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Rename Function"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Rename Variable"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Rename Signal"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Function"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Delete input port"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Variable"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Signal"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Remove Input Port"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Remove Output Port"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Change Expression"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Remove VisualScript Nodes"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Duplicate VisualScript Nodes"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Hold %s to drop a Getter. Hold Shift to drop a generic signature."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Hold Ctrl to drop a Getter. Hold Shift to drop a generic signature."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Hold %s to drop a simple reference to the node."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Hold Ctrl to drop a simple reference to the node."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Hold %s to drop a Variable Setter."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Hold Ctrl to drop a Variable Setter."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Preload Node"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Node(s) From Tree"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid ""
+"Can't drop properties because script '%s' is not used in this scene.\n"
+"Drop holding 'Shift' to just copy the signature."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Getter Property"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Setter Property"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Change Base Type"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Move Node(s)"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Remove VisualScript Node"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Connect Nodes"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Disconnect Nodes"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Connect Node Data"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Connect Node Sequence"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Script already has function '%s'"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Change Input Value"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Resize Comment"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Can't copy the function node."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Clipboard is empty!"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Paste VisualScript Nodes"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Can't create function with a function node."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Can't create function of nodes from nodes of multiple functions."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Select at least one node with sequence port."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Try to only have one sequence input in selection."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Create Function"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Remove Function"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Remove Variable"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Editing Variable:"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Remove Signal"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Editing Signal:"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Make Tool:"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Members:"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Change Base Type:"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Nodes..."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Add Function..."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "function_name"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Select or create a function to edit its graph."
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Delete Selected"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Find Node Type"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Copy Nodes"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Cut Nodes"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Make Function"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Refresh Graph"
+msgstr ""
+
+#: modules/visual_script/visual_script_editor.cpp
+msgid "Edit Member"
+msgstr ""
+
+#: modules/visual_script/visual_script_flow_control.cpp
+msgid "Input type not iterable: "
+msgstr ""
+
+#: modules/visual_script/visual_script_flow_control.cpp
+msgid "Iterator became invalid"
+msgstr ""
+
+#: modules/visual_script/visual_script_flow_control.cpp
+msgid "Iterator became invalid: "
+msgstr ""
+
+#: modules/visual_script/visual_script_func_nodes.cpp
+msgid "Invalid index property name."
+msgstr ""
+
+#: modules/visual_script/visual_script_func_nodes.cpp
+msgid "Base object is not a Node!"
+msgstr ""
+
+#: modules/visual_script/visual_script_func_nodes.cpp
+msgid "Path does not lead Node!"
+msgstr ""
+
+#: modules/visual_script/visual_script_func_nodes.cpp
+msgid "Invalid index property name '%s' in node %s."
+msgstr ""
+
+#: modules/visual_script/visual_script_nodes.cpp
+msgid ": Invalid argument of type: "
+msgstr ""
+
+#: modules/visual_script/visual_script_nodes.cpp
+msgid ": Invalid arguments: "
+msgstr ""
+
+#: modules/visual_script/visual_script_nodes.cpp
+msgid "VariableGet not found in script: "
+msgstr ""
+
+#: modules/visual_script/visual_script_nodes.cpp
+msgid "VariableSet not found in script: "
+msgstr ""
+
+#: modules/visual_script/visual_script_nodes.cpp
+msgid "Custom node has no _step() method, can't process graph."
+msgstr ""
+
+#: modules/visual_script/visual_script_nodes.cpp
+msgid ""
+"Invalid return value from _step(), must be integer (seq out), or string "
+"(error)."
+msgstr ""
+
+#: modules/visual_script/visual_script_property_selector.cpp
+msgid "Search VisualScript"
+msgstr ""
+
+#: modules/visual_script/visual_script_property_selector.cpp
+msgid "Get %s"
+msgstr ""
+
+#: modules/visual_script/visual_script_property_selector.cpp
+msgid "Set %s"
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Package name is missing."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Package segments must be of non-zero length."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "The character '%s' is not allowed in Android application package names."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "A digit cannot be the first character in a package segment."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "The character '%s' cannot be the first character in a package segment."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "The package must have at least one '.' separator."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Select device from the list"
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Unable to find the 'apksigner' tool."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid ""
+"Android build template not installed in the project. Install it from the "
+"Project menu."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Debug keystore not configured in the Editor Settings nor in the preset."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Release keystore incorrectly configured in the export preset."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "A valid Android SDK path is required in Editor Settings."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Invalid Android SDK path in Editor Settings."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Missing 'platform-tools' directory!"
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Unable to find Android SDK platform-tools' adb command."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Please check in the Android SDK directory specified in Editor Settings."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Missing 'build-tools' directory!"
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Unable to find Android SDK build-tools' apksigner command."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Invalid public key for APK expansion."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Invalid package name:"
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid ""
+"Invalid \"GodotPaymentV3\" module included in the \"android/modules\" "
+"project setting (changed in Godot 3.2.2).\n"
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "\"Use Custom Build\" must be enabled to use the plugins."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid ""
+"\"Degrees Of Freedom\" is only valid when \"Xr Mode\" is \"Oculus Mobile VR"
+"\"."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid ""
+"\"Hand Tracking\" is only valid when \"Xr Mode\" is \"Oculus Mobile VR\"."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid ""
+"\"Focus Awareness\" is only valid when \"Xr Mode\" is \"Oculus Mobile VR\"."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "\"Export AAB\" is only valid when \"Use Custom Build\" is enabled."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Invalid filename! Android App Bundle requires the *.aab extension."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "APK Expansion not compatible with Android App Bundle."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Invalid filename! Android APK requires the *.apk extension."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid ""
+"Trying to build from a custom built template, but no version info for it "
+"exists. Please reinstall from the 'Project' menu."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid ""
+"Android build version mismatch:\n"
+"   Template installed: %s\n"
+"   Godot Version: %s\n"
+"Please reinstall Android build template from 'Project' menu."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Building Android Project (gradle)"
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid ""
+"Building of Android project failed, check output for the error.\n"
+"Alternatively visit docs.godotengine.org for Android build documentation."
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid "Moving output"
+msgstr ""
+
+#: platform/android/export/export.cpp
+msgid ""
+"Unable to copy and rename export file, check gradle project directory for "
+"outputs."
+msgstr ""
+
+#: platform/iphone/export/export.cpp
+msgid "Identifier is missing."
+msgstr ""
+
+#: platform/iphone/export/export.cpp
+msgid "The character '%s' is not allowed in Identifier."
+msgstr ""
+
+#: platform/iphone/export/export.cpp
+msgid "App Store Team ID not specified - cannot configure the project."
+msgstr ""
+
+#: platform/iphone/export/export.cpp
+msgid "Invalid Identifier:"
+msgstr ""
+
+#: platform/iphone/export/export.cpp
+msgid "Required icon is not specified in the preset."
+msgstr ""
+
+#: platform/javascript/export/export.cpp
+msgid "Stop HTTP Server"
+msgstr ""
+
+#: platform/javascript/export/export.cpp
+msgid "Run in Browser"
+msgstr ""
+
+#: platform/javascript/export/export.cpp
+msgid "Run exported HTML in the system's default browser."
+msgstr ""
+
+#: platform/javascript/export/export.cpp
+msgid "Could not write file:"
+msgstr ""
+
+#: platform/javascript/export/export.cpp
+msgid "Could not open template for export:"
+msgstr ""
+
+#: platform/javascript/export/export.cpp
+msgid "Invalid export template:"
+msgstr ""
+
+#: platform/javascript/export/export.cpp
+msgid "Could not read custom HTML shell:"
+msgstr ""
+
+#: platform/javascript/export/export.cpp
+msgid "Could not read boot splash image file:"
+msgstr ""
+
+#: platform/javascript/export/export.cpp
+msgid "Using default boot splash image."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid package short name."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid package unique name."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid package publisher display name."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid product GUID."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid publisher GUID."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid background color."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid Store Logo image dimensions (should be 50x50)."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid square 44x44 logo image dimensions (should be 44x44)."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid square 71x71 logo image dimensions (should be 71x71)."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid square 150x150 logo image dimensions (should be 150x150)."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid square 310x310 logo image dimensions (should be 310x310)."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid wide 310x150 logo image dimensions (should be 310x150)."
+msgstr ""
+
+#: platform/uwp/export/export.cpp
+msgid "Invalid splash screen image dimensions (should be 620x300)."
+msgstr ""
+
+#: scene/2d/animated_sprite.cpp
+msgid ""
+"A SpriteFrames resource must be created or set in the \"Frames\" property in "
+"order for AnimatedSprite to display frames."
+msgstr ""
+
+#: scene/2d/canvas_modulate.cpp
+msgid ""
+"Only one visible CanvasModulate is allowed per scene (or set of instanced "
+"scenes). The first created one will work, while the rest will be ignored."
+msgstr ""
+
+#: scene/2d/collision_object_2d.cpp
+msgid ""
+"This node has no shape, so it can't collide or interact with other objects.\n"
+"Consider adding a CollisionShape2D or CollisionPolygon2D as a child to "
+"define its shape."
+msgstr ""
+
+#: scene/2d/collision_polygon_2d.cpp
+msgid ""
+"CollisionPolygon2D only serves to provide a collision shape to a "
+"CollisionObject2D derived node. Please only use it as a child of Area2D, "
+"StaticBody2D, RigidBody2D, KinematicBody2D, etc. to give them a shape."
+msgstr ""
+
+#: scene/2d/collision_polygon_2d.cpp
+msgid "An empty CollisionPolygon2D has no effect on collision."
+msgstr ""
+
+#: scene/2d/collision_polygon_2d.cpp
+msgid "Invalid polygon. At least 3 points are needed in 'Solids' build mode."
+msgstr ""
+
+#: scene/2d/collision_polygon_2d.cpp
+msgid "Invalid polygon. At least 2 points are needed in 'Segments' build mode."
+msgstr ""
+
+#: scene/2d/collision_shape_2d.cpp
+msgid ""
+"CollisionShape2D only serves to provide a collision shape to a "
+"CollisionObject2D derived node. Please only use it as a child of Area2D, "
+"StaticBody2D, RigidBody2D, KinematicBody2D, etc. to give them a shape."
+msgstr ""
+
+#: scene/2d/collision_shape_2d.cpp
+msgid ""
+"A shape must be provided for CollisionShape2D to function. Please create a "
+"shape resource for it!"
+msgstr ""
+
+#: scene/2d/collision_shape_2d.cpp
+msgid ""
+"Polygon-based shapes are not meant be used nor edited directly through the "
+"CollisionShape2D node. Please use the CollisionPolygon2D node instead."
+msgstr ""
+
+#: scene/2d/cpu_particles_2d.cpp
+msgid ""
+"CPUParticles2D animation requires the usage of a CanvasItemMaterial with "
+"\"Particles Animation\" enabled."
+msgstr ""
+
+#: scene/2d/joints_2d.cpp
+msgid "Node A and Node B must be PhysicsBody2Ds"
+msgstr ""
+
+#: scene/2d/joints_2d.cpp
+msgid "Node A must be a PhysicsBody2D"
+msgstr ""
+
+#: scene/2d/joints_2d.cpp
+msgid "Node B must be a PhysicsBody2D"
+msgstr ""
+
+#: scene/2d/joints_2d.cpp
+msgid "Joint is not connected to two PhysicsBody2Ds"
+msgstr ""
+
+#: scene/2d/joints_2d.cpp
+msgid "Node A and Node B must be different PhysicsBody2Ds"
+msgstr ""
+
+#: scene/2d/light_2d.cpp
+msgid ""
+"A texture with the shape of the light must be supplied to the \"Texture\" "
+"property."
+msgstr ""
+
+#: scene/2d/light_occluder_2d.cpp
+msgid ""
+"An occluder polygon must be set (or drawn) for this occluder to take effect."
+msgstr ""
+
+#: scene/2d/light_occluder_2d.cpp
+msgid "The occluder polygon for this occluder is empty. Please draw a polygon."
+msgstr ""
+
+#: scene/2d/navigation_polygon.cpp
+msgid ""
+"A NavigationPolygon resource must be set or created for this node to work. "
+"Please set a property or draw a polygon."
+msgstr ""
+
+#: scene/2d/navigation_polygon.cpp
+msgid ""
+"NavigationPolygonInstance must be a child or grandchild to a Navigation2D "
+"node. It only provides navigation data."
+msgstr ""
+
+#: scene/2d/parallax_layer.cpp
+msgid ""
+"ParallaxLayer node only works when set as child of a ParallaxBackground node."
+msgstr ""
+
+#: scene/2d/particles_2d.cpp
+msgid ""
+"GPU-based particles are not supported by the GLES2 video driver.\n"
+"Use the CPUParticles2D node instead. You can use the \"Convert to "
+"CPUParticles\" option for this purpose."
+msgstr ""
+
+#: scene/2d/particles_2d.cpp scene/3d/particles.cpp
+msgid ""
+"A material to process the particles is not assigned, so no behavior is "
+"imprinted."
+msgstr ""
+
+#: scene/2d/particles_2d.cpp
+msgid ""
+"Particles2D animation requires the usage of a CanvasItemMaterial with "
+"\"Particles Animation\" enabled."
+msgstr ""
+
+#: scene/2d/path_2d.cpp
+msgid "PathFollow2D only works when set as a child of a Path2D node."
+msgstr ""
+
+#: scene/2d/physics_body_2d.cpp
+msgid ""
+"Size changes to RigidBody2D (in character or rigid modes) will be overridden "
+"by the physics engine when running.\n"
+"Change the size in children collision shapes instead."
+msgstr ""
+
+#: scene/2d/remote_transform_2d.cpp
+msgid "Path property must point to a valid Node2D node to work."
+msgstr ""
+
+#: scene/2d/skeleton_2d.cpp
+msgid "This Bone2D chain should end at a Skeleton2D node."
+msgstr ""
+
+#: scene/2d/skeleton_2d.cpp
+msgid "A Bone2D only works with a Skeleton2D or another Bone2D as parent node."
+msgstr ""
+
+#: scene/2d/skeleton_2d.cpp
+msgid ""
+"This bone lacks a proper REST pose. Go to the Skeleton2D node and set one."
+msgstr ""
+
+#: scene/2d/tile_map.cpp
+msgid ""
+"TileMap with Use Parent on needs a parent CollisionObject2D to give shapes "
+"to. Please use it as a child of Area2D, StaticBody2D, RigidBody2D, "
+"KinematicBody2D, etc. to give them a shape."
+msgstr ""
+
+#: scene/2d/visibility_notifier_2d.cpp
+msgid ""
+"VisibilityEnabler2D works best when used with the edited scene root directly "
+"as parent."
+msgstr ""
+
+#: scene/3d/arvr_nodes.cpp
+msgid "ARVRCamera must have an ARVROrigin node as its parent."
+msgstr ""
+
+#: scene/3d/arvr_nodes.cpp
+msgid "ARVRController must have an ARVROrigin node as its parent."
+msgstr ""
+
+#: scene/3d/arvr_nodes.cpp
+msgid ""
+"The controller ID must not be 0 or this controller won't be bound to an "
+"actual controller."
+msgstr ""
+
+#: scene/3d/arvr_nodes.cpp
+msgid "ARVRAnchor must have an ARVROrigin node as its parent."
+msgstr ""
+
+#: scene/3d/arvr_nodes.cpp
+msgid ""
+"The anchor ID must not be 0 or this anchor won't be bound to an actual "
+"anchor."
+msgstr ""
+
+#: scene/3d/arvr_nodes.cpp
+msgid "ARVROrigin requires an ARVRCamera child node."
+msgstr ""
+
+#: scene/3d/baked_lightmap.cpp
+msgid "Finding meshes and lights"
+msgstr ""
+
+#: scene/3d/baked_lightmap.cpp
+msgid "Preparing geometry (%d/%d)"
+msgstr ""
+
+#: scene/3d/baked_lightmap.cpp
+msgid "Preparing environment"
+msgstr ""
+
+#: scene/3d/baked_lightmap.cpp
+msgid "Generating capture"
+msgstr ""
+
+#: scene/3d/baked_lightmap.cpp
+msgid "Saving lightmaps"
+msgstr ""
+
+#: scene/3d/baked_lightmap.cpp
+msgid "Done"
+msgstr ""
+
+#: scene/3d/collision_object.cpp
+msgid ""
+"This node has no shape, so it can't collide or interact with other objects.\n"
+"Consider adding a CollisionShape or CollisionPolygon as a child to define "
+"its shape."
+msgstr ""
+
+#: scene/3d/collision_polygon.cpp
+msgid ""
+"CollisionPolygon only serves to provide a collision shape to a "
+"CollisionObject derived node. Please only use it as a child of Area, "
+"StaticBody, RigidBody, KinematicBody, etc. to give them a shape."
+msgstr ""
+
+#: scene/3d/collision_polygon.cpp
+msgid "An empty CollisionPolygon has no effect on collision."
+msgstr ""
+
+#: scene/3d/collision_shape.cpp
+msgid ""
+"CollisionShape only serves to provide a collision shape to a CollisionObject "
+"derived node. Please only use it as a child of Area, StaticBody, RigidBody, "
+"KinematicBody, etc. to give them a shape."
+msgstr ""
+
+#: scene/3d/collision_shape.cpp
+msgid ""
+"A shape must be provided for CollisionShape to function. Please create a "
+"shape resource for it."
+msgstr ""
+
+#: scene/3d/collision_shape.cpp
+msgid ""
+"Plane shapes don't work well and will be removed in future versions. Please "
+"don't use them."
+msgstr ""
+
+#: scene/3d/collision_shape.cpp
+msgid ""
+"ConcavePolygonShape doesn't support RigidBody in another mode than static."
+msgstr ""
+
+#: scene/3d/cpu_particles.cpp
+msgid "Nothing is visible because no mesh has been assigned."
+msgstr ""
+
+#: scene/3d/cpu_particles.cpp
+msgid ""
+"CPUParticles animation requires the usage of a SpatialMaterial whose "
+"Billboard Mode is set to \"Particle Billboard\"."
+msgstr ""
+
+#: scene/3d/gi_probe.cpp
+msgid "Plotting Meshes"
+msgstr ""
+
+#: scene/3d/gi_probe.cpp
+msgid "Finishing Plot"
+msgstr ""
+
+#: scene/3d/gi_probe.cpp
+msgid ""
+"GIProbes are not supported by the GLES2 video driver.\n"
+"Use a BakedLightmap instead."
+msgstr ""
+
+#: scene/3d/light.cpp
+msgid "A SpotLight with an angle wider than 90 degrees cannot cast shadows."
+msgstr ""
+
+#: scene/3d/navigation_mesh.cpp
+msgid "A NavigationMesh resource must be set or created for this node to work."
+msgstr ""
+
+#: scene/3d/navigation_mesh.cpp
+msgid ""
+"NavigationMeshInstance must be a child or grandchild to a Navigation node. "
+"It only provides navigation data."
+msgstr ""
+
+#: scene/3d/particles.cpp
+msgid ""
+"GPU-based particles are not supported by the GLES2 video driver.\n"
+"Use the CPUParticles node instead. You can use the \"Convert to CPUParticles"
+"\" option for this purpose."
+msgstr ""
+
+#: scene/3d/particles.cpp
+msgid ""
+"Nothing is visible because meshes have not been assigned to draw passes."
+msgstr ""
+
+#: scene/3d/particles.cpp
+msgid ""
+"Particles animation requires the usage of a SpatialMaterial whose Billboard "
+"Mode is set to \"Particle Billboard\"."
+msgstr ""
+
+#: scene/3d/path.cpp
+msgid "PathFollow only works when set as a child of a Path node."
+msgstr ""
+
+#: scene/3d/path.cpp
+msgid ""
+"PathFollow's ROTATION_ORIENTED requires \"Up Vector\" to be enabled in its "
+"parent Path's Curve resource."
+msgstr ""
+
+#: scene/3d/physics_body.cpp
+msgid ""
+"Size changes to RigidBody (in character or rigid modes) will be overridden "
+"by the physics engine when running.\n"
+"Change the size in children collision shapes instead."
+msgstr ""
+
+#: scene/3d/physics_joint.cpp
+msgid "Node A and Node B must be PhysicsBodies"
+msgstr ""
+
+#: scene/3d/physics_joint.cpp
+msgid "Node A must be a PhysicsBody"
+msgstr ""
+
+#: scene/3d/physics_joint.cpp
+msgid "Node B must be a PhysicsBody"
+msgstr ""
+
+#: scene/3d/physics_joint.cpp
+msgid "Joint is not connected to any PhysicsBodies"
+msgstr ""
+
+#: scene/3d/physics_joint.cpp
+msgid "Node A and Node B must be different PhysicsBodies"
+msgstr ""
+
+#: scene/3d/remote_transform.cpp
+msgid ""
+"The \"Remote Path\" property must point to a valid Spatial or Spatial-"
+"derived node to work."
+msgstr ""
+
+#: scene/3d/soft_body.cpp
+msgid "This body will be ignored until you set a mesh."
+msgstr ""
+
+#: scene/3d/soft_body.cpp
+msgid ""
+"Size changes to SoftBody will be overridden by the physics engine when "
+"running.\n"
+"Change the size in children collision shapes instead."
+msgstr ""
+
+#: scene/3d/sprite_3d.cpp
+msgid ""
+"A SpriteFrames resource must be created or set in the \"Frames\" property in "
+"order for AnimatedSprite3D to display frames."
+msgstr ""
+
+#: scene/3d/vehicle_body.cpp
+msgid ""
+"VehicleWheel serves to provide a wheel system to a VehicleBody. Please use "
+"it as a child of a VehicleBody."
+msgstr ""
+
+#: scene/3d/world_environment.cpp
+msgid ""
+"WorldEnvironment requires its \"Environment\" property to contain an "
+"Environment to have a visible effect."
+msgstr ""
+
+#: scene/3d/world_environment.cpp
+msgid ""
+"Only one WorldEnvironment is allowed per scene (or set of instanced scenes)."
+msgstr ""
+
+#: scene/3d/world_environment.cpp
+msgid ""
+"This WorldEnvironment is ignored. Either add a Camera (for 3D scenes) or set "
+"this environment's Background Mode to Canvas (for 2D scenes)."
+msgstr ""
+
+#: scene/animation/animation_blend_tree.cpp
+msgid "On BlendTree node '%s', animation not found: '%s'"
+msgstr ""
+
+#: scene/animation/animation_blend_tree.cpp
+msgid "Animation not found: '%s'"
+msgstr ""
+
+#: scene/animation/animation_tree.cpp
+msgid "In node '%s', invalid animation: '%s'."
+msgstr ""
+
+#: scene/animation/animation_tree.cpp
+msgid "Invalid animation: '%s'."
+msgstr ""
+
+#: scene/animation/animation_tree.cpp
+msgid "Nothing connected to input '%s' of node '%s'."
+msgstr ""
+
+#: scene/animation/animation_tree.cpp
+msgid "No root AnimationNode for the graph is set."
+msgstr ""
+
+#: scene/animation/animation_tree.cpp
+msgid "Path to an AnimationPlayer node containing animations is not set."
+msgstr ""
+
+#: scene/animation/animation_tree.cpp
+msgid "Path set for AnimationPlayer does not lead to an AnimationPlayer node."
+msgstr ""
+
+#: scene/animation/animation_tree.cpp
+msgid "The AnimationPlayer root node is not a valid node."
+msgstr ""
+
+#: scene/animation/animation_tree_player.cpp
+msgid "This node has been deprecated. Use AnimationTree instead."
+msgstr ""
+
+#: scene/gui/color_picker.cpp
+msgid ""
+"Color: #%s\n"
+"LMB: Set color\n"
+"RMB: Remove preset"
+msgstr ""
+
+#: scene/gui/color_picker.cpp
+msgid "Pick a color from the editor window."
+msgstr ""
+
+#: scene/gui/color_picker.cpp
+msgid "HSV"
+msgstr ""
+
+#: scene/gui/color_picker.cpp
+msgid "Raw"
+msgstr ""
+
+#: scene/gui/color_picker.cpp
+msgid "Switch between hexadecimal and code values."
+msgstr ""
+
+#: scene/gui/color_picker.cpp
+msgid "Add current color as a preset."
+msgstr ""
+
+#: scene/gui/container.cpp
+msgid ""
+"Container by itself serves no purpose unless a script configures its "
+"children placement behavior.\n"
+"If you don't intend to add a script, use a plain Control node instead."
+msgstr ""
+
+#: scene/gui/control.cpp
+msgid ""
+"The Hint Tooltip won't be displayed as the control's Mouse Filter is set to "
+"\"Ignore\". To solve this, set the Mouse Filter to \"Stop\" or \"Pass\"."
+msgstr ""
+
+#: scene/gui/dialogs.cpp
+msgid "Alert!"
+msgstr ""
+
+#: scene/gui/dialogs.cpp
+msgid "Please Confirm..."
+msgstr ""
+
+#: scene/gui/file_dialog.cpp
+msgid "Must use a valid extension."
+msgstr ""
+
+#: scene/gui/graph_edit.cpp
+msgid "Enable grid minimap."
+msgstr ""
+
+#: scene/gui/popup.cpp
+msgid ""
+"Popups will hide by default unless you call popup() or any of the popup*() "
+"functions. Making them visible for editing is fine, but they will hide upon "
+"running."
+msgstr ""
+
+#: scene/gui/range.cpp
+msgid "If \"Exp Edit\" is enabled, \"Min Value\" must be greater than 0."
+msgstr ""
+
+#: scene/gui/scroll_container.cpp
+msgid ""
+"ScrollContainer is intended to work with a single child control.\n"
+"Use a container as child (VBox, HBox, etc.), or a Control and set the custom "
+"minimum size manually."
+msgstr ""
+
+#: scene/gui/tree.cpp
+msgid "(Other)"
+msgstr ""
+
+#: scene/main/scene_tree.cpp
+msgid ""
+"Default Environment as specified in Project Settings (Rendering -> "
+"Environment -> Default Environment) could not be loaded."
+msgstr ""
+
+#: scene/main/viewport.cpp
+msgid ""
+"This viewport is not set as render target. If you intend for it to display "
+"its contents directly to the screen, make it a child of a Control so it can "
+"obtain a size. Otherwise, make it a RenderTarget and assign its internal "
+"texture to some node for display."
+msgstr ""
+
+#: scene/main/viewport.cpp
+msgid "Viewport size must be greater than 0 to render anything."
+msgstr ""
+
+#: scene/resources/visual_shader_nodes.cpp
+msgid ""
+"The sampler port is connected but not used. Consider changing the source to "
+"'SamplerPort'."
+msgstr ""
+
+#: scene/resources/visual_shader_nodes.cpp
+msgid "Invalid source for preview."
+msgstr ""
+
+#: scene/resources/visual_shader_nodes.cpp
+msgid "Invalid source for shader."
+msgstr ""
+
+#: scene/resources/visual_shader_nodes.cpp
+msgid "Invalid comparison function for that type."
+msgstr ""
+
+#: servers/visual/shader_language.cpp
+msgid "Assignment to function."
+msgstr ""
+
+#: servers/visual/shader_language.cpp
+msgid "Assignment to uniform."
+msgstr ""
+
+#: servers/visual/shader_language.cpp
+msgid "Varyings can only be assigned in vertex function."
+msgstr ""
+
+#: servers/visual/shader_language.cpp
+msgid "Constants cannot be modified."
+msgstr ""
diff --git a/editor/translations/ko.po b/editor/translations/ko.po
index 0fcbd51720..9770daf14a 100644
--- a/editor/translations/ko.po
+++ b/editor/translations/ko.po
@@ -26,8 +26,8 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: \n"
-"PO-Revision-Date: 2021-04-05 14:28+0000\n"
-"Last-Translator: Henry LeRoux <henry.leroux@ocsbstudent.ca>\n"
+"PO-Revision-Date: 2021-04-11 22:02+0000\n"
+"Last-Translator: Myeongjin Lee <aranet100@gmail.com>\n"
 "Language-Team: Korean <https://hosted.weblate.org/projects/godot-engine/"
 "godot/ko/>\n"
 "Language: ko\n"
@@ -4071,7 +4071,7 @@ msgstr "기본값으로 재설정"
 
 #: editor/import_dock.cpp
 msgid "Keep File (No Import)"
-msgstr ""
+msgstr "파일 유지 (가져오기 없음)"
 
 #: editor/import_dock.cpp
 msgid "%d Files"
@@ -6555,7 +6555,7 @@ msgstr "폴리곤 변형"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
 msgid "Paint Bone Weights"
-msgstr "본 가중치 칠"
+msgstr "본 가중치 칠하기"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
 msgid "Open Polygon 2D UV editor."
@@ -6963,7 +6963,7 @@ msgstr "프로시저 단위 실행"
 
 #: editor/plugins/script_editor_plugin.cpp editor/script_editor_debugger.cpp
 msgid "Step Over"
-msgstr "한 단계식 코드 실행"
+msgstr "한 단계씩 코드 실행"
 
 #: editor/plugins/script_editor_plugin.cpp editor/script_editor_debugger.cpp
 msgid "Break"
@@ -6984,11 +6984,11 @@ msgstr "외부 편집기로 디버깅"
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Open Godot online documentation."
-msgstr "Godot 온라인 문서를 열."
+msgstr "Godot 온라인 문서를 엽니다."
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Search the reference documentation."
-msgstr "참조 문서 검색."
+msgstr "참조 문서를 검색합니다."
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Go to previous edited document."
@@ -7077,11 +7077,11 @@ msgstr "대소문자 변환"
 
 #: editor/plugins/script_text_editor.cpp editor/plugins/text_editor.cpp
 msgid "Uppercase"
-msgstr "대문자로 바꾸기"
+msgstr "대문자로"
 
 #: editor/plugins/script_text_editor.cpp editor/plugins/text_editor.cpp
 msgid "Lowercase"
-msgstr "소문자로 바꾸기"
+msgstr "소문자로"
 
 #: editor/plugins/script_text_editor.cpp editor/plugins/text_editor.cpp
 msgid "Capitalize"
@@ -7149,7 +7149,7 @@ msgstr "아래로 복제"
 
 #: editor/plugins/script_text_editor.cpp
 msgid "Complete Symbol"
-msgstr "자동 완성"
+msgstr "상징 자동 완성"
 
 #: editor/plugins/script_text_editor.cpp
 msgid "Evaluate Selection"
@@ -7157,7 +7157,7 @@ msgstr "선택 항목 평가"
 
 #: editor/plugins/script_text_editor.cpp
 msgid "Trim Trailing Whitespace"
-msgstr "후행 공백 문자 삭제"
+msgstr "후행 공백 문자 제거"
 
 #: editor/plugins/script_text_editor.cpp
 msgid "Convert Indent to Spaces"
@@ -7478,27 +7478,27 @@ msgstr "GLES2 렌더러에서 사용할 수 없습니다."
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Freelook Left"
-msgstr "자유 시점 왼쪽으로 가기"
+msgstr "자유 시점 왼쪽으로"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Freelook Right"
-msgstr "자유 시점 오른쪽으로 가기"
+msgstr "자유 시점 오른쪽으로"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Freelook Forward"
-msgstr "자유 시점 앞으로 가기"
+msgstr "자유 시점 앞으로"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Freelook Backwards"
-msgstr "자유 시점 뒤로 가기"
+msgstr "자유 시점 뒤로"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Freelook Up"
-msgstr "자유 시점 위로 가기"
+msgstr "자유 시점 위로"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Freelook Down"
-msgstr "자유 시점 아래로 가기"
+msgstr "자유 시점 아래로"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Freelook Speed Modifier"
@@ -7514,6 +7514,11 @@ msgstr "뷰 회전 잠김"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -7623,27 +7628,27 @@ msgstr "변형 대화 상자..."
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "1 Viewport"
-msgstr "1개 뷰포트"
+msgstr "뷰포트 1개"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "2 Viewports"
-msgstr "2개 뷰포트"
+msgstr "뷰포트 2개"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "2 Viewports (Alt)"
-msgstr "2개 뷰포트 (다른 방식)"
+msgstr "뷰포트 2개 (다른 방식)"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "3 Viewports"
-msgstr "3개 뷰포트"
+msgstr "뷰포트 3개"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "3 Viewports (Alt)"
-msgstr "3개 뷰포트 (다른 방식)"
+msgstr "뷰포트 3개 (다른 방식)"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "4 Viewports"
-msgstr "4개 뷰포트"
+msgstr "뷰포트 4개"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Gizmos"
@@ -8147,7 +8152,7 @@ msgstr "선택 항목 잘라내기"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
 msgid "Paint TileMap"
-msgstr "타일맵 칠"
+msgstr "타일맵 칠하기"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
 msgid "Line Draw"
@@ -8159,7 +8164,7 @@ msgstr "사각 영역 칠하기"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
 msgid "Bucket Fill"
-msgstr "채우기"
+msgstr "버킷 채우기"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
 msgid "Erase TileMap"
@@ -8171,7 +8176,7 @@ msgstr "타일 찾기"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
 msgid "Transpose"
-msgstr "바꾸기"
+msgstr "행렬 맞바꾸기"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
 msgid "Disable Autotile"
@@ -8199,15 +8204,15 @@ msgid ""
 "Shift+Command+LMB: Rectangle Paint"
 msgstr ""
 "Shift+좌클릭: 선 그리기\n"
-"Shift+Command+좌클릭: 사각 영역 페인트"
+"Shift+Command+좌클릭: 사각 영역 칠하기"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
 msgid ""
 "Shift+LMB: Line Draw\n"
 "Shift+Ctrl+LMB: Rectangle Paint"
 msgstr ""
-"Shift+우클릭: 선 그리기\n"
-"Shift+Ctrl+우클릭: 사각 영역 페인트"
+"Shift+좌클릭: 선 그리기\n"
+"Shift+Ctrl+좌클릭: 사각 영역 칠하기"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
 msgid "Pick Tile"
@@ -8279,7 +8284,7 @@ msgstr "이전 모양, 하위 타일, 혹은 타일을 선택하세요."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Region"
-msgstr "지역"
+msgstr "영역"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Collision"
@@ -8307,7 +8312,7 @@ msgstr "Z 인덱스"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Region Mode"
-msgstr "지역 모드"
+msgstr "영역 모드"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Collision Mode"
@@ -8339,19 +8344,19 @@ msgstr "Z 인덱스 모드"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Copy bitmask."
-msgstr "비트 마스크 복사."
+msgstr "비트 마스크를 복사합니다."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Paste bitmask."
-msgstr "비트 마스크 붙여넣기."
+msgstr "비트 마스크를 붙여넣습니다."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Erase bitmask."
-msgstr "비트 마스크 지우기."
+msgstr "비트 마스크를 지웁니다."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Create a new rectangle."
-msgstr "새로운 사각형을 만듭니다."
+msgstr "새로운 사각 영역을 만듭니다."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "New Rectangle"
@@ -8371,7 +8376,7 @@ msgstr "선택된 모양 삭제"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Keep polygon inside region Rect."
-msgstr "사각형 내부에 폴리곤을 유지."
+msgstr "사각형 영역 내에 폴리곤을 유지합니다."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Enable snap and show grid (configurable via the Inspector)."
@@ -9164,7 +9169,7 @@ msgstr "매개변수의 쌍곡탄젠트 값을 반환합니다."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Finds the truncated value of the parameter."
-msgstr "매개변수의 절사된 값을 찾아요."
+msgstr "매개변수의 절사된 값을 찾습니다."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Adds scalar to scalar."
@@ -10853,6 +10858,13 @@ msgid "Remote"
 msgstr "원격"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "로컬"
 
diff --git a/editor/translations/lt.po b/editor/translations/lt.po
index 0796f01fbe..e5ca1dd50c 100644
--- a/editor/translations/lt.po
+++ b/editor/translations/lt.po
@@ -7482,6 +7482,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10777,6 +10782,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/lv.po b/editor/translations/lv.po
index d8a665caa6..606c690a55 100644
--- a/editor/translations/lv.po
+++ b/editor/translations/lv.po
@@ -7341,6 +7341,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10599,6 +10604,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/mi.po b/editor/translations/mi.po
index 5198022282..1259cbeed4 100644
--- a/editor/translations/mi.po
+++ b/editor/translations/mi.po
@@ -7261,6 +7261,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10450,6 +10455,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/mk.po b/editor/translations/mk.po
index 0b4e23cccf..25f0c1bedd 100644
--- a/editor/translations/mk.po
+++ b/editor/translations/mk.po
@@ -7268,6 +7268,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10457,6 +10462,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/ml.po b/editor/translations/ml.po
index a445086dd6..2ffb3793b7 100644
--- a/editor/translations/ml.po
+++ b/editor/translations/ml.po
@@ -7277,6 +7277,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10467,6 +10472,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/mr.po b/editor/translations/mr.po
index 00e8ced169..119e1ce931 100644
--- a/editor/translations/mr.po
+++ b/editor/translations/mr.po
@@ -7268,6 +7268,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10458,6 +10463,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/ms.po b/editor/translations/ms.po
index 363f8895a3..127e06c898 100644
--- a/editor/translations/ms.po
+++ b/editor/translations/ms.po
@@ -7605,6 +7605,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10816,6 +10821,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/nb.po b/editor/translations/nb.po
index 172439dc43..398330b3e9 100644
--- a/editor/translations/nb.po
+++ b/editor/translations/nb.po
@@ -7867,6 +7867,11 @@ msgstr "Vis Informasjon"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11266,6 +11271,13 @@ msgid "Remote"
 msgstr "Fjern Funksjon"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/nl.po b/editor/translations/nl.po
index 2716664b7a..e12d8c9324 100644
--- a/editor/translations/nl.po
+++ b/editor/translations/nl.po
@@ -7602,6 +7602,11 @@ msgstr "Beeldrotatie vergrendeld"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11005,6 +11010,13 @@ msgid "Remote"
 msgstr "Remote"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Lokaal"
 
diff --git a/editor/translations/or.po b/editor/translations/or.po
index 5e396315c2..77e9075f6a 100644
--- a/editor/translations/or.po
+++ b/editor/translations/or.po
@@ -7267,6 +7267,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10456,6 +10461,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/pl.po b/editor/translations/pl.po
index 7da98bc87c..122c89f2b6 100644
--- a/editor/translations/pl.po
+++ b/editor/translations/pl.po
@@ -5,7 +5,7 @@
 # 8-bit Pixel <dawdejw@gmail.com>, 2016.
 # Adam Wolanski <adam.wolanski94@gmail.com>, 2017.
 # Adrian Węcławski <weclawskiadrian@gmail.com>, 2016.
-# aelspire <aelspire@gmail.com>, 2017, 2019, 2020.
+# aelspire <aelspire@gmail.com>, 2017, 2019, 2020, 2021.
 # Daniel Lewan <vision360.daniel@gmail.com>, 2016-2018, 2020.
 # Dariusz Król <rexioweb@gmail.com>, 2018.
 # heya10 <igor.gielzak@gmail.com>, 2017.
@@ -26,7 +26,7 @@
 # Zatherz <zatherz@linux.pl>, 2017, 2020.
 # Tomek <kobewi4e@gmail.com>, 2018, 2019, 2020, 2021.
 # Wojcieh Er Zet <wojcieh.rzepecki@gmail.com>, 2018.
-# Dariusz Siek <dariuszynski@gmail.com>, 2018, 2019, 2020.
+# Dariusz Siek <dariuszynski@gmail.com>, 2018, 2019, 2020, 2021.
 # Szymon Nowakowski <smnbdg13@gmail.com>, 2019.
 # Nie Powiem <blazek10@tlen.pl>, 2019.
 # Sebastian Hojka <sibibibi1@gmail.com>, 2019.
@@ -43,15 +43,16 @@
 # Filip Glura <mcmr.slendy@gmail.com>, 2020.
 # Roman Skiba <romanskiba0@gmail.com>, 2020.
 # Piotr Grodzki <ziemniakglados@gmail.com>, 2020.
-# Dzejkop <jakubtrad@gmail.com>, 2020.
+# Dzejkop <jakubtrad@gmail.com>, 2020, 2021.
 # Mateusz Grzonka <alpinus4@gmail.com>, 2020.
 # gnu-ewm <gnu.ewm@protonmail.com>, 2021.
 # vrid <patryksoon@live.com>, 2021.
+# Suchy  Talerz <kacperkubis06@gmail.com>, 2021.
 msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: \n"
-"PO-Revision-Date: 2021-04-01 02:04+0000\n"
+"PO-Revision-Date: 2021-04-19 22:33+0000\n"
 "Last-Translator: Tomek <kobewi4e@gmail.com>\n"
 "Language-Team: Polish <https://hosted.weblate.org/projects/godot-engine/"
 "godot/pl/>\n"
@@ -61,7 +62,7 @@ msgstr ""
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=3; plural=n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 "
 "|| n%100>=20) ? 1 : 2;\n"
-"X-Generator: Weblate 4.6-dev\n"
+"X-Generator: Weblate 4.7-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -283,7 +284,7 @@ msgstr "Klipy dźwiękowe:"
 
 #: editor/animation_track_editor.cpp
 msgid "Anim Clips:"
-msgstr "Klipy animacji:"
+msgstr "Animacje:"
 
 #: editor/animation_track_editor.cpp
 msgid "Change Track Path"
@@ -291,7 +292,7 @@ msgstr "Zmień adres ścieżki"
 
 #: editor/animation_track_editor.cpp
 msgid "Toggle this track on/off."
-msgstr "Włącz/wyłącz tę ścieżkę."
+msgstr "Włącz/wyłącz ścieżkę."
 
 #: editor/animation_track_editor.cpp
 msgid "Update Mode (How this property is set)"
@@ -1497,7 +1498,7 @@ msgstr "Zmień nazwę Autoload"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Toggle AutoLoad Globals"
-msgstr "Przełącz automatycznie ładowane zmienne globalne"
+msgstr "Globalnie przełącz Autoload"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Move Autoload"
@@ -1794,7 +1795,7 @@ msgstr "Nowy"
 #: editor/editor_feature_profile.cpp editor/editor_node.cpp
 #: editor/project_manager.cpp
 msgid "Import"
-msgstr "Importuj"
+msgstr "Zaimportuj"
 
 #: editor/editor_feature_profile.cpp editor/project_export.cpp
 msgid "Export"
@@ -3008,7 +3009,7 @@ msgstr "Zgłoś błąd"
 
 #: editor/editor_node.cpp
 msgid "Send Docs Feedback"
-msgstr "Wyślij opinię o dokumentacji"
+msgstr "Oceń dokumentację"
 
 #: editor/editor_node.cpp editor/plugins/asset_library_editor_plugin.cpp
 msgid "Community"
@@ -7577,6 +7578,11 @@ msgstr "Obroty widoku zablokowane"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -7865,7 +7871,7 @@ msgstr "Utwórz równorzędny węzeł LightOccluder2D"
 
 #: editor/plugins/sprite_editor_plugin.cpp
 msgid "Sprite"
-msgstr "Sprite"
+msgstr "Postać"
 
 #: editor/plugins/sprite_editor_plugin.cpp
 msgid "Simplification: "
@@ -8074,7 +8080,7 @@ msgstr "Dodaj klasę elementów"
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Remove Class Items"
-msgstr "Usuń klasę elementów"
+msgstr "Usuń elementy klasy"
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Create Empty Template"
@@ -10960,6 +10966,13 @@ msgid "Remote"
 msgstr "Zdalny"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Lokalny"
 
@@ -11566,7 +11579,7 @@ msgstr "Malowanie GridMap"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Grid Map"
-msgstr "Grid Map"
+msgstr "Siatka"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Snap View"
diff --git a/editor/translations/pr.po b/editor/translations/pr.po
index 6f67b1c1be..24e2c7146a 100644
--- a/editor/translations/pr.po
+++ b/editor/translations/pr.po
@@ -7507,6 +7507,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10815,6 +10820,13 @@ msgid "Remote"
 msgstr "Discharge ye' Signal"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/pt.po b/editor/translations/pt.po
index 6020f0557f..26c28d5a19 100644
--- a/editor/translations/pt.po
+++ b/editor/translations/pt.po
@@ -22,7 +22,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: \n"
-"PO-Revision-Date: 2021-03-10 22:14+0000\n"
+"PO-Revision-Date: 2021-04-20 22:25+0000\n"
 "Last-Translator: João Lopes <linux-man@hotmail.com>\n"
 "Language-Team: Portuguese <https://hosted.weblate.org/projects/godot-engine/"
 "godot/pt/>\n"
@@ -31,7 +31,7 @@ msgstr ""
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=2; plural=n != 1;\n"
-"X-Generator: Weblate 4.5.2-dev\n"
+"X-Generator: Weblate 4.7-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -3690,6 +3690,8 @@ msgstr ""
 msgid ""
 "Importing has been disabled for this file, so it can't be opened for editing."
 msgstr ""
+"A importação foi desativada para este ficheiro, não podendo ser aberto para "
+"edição."
 
 #: editor/filesystem_dock.cpp
 msgid "Cannot move/rename resources root."
@@ -4093,7 +4095,7 @@ msgstr "Restaurar Predefinições"
 
 #: editor/import_dock.cpp
 msgid "Keep File (No Import)"
-msgstr ""
+msgstr "Manter Ficheiro (Sem Importação)"
 
 #: editor/import_dock.cpp
 msgid "%d Files"
@@ -7547,6 +7549,11 @@ msgstr "Rotação da Vista Bloqueada"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10924,6 +10931,13 @@ msgid "Remote"
 msgstr "Remoto"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Local"
 
diff --git a/editor/translations/pt_BR.po b/editor/translations/pt_BR.po
index 45e2050732..3509d790d8 100644
--- a/editor/translations/pt_BR.po
+++ b/editor/translations/pt_BR.po
@@ -114,12 +114,13 @@
 # Diego dos Reis Macedo <diego_dragon97@hotmail.com>, 2021.
 # Lucas E. <lukas.ed45@gmail.com>, 2021.
 # Gabriel Silveira <gabomfim99@gmail.com>, 2021.
+# Arthur Phillip D. Silva <artphil.dev@gmail.com>, 2021.
 msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: 2016-05-30\n"
-"PO-Revision-Date: 2021-04-05 14:28+0000\n"
-"Last-Translator: Gabriel Silveira <gabomfim99@gmail.com>\n"
+"PO-Revision-Date: 2021-04-11 22:02+0000\n"
+"Last-Translator: Arthur Phillip D. Silva <artphil.dev@gmail.com>\n"
 "Language-Team: Portuguese (Brazil) <https://hosted.weblate.org/projects/"
 "godot-engine/godot/pt_BR/>\n"
 "Language: pt_BR\n"
@@ -7664,6 +7665,11 @@ msgstr "Ver Rotação Bloqueada"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11045,6 +11051,13 @@ msgid "Remote"
 msgstr "Remoto"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Local"
 
diff --git a/editor/translations/ro.po b/editor/translations/ro.po
index c1ee0a6492..5761aadd1d 100644
--- a/editor/translations/ro.po
+++ b/editor/translations/ro.po
@@ -7645,6 +7645,11 @@ msgstr "Curăță Rotația Cursorului"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10980,6 +10985,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/ru.po b/editor/translations/ru.po
index 193b47de8c..b12e95793a 100644
--- a/editor/translations/ru.po
+++ b/editor/translations/ru.po
@@ -92,12 +92,13 @@
 # Igor Grachev <igorecha.9999@gmail.com>, 2020.
 # Dmytro Meleshko <dmytro.meleshko@gmail.com>, 2021.
 # narrnika <narr13niki@gmail.com>, 2021.
+# nec-trou <darya.bilyalova@gmail.com>, 2021.
 msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: \n"
-"PO-Revision-Date: 2021-04-05 14:28+0000\n"
-"Last-Translator: narrnika <narr13niki@gmail.com>\n"
+"PO-Revision-Date: 2021-04-19 22:33+0000\n"
+"Last-Translator: Danil Alexeev <danil@alexeev.xyz>\n"
 "Language-Team: Russian <https://hosted.weblate.org/projects/godot-engine/"
 "godot/ru/>\n"
 "Language: ru\n"
@@ -106,7 +107,7 @@ msgstr ""
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=3; plural=n%10==1 && n%100!=11 ? 0 : n%10>=2 && n"
 "%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2;\n"
-"X-Generator: Weblate 4.6-dev\n"
+"X-Generator: Weblate 4.7-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -2078,7 +2079,7 @@ msgstr "Свойства"
 
 #: editor/editor_help.cpp
 msgid "override:"
-msgstr "Переопределить:"
+msgstr "переопределено:"
 
 #: editor/editor_help.cpp
 msgid "default:"
@@ -7624,6 +7625,11 @@ msgstr "Блокировать вращение камеры"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11007,6 +11013,13 @@ msgid "Remote"
 msgstr "Удаленный"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Локальный"
 
diff --git a/editor/translations/si.po b/editor/translations/si.po
index 67903c8677..20b9001362 100644
--- a/editor/translations/si.po
+++ b/editor/translations/si.po
@@ -7321,6 +7321,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10541,6 +10546,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/sk.po b/editor/translations/sk.po
index 3bed9c2661..95b0fc7136 100644
--- a/editor/translations/sk.po
+++ b/editor/translations/sk.po
@@ -10,12 +10,13 @@
 # Richard <rgarlik@gmail.com>, 2019.
 # Richard Urban <redasuio1@gmail.com>, 2020.
 # Anonymous <noreply@weblate.org>, 2020.
+# Mario-projects-dev <m.vitek.mv@gmail.com>, 2021.
 msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: \n"
-"PO-Revision-Date: 2020-10-03 15:29+0000\n"
-"Last-Translator: Richard Urban <redasuio1@gmail.com>\n"
+"PO-Revision-Date: 2021-04-11 22:02+0000\n"
+"Last-Translator: Mario-projects-dev <m.vitek.mv@gmail.com>\n"
 "Language-Team: Slovak <https://hosted.weblate.org/projects/godot-engine/"
 "godot/sk/>\n"
 "Language: sk\n"
@@ -23,7 +24,7 @@ msgstr ""
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=3; plural=(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2;\n"
-"X-Generator: Weblate 4.3-dev\n"
+"X-Generator: Weblate 4.6-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -1027,14 +1028,14 @@ msgid "Owners Of:"
 msgstr "Majitelia:"
 
 #: editor/dependency_editor.cpp
-#, fuzzy
 msgid ""
 "Remove selected files from the project? (no undo)\n"
 "You can find the removed files in the system trash to restore them."
-msgstr "Odstrániť vybraté súbory z projektu? (nedá sa vrátiť späť)"
+msgstr ""
+"Odstrániť vybraté súbory z projektu? (nedá sa vrátiť späť)\n"
+"Odstránené súbory nájdete v systémovom koši, aby ste ich mohli obnoviť."
 
 #: editor/dependency_editor.cpp
-#, fuzzy
 msgid ""
 "The files being removed are required by other resources in order for them to "
 "work.\n"
@@ -1042,7 +1043,8 @@ msgid ""
 "You can find the removed files in the system trash to restore them."
 msgstr ""
 "Súbory ktoré budú odstránené vyžadujú ďalšie zdroje, aby mohli pracovať.\n"
-"Odstrániť aj napriek tomu? (nedá sa vrátiť späť)"
+"Odstrániť aj napriek tomu? (nedá sa vrátiť späť)\n"
+"Odstránené súbory nájdete v systémovom koši, aby ste ich mohli obnoviť."
 
 #: editor/dependency_editor.cpp
 msgid "Cannot remove:"
@@ -1978,7 +1980,7 @@ msgstr "Zdedené používateľom:"
 
 #: editor/editor_help.cpp
 msgid "Description"
-msgstr "Popisok"
+msgstr "Popis"
 
 #: editor/editor_help.cpp
 msgid "Online Tutorials"
@@ -7543,6 +7545,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10868,6 +10875,13 @@ msgid "Remote"
 msgstr "Všetky vybrané"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
@@ -12942,14 +12956,12 @@ msgid "Invalid source for preview."
 msgstr "Neplatný zdroj pre predzobrazenie."
 
 #: scene/resources/visual_shader_nodes.cpp
-#, fuzzy
 msgid "Invalid source for shader."
-msgstr "Nesprávna veľkosť písma."
+msgstr "Neplatný zdroj pre shader."
 
 #: scene/resources/visual_shader_nodes.cpp
-#, fuzzy
 msgid "Invalid comparison function for that type."
-msgstr "Nesprávna veľkosť písma."
+msgstr "Neplatná funkcia porovnania pre tento typ."
 
 #: servers/visual/shader_language.cpp
 msgid "Assignment to function."
diff --git a/editor/translations/sl.po b/editor/translations/sl.po
index 55c60530b7..500b8b1e54 100644
--- a/editor/translations/sl.po
+++ b/editor/translations/sl.po
@@ -7845,6 +7845,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11228,6 +11233,13 @@ msgid "Remote"
 msgstr "Upravljalnik"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/sq.po b/editor/translations/sq.po
index 4ed115ecfb..7b2fee263a 100644
--- a/editor/translations/sq.po
+++ b/editor/translations/sq.po
@@ -7597,6 +7597,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10862,6 +10867,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/sr_Cyrl.po b/editor/translations/sr_Cyrl.po
index b8edfd5d95..cb28a6b876 100644
--- a/editor/translations/sr_Cyrl.po
+++ b/editor/translations/sr_Cyrl.po
@@ -8269,6 +8269,11 @@ msgid "View Rotation Locked"
 msgstr "Прикажи информације"
 
 #: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
 #, fuzzy
 msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
@@ -12253,6 +12258,13 @@ msgid "Remote"
 msgstr "Удаљени уређај"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 #, fuzzy
 msgid "Local"
 msgstr "Локално"
diff --git a/editor/translations/sr_Latn.po b/editor/translations/sr_Latn.po
index 8f79f445d8..86ce05a7f2 100644
--- a/editor/translations/sr_Latn.po
+++ b/editor/translations/sr_Latn.po
@@ -7365,6 +7365,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10626,6 +10631,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/sv.po b/editor/translations/sv.po
index 125d4c733e..2b3c17e07e 100644
--- a/editor/translations/sv.po
+++ b/editor/translations/sv.po
@@ -7670,6 +7670,11 @@ msgstr "Visa Information"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11013,6 +11018,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/ta.po b/editor/translations/ta.po
index 0fbcb5c3eb..9b57af9595 100644
--- a/editor/translations/ta.po
+++ b/editor/translations/ta.po
@@ -7326,6 +7326,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10543,6 +10548,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/te.po b/editor/translations/te.po
index de9f84e3a4..a3c48112a6 100644
--- a/editor/translations/te.po
+++ b/editor/translations/te.po
@@ -7270,6 +7270,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10460,6 +10465,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/th.po b/editor/translations/th.po
index 4ac8875aa6..d865b04b16 100644
--- a/editor/translations/th.po
+++ b/editor/translations/th.po
@@ -7425,6 +7425,11 @@ msgstr "ล็อคการหมุนวิวแล้ว"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10742,6 +10747,13 @@ msgid "Remote"
 msgstr "ระยะไกล"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "ระยะใกล้"
 
diff --git a/editor/translations/tr.po b/editor/translations/tr.po
index 619bd94bb1..47ac3ea764 100644
--- a/editor/translations/tr.po
+++ b/editor/translations/tr.po
@@ -7587,6 +7587,11 @@ msgstr "Dönme Kilitli Görünüm"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10963,6 +10968,13 @@ msgid "Remote"
 msgstr "Uzak"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Yerel"
 
diff --git a/editor/translations/tzm.po b/editor/translations/tzm.po
index 893d4134db..d13e2e5705 100644
--- a/editor/translations/tzm.po
+++ b/editor/translations/tzm.po
@@ -7268,6 +7268,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10457,6 +10462,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/uk.po b/editor/translations/uk.po
index 3dd58a87f4..1eed824645 100644
--- a/editor/translations/uk.po
+++ b/editor/translations/uk.po
@@ -7574,6 +7574,11 @@ msgstr "Обертання перегляду заблоковано"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10964,6 +10969,13 @@ msgid "Remote"
 msgstr "Віддалений"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Локальний"
 
diff --git a/editor/translations/ur_PK.po b/editor/translations/ur_PK.po
index 78698e90ba..697cc3e5a4 100644
--- a/editor/translations/ur_PK.po
+++ b/editor/translations/ur_PK.po
@@ -7434,6 +7434,11 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10721,6 +10726,13 @@ msgid "Remote"
 msgstr ".تمام کا انتخاب"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/vi.po b/editor/translations/vi.po
index fa600ca176..74d8666e35 100644
--- a/editor/translations/vi.po
+++ b/editor/translations/vi.po
@@ -22,7 +22,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Godot Engine editor\n"
 "POT-Creation-Date: \n"
-"PO-Revision-Date: 2021-04-05 14:28+0000\n"
+"PO-Revision-Date: 2021-04-19 22:33+0000\n"
 "Last-Translator: Rev <revolnoom7801@gmail.com>\n"
 "Language-Team: Vietnamese <https://hosted.weblate.org/projects/godot-engine/"
 "godot/vi/>\n"
@@ -31,7 +31,7 @@ msgstr ""
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
-"X-Generator: Weblate 4.6-dev\n"
+"X-Generator: Weblate 4.7-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -766,7 +766,7 @@ msgstr "Phương thức trong nút đích phải được chỉ định."
 
 #: editor/connections_dialog.cpp
 msgid "Method name must be a valid identifier."
-msgstr "Tên phương thức phải được chỉ định."
+msgstr "Tên phương thức phải là một định danh hợp lệ."
 
 #: editor/connections_dialog.cpp
 msgid ""
@@ -1124,7 +1124,7 @@ msgstr "Cảm ơn từ cộng đồng Godot!"
 
 #: editor/editor_about.cpp
 msgid "Godot Engine contributors"
-msgstr "Đóng góp vào Godot Engine"
+msgstr "Cá nhân đóng góp của Godot Engine"
 
 #: editor/editor_about.cpp
 msgid "Project Founders"
@@ -1508,7 +1508,7 @@ msgstr "Tên"
 
 #: editor/editor_autoload_settings.cpp
 msgid "Singleton"
-msgstr "Singleton"
+msgstr "Đơn nhất"
 
 #: editor/editor_data.cpp editor/inspector_dock.cpp
 msgid "Paste Params"
@@ -1640,7 +1640,6 @@ msgstr "Không tìm thấy mẫu gỡ lỗi tuỳ chỉnh."
 #: editor/editor_export.cpp platform/android/export/export.cpp
 #: platform/iphone/export/export.cpp platform/javascript/export/export.cpp
 #: platform/osx/export/export.cpp platform/uwp/export/export.cpp
-#, fuzzy
 msgid "Custom release template not found."
 msgstr "Không tìm thấy mẫu phát hành tùy chỉnh."
 
@@ -1670,16 +1669,15 @@ msgstr "Chỉnh sửa cảnh"
 
 #: editor/editor_feature_profile.cpp
 msgid "Node Dock"
-msgstr "Nút"
+msgstr "Khung nút"
 
 #: editor/editor_feature_profile.cpp
-#, fuzzy
 msgid "FileSystem Dock"
-msgstr "Hệ thống tập tin"
+msgstr "Khung Hệ thống tập tin"
 
 #: editor/editor_feature_profile.cpp
 msgid "Import Dock"
-msgstr "Nhập vào"
+msgstr "Khung Nhập"
 
 #: editor/editor_feature_profile.cpp
 msgid "Erase profile '%s'? (no undo)"
@@ -1926,7 +1924,7 @@ msgstr "Bỏ yêu thích thư mục hiện tại."
 
 #: editor/editor_file_dialog.cpp scene/gui/file_dialog.cpp
 msgid "Toggle the visibility of hidden files."
-msgstr "Hiện/ẩn các tệp ẩn."
+msgstr "Hiện/ẩn tệp ẩn."
 
 #: editor/editor_file_dialog.cpp editor/filesystem_dock.cpp
 msgid "View items as a grid of thumbnails."
@@ -2085,9 +2083,8 @@ msgid "Properties Only"
 msgstr "Chỉ tìm Thuộc tính"
 
 #: editor/editor_help_search.cpp
-#, fuzzy
 msgid "Theme Properties Only"
-msgstr "Chỉ tìm thuộc tính Tông màu"
+msgstr "Chỉ tìm cài đặt Tông màu"
 
 #: editor/editor_help_search.cpp
 msgid "Member Type"
@@ -2114,9 +2111,8 @@ msgid "Property"
 msgstr "Thuộc tính"
 
 #: editor/editor_help_search.cpp
-#, fuzzy
 msgid "Theme Property"
-msgstr "Thuộc tính Tông màu"
+msgstr "Cài đặt Tông màu"
 
 #: editor/editor_inspector.cpp editor/project_settings_editor.cpp
 msgid "Property:"
@@ -2628,28 +2624,27 @@ msgstr "Chạy cảnh này"
 
 #: editor/editor_node.cpp
 msgid "Close Tab"
-msgstr "Đóng Tab"
+msgstr "Đóng Cửa sổ"
 
 #: editor/editor_node.cpp
-#, fuzzy
 msgid "Undo Close Tab"
-msgstr "Đóng Tab"
+msgstr "Hoàn tác đóng cửa sổ"
 
 #: editor/editor_node.cpp editor/plugins/script_editor_plugin.cpp
 msgid "Close Other Tabs"
-msgstr "Đóng tất cả Tab khác"
+msgstr "Đóng các cửa sổ khác"
 
 #: editor/editor_node.cpp
 msgid "Close Tabs to the Right"
-msgstr "Đóng các Tab bên phải"
+msgstr "Đóng các cửa sổ bên phải"
 
 #: editor/editor_node.cpp
 msgid "Close All Tabs"
-msgstr "Đóng tất cả"
+msgstr "Đóng hết cửa sổ"
 
 #: editor/editor_node.cpp
 msgid "Switch Scene Tab"
-msgstr "Chuyển Tab cảnh"
+msgstr "Chuyển Cửa sổ cảnh"
 
 #: editor/editor_node.cpp
 msgid "%d more files or folders"
@@ -2657,15 +2652,15 @@ msgstr "%d tệp hoặc thư mục nữa"
 
 #: editor/editor_node.cpp
 msgid "%d more folders"
-msgstr "%d thêm các thư mục"
+msgstr "%d thư mục nữa"
 
 #: editor/editor_node.cpp
 msgid "%d more files"
-msgstr "%d thêm các tệp tin"
+msgstr "%d tệp tin nữa"
 
 #: editor/editor_node.cpp
 msgid "Dock Position"
-msgstr "Vị trí Dock"
+msgstr "Vị trí Khung"
 
 #: editor/editor_node.cpp
 msgid "Distraction Free Mode"
@@ -2677,28 +2672,27 @@ msgstr "Bật tắt chế độ tập trung."
 
 #: editor/editor_node.cpp
 msgid "Add a new scene."
-msgstr "Thêm một cảnh mới."
+msgstr "Thêm cảnh mới."
 
 #: editor/editor_node.cpp
 msgid "Scene"
-msgstr "Phân cảnh"
+msgstr "Cảnh"
 
 #: editor/editor_node.cpp
 msgid "Go to previously opened scene."
 msgstr "Trở về cảnh đã mở trước đó."
 
 #: editor/editor_node.cpp
-#, fuzzy
 msgid "Copy Text"
-msgstr "Sao chép đường dẫn"
+msgstr "Sao chép văn bản"
 
 #: editor/editor_node.cpp
 msgid "Next tab"
-msgstr "Tab tiếp theo"
+msgstr "Cửa sổ tiếp theo"
 
 #: editor/editor_node.cpp
 msgid "Previous tab"
-msgstr "Tab trước"
+msgstr "Cửa sổ trước"
 
 #: editor/editor_node.cpp
 msgid "Filter Files..."
@@ -2734,7 +2728,7 @@ msgstr "Lưu hết các Cảnh"
 
 #: editor/editor_node.cpp
 msgid "Convert To..."
-msgstr "Chuyển đổi ..."
+msgstr "Chuyển thành..."
 
 #: editor/editor_node.cpp
 msgid "MeshLibrary..."
@@ -2756,7 +2750,7 @@ msgstr "Làm lại"
 
 #: editor/editor_node.cpp
 msgid "Miscellaneous project or scene-wide tools."
-msgstr "Linh tinh dự án hoặc công cụ toàn phân cảnh."
+msgstr "Dự án ngoài lề hoặc các công cụ toàn phân cảnh."
 
 #: editor/editor_node.cpp editor/project_manager.cpp
 #: editor/script_create_dialog.cpp
@@ -3184,11 +3178,11 @@ msgstr "Mã lệnh chính:"
 
 #: editor/editor_plugin_settings.cpp
 msgid "Edit Plugin"
-msgstr "Chỉnh phần mềm bổ trợ"
+msgstr "Chỉnh sửa Tiện ích"
 
 #: editor/editor_plugin_settings.cpp
 msgid "Installed Plugins:"
-msgstr "Các phần mềm bổ trợ đã cài:"
+msgstr "Các Tiện ích đã cài:"
 
 #: editor/editor_plugin_settings.cpp editor/plugin_config_dialog.cpp
 msgid "Update"
@@ -4065,7 +4059,7 @@ msgstr "Nạp mặc định"
 
 #: editor/import_dock.cpp
 msgid "Keep File (No Import)"
-msgstr ""
+msgstr "Giữ tệp (Không Nhập)"
 
 #: editor/import_dock.cpp
 msgid "%d Files"
@@ -4092,9 +4086,8 @@ msgid "Reimport"
 msgstr "Nhập vào lại"
 
 #: editor/import_dock.cpp
-#, fuzzy
 msgid "Save Scenes, Re-Import, and Restart"
-msgstr "Lưu các cảnh, nhập vào lại và khởi động lại"
+msgstr "Lưu các cảnh, nhập lại, rồi tái khởi động"
 
 #: editor/import_dock.cpp
 msgid "Changing the type of an imported file requires editor restart."
@@ -4194,16 +4187,15 @@ msgstr "Chọn nút duy nhất để chỉnh sửa tính hiệu và nhóm của
 
 #: editor/plugin_config_dialog.cpp
 msgid "Edit a Plugin"
-msgstr "Chỉnh phần mềm bổ trợ"
+msgstr "Chỉnh Tiện ích"
 
 #: editor/plugin_config_dialog.cpp
-#, fuzzy
 msgid "Create a Plugin"
-msgstr "Tạo & Sửa"
+msgstr "Tạo Tiện ích"
 
 #: editor/plugin_config_dialog.cpp
 msgid "Plugin Name:"
-msgstr "Tên phần mềm bổ trợ:"
+msgstr "Tên Tiện ích:"
 
 #: editor/plugin_config_dialog.cpp
 msgid "Subfolder:"
@@ -4284,14 +4276,12 @@ msgid "Move Node Point"
 msgstr "Di chuyển điểm Nút"
 
 #: editor/plugins/animation_blend_space_1d_editor.cpp
-#, fuzzy
 msgid "Change BlendSpace1D Limits"
-msgstr "Đổi Thời gian Chuyển Animation"
+msgstr "Thay đổi giới hạn BlendSpace1D"
 
 #: editor/plugins/animation_blend_space_1d_editor.cpp
-#, fuzzy
 msgid "Change BlendSpace1D Labels"
-msgstr "Đổi Thời gian Chuyển Animation"
+msgstr "Thay đổi nhãn BlendSpace1D"
 
 #: editor/plugins/animation_blend_space_1d_editor.cpp
 #: editor/plugins/animation_blend_space_2d_editor.cpp
@@ -5133,7 +5123,7 @@ msgstr "Nhập..."
 
 #: editor/plugins/asset_library_editor_plugin.cpp
 msgid "Plugins..."
-msgstr "Các phần mềm bổ trợ..."
+msgstr "Tiện ích..."
 
 #: editor/plugins/asset_library_editor_plugin.cpp editor/project_manager.cpp
 msgid "Sort:"
@@ -5943,9 +5933,8 @@ msgid "Hold Shift to edit tangents individually"
 msgstr "Giữ Shift để sửa từng tiếp tuyến một"
 
 #: editor/plugins/curve_editor_plugin.cpp
-#, fuzzy
 msgid "Right click to add point"
-msgstr "Nhấp chuột phải: Xóa Point"
+msgstr "Nhấp chuột phải để thêm điểm"
 
 #: editor/plugins/gi_probe_editor_plugin.cpp
 msgid "Bake GI Probe"
@@ -5953,7 +5942,7 @@ msgstr ""
 
 #: editor/plugins/gradient_editor_plugin.cpp
 msgid "Gradient Edited"
-msgstr ""
+msgstr "Dải màu đã được chỉnh sửa"
 
 #: editor/plugins/item_list_editor_plugin.cpp
 msgid "Item %d"
@@ -6009,9 +5998,8 @@ msgid "Can't create multiple convex collision shapes for the scene root."
 msgstr ""
 
 #: editor/plugins/mesh_instance_editor_plugin.cpp
-#, fuzzy
 msgid "Couldn't create any collision shapes."
-msgstr "Không thể tạo folder."
+msgstr "Không thể tạo bất kì khối va chạm nào."
 
 #: editor/plugins/mesh_instance_editor_plugin.cpp
 #, fuzzy
@@ -6162,9 +6150,8 @@ msgstr ""
 "%s"
 
 #: editor/plugins/mesh_library_editor_plugin.cpp
-#, fuzzy
 msgid "Mesh Library"
-msgstr "Xuất Mesh Library"
+msgstr "Thư viện Lưới"
 
 #: editor/plugins/mesh_library_editor_plugin.cpp
 #: editor/plugins/theme_editor_plugin.cpp
@@ -6222,11 +6209,11 @@ msgstr ""
 
 #: editor/plugins/multimesh_editor_plugin.cpp
 msgid "Select a Source Mesh:"
-msgstr ""
+msgstr "Chọn một lưới nguồn:"
 
 #: editor/plugins/multimesh_editor_plugin.cpp
 msgid "Select a Target Surface:"
-msgstr ""
+msgstr "Chọn Bề mặt tác động:"
 
 #: editor/plugins/multimesh_editor_plugin.cpp
 msgid "Populate Surface"
@@ -6242,7 +6229,7 @@ msgstr ""
 
 #: editor/plugins/multimesh_editor_plugin.cpp
 msgid "Source Mesh:"
-msgstr ""
+msgstr "Lưới nguồn:"
 
 #: editor/plugins/multimesh_editor_plugin.cpp
 msgid "X-Axis"
@@ -6262,15 +6249,15 @@ msgstr ""
 
 #: editor/plugins/multimesh_editor_plugin.cpp
 msgid "Random Rotation:"
-msgstr ""
+msgstr "Xoay ngẫu nhiên:"
 
 #: editor/plugins/multimesh_editor_plugin.cpp
 msgid "Random Tilt:"
-msgstr ""
+msgstr "Nghiêng ngẫu nhiên:"
 
 #: editor/plugins/multimesh_editor_plugin.cpp
 msgid "Random Scale:"
-msgstr ""
+msgstr "Thu phóng ngẫu nhiên:"
 
 #: editor/plugins/multimesh_editor_plugin.cpp
 msgid "Populate"
@@ -6284,7 +6271,7 @@ msgstr ""
 #: editor/plugins/particles_2d_editor_plugin.cpp
 #: editor/plugins/particles_editor_plugin.cpp
 msgid "Convert to CPUParticles"
-msgstr ""
+msgstr "Chuyển thành CPUParticles"
 
 #: editor/plugins/particles_2d_editor_plugin.cpp
 msgid "Generating Visibility Rect"
@@ -6299,9 +6286,8 @@ msgid "Can only set point into a ParticlesMaterial process material"
 msgstr ""
 
 #: editor/plugins/particles_2d_editor_plugin.cpp
-#, fuzzy
 msgid "Convert to CPUParticles2D"
-msgstr "Xóa Animation"
+msgstr "Chuyển thành CPUParticles2D"
 
 #: editor/plugins/particles_2d_editor_plugin.cpp
 #: editor/plugins/particles_editor_plugin.cpp
@@ -6519,10 +6505,12 @@ msgid "Create UV Map"
 msgstr ""
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
+#, fuzzy
 msgid ""
 "Polygon 2D has internal vertices, so it can no longer be edited in the "
 "viewport."
 msgstr ""
+"Đa giác 2D có đỉnh nằm trong, vì vậy không thể chỉnh sửa trong cổng xem."
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
 msgid "Create Polygon & UV"
@@ -6541,23 +6529,20 @@ msgid "Invalid Polygon (need 3 different vertices)"
 msgstr "Đa giác không hợp lệ (cần 3 đỉnh khác nhau)"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
-#, fuzzy
 msgid "Add Custom Polygon"
-msgstr "Tạo"
+msgstr "Thêm Đa giác Tùy chỉnh"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
-#, fuzzy
 msgid "Remove Custom Polygon"
-msgstr "Xóa Animation"
+msgstr "Xóa Đa giác Tùy chỉnh"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
 msgid "Transform UV Map"
 msgstr ""
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
-#, fuzzy
 msgid "Transform Polygon"
-msgstr "Tạo"
+msgstr "Biến đổi đa giác"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
 msgid "Paint Bone Weights"
@@ -6580,9 +6565,8 @@ msgid "Points"
 msgstr "Các Điểm"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
-#, fuzzy
 msgid "Polygons"
-msgstr "Tạo"
+msgstr "Đa giác"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
 msgid "Bones"
@@ -6593,9 +6577,8 @@ msgid "Move Points"
 msgstr "Di chuyển các điểm"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
-#, fuzzy
 msgid "Command: Rotate"
-msgstr "Kéo: Xoay"
+msgstr "Nút Command: Xoay"
 
 #: editor/plugins/polygon_2d_editor_plugin.cpp
 msgid "Shift: Move All"
@@ -6723,7 +6706,7 @@ msgstr "Xóa tài nguyên"
 
 #: editor/plugins/resource_preloader_editor_plugin.cpp
 msgid "Resource clipboard is empty!"
-msgstr ""
+msgstr "Khay nhớ tạm Tài nguyên trống!"
 
 #: editor/plugins/resource_preloader_editor_plugin.cpp
 msgid "Paste Resource"
@@ -6764,7 +6747,7 @@ msgstr "Đường dẫn tới AnimationPlayer không hợp lệ"
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Clear Recent Files"
-msgstr ""
+msgstr "Xóa lịch sử Tệp gần đây"
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Close and save changes?"
@@ -6775,9 +6758,8 @@ msgid "Error writing TextFile:"
 msgstr "Lỗi viết TextFile:"
 
 #: editor/plugins/script_editor_plugin.cpp
-#, fuzzy
 msgid "Could not load file at:"
-msgstr "Không viết được file:"
+msgstr "Không tải được tệp tại:"
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Error saving file!"
@@ -6800,19 +6782,16 @@ msgid "Error Importing"
 msgstr "Lỗi Khi Nhập"
 
 #: editor/plugins/script_editor_plugin.cpp
-#, fuzzy
 msgid "New Text File..."
-msgstr "Thư mục mới ..."
+msgstr "Tệp văn bản mới..."
 
 #: editor/plugins/script_editor_plugin.cpp
-#, fuzzy
 msgid "Open File"
-msgstr "Mở"
+msgstr "Mở tệp"
 
 #: editor/plugins/script_editor_plugin.cpp
-#, fuzzy
 msgid "Save File As..."
-msgstr "Lưu Scene với tên..."
+msgstr "Lưu Cảnh thành..."
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Can't obtain the script for running."
@@ -6862,18 +6841,16 @@ msgid "Find Previous"
 msgstr "Tìm trước đó"
 
 #: editor/plugins/script_editor_plugin.cpp
-#, fuzzy
 msgid "Filter scripts"
-msgstr "Lọc các thuộc tính"
+msgstr "Lọc tệp lệnh"
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Toggle alphabetical sorting of the method list."
 msgstr "Bật/tắt sắp xếp danh sách phương thức theo bảng chữ cái."
 
 #: editor/plugins/script_editor_plugin.cpp
-#, fuzzy
 msgid "Filter methods"
-msgstr "Lọc các nút"
+msgstr "Lọc phương thức"
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Sort"
@@ -6908,9 +6885,8 @@ msgid "Open..."
 msgstr "Mở..."
 
 #: editor/plugins/script_editor_plugin.cpp
-#, fuzzy
 msgid "Reopen Closed Script"
-msgstr "Tạo Script"
+msgstr "Mở lại tệp lệnh đã đóng"
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Save All"
@@ -6989,11 +6965,11 @@ msgstr "Tiếp tục"
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Keep Debugger Open"
-msgstr ""
+msgstr "Giữ Trình gỡ lỗi mở"
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Debug with External Editor"
-msgstr ""
+msgstr "Gỡ lỗi bằng Trình chỉnh sửa bên ngoài"
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Open Godot online documentation."
@@ -7005,11 +6981,11 @@ msgstr "Tìm tài liệu tham khảo."
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Go to previous edited document."
-msgstr ""
+msgstr "Tới tài liệu được chỉnh sửa trước đó."
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Go to next edited document."
-msgstr ""
+msgstr "Tới tài liệu được chỉnh sửa tiếp theo."
 
 #: editor/plugins/script_editor_plugin.cpp
 msgid "Discard"
@@ -7020,20 +6996,20 @@ msgid ""
 "The following files are newer on disk.\n"
 "What action should be taken?:"
 msgstr ""
+"Các tệp sau đây mới hơn trên ổ cứng.\n"
+"Hãy chọn hành động của bạn:"
 
 #: editor/plugins/script_editor_plugin.cpp editor/script_editor_debugger.cpp
 msgid "Debugger"
-msgstr ""
+msgstr "Trình gỡ lỗi"
 
 #: editor/plugins/script_editor_plugin.cpp
-#, fuzzy
 msgid "Search Results"
-msgstr "Tìm sự giúp đỡ"
+msgstr "Kết quả tìm kiếm"
 
 #: editor/plugins/script_editor_plugin.cpp
-#, fuzzy
 msgid "Clear Recent Scripts"
-msgstr "Dọn các cảnh gần đây"
+msgstr "Dọn các tệp lệnh gần đây"
 
 #: editor/plugins/script_text_editor.cpp
 msgid "Connections to method:"
@@ -7041,7 +7017,7 @@ msgstr "Kết nối đến phương thức:"
 
 #: editor/plugins/script_text_editor.cpp editor/script_editor_debugger.cpp
 msgid "Source"
-msgstr ""
+msgstr "Nguồn"
 
 #: editor/plugins/script_text_editor.cpp
 msgid "Target"
@@ -7062,9 +7038,8 @@ msgid "Line"
 msgstr "Dòng"
 
 #: editor/plugins/script_text_editor.cpp
-#, fuzzy
 msgid "Go to Function"
-msgstr "Thêm Hàm"
+msgstr "Đi tới Hàm"
 
 #: editor/plugins/script_text_editor.cpp
 msgid "Only resources from filesystem can be dropped."
@@ -7160,8 +7135,9 @@ msgid "Clone Down"
 msgstr ""
 
 #: editor/plugins/script_text_editor.cpp
+#, fuzzy
 msgid "Complete Symbol"
-msgstr ""
+msgstr "Hoàn thiện kí hiệu"
 
 #: editor/plugins/script_text_editor.cpp
 #, fuzzy
@@ -7185,28 +7161,24 @@ msgid "Auto Indent"
 msgstr "Thụt lề Tự động"
 
 #: editor/plugins/script_text_editor.cpp
-#, fuzzy
 msgid "Find in Files..."
-msgstr "Tìm..."
+msgstr "Tìm trong Tệp..."
 
 #: editor/plugins/script_text_editor.cpp
 msgid "Contextual Help"
 msgstr ""
 
 #: editor/plugins/script_text_editor.cpp
-#, fuzzy
 msgid "Toggle Bookmark"
-msgstr "Bật tắt Chức năng"
+msgstr "Bật tắt Dấu trang"
 
 #: editor/plugins/script_text_editor.cpp
-#, fuzzy
 msgid "Go to Next Bookmark"
-msgstr "Đến Step tiếp theo"
+msgstr "Đến Dấu trang tiếp theo"
 
 #: editor/plugins/script_text_editor.cpp
-#, fuzzy
 msgid "Go to Previous Bookmark"
-msgstr "Đến Step trước đó"
+msgstr "Đến Dấu trang trước đó"
 
 #: editor/plugins/script_text_editor.cpp
 msgid "Remove All Bookmarks"
@@ -7222,7 +7194,6 @@ msgstr "Đến Dòng..."
 
 #: editor/plugins/script_text_editor.cpp
 #: modules/visual_script/visual_script_editor.cpp
-#, fuzzy
 msgid "Toggle Breakpoint"
 msgstr "Tạo điểm dừng"
 
@@ -7231,37 +7202,36 @@ msgid "Remove All Breakpoints"
 msgstr "Xóa hết mọi điểm dừng"
 
 #: editor/plugins/script_text_editor.cpp
-#, fuzzy
 msgid "Go to Next Breakpoint"
-msgstr "Đến Step tiếp theo"
+msgstr "Đến điểm dừng tiếp theo"
 
 #: editor/plugins/script_text_editor.cpp
-#, fuzzy
 msgid "Go to Previous Breakpoint"
-msgstr "Đến Step trước đó"
+msgstr "Đến điểm dừng trước đó"
 
 #: editor/plugins/shader_editor_plugin.cpp
 msgid ""
 "This shader has been modified on on disk.\n"
 "What action should be taken?"
 msgstr ""
+"Shader này đã bị chỉnh sửa trên bộ nhớ.\n"
+"Hành động nào nên được thực hiện?"
 
 #: editor/plugins/shader_editor_plugin.cpp
 msgid "Shader"
-msgstr ""
+msgstr "Shader"
 
 #: editor/plugins/skeleton_2d_editor_plugin.cpp
 msgid "This skeleton has no bones, create some children Bone2D nodes."
 msgstr "Bộ xương không có xương, tạo một số nút Bone2D."
 
 #: editor/plugins/skeleton_2d_editor_plugin.cpp
-#, fuzzy
 msgid "Create Rest Pose from Bones"
-msgstr "Tạo từ Scene"
+msgstr "Tạo tư thế nghỉ từ Xương"
 
 #: editor/plugins/skeleton_2d_editor_plugin.cpp
 msgid "Set Rest Pose to Bones"
-msgstr ""
+msgstr "Đặt tư thế nghỉ cho Xương"
 
 #: editor/plugins/skeleton_2d_editor_plugin.cpp
 msgid "Skeleton2D"
@@ -7280,18 +7250,16 @@ msgid "Create physical bones"
 msgstr "Tạo xương vật lý"
 
 #: editor/plugins/skeleton_editor_plugin.cpp
-#, fuzzy
 msgid "Skeleton"
-msgstr "Xóa Point"
+msgstr "Khung xương"
 
 #: editor/plugins/skeleton_editor_plugin.cpp
-#, fuzzy
 msgid "Create physical skeleton"
-msgstr "Tạo bộ xương vật lý"
+msgstr "Tạo khung xương vật lý"
 
 #: editor/plugins/skeleton_ik_editor_plugin.cpp
 msgid "Play IK"
-msgstr ""
+msgstr "Chạy IK"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Orthogonal"
@@ -7303,19 +7271,19 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Transform Aborted."
-msgstr ""
+msgstr "Hủy Biến đổi."
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "X-Axis Transform."
-msgstr ""
+msgstr "Biến đổi theo trục X."
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Y-Axis Transform."
-msgstr ""
+msgstr "Biến đổi theo trục Y."
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Z-Axis Transform."
-msgstr ""
+msgstr "Biến đổi theo trục Z."
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "View Plane Transform."
@@ -7331,7 +7299,7 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Rotating %s degrees."
-msgstr ""
+msgstr "Xoay %s độ."
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Keying is disabled (no key inserted)."
@@ -7339,7 +7307,7 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Animation Key Inserted."
-msgstr ""
+msgstr "Đã chèn khóa hoạt ảnh."
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Pitch"
@@ -7375,7 +7343,7 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Vertices"
-msgstr ""
+msgstr "Đỉnh"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Top View."
@@ -7537,6 +7505,11 @@ msgstr "Đã khóa xoay ở chế độ xem"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -7629,7 +7602,6 @@ msgstr ""
 
 #: editor/plugins/spatial_editor_plugin.cpp
 #: editor/plugins/visual_shader_editor_plugin.cpp
-#, fuzzy
 msgid "Transform"
 msgstr "Biến đổi"
 
@@ -7638,7 +7610,6 @@ msgid "Snap Object to Floor"
 msgstr "Dính Vật lên Sàn"
 
 #: editor/plugins/spatial_editor_plugin.cpp
-#, fuzzy
 msgid "Transform Dialog..."
 msgstr "Hộp thoại Biến đổi ..."
 
@@ -7680,9 +7651,8 @@ msgstr "Xem Lưới"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 #: modules/gridmap/grid_map_editor_plugin.cpp
-#, fuzzy
 msgid "Settings..."
-msgstr "Đang kết nối..."
+msgstr "Cài đặt..."
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Snap Settings"
@@ -7734,7 +7704,7 @@ msgstr "Thu phóng (theo tỉ lệ):"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Transform Type"
-msgstr ""
+msgstr "Kiểu biến đổi"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Pre"
@@ -7749,46 +7719,38 @@ msgid "Nameless gizmo"
 msgstr ""
 
 #: editor/plugins/sprite_editor_plugin.cpp
-#, fuzzy
 msgid "Create Mesh2D"
-msgstr "Tạo %s Mới"
+msgstr "Tạo Mesh2D"
 
 #: editor/plugins/sprite_editor_plugin.cpp
-#, fuzzy
 msgid "Mesh2D Preview"
-msgstr "Xem thử"
+msgstr "Xem trước Mesh2D"
 
 #: editor/plugins/sprite_editor_plugin.cpp
-#, fuzzy
 msgid "Create Polygon2D"
-msgstr "Tạo"
+msgstr "Tạo Polygon2D"
 
 #: editor/plugins/sprite_editor_plugin.cpp
 msgid "Polygon2D Preview"
 msgstr "Xem trước Polygon2D"
 
 #: editor/plugins/sprite_editor_plugin.cpp
-#, fuzzy
 msgid "Create CollisionPolygon2D"
-msgstr "Tạo"
+msgstr "Tạo CollisionPolygon2D"
 
 #: editor/plugins/sprite_editor_plugin.cpp
-#, fuzzy
 msgid "CollisionPolygon2D Preview"
-msgstr "Tạo"
+msgstr "Xem trước CollisionPolygon2D"
 
 #: editor/plugins/sprite_editor_plugin.cpp
-#, fuzzy
 msgid "Create LightOccluder2D"
-msgstr "Tạo Folder"
+msgstr "Tạo LightOccluder2D"
 
 #: editor/plugins/sprite_editor_plugin.cpp
-#, fuzzy
 msgid "LightOccluder2D Preview"
-msgstr "Tạo Folder"
+msgstr "Xem trước LightOccluder2D"
 
 #: editor/plugins/sprite_editor_plugin.cpp
-#, fuzzy
 msgid "Sprite is empty!"
 msgstr "Sprite trống!"
 
@@ -7809,9 +7771,8 @@ msgid "Invalid geometry, can't create polygon."
 msgstr ""
 
 #: editor/plugins/sprite_editor_plugin.cpp
-#, fuzzy
 msgid "Convert to Polygon2D"
-msgstr "Xóa Animation"
+msgstr "Chuyển thành Polygon2D"
 
 #: editor/plugins/sprite_editor_plugin.cpp
 msgid "Invalid geometry, can't create collision polygon."
@@ -7832,7 +7793,7 @@ msgstr ""
 
 #: editor/plugins/sprite_editor_plugin.cpp
 msgid "Sprite"
-msgstr ""
+msgstr "Sprite"
 
 #: editor/plugins/sprite_editor_plugin.cpp
 msgid "Simplification: "
@@ -7848,16 +7809,15 @@ msgstr ""
 
 #: editor/plugins/sprite_editor_plugin.cpp
 msgid "Update Preview"
-msgstr ""
+msgstr "Cập nhật bản xem trước"
 
 #: editor/plugins/sprite_editor_plugin.cpp
 msgid "Settings:"
 msgstr "Cài đặt:"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
-#, fuzzy
 msgid "No Frames Selected"
-msgstr "Xoá lựa chọn"
+msgstr "Chưa chọn khung hình nào"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
 msgid "Add %d Frame(s)"
@@ -7873,15 +7833,15 @@ msgstr "Không tải được hình ảnh"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
 msgid "ERROR: Couldn't load frame resource!"
-msgstr ""
+msgstr "LỖI: Không thể nạp khung hình!"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
 msgid "Resource clipboard is empty or not a texture!"
-msgstr ""
+msgstr "Khay nhớ tạm tài nguyên bị trống hoặc không chứa họa tiết!"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
 msgid "Paste Frame"
-msgstr ""
+msgstr "Dán Khung hình"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
 msgid "Add Empty"
@@ -7896,18 +7856,16 @@ msgid "(empty)"
 msgstr "(trống)"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
-#, fuzzy
 msgid "Move Frame"
-msgstr "Di chuyển Nút"
+msgstr "Di chuyển Khung hình"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
 msgid "Animations:"
 msgstr "Các hoạt ảnh:"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
-#, fuzzy
 msgid "New Animation"
-msgstr "Tạo Animation mới"
+msgstr "Tạo Hoạt ảnh mới"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
 msgid "Speed:"
@@ -7918,18 +7876,16 @@ msgid "Loop"
 msgstr "Lặp"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
-#, fuzzy
 msgid "Animation Frames:"
-msgstr "Tên Animation:"
+msgstr "Khung hình Hoạt ảnh:"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
-#, fuzzy
 msgid "Add a Texture from File"
-msgstr "Chèn Texture(s) vào TileSet"
+msgstr "Thêm Họa tiết từ tệp"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
 msgid "Add Frames from a Sprite Sheet"
-msgstr ""
+msgstr "Thêm Khung hình từ Sprite Sheet"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
 msgid "Insert Empty (Before)"
@@ -7948,9 +7904,8 @@ msgid "Move (After)"
 msgstr "Di chuyển (Sau)"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
-#, fuzzy
 msgid "Select Frames"
-msgstr "Chọn Points"
+msgstr "Chọn Khung hình"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
 msgid "Horizontal:"
@@ -7965,9 +7920,8 @@ msgid "Select/Clear All Frames"
 msgstr "Chọn/Xóa Tất cả Khung hình"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
-#, fuzzy
 msgid "Create Frames from Sprite Sheet"
-msgstr "Tạo từ Scene"
+msgstr "Tạo Khung hình từ Sprite Sheet"
 
 #: editor/plugins/sprite_frames_editor_plugin.cpp
 msgid "SpriteFrames"
@@ -8004,7 +7958,7 @@ msgstr ""
 
 #: editor/plugins/texture_region_editor_plugin.cpp
 msgid "Offset:"
-msgstr ""
+msgstr "Độ dời:"
 
 #: editor/plugins/texture_region_editor_plugin.cpp
 msgid "Step:"
@@ -8020,7 +7974,7 @@ msgstr "TextureRegion"
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Add All Items"
-msgstr ""
+msgstr "Thêm tất cả các mục"
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Add All"
@@ -8044,11 +7998,11 @@ msgstr "Menu chỉnh Tông màu."
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Add Class Items"
-msgstr ""
+msgstr "Thêm mục Lớp"
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Remove Class Items"
-msgstr ""
+msgstr "Xóa mục Lớp"
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Create Empty Template"
@@ -8056,7 +8010,7 @@ msgstr "Tạo Mẫu Trống"
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Create Empty Editor Template"
-msgstr ""
+msgstr "Tạo mẫu Trình biên tập trống"
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Create From Current Editor Theme"
@@ -8077,9 +8031,8 @@ msgid "Item"
 msgstr "Mục"
 
 #: editor/plugins/theme_editor_plugin.cpp
-#, fuzzy
 msgid "Disabled Item"
-msgstr "Tắt"
+msgstr "Các mục tắt"
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Check Item"
@@ -8139,9 +8092,8 @@ msgid "Tab 3"
 msgstr ""
 
 #: editor/plugins/theme_editor_plugin.cpp
-#, fuzzy
 msgid "Editable Item"
-msgstr "Chỉnh Thời gian Chuyển Animation"
+msgstr "Mục có thể chỉnh sửa"
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Subtree"
@@ -8166,7 +8118,7 @@ msgstr ""
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Font"
-msgstr ""
+msgstr "Phông chữ"
 
 #: editor/plugins/theme_editor_plugin.cpp
 msgid "Color"
@@ -8186,9 +8138,8 @@ msgstr "Sửa các ô không hợp lệ"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
 #: modules/gridmap/grid_map_editor_plugin.cpp
-#, fuzzy
 msgid "Cut Selection"
-msgstr "Nhân đôi lựa chọn"
+msgstr "Cắt lựa chọn"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
 msgid "Paint TileMap"
@@ -8211,9 +8162,8 @@ msgid "Erase TileMap"
 msgstr "Xóa TileMap"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
-#, fuzzy
 msgid "Find Tile"
-msgstr "Tìm tiếp theo"
+msgstr "Tìm Ô"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
 msgid "Transpose"
@@ -8273,9 +8223,8 @@ msgid "Flip Vertically"
 msgstr "Lật Dọc"
 
 #: editor/plugins/tile_map_editor_plugin.cpp
-#, fuzzy
 msgid "Clear Transform"
-msgstr "Đổi Transform Animation"
+msgstr "Xóa biến đổi"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Add Texture(s) to TileSet."
@@ -8295,7 +8244,7 @@ msgstr "Gộp từ Scene"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "New Single Tile"
-msgstr ""
+msgstr "Tạo Ô mới"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 #, fuzzy
@@ -8303,9 +8252,8 @@ msgid "New Autotile"
 msgstr "Hoạt ảnh mới"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "New Atlas"
-msgstr "Mới %s"
+msgstr "Tập bản đồ mới"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Next Coordinate"
@@ -8316,9 +8264,8 @@ msgid "Select the next shape, subtile, or Tile."
 msgstr ""
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Previous Coordinate"
-msgstr "Thư mục trước"
+msgstr "Tọa độ trước"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Select the previous shape, subtile, or Tile."
@@ -8329,9 +8276,8 @@ msgid "Region"
 msgstr ""
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Collision"
-msgstr "Tạo"
+msgstr "Va chạm"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 #, fuzzy
@@ -8344,7 +8290,7 @@ msgstr "Điều hướng"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Bitmask"
-msgstr ""
+msgstr "Bitmask"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Priority"
@@ -8359,9 +8305,8 @@ msgid "Region Mode"
 msgstr ""
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Collision Mode"
-msgstr "Tạo"
+msgstr "Chế độ va chạm"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 #, fuzzy
@@ -8374,7 +8319,7 @@ msgstr "Chế độ di chuyển"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Bitmask Mode"
-msgstr ""
+msgstr "Chế độ Bitmask"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Priority Mode"
@@ -8391,7 +8336,7 @@ msgstr ""
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Copy bitmask."
-msgstr ""
+msgstr "Sao chép bitmask."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Paste bitmask."
@@ -8399,30 +8344,27 @@ msgstr "Dán bitmask."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Erase bitmask."
-msgstr ""
+msgstr "Xóa bitmask."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Create a new rectangle."
 msgstr "Tạo hình chữ nhật mới."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "New Rectangle"
-msgstr "Tạo Cảnh Mới"
+msgstr "Hình chữ nhật mới"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Create a new polygon."
 msgstr "Tạo đa giác mới."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "New Polygon"
-msgstr "Tạo"
+msgstr "Đa giác mới"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Delete Selected Shape"
-msgstr "Xoá lựa chọn"
+msgstr "Xoá Hình được chọn"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Keep polygon inside region Rect."
@@ -8458,9 +8400,8 @@ msgid "Merge from scene?"
 msgstr "Hợp nhất từ cảnh?"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Remove Texture"
-msgstr "Xóa Template"
+msgstr "Xóa Họa tiết"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "%s file(s) were not added because was already on the list."
@@ -8519,23 +8460,20 @@ msgid "Set Tile Region"
 msgstr ""
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Create Tile"
-msgstr "Tạo Folder"
+msgstr "Tạo Ô"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Set Tile Icon"
 msgstr ""
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Edit Tile Bitmask"
-msgstr "Chỉnh Thời gian Chuyển Animation"
+msgstr "Sửa bitmask của ô"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Edit Collision Polygon"
-msgstr "Tạo"
+msgstr "Chỉnh đa giác va chạm"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 #, fuzzy
@@ -8543,32 +8481,28 @@ msgid "Edit Occlusion Polygon"
 msgstr "Tạo"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Edit Navigation Polygon"
-msgstr "Tạo"
+msgstr "Chỉnh đa giác điều hướng"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Paste Tile Bitmask"
-msgstr "Dán Animation"
+msgstr "Dán bitmask các ô"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Clear Tile Bitmask"
-msgstr ""
+msgstr "Xóa bitmask của ô"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Make Polygon Concave"
 msgstr "Biến thành đa giác lõm"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Make Polygon Convex"
-msgstr "Tạo"
+msgstr "Làm Lồi Đa Giác"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Remove Tile"
-msgstr "Xóa Template"
+msgstr "Xóa Ô"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Remove Collision Polygon"
@@ -8579,9 +8513,8 @@ msgid "Remove Occlusion Polygon"
 msgstr ""
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Remove Navigation Polygon"
-msgstr "Xóa Animation"
+msgstr "Xóa Đa Giác Điều Hướng"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 msgid "Edit Tile Priority"
@@ -8592,19 +8525,16 @@ msgid "Edit Tile Z Index"
 msgstr "Sửa chiều sâu (Z) của ô"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Make Convex"
-msgstr "Tạo"
+msgstr "Làm Lồi"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Make Concave"
-msgstr "Tạo"
+msgstr "Làm Lõm"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "Create Collision Polygon"
-msgstr "Tạo"
+msgstr "Tạo đa giác va chạm"
 
 #: editor/plugins/tile_set_editor_plugin.cpp
 #, fuzzy
@@ -8616,9 +8546,8 @@ msgid "This property can't be changed."
 msgstr "Không thể thay đổi thuộc tính này."
 
 #: editor/plugins/tile_set_editor_plugin.cpp
-#, fuzzy
 msgid "TileSet"
-msgstr "Xuất Tile Set"
+msgstr "TileSet"
 
 #: editor/plugins/version_control_editor_plugin.cpp
 msgid "No VCS addons are available."
@@ -8658,23 +8587,20 @@ msgid "Detect new changes"
 msgstr "Phát hiện thay đổi mới"
 
 #: editor/plugins/version_control_editor_plugin.cpp
-#, fuzzy
 msgid "Changes"
-msgstr "Đổi"
+msgstr "Những thay đổi"
 
 #: editor/plugins/version_control_editor_plugin.cpp
 msgid "Modified"
-msgstr ""
+msgstr "Đã sửa đổi"
 
 #: editor/plugins/version_control_editor_plugin.cpp
-#, fuzzy
 msgid "Renamed"
-msgstr "Đổi tên"
+msgstr "Đã đổi tên"
 
 #: editor/plugins/version_control_editor_plugin.cpp
-#, fuzzy
 msgid "Deleted"
-msgstr "Xóa"
+msgstr "Đã Xóa"
 
 #: editor/plugins/version_control_editor_plugin.cpp
 #, fuzzy
@@ -8718,9 +8644,8 @@ msgid "(GLES3 only)"
 msgstr "(Chỉ dành cho GLES3)"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
-#, fuzzy
 msgid "Add Output"
-msgstr "Thêm Input"
+msgstr "Thêm Đầu Ra"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Scalar"
@@ -8739,23 +8664,20 @@ msgid "Sampler"
 msgstr ""
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
-#, fuzzy
 msgid "Add input port"
-msgstr "Thêm Input"
+msgstr "Thêm Cổng vào"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Add output port"
 msgstr "Thêm cổng đầu ra"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
-#, fuzzy
 msgid "Change input port type"
-msgstr "Đổi dạng mặc định"
+msgstr "Đổi kiểu cổng đầu vào"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
-#, fuzzy
 msgid "Change output port type"
-msgstr "Đổi dạng mặc định"
+msgstr "Đổi kiểu cổng ra"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Change input port name"
@@ -8766,14 +8688,12 @@ msgid "Change output port name"
 msgstr "Đổi tên cổng đầu ra"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
-#, fuzzy
 msgid "Remove input port"
-msgstr "Xoá Function"
+msgstr "Xoá Cổng vào"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
-#, fuzzy
 msgid "Remove output port"
-msgstr "Xóa Template"
+msgstr "Xóa Cổng ra"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 #, fuzzy
@@ -8797,9 +8717,8 @@ msgid "Add Node to Visual Shader"
 msgstr "Thêm nút vào Visual Shader"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
-#, fuzzy
 msgid "Node(s) Moved"
-msgstr "Đã di chuyển Nút"
+msgstr "Nút đã di chuyển"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Duplicate Nodes"
@@ -8833,7 +8752,7 @@ msgstr "Mảnh"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Light"
-msgstr ""
+msgstr "Ánh sáng"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Show resulted shader code."
@@ -9168,7 +9087,7 @@ msgstr "Trả về giá trị đối của tham số."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "1.0 - scalar"
-msgstr ""
+msgstr "1.0 - vô hướng"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid ""
@@ -9181,7 +9100,7 @@ msgstr "Đổi từ độ về radian."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "1.0 / scalar"
-msgstr ""
+msgstr "1.0 / vô hướng"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Finds the nearest integer to the parameter."
@@ -9205,7 +9124,7 @@ msgstr "Trả về sin của tham số."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Returns the hyperbolic sine of the parameter."
-msgstr ""
+msgstr "Trả về sin hyperbolic của tham số."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Returns the square root of the parameter."
@@ -9233,7 +9152,7 @@ msgstr "Trả về tan của tham số."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Returns the hyperbolic tangent of the parameter."
-msgstr ""
+msgstr "Trả về tan hyperbolic của tham số."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Finds the truncated value of the parameter."
@@ -9241,23 +9160,23 @@ msgstr ""
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Adds scalar to scalar."
-msgstr "Cộng hai số."
+msgstr "Cộng hai giá trị vô hướng."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Divides scalar by scalar."
-msgstr "Chia hai số."
+msgstr "Chia hai giá trị vô hướng."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Multiplies scalar by scalar."
-msgstr "Nhân hai số."
+msgstr "Nhân hai giá trị vô hướng."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Returns the remainder of the two scalars."
-msgstr ""
+msgstr "Trả về phần dư của hai giá trị vô hướng."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Subtracts scalar from scalar."
-msgstr "Trừ hai số."
+msgstr "Trừ hai giá trị vô hướng."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Scalar constant."
@@ -9303,39 +9222,44 @@ msgid ""
 "whose number of rows is the number of components in 'c' and whose number of "
 "columns is the number of components in 'r'."
 msgstr ""
+"Tính tích ngoài của một cặp vector.\n"
+"\n"
+"Tích ngoài đặt tham số 'c' đầu tiên làm vector dọc (ma trận 1 cột) và tham "
+"số 'h' thứ hai là vector ngang (ma trận 1 hàng) rồi thực hiện phép nhân ma "
+"trận tuyến tính 'c * h', tạo ra ma trận có số hàng bằng số phần tử trong 'h' "
+"và số cột bằng số phần tử trong 'c'."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Composes transform from four vectors."
-msgstr ""
+msgstr "Tạo phép biến đổi từ 4 vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Decomposes transform to four vectors."
-msgstr ""
+msgstr "Tách phép biến đổi thành 4 vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Calculates the determinant of a transform."
-msgstr ""
+msgstr "Tính định thức của phép biến đổi."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Calculates the inverse of a transform."
-msgstr ""
+msgstr "Tính nghịch đảo của phép biến đổi."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Calculates the transpose of a transform."
-msgstr ""
+msgstr "Tính chuyển vị của phép biến đổi."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Multiplies transform by transform."
-msgstr ""
+msgstr "Nhân hai phép biến đổi."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Multiplies vector by transform."
-msgstr ""
+msgstr "Nhân vector với phép biến đổi."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
-#, fuzzy
 msgid "Transform constant."
-msgstr "Tạo"
+msgstr "Hằng (số) Phép biến đổi."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 #, fuzzy
@@ -9352,23 +9276,23 @@ msgstr ""
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Composes vector from three scalars."
-msgstr ""
+msgstr "Tạo vector từ ba giá trị vô hướng."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Decomposes vector to three scalars."
-msgstr ""
+msgstr "Tách vector thành ba giá trị vô hướng."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Calculates the cross product of two vectors."
-msgstr ""
+msgstr "Tính tích chéo của hai vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Returns the distance between two points."
-msgstr ""
+msgstr "Trả về khoảng cách giữa hai điểm."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Calculates the dot product of two vectors."
-msgstr ""
+msgstr "Tính tích vô hướng của hai vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid ""
@@ -9380,7 +9304,7 @@ msgstr ""
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Calculates the length of a vector."
-msgstr ""
+msgstr "Tính chiều dài vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Linear interpolation between two vectors."
@@ -9392,25 +9316,27 @@ msgstr ""
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Calculates the normalize product of vector."
-msgstr ""
+msgstr "Tính tích chuẩn hóa của vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "1.0 - vector"
-msgstr ""
+msgstr "1.0 - vector"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "1.0 / vector"
-msgstr ""
+msgstr "1.0 / vector"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid ""
 "Returns the vector that points in the direction of reflection ( a : incident "
 "vector, b : normal vector )."
 msgstr ""
+"Trả về vector chỉ hướng phản xạ ( a : vector tia tới, b : vector pháp "
+"tuyến )."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Returns the vector that points in the direction of refraction."
-msgstr ""
+msgstr "Trả về vector chỉ hướng khúc xạ."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid ""
@@ -9446,27 +9372,27 @@ msgstr ""
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Adds vector to vector."
-msgstr ""
+msgstr "Cộng vector với vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Divides vector by vector."
-msgstr ""
+msgstr "Chia vector cho vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Multiplies vector by vector."
-msgstr ""
+msgstr "Nhân vector với vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Returns the remainder of the two vectors."
-msgstr ""
+msgstr "Trả về phần dư của hai vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Subtracts vector from vector."
-msgstr ""
+msgstr "Trừ vector cho vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Vector constant."
-msgstr ""
+msgstr "Hằng (số) vector."
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Vector uniform."
@@ -9581,15 +9507,15 @@ msgstr ""
 
 #: editor/project_export.cpp
 msgid "Release"
-msgstr ""
+msgstr "Phát hành"
 
 #: editor/project_export.cpp
 msgid "Exporting All"
-msgstr ""
+msgstr "Xuất tất cả"
 
 #: editor/project_export.cpp
 msgid "The given export path doesn't exist:"
-msgstr ""
+msgstr "Đường dẫn xuất không tồn tại:"
 
 #: editor/project_export.cpp
 msgid "Export templates for this platform are missing/corrupted:"
@@ -9601,7 +9527,7 @@ msgstr ""
 
 #: editor/project_export.cpp editor/project_settings_editor.cpp
 msgid "Add..."
-msgstr ""
+msgstr "Thêm..."
 
 #: editor/project_export.cpp
 msgid ""
@@ -9610,13 +9536,12 @@ msgid ""
 msgstr ""
 
 #: editor/project_export.cpp
-#, fuzzy
 msgid "Export Path"
-msgstr "Xuất Tile Set"
+msgstr "Đường dẫn xuất"
 
 #: editor/project_export.cpp
 msgid "Resources"
-msgstr ""
+msgstr "Tài nguyên"
 
 #: editor/project_export.cpp
 msgid "Export all resources in the project"
@@ -9624,25 +9549,27 @@ msgstr "Xuất ra tất cả tài nguyên dùng trong dự án"
 
 #: editor/project_export.cpp
 msgid "Export selected scenes (and dependencies)"
-msgstr ""
+msgstr "Xuất các cảnh đã chọn (cùng các phần phụ thuộc)"
 
 #: editor/project_export.cpp
 msgid "Export selected resources (and dependencies)"
-msgstr ""
+msgstr "Xuất tài nguyên đã chọn (cùng các phần phụ thuộc)"
 
 #: editor/project_export.cpp
 msgid "Export Mode:"
-msgstr ""
+msgstr "Chế độ xuất:"
 
 #: editor/project_export.cpp
 msgid "Resources to export:"
-msgstr ""
+msgstr "Tài nguyên để xuất:"
 
 #: editor/project_export.cpp
 msgid ""
 "Filters to export non-resource files/folders\n"
 "(comma-separated, e.g: *.json, *.txt, docs/*)"
 msgstr ""
+"Lọc để xuất các tệp/thư mục không phải tài nguyên\n"
+"(phẩy-phân-cách, ví dụ: *.json, *.txt, docs/*)"
 
 #: editor/project_export.cpp
 msgid ""
@@ -9654,48 +9581,47 @@ msgstr ""
 
 #: editor/project_export.cpp
 msgid "Features"
-msgstr ""
+msgstr "Tính năng"
 
 #: editor/project_export.cpp
 msgid "Custom (comma-separated):"
-msgstr ""
+msgstr "Tùy chỉnh (dấu-phẩy-phân-cách):"
 
 #: editor/project_export.cpp
 msgid "Feature List:"
-msgstr ""
+msgstr "Danh sách tính năng:"
 
 #: editor/project_export.cpp
-#, fuzzy
 msgid "Script"
-msgstr "Tạo Script"
+msgstr "Tệp lệnh"
 
 #: editor/project_export.cpp
 msgid "Script Export Mode:"
-msgstr "Chế độ xuất Script:"
+msgstr "Chế độ xuất tệp lệnh:"
 
 #: editor/project_export.cpp
 msgid "Text"
-msgstr ""
+msgstr "Văn bản"
 
 #: editor/project_export.cpp
 msgid "Compiled"
-msgstr ""
+msgstr "Đã biên dịch"
 
 #: editor/project_export.cpp
 msgid "Encrypted (Provide Key Below)"
-msgstr ""
+msgstr "Đã mã hóa (cung cấp mã mở bên dưới)"
 
 #: editor/project_export.cpp
 msgid "Invalid Encryption Key (must be 64 characters long)"
-msgstr ""
+msgstr "Mã mở không hợp lệ (phải dài 64 kí tự)"
 
 #: editor/project_export.cpp
 msgid "Script Encryption Key (256-bits as hex):"
-msgstr ""
+msgstr "Mã khóa tệp lệnh (256-bit theo hex):"
 
 #: editor/project_export.cpp
 msgid "Export PCK/Zip"
-msgstr ""
+msgstr "Xuất PCK/Zip"
 
 #: editor/project_export.cpp
 msgid "Export Project"
@@ -9706,9 +9632,8 @@ msgid "Export mode?"
 msgstr "Chế độ xuất?"
 
 #: editor/project_export.cpp
-#, fuzzy
 msgid "Export All"
-msgstr "Xuất Tile Set"
+msgstr "Xuất tất cả"
 
 #: editor/project_export.cpp editor/project_manager.cpp
 msgid "ZIP File"
@@ -9716,7 +9641,7 @@ msgstr "Tệp ZIP"
 
 #: editor/project_export.cpp
 msgid "Godot Game Pack"
-msgstr ""
+msgstr "Gói trò chơi Godot"
 
 #: editor/project_export.cpp
 msgid "Export templates for this platform are missing:"
@@ -9728,17 +9653,15 @@ msgstr "Quản Lý Các Mẫu Xuất Bản"
 
 #: editor/project_export.cpp
 msgid "Export With Debug"
-msgstr ""
+msgstr "Xuất cùng gỡ lỗi"
 
 #: editor/project_manager.cpp
-#, fuzzy
 msgid "The path specified doesn't exist."
-msgstr "Tệp không tồn tại."
+msgstr "Đường dẫn đã cho không tồn tại."
 
 #: editor/project_manager.cpp
-#, fuzzy
 msgid "Error opening package file (it's not in ZIP format)."
-msgstr "Lỗi không thể mở gói, không phải dạng nén."
+msgstr "Lỗi mở gói (không phải dạng ZIP)."
 
 #: editor/project_manager.cpp
 msgid ""
@@ -9748,7 +9671,7 @@ msgstr ""
 
 #: editor/project_manager.cpp
 msgid "Please choose an empty folder."
-msgstr ""
+msgstr "Hãy chọn một thư mục trống."
 
 #: editor/project_manager.cpp
 msgid "Please choose a \"project.godot\" or \".zip\" file."
@@ -9772,11 +9695,11 @@ msgstr "Tên dự án không hợp lệ."
 
 #: editor/project_manager.cpp
 msgid "Couldn't create folder."
-msgstr ""
+msgstr "Không thể tạo thư mục."
 
 #: editor/project_manager.cpp
 msgid "There is already a folder in this path with the specified name."
-msgstr ""
+msgstr "Đã tồn tại một thư mục cùng tên trên đường dẫn này."
 
 #: editor/project_manager.cpp
 msgid "It would be a good idea to name your project."
@@ -9812,7 +9735,7 @@ msgstr "Nạp Dự án có sẵn"
 
 #: editor/project_manager.cpp
 msgid "Import & Edit"
-msgstr ""
+msgstr "Nhập & Chỉnh sửa"
 
 #: editor/project_manager.cpp
 msgid "Create New Project"
@@ -9828,7 +9751,7 @@ msgstr "Cài đặt Dự án:"
 
 #: editor/project_manager.cpp
 msgid "Install & Edit"
-msgstr ""
+msgstr "Cài đặt & Chỉnh sửa"
 
 #: editor/project_manager.cpp
 msgid "Project Name:"
@@ -9844,15 +9767,15 @@ msgstr "Đường dẫn cài đặt Dự án:"
 
 #: editor/project_manager.cpp
 msgid "Renderer:"
-msgstr ""
+msgstr "Trình kết xuất hình ảnh:"
 
 #: editor/project_manager.cpp
 msgid "OpenGL ES 3.0"
-msgstr ""
+msgstr "OpenGL ES 3.0"
 
 #: editor/project_manager.cpp
 msgid "Not supported by your GPU drivers."
-msgstr ""
+msgstr "GPU driver của bạn không hỗ trợ."
 
 #: editor/project_manager.cpp
 msgid ""
@@ -9861,10 +9784,14 @@ msgid ""
 "Incompatible with older hardware\n"
 "Not recommended for web games"
 msgstr ""
+"Chất lượng hình ảnh cao hơn\n"
+"Đầy đủ mọi tính năng\n"
+"Không tương thích với các dòng máy cũ\n"
+"Khuyến cáo đối với trò chơi trên web"
 
 #: editor/project_manager.cpp
 msgid "OpenGL ES 2.0"
-msgstr ""
+msgstr "OpenGL ES 2.0"
 
 #: editor/project_manager.cpp
 msgid ""
@@ -9873,10 +9800,15 @@ msgid ""
 "Works on most hardware\n"
 "Recommended for web games"
 msgstr ""
+"Giảm chất lượng hình ảnh\n"
+"Không hỗ trợ một số tính năng\n"
+"Chạy trên đa số dòng máy\n"
+"Khuyên dùng cho trò chơi trên web"
 
 #: editor/project_manager.cpp
 msgid "Renderer can be changed later, but scenes may need to be adjusted."
 msgstr ""
+"Trình kết xuất có thể đổi sau, nhưng có thể sẽ phải chỉnh lại các Cảnh."
 
 #: editor/project_manager.cpp
 msgid "Unnamed Project"
@@ -9942,8 +9874,8 @@ msgid ""
 "The project settings were created by a newer engine version, whose settings "
 "are not compatible with this version."
 msgstr ""
-"Các cài đặt dự án đã được tạo bởi phiên bản Godot mới, có các cài đặt không "
-"tương thích với phiên bản này."
+"Các cài đặt dự án đã được tạo bởi phiên bản Godot mới và không tương thích "
+"với phiên bản này."
 
 #: editor/project_manager.cpp
 msgid ""
@@ -10022,7 +9954,7 @@ msgstr "Đang tải, đợi xíu..."
 
 #: editor/project_manager.cpp
 msgid "Last Modified"
-msgstr ""
+msgstr "Sửa đổi lần cuối"
 
 #: editor/project_manager.cpp
 msgid "Scan"
@@ -10085,13 +10017,13 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp
 msgid "Mouse Button"
-msgstr ""
+msgstr "Nút chuột"
 
 #: editor/project_settings_editor.cpp
 msgid ""
 "Invalid action name. it cannot be empty nor contain '/', ':', '=', '\\' or "
 "'\"'"
-msgstr ""
+msgstr "Tên hành động không được trống hoặc chứa '/', ':', '=', '\\' hoặc '\"'"
 
 #: editor/project_settings_editor.cpp
 msgid "An action with the name '%s' already exists."
@@ -10112,15 +10044,15 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp
 msgid "All Devices"
-msgstr ""
+msgstr "Tất cả thiết bị"
 
 #: editor/project_settings_editor.cpp
 msgid "Device"
-msgstr ""
+msgstr "Thiết bị"
 
 #: editor/project_settings_editor.cpp editor/settings_config_dialog.cpp
 msgid "Press a Key..."
-msgstr ""
+msgstr "Nhấn một phím..."
 
 #: editor/project_settings_editor.cpp
 msgid "Mouse Button Index:"
@@ -10128,31 +10060,31 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp
 msgid "Left Button"
-msgstr ""
+msgstr "Phím Trái"
 
 #: editor/project_settings_editor.cpp
 msgid "Right Button"
-msgstr ""
+msgstr "Phím Phải"
 
 #: editor/project_settings_editor.cpp
 msgid "Middle Button"
-msgstr ""
+msgstr "Phím Giữa"
 
 #: editor/project_settings_editor.cpp
 msgid "Wheel Up Button"
-msgstr ""
+msgstr "Phím Lăn Lên"
 
 #: editor/project_settings_editor.cpp
 msgid "Wheel Down Button"
-msgstr ""
+msgstr "Phím Lăn Xuống"
 
 #: editor/project_settings_editor.cpp
 msgid "Wheel Left Button"
-msgstr ""
+msgstr "Phím Lăn Trái"
 
 #: editor/project_settings_editor.cpp
 msgid "Wheel Right Button"
-msgstr ""
+msgstr "Phím Lăn Phải"
 
 #: editor/project_settings_editor.cpp
 msgid "X Button 1"
@@ -10168,7 +10100,7 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp
 msgid "Axis"
-msgstr ""
+msgstr "Trục"
 
 #: editor/project_settings_editor.cpp
 msgid "Joypad Button Index:"
@@ -10184,7 +10116,7 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp
 msgid "Add Event"
-msgstr ""
+msgstr "Thêm Sự kiện"
 
 #: editor/project_settings_editor.cpp
 msgid "Button"
@@ -10192,49 +10124,49 @@ msgstr "Button (nút, phím)"
 
 #: editor/project_settings_editor.cpp
 msgid "Left Button."
-msgstr ""
+msgstr "Phím Trái."
 
 #: editor/project_settings_editor.cpp
 msgid "Right Button."
-msgstr ""
+msgstr "Phím Phải."
 
 #: editor/project_settings_editor.cpp
 msgid "Middle Button."
-msgstr ""
+msgstr "Phím Giữa."
 
 #: editor/project_settings_editor.cpp
 msgid "Wheel Up."
-msgstr ""
+msgstr "Lăn Lên."
 
 #: editor/project_settings_editor.cpp
 msgid "Wheel Down."
-msgstr ""
+msgstr "Lăn Xuống."
 
 #: editor/project_settings_editor.cpp
 msgid "Add Global Property"
-msgstr ""
+msgstr "Thêm Thuộc tính Toàn cục"
 
 #: editor/project_settings_editor.cpp
 msgid "Select a setting item first!"
-msgstr ""
+msgstr "Chọn một mục cài đặt đã!"
 
 #: editor/project_settings_editor.cpp
 msgid "No property '%s' exists."
-msgstr ""
+msgstr "Thuộc tính '%s' không tồn tại."
 
 #: editor/project_settings_editor.cpp
 msgid "Setting '%s' is internal, and it can't be deleted."
-msgstr ""
+msgstr "Cài đặt '%s' thuộc nội tại, không thể xóa."
 
 #: editor/project_settings_editor.cpp
 msgid "Delete Item"
-msgstr ""
+msgstr "Xóa Mục"
 
 #: editor/project_settings_editor.cpp
 msgid ""
 "Invalid action name. It cannot be empty nor contain '/', ':', '=', '\\' or "
 "'\"'."
-msgstr ""
+msgstr "Tên hành động không thể trống hoặc chứa '/', ':', '=', '\\' hoặc '\"'."
 
 #: editor/project_settings_editor.cpp
 msgid "Add Input Action"
@@ -10242,11 +10174,11 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp
 msgid "Error saving settings."
-msgstr ""
+msgstr "Lỗi khi lưu cài đặt."
 
 #: editor/project_settings_editor.cpp
 msgid "Settings saved OK."
-msgstr ""
+msgstr "Lưu cài đặt thành công."
 
 #: editor/project_settings_editor.cpp
 msgid "Moved Input Action Event"
@@ -10258,11 +10190,11 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp
 msgid "Add Translation"
-msgstr ""
+msgstr "Thêm Bản dịch"
 
 #: editor/project_settings_editor.cpp
 msgid "Remove Translation"
-msgstr ""
+msgstr "Xóa bản dịch"
 
 #: editor/project_settings_editor.cpp
 msgid "Add Remapped Path"
@@ -10306,7 +10238,7 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp editor/settings_config_dialog.cpp
 msgid "The editor must be restarted for changes to take effect."
-msgstr ""
+msgstr "Thay đổi sẽ được áp dụng sau khi Trình biên tập khởi động lại."
 
 #: editor/project_settings_editor.cpp
 msgid "Input Map"
@@ -10314,7 +10246,7 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp
 msgid "Action:"
-msgstr ""
+msgstr "Hành động:"
 
 #: editor/project_settings_editor.cpp
 #, fuzzy
@@ -10327,7 +10259,7 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp
 msgid "Device:"
-msgstr ""
+msgstr "Thiết bị:"
 
 #: editor/project_settings_editor.cpp
 msgid "Index:"
@@ -10351,7 +10283,7 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp
 msgid "Resources:"
-msgstr ""
+msgstr "Tài nguyên:"
 
 #: editor/project_settings_editor.cpp
 msgid "Remaps by Locale:"
@@ -10376,7 +10308,7 @@ msgstr "Chỉ lựa chọn"
 
 #: editor/project_settings_editor.cpp
 msgid "Filter mode:"
-msgstr ""
+msgstr "Chế độ lọc:"
 
 #: editor/project_settings_editor.cpp
 msgid "Locales:"
@@ -10388,7 +10320,7 @@ msgstr ""
 
 #: editor/project_settings_editor.cpp
 msgid "Plugins"
-msgstr ""
+msgstr "Tiện ích"
 
 #: editor/project_settings_editor.cpp
 #, fuzzy
@@ -10505,6 +10437,8 @@ msgid ""
 "Sequential integer counter.\n"
 "Compare counter options."
 msgstr ""
+"Bộ đếm số nguyên tuần tự.\n"
+"So sánh giữa các cách đếm."
 
 #: editor/rename_dialog.cpp
 msgid "Per-level Counter"
@@ -10548,15 +10482,15 @@ msgstr "Giữ"
 
 #: editor/rename_dialog.cpp
 msgid "PascalCase to snake_case"
-msgstr ""
+msgstr "KiểuPascal thành kiểu_rắn"
 
 #: editor/rename_dialog.cpp
 msgid "snake_case to PascalCase"
-msgstr ""
+msgstr "kiểu_rắn thành KiểuPascal"
 
 #: editor/rename_dialog.cpp
 msgid "Case"
-msgstr ""
+msgstr "Kiểu"
 
 #: editor/rename_dialog.cpp
 msgid "To Lowercase"
@@ -10598,7 +10532,7 @@ msgstr ""
 
 #: editor/run_settings_dialog.cpp
 msgid "Run Mode:"
-msgstr ""
+msgstr "Chế độ chạy:"
 
 #: editor/run_settings_dialog.cpp
 msgid "Current Scene"
@@ -10835,6 +10769,9 @@ msgid ""
 "This is probably because this editor was built with all language modules "
 "disabled."
 msgstr ""
+"Không thể đính kèm tệp lệnh: Không ghi nhận thấy ngôn ngữ nào.\n"
+"Vấn đề có thể là do các module ngôn ngữ bị vô hiệu hóa khi trình biên tập "
+"này được xây dựng."
 
 #: editor/scene_tree_dock.cpp
 msgid "Add Child Node"
@@ -10899,6 +10836,13 @@ msgid "Remote"
 msgstr ""
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "Cục bộ"
 
@@ -11044,7 +10988,7 @@ msgstr "Lỗi nạp mẫu '%s'"
 
 #: editor/script_create_dialog.cpp
 msgid "Error - Could not create script in filesystem."
-msgstr ""
+msgstr "Lỗi - Không thể tạo tệp lệnh trong hệ thống tệp tin."
 
 #: editor/script_create_dialog.cpp
 msgid "Error loading script from %s"
@@ -11240,7 +11184,7 @@ msgstr "Chọn một hoặc nhiều mục từ danh sách để hiển thị bi�
 
 #: editor/script_editor_debugger.cpp
 msgid "List of Video Memory Usage by Resource:"
-msgstr ""
+msgstr "Danh sách sử dụng bộ nhớ của các video theo tài nguyên:"
 
 #: editor/script_editor_debugger.cpp
 #, fuzzy
@@ -11266,7 +11210,7 @@ msgstr "Định dạng"
 
 #: editor/script_editor_debugger.cpp
 msgid "Usage"
-msgstr ""
+msgstr "Sử dụng"
 
 #: editor/script_editor_debugger.cpp
 #, fuzzy
@@ -11346,107 +11290,105 @@ msgid "Change Probe Extents"
 msgstr ""
 
 #: editor/spatial_editor_gizmos.cpp modules/csg/csg_gizmos.cpp
-#, fuzzy
 msgid "Change Sphere Shape Radius"
-msgstr "Thay đổi bán kính hình cầu"
+msgstr "Thay Đổi Bán Kính Hình Cầu"
 
 #: editor/spatial_editor_gizmos.cpp modules/csg/csg_gizmos.cpp
 msgid "Change Box Shape Extents"
-msgstr ""
+msgstr "Chỉnh chiều dài hình hộp"
 
 #: editor/spatial_editor_gizmos.cpp
 msgid "Change Capsule Shape Radius"
-msgstr ""
+msgstr "Chỉnh bán kính hình nhộng"
 
 #: editor/spatial_editor_gizmos.cpp
 msgid "Change Capsule Shape Height"
-msgstr ""
+msgstr "Chỉnh chiều cao hình nhộng"
 
 #: editor/spatial_editor_gizmos.cpp
 msgid "Change Cylinder Shape Radius"
-msgstr ""
+msgstr "Chỉnh bán kính hình trụ"
 
 #: editor/spatial_editor_gizmos.cpp
 msgid "Change Cylinder Shape Height"
-msgstr ""
+msgstr "Chỉnh chiều cao hình trụ"
 
 #: editor/spatial_editor_gizmos.cpp
 msgid "Change Ray Shape Length"
 msgstr ""
 
 #: modules/csg/csg_gizmos.cpp
-#, fuzzy
 msgid "Change Cylinder Radius"
-msgstr "Đổi Thời gian Chuyển Animation"
+msgstr "Thay Đổi Bán Kính Hình Trụ"
 
 #: modules/csg/csg_gizmos.cpp
-#, fuzzy
 msgid "Change Cylinder Height"
-msgstr "Đổi Thời gian Chuyển Animation"
+msgstr "Thay Đổi Chiều Cao Hình Trụ"
 
 #: modules/csg/csg_gizmos.cpp
 msgid "Change Torus Inner Radius"
-msgstr ""
+msgstr "Thay Đổi Bán Kính Trong Của Hình Xuyến"
 
 #: modules/csg/csg_gizmos.cpp
 msgid "Change Torus Outer Radius"
-msgstr ""
+msgstr "Thay Đổi Bán Kính Ngoài Của Hình Xuyến"
 
 #: modules/gdnative/gdnative_library_editor_plugin.cpp
 msgid "Select the dynamic library for this entry"
-msgstr ""
+msgstr "Chọn thư viện động cho mục này"
 
 #: modules/gdnative/gdnative_library_editor_plugin.cpp
 msgid "Select dependencies of the library for this entry"
-msgstr ""
+msgstr "Chọn phần phụ thuộc của thư viện cho mục này"
 
 #: modules/gdnative/gdnative_library_editor_plugin.cpp
 msgid "Remove current entry"
-msgstr ""
+msgstr "Xóa mục hiện tại"
 
 #: modules/gdnative/gdnative_library_editor_plugin.cpp
 msgid "Double click to create a new entry"
-msgstr ""
+msgstr "Nhấp đúp để tạo mục mới"
 
 #: modules/gdnative/gdnative_library_editor_plugin.cpp
 msgid "Platform:"
-msgstr ""
+msgstr "Nền tảng:"
 
 #: modules/gdnative/gdnative_library_editor_plugin.cpp
 msgid "Platform"
-msgstr ""
+msgstr "Nền tảng"
 
 #: modules/gdnative/gdnative_library_editor_plugin.cpp
 msgid "Dynamic Library"
-msgstr ""
+msgstr "Thư viện động"
 
 #: modules/gdnative/gdnative_library_editor_plugin.cpp
 msgid "Add an architecture entry"
-msgstr ""
+msgstr "Thêm mục kiến trúc máy"
 
 #: modules/gdnative/gdnative_library_editor_plugin.cpp
 msgid "GDNativeLibrary"
-msgstr ""
+msgstr "GDNativeLibrary"
 
 #: modules/gdnative/gdnative_library_singleton_editor.cpp
+#, fuzzy
 msgid "Enabled GDNative Singleton"
-msgstr ""
+msgstr "Bật đơn nhất GDNative"
 
 #: modules/gdnative/gdnative_library_singleton_editor.cpp
 msgid "Disabled GDNative Singleton"
-msgstr ""
+msgstr "Tắt đơn nhất GDNative"
 
 #: modules/gdnative/gdnative_library_singleton_editor.cpp
 msgid "Library"
-msgstr ""
+msgstr "Thư viện"
 
 #: modules/gdnative/gdnative_library_singleton_editor.cpp
 msgid "Libraries: "
-msgstr ""
+msgstr "Thư viện: "
 
 #: modules/gdnative/register_types.cpp
 msgid "GDNative"
-msgstr ""
+msgstr "GDNative"
 
 #: modules/gdscript/gdscript_functions.cpp
 msgid "Step argument is zero!"
@@ -11458,23 +11400,24 @@ msgstr ""
 
 #: modules/gdscript/gdscript_functions.cpp
 msgid "Not based on a script"
-msgstr ""
+msgstr "Không dựa trên tệp lệnh"
 
 #: modules/gdscript/gdscript_functions.cpp
 msgid "Not based on a resource file"
-msgstr ""
+msgstr "Không dựa trên tệp tài nguyên"
 
 #: modules/gdscript/gdscript_functions.cpp
+#, fuzzy
 msgid "Invalid instance dictionary format (missing @path)"
-msgstr ""
+msgstr "Định dạng từ điển không hợp lệ (thiếu @path)"
 
 #: modules/gdscript/gdscript_functions.cpp
 msgid "Invalid instance dictionary format (can't load script at @path)"
-msgstr ""
+msgstr "Định dạng từ điển không hợp lệ (Không tải được tệp lệnh ở @path)"
 
 #: modules/gdscript/gdscript_functions.cpp
 msgid "Invalid instance dictionary format (invalid script at @path)"
-msgstr ""
+msgstr "Định dạng từ điển không hợp lệ (tệp lệnh không hợp lệ ở @path)"
 
 #: modules/gdscript/gdscript_functions.cpp
 msgid "Invalid instance dictionary (invalid subclasses)"
@@ -11482,20 +11425,19 @@ msgstr "Từ điển không hợp lệ (Lớp con không hợp lệ)"
 
 #: modules/gdscript/gdscript_functions.cpp
 msgid "Object can't provide a length."
-msgstr ""
+msgstr "Đối tượng không thể cung cấp chiều dài."
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Next Plane"
-msgstr ""
+msgstr "Mặt phẳng tiếp theo"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
-#, fuzzy
 msgid "Previous Plane"
-msgstr "Thư mục trước"
+msgstr "Mặt phẳng trước đó"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Plane:"
-msgstr ""
+msgstr "Mặt phẳng:"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Next Floor"
@@ -11529,7 +11471,7 @@ msgstr ""
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Grid Map"
-msgstr ""
+msgstr "Bản đồ Lưới"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Snap View"
@@ -11549,15 +11491,15 @@ msgstr ""
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Edit X Axis"
-msgstr ""
+msgstr "Chỉnh trục X"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Edit Y Axis"
-msgstr ""
+msgstr "Chỉnh trục Y"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Edit Z Axis"
-msgstr ""
+msgstr "Chỉnh trục Z"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Cursor Rotate X"
@@ -11594,7 +11536,7 @@ msgstr "Chọn tất cả"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Clear Selection"
-msgstr ""
+msgstr "Xóa Lựa chọn"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 #, fuzzy
@@ -11603,7 +11545,7 @@ msgstr "Chọn tất cả"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "GridMap Settings"
-msgstr ""
+msgstr "Cài đặt Bản đồ Lưới"
 
 #: modules/gridmap/grid_map_editor_plugin.cpp
 msgid "Pick Distance:"
@@ -11624,11 +11566,11 @@ msgstr ""
 
 #: modules/lightmapper_cpu/lightmapper_cpu.cpp
 msgid "Preparing data structures"
-msgstr ""
+msgstr "Chuẩn bị cấu trúc dữ liệu"
 
 #: modules/lightmapper_cpu/lightmapper_cpu.cpp
 msgid "Generate buffers"
-msgstr ""
+msgstr "Tạo bộ đệm"
 
 #: modules/lightmapper_cpu/lightmapper_cpu.cpp
 #, fuzzy
@@ -11637,7 +11579,7 @@ msgstr "Hướng đi"
 
 #: modules/lightmapper_cpu/lightmapper_cpu.cpp
 msgid "Indirect lighting"
-msgstr ""
+msgstr "Chiếu sáng gián tiếp"
 
 #: modules/lightmapper_cpu/lightmapper_cpu.cpp
 msgid "Post processing"
@@ -11677,7 +11619,7 @@ msgstr ""
 
 #: modules/recast/navigation_mesh_generator.cpp
 msgid "Marking walkable triangles..."
-msgstr ""
+msgstr "Đánh dấu tam giác có thể đi được..."
 
 #: modules/recast/navigation_mesh_generator.cpp
 msgid "Constructing compact heightfield..."
@@ -11685,7 +11627,7 @@ msgstr ""
 
 #: modules/recast/navigation_mesh_generator.cpp
 msgid "Eroding walkable area..."
-msgstr ""
+msgstr "Làm xói mòn vùng đi được..."
 
 #: modules/recast/navigation_mesh_generator.cpp
 msgid "Partitioning..."
@@ -11781,7 +11723,7 @@ msgstr "Thêm cổng ra"
 
 #: modules/visual_script/visual_script_editor.cpp
 msgid "Override an existing built-in function."
-msgstr ""
+msgstr "Ghi đè một hàm cài sẵn."
 
 #: modules/visual_script/visual_script_editor.cpp
 msgid "Create a new function."
@@ -11805,7 +11747,7 @@ msgstr "Tạo tín hiệu mới."
 
 #: modules/visual_script/visual_script_editor.cpp
 msgid "Name is not a valid identifier:"
-msgstr ""
+msgstr "Tên không phải định danh hợp lệ:"
 
 #: modules/visual_script/visual_script_editor.cpp
 msgid "Name already in use by another func/var/signal:"
@@ -11861,7 +11803,7 @@ msgstr "Nhân bản các nút VisualScript"
 
 #: modules/visual_script/visual_script_editor.cpp
 msgid "Hold %s to drop a Getter. Hold Shift to drop a generic signature."
-msgstr ""
+msgstr "Giữ% s để thả Getter. Giữ phím Shift để thả chữ ký chung."
 
 #: modules/visual_script/visual_script_editor.cpp
 msgid "Hold Ctrl to drop a Getter. Hold Shift to drop a generic signature."
@@ -11877,11 +11819,11 @@ msgstr "Giữ Ctrl để thả một tài liệu tham khảo đơn giản đến
 
 #: modules/visual_script/visual_script_editor.cpp
 msgid "Hold %s to drop a Variable Setter."
-msgstr ""
+msgstr "Giữ %s để bỏ Hàm Đặt Biến."
 
 #: modules/visual_script/visual_script_editor.cpp
 msgid "Hold Ctrl to drop a Variable Setter."
-msgstr ""
+msgstr "Giữ Ctrl để bỏ Hàm Đặt Biến."
 
 #: modules/visual_script/visual_script_editor.cpp
 msgid "Add Preload Node"
@@ -12129,7 +12071,7 @@ msgstr "Thiếu tên gói."
 
 #: platform/android/export/export.cpp
 msgid "Package segments must be of non-zero length."
-msgstr ""
+msgstr "Các phân đoạn của gói phải có độ dài khác không."
 
 #: platform/android/export/export.cpp
 msgid "The character '%s' is not allowed in Android application package names."
@@ -12141,7 +12083,7 @@ msgstr "Không thể có chữ số làm kí tự đầu tiên trong một phầ
 
 #: platform/android/export/export.cpp
 msgid "The character '%s' cannot be the first character in a package segment."
-msgstr ""
+msgstr "Kí tự '%s' không thể ở đầu trong một phân đoạn của gói."
 
 #: platform/android/export/export.cpp
 msgid "The package must have at least one '.' separator."
@@ -12173,11 +12115,11 @@ msgstr ""
 
 #: platform/android/export/export.cpp
 msgid "A valid Android SDK path is required in Editor Settings."
-msgstr ""
+msgstr "Cài đặt Trình biên tập yêu cầu một đường dẫn Android SDK hợp lệ."
 
 #: platform/android/export/export.cpp
 msgid "Invalid Android SDK path in Editor Settings."
-msgstr ""
+msgstr "Đường dẫn Android SDK không hợp lệ trong Cài đặt Trình biên tập."
 
 #: platform/android/export/export.cpp
 msgid "Missing 'platform-tools' directory!"
@@ -12190,6 +12132,7 @@ msgstr "Không tìm thấy lệnh adb trong bộ Android SDK platform-tools."
 #: platform/android/export/export.cpp
 msgid "Please check in the Android SDK directory specified in Editor Settings."
 msgstr ""
+"Hãy kiểm tra thư mục Android SDK được cung cấp ở Cài đặt Trình biên tập."
 
 #: platform/android/export/export.cpp
 msgid "Missing 'build-tools' directory!"
@@ -12217,7 +12160,7 @@ msgstr ""
 
 #: platform/android/export/export.cpp
 msgid "\"Use Custom Build\" must be enabled to use the plugins."
-msgstr ""
+msgstr "\"Sử dụng Bản dựng tùy chỉnh\" phải được bật để sử dụng các tiện ích."
 
 #: platform/android/export/export.cpp
 msgid ""
@@ -12247,8 +12190,9 @@ msgid "Invalid filename! Android App Bundle requires the *.aab extension."
 msgstr "Tên tệp không hợp lệ! Android App Bundle cần đuôi *.aab ở cuối."
 
 #: platform/android/export/export.cpp
+#, fuzzy
 msgid "APK Expansion not compatible with Android App Bundle."
-msgstr ""
+msgstr "Đuôi APK không tương thích với Android App Bundle."
 
 #: platform/android/export/export.cpp
 msgid "Invalid filename! Android APK requires the *.apk extension."
@@ -12295,23 +12239,24 @@ msgid ""
 "Unable to copy and rename export file, check gradle project directory for "
 "outputs."
 msgstr ""
+"Không thể sao chép và đổi tên tệp xuất, hãy kiểm tra thư mục Gradle của dự "
+"án để xem kết quả."
 
 #: platform/iphone/export/export.cpp
 msgid "Identifier is missing."
-msgstr ""
+msgstr "Thiếu định danh."
 
 #: platform/iphone/export/export.cpp
 msgid "The character '%s' is not allowed in Identifier."
-msgstr ""
+msgstr "Không được phép có kí tự '%s' trong Định danh."
 
 #: platform/iphone/export/export.cpp
 msgid "App Store Team ID not specified - cannot configure the project."
 msgstr "App Store Team ID không được chỉ định - không thể cấu hình dự án."
 
 #: platform/iphone/export/export.cpp
-#, fuzzy
 msgid "Invalid Identifier:"
-msgstr "Kích thước font không hợp lệ."
+msgstr "Định danh không hợp lệ:"
 
 #: platform/iphone/export/export.cpp
 msgid "Required icon is not specified in the preset."
@@ -12327,7 +12272,7 @@ msgstr "Chạy trong Trình duyệt web"
 
 #: platform/javascript/export/export.cpp
 msgid "Run exported HTML in the system's default browser."
-msgstr ""
+msgstr "Chạy HTML được xuất với trình duyệt mặc định của máy."
 
 #: platform/javascript/export/export.cpp
 msgid "Could not write file:"
@@ -12354,29 +12299,24 @@ msgid "Using default boot splash image."
 msgstr "Sử dụng hình khởi động mặc định."
 
 #: platform/uwp/export/export.cpp
-#, fuzzy
 msgid "Invalid package short name."
-msgstr "Kích thước font không hợp lệ."
+msgstr "Gói có tên ngắn không hợp lệ."
 
 #: platform/uwp/export/export.cpp
-#, fuzzy
 msgid "Invalid package unique name."
-msgstr "Kích thước font không hợp lệ."
+msgstr "Gói có tên độc nhất không hợp lệ."
 
 #: platform/uwp/export/export.cpp
-#, fuzzy
 msgid "Invalid package publisher display name."
-msgstr "Kích thước font không hợp lệ."
+msgstr "Gói có tên hiển thị của nhà phát hành không hợp lệ."
 
 #: platform/uwp/export/export.cpp
-#, fuzzy
 msgid "Invalid product GUID."
-msgstr "Kích thước font không hợp lệ."
+msgstr "GUID sản phẩm không hợp lệ."
 
 #: platform/uwp/export/export.cpp
-#, fuzzy
 msgid "Invalid publisher GUID."
-msgstr "Kích thước font không hợp lệ."
+msgstr "GUID của nhà phát hành không hợp lệ."
 
 #: platform/uwp/export/export.cpp
 msgid "Invalid background color."
@@ -12532,12 +12472,16 @@ msgid ""
 "A NavigationPolygon resource must be set or created for this node to work. "
 "Please set a property or draw a polygon."
 msgstr ""
+"Một tài nguyên Navigation Polygon phải được đặt hoặc tạo thì nút này mới "
+"chạy được. Vui lòng đặt thuộc tính hoặc vẽ một đa giác."
 
 #: scene/2d/navigation_polygon.cpp
 msgid ""
 "NavigationPolygonInstance must be a child or grandchild to a Navigation2D "
 "node. It only provides navigation data."
 msgstr ""
+"NavigationPolygonInstance phải là nút con hoặc cháu của nút Navigation2D. Nó "
+"chỉ cung cấp dữ liệu điều hướng."
 
 #: scene/2d/parallax_layer.cpp
 msgid ""
@@ -12551,6 +12495,9 @@ msgid ""
 "Use the CPUParticles2D node instead. You can use the \"Convert to "
 "CPUParticles\" option for this purpose."
 msgstr ""
+"Video driver GLES2 không hỗ trợ hạt dựa trên bộ xử lí GPU.\n"
+"Thay vào đó hãy dùng nút CPUParticles2D. Bạn có thể dùng tùy chọn \"Chuyển "
+"thành CPUParticles\" cho mục đích này."
 
 #: scene/2d/particles_2d.cpp scene/3d/particles.cpp
 msgid ""
@@ -12575,6 +12522,9 @@ msgid ""
 "by the physics engine when running.\n"
 "Change the size in children collision shapes instead."
 msgstr ""
+"Thay đổi về kích cỡ của RigidBody2d (trong chế độ Nhân vật hoặc Rắn) sẽ bị "
+"ghi đè khi tính toán vật lí.\n"
+"Hãy sửa kích cỡ khối va chạm của nút con ý."
 
 #: scene/2d/remote_transform_2d.cpp
 msgid "Path property must point to a valid Node2D node to work."
@@ -12584,7 +12534,7 @@ msgstr ""
 
 #: scene/2d/skeleton_2d.cpp
 msgid "This Bone2D chain should end at a Skeleton2D node."
-msgstr ""
+msgstr "Chuỗi Bone2D này phải kết thúc với một nút Skeleton2D."
 
 #: scene/2d/skeleton_2d.cpp
 msgid "A Bone2D only works with a Skeleton2D or another Bone2D as parent node."
@@ -12611,11 +12561,11 @@ msgstr ""
 
 #: scene/3d/arvr_nodes.cpp
 msgid "ARVRCamera must have an ARVROrigin node as its parent."
-msgstr ""
+msgstr "ARVRCamera phải là con nút ARVROrigin."
 
 #: scene/3d/arvr_nodes.cpp
 msgid "ARVRController must have an ARVROrigin node as its parent."
-msgstr ""
+msgstr "ARVRController phải là con nút ARVROrigin."
 
 #: scene/3d/arvr_nodes.cpp
 msgid ""
@@ -12637,19 +12587,20 @@ msgstr ""
 
 #: scene/3d/arvr_nodes.cpp
 msgid "ARVROrigin requires an ARVRCamera child node."
-msgstr ""
+msgstr "ARVROrigin cần một nút con ARVRCamera."
 
 #: scene/3d/baked_lightmap.cpp
 msgid "Finding meshes and lights"
 msgstr ""
 
 #: scene/3d/baked_lightmap.cpp
+#, fuzzy
 msgid "Preparing geometry (%d/%d)"
-msgstr ""
+msgstr "Đang xử lí hình học (%s/%d)"
 
 #: scene/3d/baked_lightmap.cpp
 msgid "Preparing environment"
-msgstr ""
+msgstr "Xử lí môi trường"
 
 #: scene/3d/baked_lightmap.cpp
 msgid "Generating capture"
@@ -12661,7 +12612,7 @@ msgstr ""
 
 #: scene/3d/baked_lightmap.cpp
 msgid "Done"
-msgstr ""
+msgstr "Xong"
 
 #: scene/3d/collision_object.cpp
 msgid ""
@@ -12669,6 +12620,10 @@ msgid ""
 "Consider adding a CollisionShape or CollisionPolygon as a child to define "
 "its shape."
 msgstr ""
+"Nút này không có hình dạng, vì vậy nó không thể va chạm hoặc tương tác với "
+"các đối tượng khác.\n"
+"Hãy thêm một nút con CollisionShape hoặc CollisionPolygon để xác định hình "
+"dạng của nó."
 
 #: scene/3d/collision_polygon.cpp
 msgid ""
@@ -12676,10 +12631,13 @@ msgid ""
 "CollisionObject derived node. Please only use it as a child of Area, "
 "StaticBody, RigidBody, KinematicBody, etc. to give them a shape."
 msgstr ""
+"CollisionPolygon chỉ nhằm mục đích cung cấp khối va chạm cho một nút kế thừa "
+"CollisionObject. Hãy dùng nó làm nút con của Area, StaticBody, RigidBody, "
+"KinematicBody,... để tạo hình cho chúng."
 
 #: scene/3d/collision_polygon.cpp
 msgid "An empty CollisionPolygon has no effect on collision."
-msgstr ""
+msgstr "CollisionPolygon rỗng sẽ chẳng ích gì trong va chạm."
 
 #: scene/3d/collision_shape.cpp
 msgid ""
@@ -12687,27 +12645,35 @@ msgid ""
 "derived node. Please only use it as a child of Area, StaticBody, RigidBody, "
 "KinematicBody, etc. to give them a shape."
 msgstr ""
+"CollisionShape chỉ nhằm mục đích cung cấp khối va chạm cho nút kế thừa từ "
+"CollisionObject. Hãy dùng nó làm nút con cho Area, StaticBody, RigidBody, "
+"KinematicBody, v.v. để tạo hình cho chúng."
 
 #: scene/3d/collision_shape.cpp
 msgid ""
 "A shape must be provided for CollisionShape to function. Please create a "
 "shape resource for it."
 msgstr ""
+"Một hình phải được cung cấp CollisionShape thì mới hoạt động. Hãy tạo cho nó "
+"một cái."
 
 #: scene/3d/collision_shape.cpp
 msgid ""
 "Plane shapes don't work well and will be removed in future versions. Please "
 "don't use them."
 msgstr ""
+"Hình phẳng không chạy tốt và sẽ bị bãi bõ trong tương lai. Đừng dùng chúng."
 
 #: scene/3d/collision_shape.cpp
 msgid ""
 "ConcavePolygonShape doesn't support RigidBody in another mode than static."
 msgstr ""
+"ConcavePolygonShape không hỗ trợ RigidBody ở bất kì chế độ nào khác ngoài "
+"chế độ Tĩnh."
 
 #: scene/3d/cpu_particles.cpp
 msgid "Nothing is visible because no mesh has been assigned."
-msgstr ""
+msgstr "Không có gì hiển thị vì không có lưới nào được chỉ định."
 
 #: scene/3d/cpu_particles.cpp
 msgid ""
@@ -12735,13 +12701,15 @@ msgstr ""
 
 #: scene/3d/navigation_mesh.cpp
 msgid "A NavigationMesh resource must be set or created for this node to work."
-msgstr ""
+msgstr "Phải tạo hoặc đặt một NavigationMesh cho nút này thì nó mới hoạt động."
 
 #: scene/3d/navigation_mesh.cpp
 msgid ""
 "NavigationMeshInstance must be a child or grandchild to a Navigation node. "
 "It only provides navigation data."
 msgstr ""
+"NavigationMeshInstance phải là nút con hoặc cháu một nút Navigation. Nó chỉ "
+"cung cấp dữ liệu điều hướng."
 
 #: scene/3d/particles.cpp
 msgid ""
@@ -12777,6 +12745,9 @@ msgid ""
 "by the physics engine when running.\n"
 "Change the size in children collision shapes instead."
 msgstr ""
+"Thay đổi về kích cỡ của RigidBody2d (trong chế độ Nhân vật hoặc Rắn) sẽ bị "
+"ghi đè khi tính toán vật lí.\n"
+"Hãy sửa kích cỡ khối va chạm của nút con ý."
 
 #: scene/3d/physics_joint.cpp
 msgid "Node A and Node B must be PhysicsBodies"
@@ -12814,6 +12785,8 @@ msgid ""
 "running.\n"
 "Change the size in children collision shapes instead."
 msgstr ""
+"Thay đổi về kích cỡ của SoftBody sẽ bị ghi đè khi tính toán vật lí.\n"
+"Hãy sửa kích cỡ khối va chạm của nút con ý."
 
 #: scene/3d/sprite_3d.cpp
 msgid ""
diff --git a/editor/translations/zh_CN.po b/editor/translations/zh_CN.po
index e043d0f05a..e5826da638 100644
--- a/editor/translations/zh_CN.po
+++ b/editor/translations/zh_CN.po
@@ -77,11 +77,12 @@
 # Magian <magian1127@gmail.com>, 2021.
 # Weiduo Xie <xwditfr@gmail.com>, 2021.
 # suplife <2634557184@qq.com>, 2021.
+# luoji <564144019@qq.com>, 2021.
 msgid ""
 msgstr ""
 "Project-Id-Version: Chinese (Simplified) (Godot Engine)\n"
 "POT-Creation-Date: 2018-01-20 12:15+0200\n"
-"PO-Revision-Date: 2021-04-05 14:28+0000\n"
+"PO-Revision-Date: 2021-04-19 22:33+0000\n"
 "Last-Translator: Haoyu Qiu <timothyqiu32@gmail.com>\n"
 "Language-Team: Chinese (Simplified) <https://hosted.weblate.org/projects/"
 "godot-engine/godot/zh_Hans/>\n"
@@ -90,7 +91,7 @@ msgstr ""
 "Content-Type: text/plain; charset=UTF-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
-"X-Generator: Weblate 4.6-dev\n"
+"X-Generator: Weblate 4.7-dev\n"
 
 #: core/math/expression.cpp modules/gdscript/gdscript_functions.cpp
 #: modules/visual_script/visual_script_builtin_funcs.cpp
@@ -1376,7 +1377,7 @@ msgstr "总线选项"
 #: editor/editor_audio_buses.cpp editor/filesystem_dock.cpp
 #: editor/plugins/animation_player_editor_plugin.cpp editor/scene_tree_dock.cpp
 msgid "Duplicate"
-msgstr "复制"
+msgstr "制作副本"
 
 #: editor/editor_audio_buses.cpp
 msgid "Reset Volume"
@@ -5208,7 +5209,7 @@ msgstr "网格偏移："
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Grid Step:"
-msgstr "网格大小:"
+msgstr "网格步长："
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Primary Line Every:"
@@ -5224,7 +5225,7 @@ msgstr "旋转偏移："
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Rotation Step:"
-msgstr "旋转步长:"
+msgstr "旋转步长："
 
 #: editor/plugins/canvas_item_editor_plugin.cpp
 msgid "Scale Step:"
@@ -7480,6 +7481,11 @@ msgstr "锁定视角旋转"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -7666,7 +7672,7 @@ msgstr "修改变换"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Translate:"
-msgstr "移动:"
+msgstr "移动："
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid "Rotate (deg.):"
@@ -8628,7 +8634,7 @@ msgstr "标量"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Vector"
-msgstr "矢量"
+msgstr "向量"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Boolean"
@@ -9188,11 +9194,11 @@ msgid ""
 "whose number of rows is the number of components in 'c' and whose number of "
 "columns is the number of components in 'r'."
 msgstr ""
-"计算一对矢量的外积。\n"
+"计算一对向量的外积。\n"
 "\n"
-"OuterProduct 将第一个参数 “c” 视为列矢量（包含一列的矩阵），将第二个参数 “r” "
-"视为行矢量（具有一行的矩阵），并执行线性代数矩阵乘以 “c * r”，生成行数为 “c” "
-"中的组件，其列数是 “r” 中的组件数。"
+"OuterProduct 将第一个参数 “c” 视为列向量（只有一列的矩阵），将第二个参数 “r” "
+"视为行向量（只有一行的矩阵），并执行线性代数矩阵乘法 “c * r”。所生成的矩阵"
+"中，行数为 “c” 中元素的数量，列数为 “r” 中元素的数量。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Composes transform from four vectors."
@@ -9232,7 +9238,7 @@ msgstr "变换统一。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Vector function."
-msgstr "向量功能。"
+msgstr "向量函数。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Vector operator."
@@ -9248,7 +9254,7 @@ msgstr "将向量分解为三个标量。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Calculates the cross product of two vectors."
-msgstr "计算两个向量的叉乘。"
+msgstr "计算两个向量的叉积。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Returns the distance between two points."
@@ -9256,7 +9262,7 @@ msgstr "返回两点之间的距离。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Calculates the dot product of two vectors."
-msgstr "计算两个向量的点乘。"
+msgstr "计算两个向量的点积。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid ""
@@ -9266,7 +9272,7 @@ msgid ""
 "Nref is smaller than zero the return value is N. Otherwise -N is returned."
 msgstr ""
 "返回指向与参考向量相同方向的向量。该函数有三个向量参数：N，方向向量；I，入射"
-"向量；Nref，参考向量。如果 I 和 Nref 的点乘小于零，返回值为 N，否则返回 -N。"
+"向量；Nref，参考向量。如果 I 和 Nref 的点积小于零，返回值为 N，否则返回 -N。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Calculates the length of a vector."
@@ -9278,7 +9284,7 @@ msgstr "两个向量之间的线性插值。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Linear interpolation between two vectors using scalar."
-msgstr "使用标量的两个矢量之间的线性插值。"
+msgstr "使用标量在两个向量之间进行线性插值。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Calculates the normalize product of vector."
@@ -9300,7 +9306,7 @@ msgstr "返回指向反射方向的向量（a：入射向量，b：法向量）�
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "Returns the vector that points in the direction of refraction."
-msgstr "返回指向折射方向的矢量。"
+msgstr "返回指向折射方向的向量。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid ""
@@ -9411,13 +9417,13 @@ msgstr "（仅限片段/光照模式）标量导数函数。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid "(Fragment/Light mode only) Vector derivative function."
-msgstr "（仅限片段/灯光模式）矢量导数功能。"
+msgstr "（仅限片段/光照模式）向量导数函数。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid ""
 "(Fragment/Light mode only) (Vector) Derivative in 'x' using local "
 "differencing."
-msgstr "（仅限片段/光照模式）（矢量）使用局部差分的 “x” 中的导数。"
+msgstr "（仅限片段/光照模式）（向量）使用局部差分的 “x” 中的导数。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid ""
@@ -9429,7 +9435,7 @@ msgstr "（仅限片段/光照模式）（标量）使用本地差分的“ x”
 msgid ""
 "(Fragment/Light mode only) (Vector) Derivative in 'y' using local "
 "differencing."
-msgstr "（仅适用于片段/光照模式）（矢量）使用局部差分的'y'导数。"
+msgstr "（仅适用于片段/光照模式）（向量）使用局部差分的'y'导数。"
 
 #: editor/plugins/visual_shader_editor_plugin.cpp
 msgid ""
@@ -10787,6 +10793,13 @@ msgid "Remote"
 msgstr "远程"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "本地"
 
diff --git a/editor/translations/zh_HK.po b/editor/translations/zh_HK.po
index 030f678592..37e8f6ab61 100644
--- a/editor/translations/zh_HK.po
+++ b/editor/translations/zh_HK.po
@@ -7794,6 +7794,11 @@ msgstr "本地化"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -11193,6 +11198,13 @@ msgid "Remote"
 msgstr "移除"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr ""
 
diff --git a/editor/translations/zh_TW.po b/editor/translations/zh_TW.po
index 2c60984b36..50b323130e 100644
--- a/editor/translations/zh_TW.po
+++ b/editor/translations/zh_TW.po
@@ -7431,6 +7431,11 @@ msgstr "視圖旋轉已鎖定"
 
 #: editor/plugins/spatial_editor_plugin.cpp
 msgid ""
+"To zoom further, change the camera's clipping planes (View -> Settings...)"
+msgstr ""
+
+#: editor/plugins/spatial_editor_plugin.cpp
+msgid ""
 "Note: The FPS value displayed is the editor's framerate.\n"
 "It cannot be used as a reliable indication of in-game performance."
 msgstr ""
@@ -10738,6 +10743,13 @@ msgid "Remote"
 msgstr "遠端"
 
 #: editor/scene_tree_dock.cpp
+msgid ""
+"If selected, the Remote scene tree dock will cause the project to stutter "
+"every time it updates.\n"
+"Switch back to the Local scene tree dock to improve performance."
+msgstr ""
+
+#: editor/scene_tree_dock.cpp
 msgid "Local"
 msgstr "本機"
 
diff --git a/main/main.cpp b/main/main.cpp
index 4635d377c4..bb16c49983 100644
--- a/main/main.cpp
+++ b/main/main.cpp
@@ -375,8 +375,8 @@ void Main::print_help(const char *p_binary) {
 #ifdef TESTS_ENABLED
 	OS::get_singleton()->print("  --test [--help]                              Run unit tests. Use --test --help for more information.\n");
 #endif
-	OS::get_singleton()->print("\n");
 #endif
+	OS::get_singleton()->print("\n");
 }
 
 #ifdef TESTS_ENABLED
@@ -390,6 +390,8 @@ Error Main::test_setup() {
 	register_core_types();
 	register_core_driver_types();
 
+	packed_data = memnew(PackedData);
+
 	globals = memnew(ProjectSettings);
 
 	GLOBAL_DEF("debug/settings/crash_handler/message",
@@ -459,6 +461,9 @@ void Main::test_cleanup() {
 	if (globals) {
 		memdelete(globals);
 	}
+	if (packed_data) {
+		memdelete(packed_data);
+	}
 	if (engine) {
 		memdelete(engine);
 	}
@@ -1248,6 +1253,7 @@ Error Main::setup(const char *execpath, int argc, char *argv[], bool p_second_ph
 	}
 
 	GLOBAL_DEF("internationalization/rendering/force_right_to_left_layout_direction", false);
+	GLOBAL_DEF("internationalization/locale/include_text_server_data", false);
 
 	if (!force_lowdpi) {
 		OS::get_singleton()->_allow_hidpi = GLOBAL_DEF("display/window/dpi/allow_hidpi", false);
@@ -2537,10 +2543,10 @@ bool Main::iteration() {
 	if (frame > 1000000) {
 		if (editor || project_manager) {
 			if (print_fps) {
-				print_line("Editor FPS: " + itos(frames));
+				print_line(vformat("Editor FPS: %d (%s mspf)", frames, rtos(1000.0 / frames).pad_decimals(1)));
 			}
 		} else if (GLOBAL_GET("debug/settings/stdout/print_fps") || print_fps) {
-			print_line("Game FPS: " + itos(frames));
+			print_line(vformat("Project FPS: %d (%s mspf)", frames, rtos(1000.0 / frames).pad_decimals(1)));
 		}
 
 		Engine::get_singleton()->_fps = frames;
diff --git a/misc/dist/html/editor.html b/misc/dist/html/editor.html
index 4785f54973..347c22adf8 100644
--- a/misc/dist/html/editor.html
+++ b/misc/dist/html/editor.html
@@ -58,6 +58,29 @@
 			filter: brightness(82.5%);
 		}
 
+		.welcome-modal {
+			display: none;
+ 			position: fixed;
+			z-index: 1;
+			left: 0;
+			top: 0;
+			width: 100%;
+			height: 100%;
+			overflow: auto;
+			background-color: hsla(0, 0%, 0%, 0.5);
+		}
+
+		.welcome-modal-content {
+			background-color: #333b4f;
+			box-shadow: 0 0.25rem 0.25rem hsla(0, 0%, 0%, 0.5);
+			line-height: 1.5;
+			max-width: 38rem;
+			margin: 4rem auto 0 auto;
+			color: white;
+			border-radius: 0.5rem;
+			padding: 1rem 1rem 2rem 1rem;
+		}
+
 		#tabs-buttons {
 			/* Match the default background color of the editor window for a seamless appearance. */
 			background-color: #202531;
@@ -206,6 +229,36 @@
 	</style>
 </head>
 <body>
+	<div
+		id="welcome-modal"
+		class="welcome-modal"
+		role="dialog"
+		aria-labelledby="welcome-modal-title"
+		aria-describedby="welcome-modal-description"
+		onclick="if (event.target === this) closeWelcomeModal(false)"
+	>
+		<div class="welcome-modal-content">
+			<h2 id="welcome-modal-title">Important - Please read before continuing</h2>
+			<div id="welcome-modal-description">
+				<p>
+					The Godot Web Editor has some limitations compared to the native version.
+					Its main focus is education and experimentation;
+					<strong>it is not recommended for production</strong>.
+				</p>
+				<p>
+					Refer to the
+					<a
+						href="https://docs.godotengine.org/en/latest/tutorials/editor/using_the_web_editor.html"
+						target="_blank"
+						rel="noopener"
+					>Web editor documentation</a> for usage instructions and limitations.
+				</p>
+			</div>
+			<button id="welcome-modal-dismiss" class="btn" type="button" onclick="closeWelcomeModal(true)" style="margin-top: 1rem">
+				OK, don't show again
+			</button>
+		</div>
+	</div>
 	<div id="tabs-buttons">
 		<button id="btn-tab-loader" class="btn tab-btn" onclick="showTab('loader')">Loader</button>
 		<button id="btn-tab-editor" class="btn tab-btn" disabled="disabled" onclick="showTab('editor')">Editor</button>
@@ -274,7 +327,19 @@
 			if ("serviceWorker" in navigator) {
 				navigator.serviceWorker.register("service.worker.js");
 			}
+
+			if (localStorage.getItem("welcomeModalDismissed") !== 'true') {
+				document.getElementById("welcome-modal").style.display = "block";
+				document.getElementById("welcome-modal-dismiss").focus();
+			}
 		});
+
+		function closeWelcomeModal(dontShowAgain) {
+			document.getElementById("welcome-modal").style.display = "none";
+			if (dontShowAgain) {
+				localStorage.setItem("welcomeModalDismissed", 'true');
+			}
+		}
 	</script>
 	<script src="godot.tools.js"></script>
 	<script>//<![CDATA[
diff --git a/misc/dist/html/full-size.html b/misc/dist/html/full-size.html
index abc0479739..7afb6fdb6b 100644
--- a/misc/dist/html/full-size.html
+++ b/misc/dist/html/full-size.html
@@ -3,7 +3,6 @@
 <head>
 	<meta charset='utf-8' />
 	<meta name='viewport' content='width=device-width, user-scalable=no' />
-	<link id='-gd-engine-icon' rel='icon' type='image/png' href='favicon.png' />
 	<title>$GODOT_PROJECT_NAME</title>
 	<style type='text/css'>
 
diff --git a/misc/dist/html/offline-export.html b/misc/dist/html/offline-export.html
new file mode 100644
index 0000000000..41ab42b04b
--- /dev/null
+++ b/misc/dist/html/offline-export.html
@@ -0,0 +1,42 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+	<meta charset="utf-8" />
+	<meta http-equiv="X-UA-Compatible" content="IE=edge" />
+	<meta name="viewport" content="width=device-width, initial-scale=1" />
+	<title>You are offline</title>
+	<style>
+		html {
+			background-color: #000000;
+			color: #ffffff;
+		}
+
+		body {
+			font-family: system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
+			margin: 2rem;
+		}
+
+		p {
+			margin-block: 1rem;
+		}
+
+		button {
+			display: block;
+			padding: 1rem 2rem;
+			margin: 3rem auto 0;
+		}
+	</style>
+</head>
+<body>
+	<h1>You are offline</h1>
+	<p>This application requires an Internet connection to run for the first time.</p>
+	<p>Press the button below to try reloading:</p>
+	<button type="button">Reload</button>
+
+	<script>
+		document.querySelector("button").addEventListener("click", () => {
+			window.location.reload();
+		});
+	</script>
+</body>
+</html>
diff --git a/misc/dist/html/service-worker.js b/misc/dist/html/service-worker.js
index d4eaed2b17..f8dee8cd5b 100644
--- a/misc/dist/html/service-worker.js
+++ b/misc/dist/html/service-worker.js
@@ -5,22 +5,11 @@
 // previously cached resources to be updated from the network.
 const CACHE_VERSION = "@GODOT_VERSION@";
 const CACHE_NAME = "@GODOT_NAME@-cache";
-const OFFLINE_URL = "offline.html";
+const OFFLINE_URL = "@GODOT_OFFLINE_PAGE@";
 // Files that will be cached on load.
-const CACHED_FILES = [
-	"godot.tools.html",
-	"offline.html",
-	"godot.tools.js",
-	"godot.tools.worker.js",
-	"godot.tools.audio.worklet.js",
-	"logo.svg",
-	"favicon.png",
-];
-
+const CACHED_FILES = @GODOT_CACHE@;
 // Files that we might not want the user to preload, and will only be cached on first load.
-const CACHABLE_FILES = [
-	"godot.tools.wasm",
-];
+const CACHABLE_FILES = @GODOT_OPT_CACHE@;
 const FULL_CACHE = CACHED_FILES.concat(CACHABLE_FILES);
 
 self.addEventListener("install", (event) => {
diff --git a/misc/dist/osx_template.app/Contents/Resources/vulkan/icd.d/MoltenVK_icd.json b/misc/dist/osx_template.app/Contents/Resources/vulkan/icd.d/MoltenVK_icd.json
index 6bf2edb02d..c4f8f71d0e 100644
--- a/misc/dist/osx_template.app/Contents/Resources/vulkan/icd.d/MoltenVK_icd.json
+++ b/misc/dist/osx_template.app/Contents/Resources/vulkan/icd.d/MoltenVK_icd.json
@@ -2,6 +2,6 @@
     "file_format_version" : "1.0.0",
     "ICD": {
         "library_path": "../../../Frameworks/libMoltenVK.dylib",
-        "api_version" : "1.0.0"
+        "api_version" : "1.1.0"
     }
 }
diff --git a/misc/dist/osx_tools.app/Contents/Resources/vulkan/icd.d/MoltenVK_icd.json b/misc/dist/osx_tools.app/Contents/Resources/vulkan/icd.d/MoltenVK_icd.json
index 6bf2edb02d..c4f8f71d0e 100644
--- a/misc/dist/osx_tools.app/Contents/Resources/vulkan/icd.d/MoltenVK_icd.json
+++ b/misc/dist/osx_tools.app/Contents/Resources/vulkan/icd.d/MoltenVK_icd.json
@@ -2,6 +2,6 @@
     "file_format_version" : "1.0.0",
     "ICD": {
         "library_path": "../../../Frameworks/libMoltenVK.dylib",
-        "api_version" : "1.0.0"
+        "api_version" : "1.1.0"
     }
 }
diff --git a/misc/scripts/check_ci_log.py b/misc/scripts/check_ci_log.py
new file mode 100755
index 0000000000..f2cdf95c7b
--- /dev/null
+++ b/misc/scripts/check_ci_log.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+
+if len(sys.argv) < 2:
+    print("ERROR: You must run program with file name as argument.")
+    sys.exit(1)
+
+fname = sys.argv[1]
+
+fileread = open(fname.strip(), "r")
+file_contents = fileread.read()
+
+# If find "ERROR: AddressSanitizer:", then happens invalid read or write
+# This is critical bug, so we need to fix this as fast as possible
+
+if file_contents.find("ERROR: AddressSanitizer:") != -1:
+    print("FATAL ERROR: An incorrectly used memory was found.")
+    sys.exit(1)
+
+# There is also possible, that program crashed with or without backtrace.
+
+if (
+    file_contents.find("Program crashed with signal") != -1
+    or file_contents.find("Dumping the backtrace") != -1
+    or file_contents.find("Segmentation fault (core dumped)") != -1
+):
+    print("FATAL ERROR: Godot has been crashed.")
+    sys.exit(1)
+
+# Finding memory leaks in Godot is quite difficult, because we need to take into
+# account leaks also in external libraries. They are usually provided without
+# debugging symbols, so the leak report from it usually has only 2/3 lines,
+# so searching for 5 element - "#4 0x" - should correctly detect the vast
+# majority of memory leaks
+
+if file_contents.find("ERROR: LeakSanitizer:") != -1:
+    if file_contents.find("#4 0x") != -1:
+        print("ERROR: Memory leak was found")
+        sys.exit(1)
+
+# It may happen that Godot detects leaking nodes/resources and removes them, so
+# this possibility should also be handled as a potential error, even if
+# LeakSanitizer doesn't report anything
+
+if file_contents.find("ObjectDB instances leaked at exit") != -1:
+    print("ERROR: Memory leak was found")
+    sys.exit(1)
+
+# In test project may be put several assert functions which will control if
+# project is executed with right parameters etc. which normally will not stop
+# execution of project
+
+if file_contents.find("Assertion failed") != -1:
+    print("ERROR: Assertion failed in project, check exectution log for more info")
+    sys.exit(1)
+
+# For now Godot leaks a lot of rendering stuff so for now we just show info
+# about it and this needs to be reenabled after fixing this memory leaks.
+
+if file_contents.find("were leaked") != -1 or file_contents.find("were never freed") != -1:
+    print("WARNING: Memory leak was found")
+
+sys.exit(0)
diff --git a/modules/bullet/bullet_physics_server.cpp b/modules/bullet/bullet_physics_server.cpp
index 93642f2d5c..e601884486 100644
--- a/modules/bullet/bullet_physics_server.cpp
+++ b/modules/bullet/bullet_physics_server.cpp
@@ -824,10 +824,10 @@ bool BulletPhysicsServer3D::body_is_omitting_force_integration(RID p_body) const
 	return body->get_omit_forces_integration();
 }
 
-void BulletPhysicsServer3D::body_set_force_integration_callback(RID p_body, Object *p_receiver, const StringName &p_method, const Variant &p_udata) {
+void BulletPhysicsServer3D::body_set_force_integration_callback(RID p_body, const Callable &p_callable, const Variant &p_udata) {
 	RigidBodyBullet *body = rigid_body_owner.getornull(p_body);
 	ERR_FAIL_COND(!body);
-	body->set_force_integration_callback(p_receiver ? p_receiver->get_instance_id() : ObjectID(), p_method, p_udata);
+	body->set_force_integration_callback(p_callable, p_udata);
 }
 
 void BulletPhysicsServer3D::body_set_ray_pickable(RID p_body, bool p_enable) {
diff --git a/modules/bullet/bullet_physics_server.h b/modules/bullet/bullet_physics_server.h
index 856ff74963..de0379c873 100644
--- a/modules/bullet/bullet_physics_server.h
+++ b/modules/bullet/bullet_physics_server.h
@@ -246,7 +246,7 @@ public:
 	virtual void body_set_omit_force_integration(RID p_body, bool p_omit) override;
 	virtual bool body_is_omitting_force_integration(RID p_body) const override;
 
-	virtual void body_set_force_integration_callback(RID p_body, Object *p_receiver, const StringName &p_method, const Variant &p_udata = Variant()) override;
+	virtual void body_set_force_integration_callback(RID p_body, const Callable &p_callable, const Variant &p_udata = Variant()) override;
 
 	virtual void body_set_ray_pickable(RID p_body, bool p_enable) override;
 
diff --git a/modules/bullet/rigid_body_bullet.cpp b/modules/bullet/rigid_body_bullet.cpp
index 433bff8c38..675da1a597 100644
--- a/modules/bullet/rigid_body_bullet.cpp
+++ b/modules/bullet/rigid_body_bullet.cpp
@@ -346,16 +346,17 @@ void RigidBodyBullet::dispatch_callbacks() {
 
 		Variant variantBodyDirect = bodyDirect;
 
-		Object *obj = ObjectDB::get_instance(force_integration_callback->id);
+		Object *obj = force_integration_callback->callable.get_object();
 		if (!obj) {
 			// Remove integration callback
-			set_force_integration_callback(ObjectID(), StringName());
+			set_force_integration_callback(Callable());
 		} else {
 			const Variant *vp[2] = { &variantBodyDirect, &force_integration_callback->udata };
 
 			Callable::CallError responseCallError;
 			int argc = (force_integration_callback->udata.get_type() == Variant::NIL) ? 1 : 2;
-			obj->call(force_integration_callback->method, vp, argc, responseCallError);
+			Variant rv;
+			force_integration_callback->callable.call(vp, argc, rv, responseCallError);
 		}
 	}
 
@@ -371,16 +372,15 @@ void RigidBodyBullet::dispatch_callbacks() {
 	previousActiveState = btBody->isActive();
 }
 
-void RigidBodyBullet::set_force_integration_callback(ObjectID p_id, const StringName &p_method, const Variant &p_udata) {
+void RigidBodyBullet::set_force_integration_callback(const Callable &p_callable, const Variant &p_udata) {
 	if (force_integration_callback) {
 		memdelete(force_integration_callback);
 		force_integration_callback = nullptr;
 	}
 
-	if (p_id.is_valid()) {
+	if (p_callable.get_object()) {
 		force_integration_callback = memnew(ForceIntegrationCallback);
-		force_integration_callback->id = p_id;
-		force_integration_callback->method = p_method;
+		force_integration_callback->callable = p_callable;
 		force_integration_callback->udata = p_udata;
 	}
 }
diff --git a/modules/bullet/rigid_body_bullet.h b/modules/bullet/rigid_body_bullet.h
index a4be7f9e07..843ff4a7af 100644
--- a/modules/bullet/rigid_body_bullet.h
+++ b/modules/bullet/rigid_body_bullet.h
@@ -154,8 +154,7 @@ public:
 	};
 
 	struct ForceIntegrationCallback {
-		ObjectID id;
-		StringName method;
+		Callable callable;
 		Variant udata;
 	};
 
@@ -240,7 +239,7 @@ public:
 	virtual void set_space(SpaceBullet *p_space);
 
 	virtual void dispatch_callbacks();
-	void set_force_integration_callback(ObjectID p_id, const StringName &p_method, const Variant &p_udata = Variant());
+	void set_force_integration_callback(const Callable &p_callable, const Variant &p_udata = Variant());
 	void scratch_space_override_modificator();
 
 	virtual void on_collision_filters_change();
diff --git a/modules/bullet/shape_bullet.cpp b/modules/bullet/shape_bullet.cpp
index 471b154813..40e785d699 100644
--- a/modules/bullet/shape_bullet.cpp
+++ b/modules/bullet/shape_bullet.cpp
@@ -142,7 +142,7 @@ btScaledBvhTriangleMeshShape *ShapeBullet::create_shape_concave(btBvhTriangleMes
 	}
 }
 
-btHeightfieldTerrainShape *ShapeBullet::create_shape_height_field(Vector<real_t> &p_heights, int p_width, int p_depth, real_t p_min_height, real_t p_max_height) {
+btHeightfieldTerrainShape *ShapeBullet::create_shape_height_field(Vector<float> &p_heights, int p_width, int p_depth, real_t p_min_height, real_t p_max_height) {
 	const btScalar ignoredHeightScale(1);
 	const int YAxis = 1; // 0=X, 1=Y, 2=Z
 	const bool flipQuadEdges = false;
@@ -480,17 +480,10 @@ void HeightMapShapeBullet::set_data(const Variant &p_data) {
 	ERR_FAIL_COND_MSG(l_width < 2, "Map width must be at least 2.");
 	ERR_FAIL_COND_MSG(l_depth < 2, "Map depth must be at least 2.");
 
-	// TODO This code will need adjustments if real_t is set to `double`,
-	// because that precision is unnecessary for a heightmap and Bullet doesn't support it...
-
-	Vector<real_t> l_heights;
+	Vector<float> l_heights;
 	Variant l_heights_v = d["heights"];
 
-#ifdef REAL_T_IS_DOUBLE
-	if (l_heights_v.get_type() == Variant::PACKED_FLOAT64_ARRAY) {
-#else
 	if (l_heights_v.get_type() == Variant::PACKED_FLOAT32_ARRAY) {
-#endif
 		// Ready-to-use heights can be passed
 
 		l_heights = l_heights_v;
@@ -511,9 +504,9 @@ void HeightMapShapeBullet::set_data(const Variant &p_data) {
 
 		l_heights.resize(l_image->get_width() * l_image->get_height());
 
-		real_t *w = l_heights.ptrw();
+		float *w = l_heights.ptrw();
 		const uint8_t *r = im_data.ptr();
-		real_t *rp = (real_t *)r;
+		float *rp = (float *)r;
 		// At this point, `rp` could be used directly for Bullet, but I don't know how safe it would be.
 
 		for (int i = 0; i < l_heights.size(); ++i) {
@@ -521,11 +514,7 @@ void HeightMapShapeBullet::set_data(const Variant &p_data) {
 		}
 
 	} else {
-#ifdef REAL_T_IS_DOUBLE
-		ERR_FAIL_MSG("Expected PackedFloat64Array or float Image.");
-#else
 		ERR_FAIL_MSG("Expected PackedFloat32Array or float Image.");
-#endif
 	}
 
 	ERR_FAIL_COND(l_width <= 0);
@@ -534,11 +523,11 @@ void HeightMapShapeBullet::set_data(const Variant &p_data) {
 
 	// Compute min and max heights if not specified.
 	if (!d.has("min_height") && !d.has("max_height")) {
-		const real_t *r = l_heights.ptr();
+		const float *r = l_heights.ptr();
 		int heights_size = l_heights.size();
 
 		for (int i = 0; i < heights_size; ++i) {
-			real_t h = r[i];
+			float h = r[i];
 
 			if (h < l_min_height) {
 				l_min_height = h;
@@ -559,7 +548,7 @@ PhysicsServer3D::ShapeType HeightMapShapeBullet::get_type() const {
 	return PhysicsServer3D::SHAPE_HEIGHTMAP;
 }
 
-void HeightMapShapeBullet::setup(Vector<real_t> &p_heights, int p_width, int p_depth, real_t p_min_height, real_t p_max_height) {
+void HeightMapShapeBullet::setup(Vector<float> &p_heights, int p_width, int p_depth, real_t p_min_height, real_t p_max_height) {
 	// TODO cell size must be tweaked using localScaling, which is a shared property for all Bullet shapes
 
 	// If this array is resized outside of here, it should be preserved due to CoW
diff --git a/modules/bullet/shape_bullet.h b/modules/bullet/shape_bullet.h
index bfd95747eb..5080d13d99 100644
--- a/modules/bullet/shape_bullet.h
+++ b/modules/bullet/shape_bullet.h
@@ -89,7 +89,7 @@ public:
 	/// IMPORTANT: Remember to delete the shape interface by calling: delete my_shape->getMeshInterface();
 	static class btConvexPointCloudShape *create_shape_convex(btAlignedObjectArray<btVector3> &p_vertices, const btVector3 &p_local_scaling = btVector3(1, 1, 1));
 	static class btScaledBvhTriangleMeshShape *create_shape_concave(btBvhTriangleMeshShape *p_mesh_shape, const btVector3 &p_local_scaling = btVector3(1, 1, 1));
-	static class btHeightfieldTerrainShape *create_shape_height_field(Vector<real_t> &p_heights, int p_width, int p_depth, real_t p_min_height, real_t p_max_height);
+	static class btHeightfieldTerrainShape *create_shape_height_field(Vector<float> &p_heights, int p_width, int p_depth, real_t p_min_height, real_t p_max_height);
 	static class btRayShape *create_shape_ray(real_t p_length, bool p_slips_on_slope);
 };
 
@@ -212,7 +212,7 @@ private:
 
 class HeightMapShapeBullet : public ShapeBullet {
 public:
-	Vector<real_t> heights;
+	Vector<float> heights;
 	int width = 0;
 	int depth = 0;
 	real_t min_height = 0.0;
@@ -226,7 +226,7 @@ public:
 	virtual btCollisionShape *create_bt_shape(const btVector3 &p_implicit_scale, real_t p_extra_edge = 0);
 
 private:
-	void setup(Vector<real_t> &p_heights, int p_width, int p_depth, real_t p_min_height, real_t p_max_height);
+	void setup(Vector<float> &p_heights, int p_width, int p_depth, real_t p_min_height, real_t p_max_height);
 };
 
 class RayShapeBullet : public ShapeBullet {
diff --git a/modules/camera/camera_osx.mm b/modules/camera/camera_osx.mm
index 3d2053ad23..9b59b68075 100644
--- a/modules/camera/camera_osx.mm
+++ b/modules/camera/camera_osx.mm
@@ -106,15 +106,15 @@
 	if (input) {
 		[self removeInput:input];
 		// don't release this
-		input = NULL;
+		input = nullptr;
 	}
 
 	// free up our output
 	if (output) {
 		[self removeOutput:output];
-		[output setSampleBufferDelegate:nil queue:NULL];
+		[output setSampleBufferDelegate:nil queue:nullptr];
 		[output release];
-		output = NULL;
+		output = nullptr;
 	}
 
 	[self commitConfiguration];
@@ -141,9 +141,9 @@
 	// get our buffers
 	unsigned char *dataY = (unsigned char *)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 0);
 	unsigned char *dataCbCr = (unsigned char *)CVPixelBufferGetBaseAddressOfPlane(pixelBuffer, 1);
-	if (dataY == NULL) {
+	if (dataY == nullptr) {
 		print_line("Couldn't access Y pixel buffer data");
-	} else if (dataCbCr == NULL) {
+	} else if (dataCbCr == nullptr) {
 		print_line("Couldn't access CbCr pixel buffer data");
 	} else {
 		Ref<Image> img[2];
@@ -220,8 +220,8 @@ AVCaptureDevice *CameraFeedOSX::get_device() const {
 };
 
 CameraFeedOSX::CameraFeedOSX() {
-	device = NULL;
-	capture_session = NULL;
+	device = nullptr;
+	capture_session = nullptr;
 };
 
 void CameraFeedOSX::set_device(AVCaptureDevice *p_device) {
@@ -240,14 +240,14 @@ void CameraFeedOSX::set_device(AVCaptureDevice *p_device) {
 };
 
 CameraFeedOSX::~CameraFeedOSX() {
-	if (capture_session != NULL) {
+	if (capture_session != nullptr) {
 		[capture_session release];
-		capture_session = NULL;
+		capture_session = nullptr;
 	};
 
-	if (device != NULL) {
+	if (device != nullptr) {
 		[device release];
-		device = NULL;
+		device = nullptr;
 	};
 };
 
@@ -267,7 +267,7 @@ void CameraFeedOSX::deactivate_feed() {
 	if (capture_session) {
 		[capture_session cleanup];
 		[capture_session release];
-		capture_session = NULL;
+		capture_session = nullptr;
 	};
 };
 
diff --git a/modules/csg/csg_gizmos.cpp b/modules/csg/csg_gizmos.cpp
index e23442ef99..8a46dcca65 100644
--- a/modules/csg/csg_gizmos.cpp
+++ b/modules/csg/csg_gizmos.cpp
@@ -292,27 +292,16 @@ bool CSGShape3DGizmoPlugin::is_selectable_when_hidden() const {
 }
 
 void CSGShape3DGizmoPlugin::redraw(EditorNode3DGizmo *p_gizmo) {
-	CSGShape3D *cs = Object::cast_to<CSGShape3D>(p_gizmo->get_spatial_node());
-
 	p_gizmo->clear();
 
-	Ref<Material> material;
-	switch (cs->get_operation()) {
-		case CSGShape3D::OPERATION_UNION:
-			material = get_material("shape_union_material", p_gizmo);
-			break;
-		case CSGShape3D::OPERATION_INTERSECTION:
-			material = get_material("shape_intersection_material", p_gizmo);
-			break;
-		case CSGShape3D::OPERATION_SUBTRACTION:
-			material = get_material("shape_subtraction_material", p_gizmo);
-			break;
-	}
-
-	Ref<Material> handles_material = get_material("handles");
+	CSGShape3D *cs = Object::cast_to<CSGShape3D>(p_gizmo->get_spatial_node());
 
 	Vector<Vector3> faces = cs->get_brush_faces();
 
+	if (faces.size() == 0) {
+		return;
+	}
+
 	Vector<Vector3> lines;
 	lines.resize(faces.size() * 2);
 	{
@@ -328,6 +317,21 @@ void CSGShape3DGizmoPlugin::redraw(EditorNode3DGizmo *p_gizmo) {
 		}
 	}
 
+	Ref<Material> material;
+	switch (cs->get_operation()) {
+		case CSGShape3D::OPERATION_UNION:
+			material = get_material("shape_union_material", p_gizmo);
+			break;
+		case CSGShape3D::OPERATION_INTERSECTION:
+			material = get_material("shape_intersection_material", p_gizmo);
+			break;
+		case CSGShape3D::OPERATION_SUBTRACTION:
+			material = get_material("shape_subtraction_material", p_gizmo);
+			break;
+	}
+
+	Ref<Material> handles_material = get_material("handles");
+
 	p_gizmo->add_lines(lines, material);
 	p_gizmo->add_collision_segments(lines);
 
diff --git a/modules/csg/csg_shape.cpp b/modules/csg/csg_shape.cpp
index 77be493be9..541b7036ac 100644
--- a/modules/csg/csg_shape.cpp
+++ b/modules/csg/csg_shape.cpp
@@ -89,6 +89,7 @@ uint32_t CSGShape3D::get_collision_mask() const {
 }
 
 void CSGShape3D::set_collision_mask_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision mask bit must be between 0 and 31 inclusive.");
 	uint32_t mask = get_collision_mask();
 	if (p_value) {
 		mask |= 1 << p_bit;
@@ -99,20 +100,23 @@ void CSGShape3D::set_collision_mask_bit(int p_bit, bool p_value) {
 }
 
 bool CSGShape3D::get_collision_mask_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision mask bit must be between 0 and 31 inclusive.");
 	return get_collision_mask() & (1 << p_bit);
 }
 
 void CSGShape3D::set_collision_layer_bit(int p_bit, bool p_value) {
-	uint32_t mask = get_collision_layer();
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision layer bit must be between 0 and 31 inclusive.");
+	uint32_t layer = get_collision_layer();
 	if (p_value) {
-		mask |= 1 << p_bit;
+		layer |= 1 << p_bit;
 	} else {
-		mask &= ~(1 << p_bit);
+		layer &= ~(1 << p_bit);
 	}
-	set_collision_layer(mask);
+	set_collision_layer(layer);
 }
 
 bool CSGShape3D::get_collision_layer_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision layer bit must be between 0 and 31 inclusive.");
 	return get_collision_layer() & (1 << p_bit);
 }
 
@@ -880,7 +884,7 @@ void CSGMesh3D::set_mesh(const Ref<Mesh> &p_mesh) {
 		mesh->connect("changed", callable_mp(this, &CSGMesh3D::_mesh_changed));
 	}
 
-	_make_dirty();
+	_mesh_changed();
 }
 
 Ref<Mesh> CSGMesh3D::get_mesh() {
@@ -1741,7 +1745,6 @@ CSGBrush *CSGPolygon3D::_build_brush() {
 
 			path_cache->connect("tree_exited", callable_mp(this, &CSGPolygon3D::_path_exited));
 			path_cache->connect("curve_changed", callable_mp(this, &CSGPolygon3D::_path_changed));
-			path_cache = nullptr;
 		}
 		curve = path->get_curve();
 		if (curve.is_null()) {
@@ -2226,7 +2229,7 @@ void CSGPolygon3D::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "depth", PROPERTY_HINT_EXP_RANGE, "0.001,1000.0,0.001,or_greater"), "set_depth", "get_depth");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "spin_degrees", PROPERTY_HINT_RANGE, "1,360,0.1"), "set_spin_degrees", "get_spin_degrees");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "spin_sides", PROPERTY_HINT_RANGE, "3,64,1"), "set_spin_sides", "get_spin_sides");
-	ADD_PROPERTY(PropertyInfo(Variant::NODE_PATH, "path_node", PROPERTY_HINT_NODE_PATH_VALID_TYPES, "Path"), "set_path_node", "get_path_node");
+	ADD_PROPERTY(PropertyInfo(Variant::NODE_PATH, "path_node", PROPERTY_HINT_NODE_PATH_VALID_TYPES, "Path3D"), "set_path_node", "get_path_node");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "path_interval", PROPERTY_HINT_EXP_RANGE, "0.001,1000.0,0.001,or_greater"), "set_path_interval", "get_path_interval");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "path_rotation", PROPERTY_HINT_ENUM, "Polygon,Path,PathFollow"), "set_path_rotation", "get_path_rotation");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "path_local"), "set_path_local", "is_path_local");
diff --git a/modules/csg/doc_classes/CSGMesh3D.xml b/modules/csg/doc_classes/CSGMesh3D.xml
index 1bab8f4ee9..5fa8427843 100644
--- a/modules/csg/doc_classes/CSGMesh3D.xml
+++ b/modules/csg/doc_classes/CSGMesh3D.xml
@@ -4,7 +4,7 @@
 		A CSG Mesh shape that uses a mesh resource.
 	</brief_description>
 	<description>
-		This CSG node allows you to use any mesh resource as a CSG shape, provided it is closed, does not self-intersect, does not contain internal faces and has no edges that connect to more then two faces.
+		This CSG node allows you to use any mesh resource as a CSG shape, provided it is closed, does not self-intersect, does not contain internal faces and has no edges that connect to more than two faces.
 	</description>
 	<tutorials>
 	</tutorials>
@@ -16,6 +16,7 @@
 		</member>
 		<member name="mesh" type="Mesh" setter="set_mesh" getter="get_mesh">
 			The [Mesh] resource to use as a CSG shape.
+			[b]Note:[/b] When using an [ArrayMesh], avoid meshes with vertex normals unless a flat shader is required. By default, CSGMesh will ignore the mesh's vertex normals and use a smooth shader calculated using the faces' normals. If a flat shader is required, ensure that all faces' vertex normals are parallel.
 		</member>
 	</members>
 	<constants>
diff --git a/modules/enet/doc_classes/NetworkedMultiplayerENet.xml b/modules/enet/doc_classes/NetworkedMultiplayerENet.xml
index c8f32ffde6..f22ff29349 100644
--- a/modules/enet/doc_classes/NetworkedMultiplayerENet.xml
+++ b/modules/enet/doc_classes/NetworkedMultiplayerENet.xml
@@ -33,10 +33,10 @@
 			</argument>
 			<argument index="3" name="out_bandwidth" type="int" default="0">
 			</argument>
-			<argument index="4" name="client_port" type="int" default="0">
+			<argument index="4" name="local_port" type="int" default="0">
 			</argument>
 			<description>
-				Create client that connects to a server at [code]address[/code] using specified [code]port[/code]. The given address needs to be either a fully qualified domain name (e.g. [code]"www.example.com"[/code]) or an IP address in IPv4 or IPv6 format (e.g. [code]"192.168.1.1"[/code]). The [code]port[/code] is the port the server is listening on. The [code]in_bandwidth[/code] and [code]out_bandwidth[/code] parameters can be used to limit the incoming and outgoing bandwidth to the given number of bytes per second. The default of 0 means unlimited bandwidth. Note that ENet will strategically drop packets on specific sides of a connection between peers to ensure the peer's bandwidth is not overwhelmed. The bandwidth parameters also determine the window size of a connection which limits the amount of reliable packets that may be in transit at any given time. Returns [constant OK] if a client was created, [constant ERR_ALREADY_IN_USE] if this NetworkedMultiplayerENet instance already has an open connection (in which case you need to call [method close_connection] first) or [constant ERR_CANT_CREATE] if the client could not be created. If [code]client_port[/code] is specified, the client will also listen to the given port; this is useful for some NAT traversal techniques.
+				Create client that connects to a server at [code]address[/code] using specified [code]port[/code]. The given address needs to be either a fully qualified domain name (e.g. [code]"www.example.com"[/code]) or an IP address in IPv4 or IPv6 format (e.g. [code]"192.168.1.1"[/code]). The [code]port[/code] is the port the server is listening on. The [code]in_bandwidth[/code] and [code]out_bandwidth[/code] parameters can be used to limit the incoming and outgoing bandwidth to the given number of bytes per second. The default of 0 means unlimited bandwidth. Note that ENet will strategically drop packets on specific sides of a connection between peers to ensure the peer's bandwidth is not overwhelmed. The bandwidth parameters also determine the window size of a connection which limits the amount of reliable packets that may be in transit at any given time. Returns [constant OK] if a client was created, [constant ERR_ALREADY_IN_USE] if this NetworkedMultiplayerENet instance already has an open connection (in which case you need to call [method close_connection] first) or [constant ERR_CANT_CREATE] if the client could not be created. If [code]local_port[/code] is specified, the client will also listen to the given port; this is useful for some NAT traversal techniques.
 			</description>
 		</method>
 		<method name="create_server">
@@ -72,6 +72,13 @@
 				Returns the channel of the last packet fetched via [method PacketPeer.get_packet].
 			</description>
 		</method>
+		<method name="get_local_port" qualifiers="const">
+			<return type="int">
+			</return>
+			<description>
+				Returns the local port to which this peer is bound.
+			</description>
+		</method>
 		<method name="get_packet_channel" qualifiers="const">
 			<return type="int">
 			</return>
diff --git a/modules/enet/networked_multiplayer_enet.cpp b/modules/enet/networked_multiplayer_enet.cpp
index 25b87145b6..1cf77b307d 100644
--- a/modules/enet/networked_multiplayer_enet.cpp
+++ b/modules/enet/networked_multiplayer_enet.cpp
@@ -68,7 +68,7 @@ int NetworkedMultiplayerENet::get_last_packet_channel() const {
 
 Error NetworkedMultiplayerENet::create_server(int p_port, int p_max_clients, int p_in_bandwidth, int p_out_bandwidth) {
 	ERR_FAIL_COND_V_MSG(active, ERR_ALREADY_IN_USE, "The multiplayer instance is already active.");
-	ERR_FAIL_COND_V_MSG(p_port < 0 || p_port > 65535, ERR_INVALID_PARAMETER, "The port number must be set between 0 and 65535 (inclusive).");
+	ERR_FAIL_COND_V_MSG(p_port < 0 || p_port > 65535, ERR_INVALID_PARAMETER, "The local port number must be between 0 and 65535 (inclusive).");
 	ERR_FAIL_COND_V_MSG(p_max_clients < 1 || p_max_clients > 4095, ERR_INVALID_PARAMETER, "The number of clients must be set between 1 and 4095 (inclusive).");
 	ERR_FAIL_COND_V_MSG(p_in_bandwidth < 0, ERR_INVALID_PARAMETER, "The incoming bandwidth limit must be greater than or equal to 0 (0 disables the limit).");
 	ERR_FAIL_COND_V_MSG(p_out_bandwidth < 0, ERR_INVALID_PARAMETER, "The outgoing bandwidth limit must be greater than or equal to 0 (0 disables the limit).");
@@ -115,46 +115,37 @@ Error NetworkedMultiplayerENet::create_server(int p_port, int p_max_clients, int
 	connection_status = CONNECTION_CONNECTED;
 	return OK;
 }
-
-Error NetworkedMultiplayerENet::create_client(const String &p_address, int p_port, int p_in_bandwidth, int p_out_bandwidth, int p_client_port) {
+Error NetworkedMultiplayerENet::create_client(const String &p_address, int p_port, int p_in_bandwidth, int p_out_bandwidth, int p_local_port) {
 	ERR_FAIL_COND_V_MSG(active, ERR_ALREADY_IN_USE, "The multiplayer instance is already active.");
-	ERR_FAIL_COND_V_MSG(p_port < 0 || p_port > 65535, ERR_INVALID_PARAMETER, "The server port number must be set between 0 and 65535 (inclusive).");
-	ERR_FAIL_COND_V_MSG(p_client_port < 0 || p_client_port > 65535, ERR_INVALID_PARAMETER, "The client port number must be set between 0 and 65535 (inclusive).");
+	ERR_FAIL_COND_V_MSG(p_port < 1 || p_port > 65535, ERR_INVALID_PARAMETER, "The remote port number must be between 1 and 65535 (inclusive).");
+	ERR_FAIL_COND_V_MSG(p_local_port < 0 || p_local_port > 65535, ERR_INVALID_PARAMETER, "The local port number must be between 0 and 65535 (inclusive).");
 	ERR_FAIL_COND_V_MSG(p_in_bandwidth < 0, ERR_INVALID_PARAMETER, "The incoming bandwidth limit must be greater than or equal to 0 (0 disables the limit).");
 	ERR_FAIL_COND_V_MSG(p_out_bandwidth < 0, ERR_INVALID_PARAMETER, "The outgoing bandwidth limit must be greater than or equal to 0 (0 disables the limit).");
 
-	if (p_client_port != 0) {
-		ENetAddress c_client;
+	ENetAddress c_client;
 
 #ifdef GODOT_ENET
-		if (bind_ip.is_wildcard()) {
-			c_client.wildcard = 1;
-		} else {
-			enet_address_set_ip(&c_client, bind_ip.get_ipv6(), 16);
-		}
+	if (bind_ip.is_wildcard()) {
+		c_client.wildcard = 1;
+	} else {
+		enet_address_set_ip(&c_client, bind_ip.get_ipv6(), 16);
+	}
 #else
-		if (bind_ip.is_wildcard()) {
-			c_client.host = 0;
-		} else {
-			ERR_FAIL_COND_V_MSG(!bind_ip.is_ipv4(), ERR_INVALID_PARAMETER, "Wildcard IP addresses are only permitted in IPv4, not IPv6.");
-			c_client.host = *(uint32_t *)bind_ip.get_ipv4();
-		}
+	if (bind_ip.is_wildcard()) {
+		c_client.host = 0;
+	} else {
+		ERR_FAIL_COND_V_MSG(!bind_ip.is_ipv4(), ERR_INVALID_PARAMETER, "Wildcard IP addresses are only permitted in IPv4, not IPv6.");
+		c_client.host = *(uint32_t *)bind_ip.get_ipv4();
+	}
 #endif
 
-		c_client.port = p_client_port;
+	c_client.port = p_local_port;
 
-		host = enet_host_create(&c_client /* create a client host */,
-				1 /* only allow 1 outgoing connection */,
-				channel_count /* allow up to channel_count to be used */,
-				p_in_bandwidth /* limit incoming bandwidth if > 0 */,
-				p_out_bandwidth /* limit outgoing bandwidth if > 0 */);
-	} else {
-		host = enet_host_create(nullptr /* create a client host */,
-				1 /* only allow 1 outgoing connection */,
-				channel_count /* allow up to channel_count to be used */,
-				p_in_bandwidth /* limit incoming bandwidth if > 0 */,
-				p_out_bandwidth /* limit outgoing bandwidth if > 0 */);
-	}
+	host = enet_host_create(&c_client /* create a client host */,
+			1 /* only allow 1 outgoing connection */,
+			channel_count /* allow up to channel_count to be used */,
+			p_in_bandwidth /* limit incoming bandwidth if > 0 */,
+			p_out_bandwidth /* limit outgoing bandwidth if > 0 */);
 
 	ERR_FAIL_COND_V_MSG(!host, ERR_CANT_CREATE, "Couldn't create the ENet client host.");
 #ifdef GODOT_ENET
@@ -562,7 +553,7 @@ Error NetworkedMultiplayerENet::put_packet(const uint8_t *p_buffer, int p_buffer
 	ENetPacket *packet = enet_packet_create(nullptr, p_buffer_size + 8, packet_flags);
 	encode_uint32(unique_id, &packet->data[0]); // Source ID
 	encode_uint32(target_peer, &packet->data[4]); // Dest ID
-	copymem(&packet->data[8], p_buffer, p_buffer_size);
+	memcpy(&packet->data[8], p_buffer, p_buffer_size);
 
 	if (server) {
 		if (target_peer == 0) {
@@ -673,7 +664,7 @@ size_t NetworkedMultiplayerENet::enet_compress(void *context, const ENetBuffer *
 	while (total) {
 		for (size_t i = 0; i < inBufferCount; i++) {
 			int to_copy = MIN(total, int(inBuffers[i].dataLength));
-			copymem(&enet->src_compressor_mem.write[ofs], inBuffers[i].data, to_copy);
+			memcpy(&enet->src_compressor_mem.write[ofs], inBuffers[i].data, to_copy);
 			ofs += to_copy;
 			total -= to_copy;
 		}
@@ -710,7 +701,7 @@ size_t NetworkedMultiplayerENet::enet_compress(void *context, const ENetBuffer *
 		return 0; // Do not bother
 	}
 
-	copymem(outData, enet->dst_compressor_mem.ptr(), ret);
+	memcpy(outData, enet->dst_compressor_mem.ptr(), ret);
 
 	return ret;
 }
@@ -784,6 +775,11 @@ int NetworkedMultiplayerENet::get_peer_port(int p_peer_id) const {
 #endif
 }
 
+int NetworkedMultiplayerENet::get_local_port() const {
+	ERR_FAIL_COND_V_MSG(!active || !host, 0, "The multiplayer instance isn't currently active.");
+	return host->address.port;
+}
+
 void NetworkedMultiplayerENet::set_peer_timeout(int p_peer_id, int p_timeout_limit, int p_timeout_min, int p_timeout_max) {
 	ERR_FAIL_COND_MSG(!peer_map.has(p_peer_id), vformat("Peer ID %d not found in the list of peers.", p_peer_id));
 	ERR_FAIL_COND_MSG(!is_server() && p_peer_id != 1, "Can't change the timeout of peers other then the server when acting as a client.");
@@ -832,7 +828,7 @@ bool NetworkedMultiplayerENet::is_server_relay_enabled() const {
 
 void NetworkedMultiplayerENet::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("create_server", "port", "max_clients", "in_bandwidth", "out_bandwidth"), &NetworkedMultiplayerENet::create_server, DEFVAL(32), DEFVAL(0), DEFVAL(0));
-	ClassDB::bind_method(D_METHOD("create_client", "address", "port", "in_bandwidth", "out_bandwidth", "client_port"), &NetworkedMultiplayerENet::create_client, DEFVAL(0), DEFVAL(0), DEFVAL(0));
+	ClassDB::bind_method(D_METHOD("create_client", "address", "port", "in_bandwidth", "out_bandwidth", "local_port"), &NetworkedMultiplayerENet::create_client, DEFVAL(0), DEFVAL(0), DEFVAL(0));
 	ClassDB::bind_method(D_METHOD("close_connection", "wait_usec"), &NetworkedMultiplayerENet::close_connection, DEFVAL(100));
 	ClassDB::bind_method(D_METHOD("disconnect_peer", "id", "now"), &NetworkedMultiplayerENet::disconnect_peer, DEFVAL(false));
 	ClassDB::bind_method(D_METHOD("set_compression_mode", "mode"), &NetworkedMultiplayerENet::set_compression_mode);
@@ -846,6 +842,7 @@ void NetworkedMultiplayerENet::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("is_dtls_verify_enabled"), &NetworkedMultiplayerENet::is_dtls_verify_enabled);
 	ClassDB::bind_method(D_METHOD("get_peer_address", "id"), &NetworkedMultiplayerENet::get_peer_address);
 	ClassDB::bind_method(D_METHOD("get_peer_port", "id"), &NetworkedMultiplayerENet::get_peer_port);
+	ClassDB::bind_method(D_METHOD("get_local_port"), &NetworkedMultiplayerENet::get_local_port);
 	ClassDB::bind_method(D_METHOD("set_peer_timeout", "id", "timeout_limit", "timeout_min", "timeout_max"), &NetworkedMultiplayerENet::set_peer_timeout);
 
 	ClassDB::bind_method(D_METHOD("get_packet_channel"), &NetworkedMultiplayerENet::get_packet_channel);
diff --git a/modules/enet/networked_multiplayer_enet.h b/modules/enet/networked_multiplayer_enet.h
index b99b14d218..c589cd9fbf 100644
--- a/modules/enet/networked_multiplayer_enet.h
+++ b/modules/enet/networked_multiplayer_enet.h
@@ -127,10 +127,11 @@ public:
 
 	virtual IP_Address get_peer_address(int p_peer_id) const;
 	virtual int get_peer_port(int p_peer_id) const;
+	virtual int get_local_port() const;
 	void set_peer_timeout(int p_peer_id, int p_timeout_limit, int p_timeout_min, int p_timeout_max);
 
 	Error create_server(int p_port, int p_max_clients = 32, int p_in_bandwidth = 0, int p_out_bandwidth = 0);
-	Error create_client(const String &p_address, int p_port, int p_in_bandwidth = 0, int p_out_bandwidth = 0, int p_client_port = 0);
+	Error create_client(const String &p_address, int p_port, int p_in_bandwidth = 0, int p_out_bandwidth = 0, int p_local_port = 0);
 
 	void close_connection(uint32_t wait_usec = 100);
 
diff --git a/modules/etc/SCsub b/modules/etc/SCsub
deleted file mode 100644
index 9b46f17916..0000000000
--- a/modules/etc/SCsub
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env python
-
-Import("env")
-Import("env_modules")
-
-env_etc = env_modules.Clone()
-
-# Thirdparty source files
-
-thirdparty_obj = []
-
-# Not unbundled so far since not widespread as shared library
-thirdparty_dir = "#thirdparty/etc2comp/"
-thirdparty_sources = [
-    "EtcBlock4x4.cpp",
-    "EtcBlock4x4Encoding.cpp",
-    "EtcBlock4x4Encoding_ETC1.cpp",
-    "EtcBlock4x4Encoding_R11.cpp",
-    "EtcBlock4x4Encoding_RG11.cpp",
-    "EtcBlock4x4Encoding_RGB8A1.cpp",
-    "EtcBlock4x4Encoding_RGB8.cpp",
-    "EtcBlock4x4Encoding_RGBA8.cpp",
-    "Etc.cpp",
-    "EtcDifferentialTrys.cpp",
-    "EtcFilter.cpp",
-    "EtcImage.cpp",
-    "EtcIndividualTrys.cpp",
-    "EtcMath.cpp",
-    "EtcSortedBlockList.cpp",
-]
-thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
-
-env_etc.Prepend(CPPPATH=[thirdparty_dir])
-
-env_thirdparty = env_etc.Clone()
-env_thirdparty.disable_warnings()
-env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
-env.modules_sources += thirdparty_obj
-
-# Godot source files
-
-module_obj = []
-
-env_etc.add_source_files(module_obj, "*.cpp")
-env.modules_sources += module_obj
-
-# Needed to force rebuilding the module files when the thirdparty library is updated.
-env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/etc/image_compress_etc.cpp b/modules/etc/image_compress_etc.cpp
deleted file mode 100644
index 41cbbe3f54..0000000000
--- a/modules/etc/image_compress_etc.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/*************************************************************************/
-/*  image_compress_etc.cpp                                               */
-/*************************************************************************/
-/*                       This file is part of:                           */
-/*                           GODOT ENGINE                                */
-/*                      https://godotengine.org                          */
-/*************************************************************************/
-/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
-/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
-/*                                                                       */
-/* Permission is hereby granted, free of charge, to any person obtaining */
-/* a copy of this software and associated documentation files (the       */
-/* "Software"), to deal in the Software without restriction, including   */
-/* without limitation the rights to use, copy, modify, merge, publish,   */
-/* distribute, sublicense, and/or sell copies of the Software, and to    */
-/* permit persons to whom the Software is furnished to do so, subject to */
-/* the following conditions:                                             */
-/*                                                                       */
-/* The above copyright notice and this permission notice shall be        */
-/* included in all copies or substantial portions of the Software.       */
-/*                                                                       */
-/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
-/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
-/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
-/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
-/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
-/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
-/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
-/*************************************************************************/
-
-#include "image_compress_etc.h"
-
-#include "core/io/image.h"
-#include "core/os/copymem.h"
-#include "core/os/os.h"
-#include "core/string/print_string.h"
-
-#include <Etc.h>
-#include <EtcFilter.h>
-
-static Image::Format _get_etc2_mode(Image::UsedChannels format) {
-	switch (format) {
-		case Image::USED_CHANNELS_R:
-			return Image::FORMAT_ETC2_R11;
-
-		case Image::USED_CHANNELS_RG:
-			return Image::FORMAT_ETC2_RG11;
-
-		case Image::USED_CHANNELS_RGB:
-			return Image::FORMAT_ETC2_RGB8;
-
-		case Image::USED_CHANNELS_RGBA:
-			return Image::FORMAT_ETC2_RGBA8;
-
-		// TODO: would be nice if we could use FORMAT_ETC2_RGB8A1 for FORMAT_RGBA5551
-		default:
-			// TODO: Kept for compatibility, but should be investigated whether it's correct or if it should error out
-			return Image::FORMAT_ETC2_RGBA8;
-	}
-}
-
-static Etc::Image::Format _image_format_to_etc2comp_format(Image::Format format) {
-	switch (format) {
-		case Image::FORMAT_ETC:
-			return Etc::Image::Format::ETC1;
-
-		case Image::FORMAT_ETC2_R11:
-			return Etc::Image::Format::R11;
-
-		case Image::FORMAT_ETC2_R11S:
-			return Etc::Image::Format::SIGNED_R11;
-
-		case Image::FORMAT_ETC2_RG11:
-			return Etc::Image::Format::RG11;
-
-		case Image::FORMAT_ETC2_RG11S:
-			return Etc::Image::Format::SIGNED_RG11;
-
-		case Image::FORMAT_ETC2_RGB8:
-			return Etc::Image::Format::RGB8;
-
-		case Image::FORMAT_ETC2_RGBA8:
-			return Etc::Image::Format::RGBA8;
-
-		case Image::FORMAT_ETC2_RGB8A1:
-			return Etc::Image::Format::RGB8A1;
-
-		default:
-			ERR_FAIL_V(Etc::Image::Format::UNKNOWN);
-	}
-}
-
-static void _compress_etc(Image *p_img, float p_lossy_quality, bool force_etc1_format, Image::UsedChannels p_channels) {
-	Image::Format img_format = p_img->get_format();
-
-	if (img_format >= Image::FORMAT_DXT1) {
-		return; //do not compress, already compressed
-	}
-
-	if (img_format > Image::FORMAT_RGBA8) {
-		// TODO: we should be able to handle FORMAT_RGBA4444 and FORMAT_RGBA5551 eventually
-		return;
-	}
-
-	// FIXME: Commented out during Vulkan rebase.
-	/*
-	if (force_etc1_format) {
-		// If VRAM compression is using ETC, but image has alpha, convert to RGBA4444 or LA8
-		// This saves space while maintaining the alpha channel
-		if (detected_channels == Image::USED_CHANNELS_RGBA) {
-			if (p_img->has_mipmaps()) {
-				// Image doesn't support mipmaps with RGBA4444 textures
-				p_img->clear_mipmaps();
-			}
-			p_img->convert(Image::FORMAT_RGBA4444);
-			return;
-		} else if (detected_channels == Image::USE_CHANNELS_LA) {
-			p_img->convert(Image::FORMAT_LA8);
-			return;
-		}
-	}
-	*/
-
-	uint32_t imgw = p_img->get_width(), imgh = p_img->get_height();
-
-	Image::Format etc_format = force_etc1_format ? Image::FORMAT_ETC : _get_etc2_mode(p_channels);
-
-	Ref<Image> img = p_img->duplicate();
-
-	if (img->get_format() != Image::FORMAT_RGBA8) {
-		img->convert(Image::FORMAT_RGBA8); //still uses RGBA to convert
-	}
-
-	if (img->has_mipmaps()) {
-		if (next_power_of_2(imgw) != imgw || next_power_of_2(imgh) != imgh) {
-			img->resize_to_po2();
-			imgw = img->get_width();
-			imgh = img->get_height();
-		}
-	} else {
-		if (imgw % 4 != 0 || imgh % 4 != 0) {
-			if (imgw % 4) {
-				imgw += 4 - imgw % 4;
-			}
-			if (imgh % 4) {
-				imgh += 4 - imgh % 4;
-			}
-
-			img->resize(imgw, imgh);
-		}
-	}
-
-	const uint8_t *r = img->get_data().ptr();
-	ERR_FAIL_COND(!r);
-
-	unsigned int target_size = Image::get_image_data_size(imgw, imgh, etc_format, p_img->has_mipmaps());
-	int mmc = 1 + (p_img->has_mipmaps() ? Image::get_image_required_mipmaps(imgw, imgh, etc_format) : 0);
-
-	Vector<uint8_t> dst_data;
-	dst_data.resize(target_size);
-
-	uint8_t *w = dst_data.ptrw();
-
-	// prepare parameters to be passed to etc2comp
-	int num_cpus = OS::get_singleton()->get_processor_count();
-	int encoding_time = 0;
-	float effort = 0.0; //default, reasonable time
-
-	if (p_lossy_quality > 0.95) {
-		effort = 80;
-	} else if (p_lossy_quality > 0.85) {
-		effort = 60;
-	} else if (p_lossy_quality > 0.75) {
-		effort = 40;
-	}
-
-	Etc::ErrorMetric error_metric = Etc::ErrorMetric::RGBX; // NOTE: we can experiment with other error metrics
-	Etc::Image::Format etc2comp_etc_format = _image_format_to_etc2comp_format(etc_format);
-
-	int wofs = 0;
-
-	print_verbose("ETC: Begin encoding, format: " + Image::get_format_name(etc_format));
-	uint64_t t = OS::get_singleton()->get_ticks_msec();
-	for (int i = 0; i < mmc; i++) {
-		// convert source image to internal etc2comp format (which is equivalent to Image::FORMAT_RGBAF)
-		// NOTE: We can alternatively add a case to Image::convert to handle Image::FORMAT_RGBAF conversion.
-		int mipmap_ofs = 0, mipmap_size = 0, mipmap_w = 0, mipmap_h = 0;
-		img->get_mipmap_offset_size_and_dimensions(i, mipmap_ofs, mipmap_size, mipmap_w, mipmap_h);
-		const uint8_t *src = &r[mipmap_ofs];
-
-		Etc::ColorFloatRGBA *src_rgba_f = new Etc::ColorFloatRGBA[mipmap_w * mipmap_h];
-		for (int j = 0; j < mipmap_w * mipmap_h; j++) {
-			int si = j * 4; // RGBA8
-			src_rgba_f[j] = Etc::ColorFloatRGBA::ConvertFromRGBA8(src[si], src[si + 1], src[si + 2], src[si + 3]);
-		}
-
-		unsigned char *etc_data = nullptr;
-		unsigned int etc_data_len = 0;
-		unsigned int extended_width = 0, extended_height = 0;
-		Etc::Encode((float *)src_rgba_f, mipmap_w, mipmap_h, etc2comp_etc_format, error_metric, effort, num_cpus, num_cpus, &etc_data, &etc_data_len, &extended_width, &extended_height, &encoding_time);
-
-		CRASH_COND(wofs + etc_data_len > target_size);
-		memcpy(&w[wofs], etc_data, etc_data_len);
-		wofs += etc_data_len;
-
-		delete[] etc_data;
-		delete[] src_rgba_f;
-	}
-
-	print_verbose("ETC: Time encoding: " + rtos(OS::get_singleton()->get_ticks_msec() - t));
-
-	p_img->create(imgw, imgh, p_img->has_mipmaps(), etc_format, dst_data);
-}
-
-static void _compress_etc1(Image *p_img, float p_lossy_quality) {
-	_compress_etc(p_img, p_lossy_quality, true, Image::USED_CHANNELS_RGB);
-}
-
-static void _compress_etc2(Image *p_img, float p_lossy_quality, Image::UsedChannels p_channels) {
-	_compress_etc(p_img, p_lossy_quality, false, p_channels);
-}
-
-void _register_etc_compress_func() {
-	Image::_image_compress_etc1_func = _compress_etc1;
-	Image::_image_compress_etc2_func = _compress_etc2;
-}
diff --git a/modules/etc/texture_loader_pkm.cpp b/modules/etc/texture_loader_pkm.cpp
deleted file mode 100644
index 95db9315d5..0000000000
--- a/modules/etc/texture_loader_pkm.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*************************************************************************/
-/*  texture_loader_pkm.cpp                                               */
-/*************************************************************************/
-/*                       This file is part of:                           */
-/*                           GODOT ENGINE                                */
-/*                      https://godotengine.org                          */
-/*************************************************************************/
-/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
-/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
-/*                                                                       */
-/* Permission is hereby granted, free of charge, to any person obtaining */
-/* a copy of this software and associated documentation files (the       */
-/* "Software"), to deal in the Software without restriction, including   */
-/* without limitation the rights to use, copy, modify, merge, publish,   */
-/* distribute, sublicense, and/or sell copies of the Software, and to    */
-/* permit persons to whom the Software is furnished to do so, subject to */
-/* the following conditions:                                             */
-/*                                                                       */
-/* The above copyright notice and this permission notice shall be        */
-/* included in all copies or substantial portions of the Software.       */
-/*                                                                       */
-/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
-/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
-/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
-/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
-/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
-/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
-/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
-/*************************************************************************/
-
-#include "texture_loader_pkm.h"
-
-#include "core/os/file_access.h"
-#include <string.h>
-
-struct ETC1Header {
-	char tag[6]; // "PKM 10"
-	uint16_t format = 0; // Format == number of mips (== zero)
-	uint16_t texWidth = 0; // Texture dimensions, multiple of 4 (big-endian)
-	uint16_t texHeight = 0;
-	uint16_t origWidth = 0; // Original dimensions (big-endian)
-	uint16_t origHeight = 0;
-};
-
-RES ResourceFormatPKM::load(const String &p_path, const String &p_original_path, Error *r_error, bool p_use_sub_threads, float *r_progress, CacheMode p_cache_mode) {
-	if (r_error) {
-		*r_error = ERR_CANT_OPEN;
-	}
-
-	Error err;
-	FileAccess *f = FileAccess::open(p_path, FileAccess::READ, &err);
-	if (!f) {
-		return RES();
-	}
-
-	FileAccessRef fref(f);
-	if (r_error) {
-		*r_error = ERR_FILE_CORRUPT;
-	}
-
-	ERR_FAIL_COND_V_MSG(err != OK, RES(), "Unable to open PKM texture file '" + p_path + "'.");
-
-	// big endian
-	f->set_endian_swap(true);
-
-	ETC1Header h;
-	f->get_buffer((uint8_t *)&h.tag, sizeof(h.tag));
-	ERR_FAIL_COND_V_MSG(strncmp(h.tag, "PKM 10", sizeof(h.tag)), RES(), "Invalid or unsupported PKM texture file '" + p_path + "'.");
-
-	h.format = f->get_16();
-	h.texWidth = f->get_16();
-	h.texHeight = f->get_16();
-	h.origWidth = f->get_16();
-	h.origHeight = f->get_16();
-
-	Vector<uint8_t> src_data;
-
-	uint32_t size = h.texWidth * h.texHeight / 2;
-	src_data.resize(size);
-	uint8_t *wb = src_data.ptrw();
-	f->get_buffer(wb, size);
-
-	int mipmaps = h.format;
-	int width = h.origWidth;
-	int height = h.origHeight;
-
-	Ref<Image> img = memnew(Image(width, height, mipmaps, Image::FORMAT_ETC, src_data));
-
-	Ref<ImageTexture> texture = memnew(ImageTexture);
-	texture->create_from_image(img);
-
-	if (r_error) {
-		*r_error = OK;
-	}
-
-	f->close();
-	memdelete(f);
-	return texture;
-}
-
-void ResourceFormatPKM::get_recognized_extensions(List<String> *p_extensions) const {
-	p_extensions->push_back("pkm");
-}
-
-bool ResourceFormatPKM::handles_type(const String &p_type) const {
-	return ClassDB::is_parent_class(p_type, "Texture2D");
-}
-
-String ResourceFormatPKM::get_resource_type(const String &p_path) const {
-	if (p_path.get_extension().to_lower() == "pkm") {
-		return "ImageTexture";
-	}
-	return "";
-}
diff --git a/modules/etcpak/SCsub b/modules/etcpak/SCsub
new file mode 100644
index 0000000000..2d3b69be75
--- /dev/null
+++ b/modules/etcpak/SCsub
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+Import("env")
+Import("env_modules")
+
+env_etcpak = env_modules.Clone()
+
+# Thirdparty source files
+
+thirdparty_obj = []
+
+thirdparty_dir = "#thirdparty/etcpak/"
+thirdparty_sources = [
+    "Dither.cpp",
+    "ProcessDxtc.cpp",
+    "ProcessRGB.cpp",
+    "Tables.cpp",
+]
+thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
+
+env_etcpak.Prepend(CPPPATH=[thirdparty_dir])
+
+env_thirdparty = env_etcpak.Clone()
+env_thirdparty.disable_warnings()
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
+
+# Godot source files
+
+module_obj = []
+
+env_etcpak.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/etc/config.py b/modules/etcpak/config.py
index 53b8f2f2e3..53b8f2f2e3 100644
--- a/modules/etc/config.py
+++ b/modules/etcpak/config.py
diff --git a/modules/etcpak/image_compress_etcpak.cpp b/modules/etcpak/image_compress_etcpak.cpp
new file mode 100644
index 0000000000..abc3c26188
--- /dev/null
+++ b/modules/etcpak/image_compress_etcpak.cpp
@@ -0,0 +1,184 @@
+/*************************************************************************/
+/*  image_compress_etcpak.cpp                                            */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "image_compress_etcpak.h"
+
+#include "core/os/os.h"
+#include "core/string/print_string.h"
+
+#include "thirdparty/etcpak/ProcessDxtc.hpp"
+#include "thirdparty/etcpak/ProcessRGB.hpp"
+
+EtcpakType _determine_etc_type(Image::UsedChannels p_channels) {
+	switch (p_channels) {
+		case Image::USED_CHANNELS_L:
+			return EtcpakType::ETCPAK_TYPE_ETC1;
+		case Image::USED_CHANNELS_LA:
+			return EtcpakType::ETCPAK_TYPE_ETC2_ALPHA;
+		case Image::USED_CHANNELS_R:
+			return EtcpakType::ETCPAK_TYPE_ETC2;
+		case Image::USED_CHANNELS_RG:
+			return EtcpakType::ETCPAK_TYPE_ETC2_RA_AS_RG;
+		case Image::USED_CHANNELS_RGB:
+			return EtcpakType::ETCPAK_TYPE_ETC2;
+		case Image::USED_CHANNELS_RGBA:
+			return EtcpakType::ETCPAK_TYPE_ETC2_ALPHA;
+		default:
+			return EtcpakType::ETCPAK_TYPE_ETC2_ALPHA;
+	}
+}
+
+EtcpakType _determine_dxt_type(Image::UsedChannels p_channels) {
+	switch (p_channels) {
+		case Image::USED_CHANNELS_L:
+			return EtcpakType::ETCPAK_TYPE_DXT1;
+		case Image::USED_CHANNELS_LA:
+			return EtcpakType::ETCPAK_TYPE_DXT5;
+		case Image::USED_CHANNELS_R:
+			return EtcpakType::ETCPAK_TYPE_DXT5;
+		case Image::USED_CHANNELS_RG:
+			return EtcpakType::ETCPAK_TYPE_DXT5_RA_AS_RG;
+		case Image::USED_CHANNELS_RGB:
+			return EtcpakType::ETCPAK_TYPE_DXT5;
+		case Image::USED_CHANNELS_RGBA:
+			return EtcpakType::ETCPAK_TYPE_DXT5;
+		default:
+			return EtcpakType::ETCPAK_TYPE_DXT5;
+	}
+}
+
+void _compress_etc1(Image *r_img, float p_lossy_quality) {
+	_compress_etcpak(EtcpakType::ETCPAK_TYPE_ETC1, r_img, p_lossy_quality);
+}
+
+void _compress_etc2(Image *r_img, float p_lossy_quality, Image::UsedChannels p_channels) {
+	EtcpakType type = _determine_etc_type(p_channels);
+	_compress_etcpak(type, r_img, p_lossy_quality);
+}
+
+void _compress_bc(Image *r_img, float p_lossy_quality, Image::UsedChannels p_channels) {
+	EtcpakType type = _determine_dxt_type(p_channels);
+	_compress_etcpak(type, r_img, p_lossy_quality);
+}
+
+void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_quality) {
+	uint64_t start_time = OS::get_singleton()->get_ticks_msec();
+
+	// TODO: See how to handle lossy quality.
+
+	Image::Format img_format = r_img->get_format();
+	if (img_format >= Image::FORMAT_DXT1) {
+		return; // Do not compress, already compressed.
+	}
+	if (img_format > Image::FORMAT_RGBA8) {
+		// TODO: we should be able to handle FORMAT_RGBA4444 and FORMAT_RGBA5551 eventually
+		return;
+	}
+
+	// Use RGBA8 to convert.
+	if (img_format != Image::FORMAT_RGBA8) {
+		r_img->convert(Image::FORMAT_RGBA8);
+	}
+
+	// Determine output format based on Etcpak type.
+	Image::Format target_format = Image::FORMAT_RGBA8;
+	if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC1) {
+		target_format = Image::FORMAT_ETC;
+	} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2) {
+		target_format = Image::FORMAT_ETC2_RGB8;
+	} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2_RA_AS_RG) {
+		target_format = Image::FORMAT_ETC2_RA_AS_RG;
+		r_img->convert_rg_to_ra_rgba8();
+	} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2_ALPHA) {
+		target_format = Image::FORMAT_ETC2_RGBA8;
+	} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT1) {
+		target_format = Image::FORMAT_DXT1;
+	} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5_RA_AS_RG) {
+		target_format = Image::FORMAT_DXT5_RA_AS_RG;
+		r_img->convert_rg_to_ra_rgba8();
+	} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5) {
+		target_format = Image::FORMAT_DXT5;
+	} else {
+		ERR_FAIL_MSG("Invalid or unsupported Etcpak compression format.");
+	}
+
+	// Compress image data and (if required) mipmaps.
+
+	const bool mipmaps = r_img->has_mipmaps();
+	const int width = r_img->get_width();
+	const int height = r_img->get_height();
+	const uint8_t *src_read = r_img->get_data().ptr();
+
+	print_verbose(vformat("ETCPAK: Encoding image size %dx%d to format %s.", width, height, Image::get_format_name(target_format)));
+
+	int dest_size = Image::get_image_data_size(width, height, target_format, mipmaps);
+	Vector<uint8_t> dest_data;
+	dest_data.resize(dest_size);
+	uint8_t *dest_write = dest_data.ptrw();
+
+	int mip_count = mipmaps ? Image::get_image_required_mipmaps(width, height, target_format) : 0;
+
+	for (int i = 0; i < mip_count + 1; i++) {
+		// Get write mip metrics for target image.
+		int mip_w, mip_h;
+		int mip_ofs = Image::get_image_mipmap_offset_and_dimensions(width, height, target_format, i, mip_w, mip_h);
+		// Ensure that mip offset is a multiple of 8 (etcpak expects uint64_t pointer).
+		ERR_FAIL_COND(mip_ofs % 8 != 0);
+		uint64_t *dest_mip_write = (uint64_t *)&dest_write[mip_ofs];
+
+		// Block size. Align stride to multiple of 4 (RGBA8).
+		mip_w = (mip_w + 3) & ~3;
+		mip_h = (mip_h + 3) & ~3;
+		const uint32_t blocks = mip_w * mip_h / 16;
+
+		// Get mip data from source image for reading.
+		int src_mip_ofs = r_img->get_mipmap_offset(i);
+		const uint32_t *src_mip_read = (const uint32_t *)&src_read[src_mip_ofs];
+
+		if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC1) {
+			CompressEtc1RgbDither(src_mip_read, dest_mip_write, blocks, mip_w);
+		} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2 || p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2_RA_AS_RG) {
+			CompressEtc2Rgb(src_mip_read, dest_mip_write, blocks, mip_w);
+		} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_ETC2_ALPHA) {
+			CompressEtc2Rgba(src_mip_read, dest_mip_write, blocks, mip_w);
+		} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT1) {
+			CompressDxt1Dither(src_mip_read, dest_mip_write, blocks, mip_w);
+		} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5 || p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5_RA_AS_RG) {
+			CompressDxt5(src_mip_read, dest_mip_write, blocks, mip_w);
+		} else {
+			ERR_FAIL_MSG("Invalid or unsupported Etcpak compression format.");
+		}
+	}
+
+	// Replace original image with compressed one.
+	r_img->create(width, height, mipmaps, target_format, dest_data);
+
+	print_verbose(vformat("ETCPAK encode took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
+}
diff --git a/core/os/copymem.h b/modules/etcpak/image_compress_etcpak.h
index 6fd559356c..ccf157fada 100644
--- a/core/os/copymem.h
+++ b/modules/etcpak/image_compress_etcpak.h
@@ -1,5 +1,5 @@
 /*************************************************************************/
-/*  copymem.h                                                            */
+/*  image_compress_etcpak.h                                              */
 /*************************************************************************/
 /*                       This file is part of:                           */
 /*                           GODOT ENGINE                                */
@@ -28,23 +28,25 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
 
-#ifndef COPYMEM_H
-#define COPYMEM_H
+#ifndef IMAGE_COMPRESS_ETCPAK_H
+#define IMAGE_COMPRESS_ETCPAK_H
 
-#include "core/typedefs.h"
+#include "core/io/image.h"
 
-#ifdef PLATFORM_COPYMEM
+enum class EtcpakType {
+	ETCPAK_TYPE_ETC1,
+	ETCPAK_TYPE_ETC2,
+	ETCPAK_TYPE_ETC2_ALPHA,
+	ETCPAK_TYPE_ETC2_RA_AS_RG,
+	ETCPAK_TYPE_DXT1,
+	ETCPAK_TYPE_DXT5,
+	ETCPAK_TYPE_DXT5_RA_AS_RG,
+};
 
-#include "platform_copymem.h" // included from platform/<current_platform>/platform_copymem.h"
+void _compress_etc1(Image *r_img, float p_lossy_quality);
+void _compress_etc2(Image *r_img, float p_lossy_quality, Image::UsedChannels p_channels);
+void _compress_bc(Image *r_img, float p_lossy_quality, Image::UsedChannels p_channels);
 
-#else
+void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_quality);
 
-#include <string.h>
-
-#define copymem(to, from, count) memcpy(to, from, count)
-#define zeromem(to, count) memset(to, 0, count)
-#define movemem(to, from, count) memmove(to, from, count)
-
-#endif
-
-#endif // COPYMEM_H
+#endif // IMAGE_COMPRESS_ETCPAK_H
diff --git a/modules/etcpak/register_types.cpp b/modules/etcpak/register_types.cpp
new file mode 100644
index 0000000000..d57d2f747a
--- /dev/null
+++ b/modules/etcpak/register_types.cpp
@@ -0,0 +1,42 @@
+/*************************************************************************/
+/*  register_types.cpp                                                   */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "register_types.h"
+
+#include "image_compress_etcpak.h"
+
+void register_etcpak_types() {
+	Image::_image_compress_etc1_func = _compress_etc1;
+	Image::_image_compress_etc2_func = _compress_etc2;
+	Image::_image_compress_bc_func = _compress_bc;
+}
+
+void unregister_etcpak_types() {
+}
diff --git a/modules/etc/image_compress_etc.h b/modules/etcpak/register_types.h
index 44a06194e9..a9e10a4aae 100644
--- a/modules/etc/image_compress_etc.h
+++ b/modules/etcpak/register_types.h
@@ -1,5 +1,5 @@
 /*************************************************************************/
-/*  image_compress_etc.h                                                 */
+/*  register_types.h                                                     */
 /*************************************************************************/
 /*                       This file is part of:                           */
 /*                           GODOT ENGINE                                */
@@ -28,9 +28,10 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
 
-#ifndef IMAGE_COMPRESS_ETC_H
-#define IMAGE_COMPRESS_ETC_H
+#ifndef ETCPAK_REGISTER_TYPES_H
+#define ETCPAK_REGISTER_TYPES_H
 
-void _register_etc_compress_func();
+void register_etcpak_types();
+void unregister_etcpak_types();
 
-#endif // IMAGE_COMPRESS_ETC_H
+#endif // ETCPAK_REGISTER_TYPES_H
diff --git a/modules/fbx/SCsub b/modules/fbx/SCsub
index 84220a66fa..0311fddfee 100644
--- a/modules/fbx/SCsub
+++ b/modules/fbx/SCsub
@@ -8,6 +8,9 @@ env_fbx = env_modules.Clone()
 # Make includes relative to the folder path specified here so our includes are clean
 env_fbx.Prepend(CPPPATH=["#modules/fbx/"])
 
+if env["builtin_zlib"]:
+    env_fbx.Prepend(CPPPATH=["#thirdparty/zlib/"])
+
 # Godot's own source files
 env_fbx.add_source_files(env.modules_sources, "tools/*.cpp")
 env_fbx.add_source_files(env.modules_sources, "data/*.cpp")
diff --git a/modules/fbx/data/fbx_material.cpp b/modules/fbx/data/fbx_material.cpp
index 5995097b2f..d54ac86e9f 100644
--- a/modules/fbx/data/fbx_material.cpp
+++ b/modules/fbx/data/fbx_material.cpp
@@ -277,7 +277,7 @@ Ref<StandardMaterial3D> FBXMaterial::import_material(ImportState &state) {
 	}
 
 	/// ALL below is related to properties
-	for (FBXDocParser::LazyPropertyMap::value_type iter : material->Props()->GetLazyProperties()) {
+	for (FBXDocParser::LazyPropertyMap::value_type iter : material->GetLazyProperties()) {
 		const std::string name = iter.first;
 
 		if (name.empty()) {
@@ -317,7 +317,7 @@ Ref<StandardMaterial3D> FBXMaterial::import_material(ImportState &state) {
 
 		ERR_CONTINUE_MSG(desc == PROPERTY_DESC_NOT_FOUND, "The FBX material parameter: `" + String(name.c_str()) + "` was not recognized. Please open an issue so we can add the support to it.");
 
-		const FBXDocParser::PropertyTable *tbl = material->Props();
+		const FBXDocParser::PropertyTable *tbl = material;
 		FBXDocParser::PropertyPtr prop = tbl->Get(name);
 
 		ERR_CONTINUE_MSG(prop == nullptr, "This file may be corrupted because is not possible to extract the material parameter: " + String(name.c_str()));
diff --git a/modules/fbx/data/fbx_mesh_data.cpp b/modules/fbx/data/fbx_mesh_data.cpp
index b088dd8640..304d1598f6 100644
--- a/modules/fbx/data/fbx_mesh_data.cpp
+++ b/modules/fbx/data/fbx_mesh_data.cpp
@@ -101,20 +101,6 @@ HashMap<int, Vector2> collect_uv(const Vector<VertexData<Vector2>> *p_data, Hash
 	return collection;
 }
 
-typedef int Vertex;
-typedef int SurfaceId;
-typedef int PolygonId;
-typedef int DataIndex;
-
-struct SurfaceData {
-	Ref<SurfaceTool> surface_tool;
-	OrderedHashMap<Vertex, int> lookup_table; // proposed fix is to replace lookup_table[vertex_id] to give the position of the vertices_map[int] index.
-	LocalVector<Vertex> vertices_map; // this must be ordered the same as insertion <-- slow to do find() operation.
-	Ref<Material> material;
-	HashMap<PolygonId, Vector<DataIndex>> surface_polygon_vertex;
-	Array morphs;
-};
-
 EditorSceneImporterMeshNode3D *FBXMeshData::create_fbx_mesh(const ImportState &state, const FBXDocParser::MeshGeometry *p_mesh_geometry, const FBXDocParser::Model *model, bool use_compression) {
 	mesh_geometry = p_mesh_geometry;
 	// todo: make this just use a uint64_t FBX ID this is a copy of our original materials unfortunately.
@@ -307,11 +293,9 @@ EditorSceneImporterMeshNode3D *FBXMeshData::create_fbx_mesh(const ImportState &s
 		// Triangulate the various polygons and add the indices.
 		for (const PolygonId *polygon_id = surface->surface_polygon_vertex.next(nullptr); polygon_id != nullptr; polygon_id = surface->surface_polygon_vertex.next(polygon_id)) {
 			const Vector<DataIndex> *indices = surface->surface_polygon_vertex.getptr(*polygon_id);
-
 			triangulate_polygon(
-					surface->surface_tool,
+					surface,
 					*indices,
-					surface->vertices_map,
 					vertices);
 		}
 	}
@@ -336,7 +320,7 @@ EditorSceneImporterMeshNode3D *FBXMeshData::create_fbx_mesh(const ImportState &s
 			morph_st->begin(Mesh::PRIMITIVE_TRIANGLES);
 
 			for (unsigned int vi = 0; vi < surface->vertices_map.size(); vi += 1) {
-				const Vertex vertex = surface->vertices_map[vi];
+				const Vertex &vertex = surface->vertices_map[vi];
 				add_vertex(
 						state,
 						morph_st,
@@ -398,6 +382,9 @@ EditorSceneImporterMeshNode3D *FBXMeshData::create_fbx_mesh(const ImportState &s
 
 	EditorSceneImporterMeshNode3D *godot_mesh = memnew(EditorSceneImporterMeshNode3D);
 	godot_mesh->set_mesh(mesh);
+	const String name = ImportUtils::FBXNodeToName(model->Name());
+	godot_mesh->set_name(name); // hurry up compiling >.<
+	mesh->set_name("mesh3d-" + name);
 	return godot_mesh;
 }
 
@@ -816,8 +803,10 @@ void FBXMeshData::add_vertex(
 	p_surface_tool->add_vertex((p_vertices_position[p_vertex] + p_morph_value) * p_scale);
 }
 
-void FBXMeshData::triangulate_polygon(Ref<SurfaceTool> st, Vector<int> p_polygon_vertex, const Vector<Vertex> p_surface_vertex_map, const std::vector<Vector3> &p_vertices) const {
+void FBXMeshData::triangulate_polygon(SurfaceData *surface, const Vector<int> &p_polygon_vertex, const std::vector<Vector3> &p_vertices) const {
+	Ref<SurfaceTool> st(surface->surface_tool);
 	const int polygon_vertex_count = p_polygon_vertex.size();
+	//const Vector<Vertex>& p_surface_vertex_map
 	if (polygon_vertex_count == 1) {
 		// point to triangle
 		st->add_index(p_polygon_vertex[0]);
@@ -856,9 +845,9 @@ void FBXMeshData::triangulate_polygon(Ref<SurfaceTool> st, Vector<int> p_polygon
 			is_simple_convex = true;
 			Vector3 first_vec;
 			for (int i = 0; i < polygon_vertex_count; i += 1) {
-				const Vector3 p1 = p_vertices[p_surface_vertex_map[p_polygon_vertex[i]]];
-				const Vector3 p2 = p_vertices[p_surface_vertex_map[p_polygon_vertex[(i + 1) % polygon_vertex_count]]];
-				const Vector3 p3 = p_vertices[p_surface_vertex_map[p_polygon_vertex[(i + 2) % polygon_vertex_count]]];
+				const Vector3 p1 = p_vertices[surface->vertices_map[p_polygon_vertex[i]]];
+				const Vector3 p2 = p_vertices[surface->vertices_map[p_polygon_vertex[(i + 1) % polygon_vertex_count]]];
+				const Vector3 p3 = p_vertices[surface->vertices_map[p_polygon_vertex[(i + 2) % polygon_vertex_count]]];
 
 				const Vector3 edge1 = p1 - p2;
 				const Vector3 edge2 = p3 - p2;
@@ -893,7 +882,7 @@ void FBXMeshData::triangulate_polygon(Ref<SurfaceTool> st, Vector<int> p_polygon
 
 		std::vector<Vector3> poly_vertices(polygon_vertex_count);
 		for (int i = 0; i < polygon_vertex_count; i += 1) {
-			poly_vertices[i] = p_vertices[p_surface_vertex_map[p_polygon_vertex[i]]];
+			poly_vertices[i] = p_vertices[surface->vertices_map[p_polygon_vertex[i]]];
 		}
 
 		const Vector3 poly_norm = get_poly_normal(poly_vertices);
diff --git a/modules/fbx/data/fbx_mesh_data.h b/modules/fbx/data/fbx_mesh_data.h
index 77510ff2ec..575f833584 100644
--- a/modules/fbx/data/fbx_mesh_data.h
+++ b/modules/fbx/data/fbx_mesh_data.h
@@ -32,6 +32,8 @@
 #define FBX_MESH_DATA_H
 
 #include "core/templates/hash_map.h"
+#include "core/templates/local_vector.h"
+#include "core/templates/ordered_hash_map.h"
 #include "editor/import/resource_importer_scene.h"
 #include "editor/import/scene_importer_mesh_node_3d.h"
 #include "scene/3d/mesh_instance_3d.h"
@@ -47,6 +49,20 @@ struct FBXMeshData;
 struct FBXBone;
 struct ImportState;
 
+typedef int Vertex;
+typedef int SurfaceId;
+typedef int PolygonId;
+typedef int DataIndex;
+
+struct SurfaceData {
+	Ref<SurfaceTool> surface_tool;
+	OrderedHashMap<Vertex, int> lookup_table; // proposed fix is to replace lookup_table[vertex_id] to give the position of the vertices_map[int] index.
+	LocalVector<Vertex> vertices_map; // this must be ordered the same as insertion <-- slow to do find() operation.
+	Ref<Material> material;
+	HashMap<PolygonId, Vector<DataIndex>> surface_polygon_vertex;
+	Array morphs;
+};
+
 struct VertexWeightMapping {
 	Vector<real_t> weights;
 	Vector<int> bones;
@@ -127,7 +143,7 @@ private:
 			const Vector3 &p_morph_value = Vector3(),
 			const Vector3 &p_morph_normal = Vector3());
 
-	void triangulate_polygon(Ref<SurfaceTool> st, Vector<int> p_polygon_vertex, Vector<int> p_surface_vertex_map, const std::vector<Vector3> &p_vertices) const;
+	void triangulate_polygon(SurfaceData *surface, const Vector<int> &p_polygon_vertex, const std::vector<Vector3> &p_vertices) const;
 
 	/// This function is responsible to convert the FBX polygon vertex to
 	/// vertex index.
diff --git a/modules/fbx/data/pivot_transform.cpp b/modules/fbx/data/pivot_transform.cpp
index 1895af6f9f..f4055c830f 100644
--- a/modules/fbx/data/pivot_transform.cpp
+++ b/modules/fbx/data/pivot_transform.cpp
@@ -33,7 +33,7 @@
 #include "tools/import_utils.h"
 
 void PivotTransform::ReadTransformChain() {
-	const FBXDocParser::PropertyTable *props = fbx_model->Props();
+	const FBXDocParser::PropertyTable *props = fbx_model;
 	const FBXDocParser::Model::RotOrder &rot = fbx_model->RotationOrder();
 	const FBXDocParser::TransformInheritance &inheritType = fbx_model->InheritType();
 	inherit_type = inheritType; // copy the inherit type we need it in the second step.
diff --git a/modules/fbx/editor_scene_importer_fbx.cpp b/modules/fbx/editor_scene_importer_fbx.cpp
index 55d524883f..ccbea21541 100644
--- a/modules/fbx/editor_scene_importer_fbx.cpp
+++ b/modules/fbx/editor_scene_importer_fbx.cpp
@@ -44,7 +44,6 @@
 #include "scene/3d/bone_attachment_3d.h"
 #include "scene/3d/camera_3d.h"
 #include "scene/3d/light_3d.h"
-#include "scene/3d/mesh_instance_3d.h"
 #include "scene/main/node.h"
 #include "scene/resources/material.h"
 
@@ -105,7 +104,7 @@ Node3D *EditorSceneImporterFBX::import_scene(const String &p_path, uint32_t p_fl
 		bool is_binary = false;
 		data.resize(f->get_len());
 
-		ERR_FAIL_COND_V(data.size() < 64, NULL);
+		ERR_FAIL_COND_V(data.size() < 64, nullptr);
 
 		f->get_buffer(data.ptrw(), data.size());
 		PackedByteArray fbx_header;
@@ -121,15 +120,27 @@ Node3D *EditorSceneImporterFBX::import_scene(const String &p_path, uint32_t p_fl
 
 		print_verbose("[doc] opening fbx file: " + p_path);
 		print_verbose("[doc] fbx header: " + fbx_header_string);
+		bool corrupt = false;
 
 		// safer to check this way as there can be different formatted headers
 		if (fbx_header_string.find("Kaydara FBX Binary", 0) != -1) {
 			is_binary = true;
 			print_verbose("[doc] is binary");
-			FBXDocParser::TokenizeBinary(tokens, (const char *)data.ptrw(), (size_t)data.size());
+
+			FBXDocParser::TokenizeBinary(tokens, (const char *)data.ptrw(), (size_t)data.size(), corrupt);
+
 		} else {
 			print_verbose("[doc] is ascii");
-			FBXDocParser::Tokenize(tokens, (const char *)data.ptrw(), (size_t)data.size());
+			FBXDocParser::Tokenize(tokens, (const char *)data.ptrw(), (size_t)data.size(), corrupt);
+		}
+
+		if (corrupt) {
+			for (FBXDocParser::TokenPtr token : tokens) {
+				delete token;
+			}
+			tokens.clear();
+			ERR_PRINT(vformat("Cannot import FBX file: %s the file is corrupt so we safely exited parsing the file.", p_path));
+			return memnew(Node3D);
 		}
 
 		// The import process explained:
@@ -141,6 +152,16 @@ Node3D *EditorSceneImporterFBX::import_scene(const String &p_path, uint32_t p_fl
 		// use this information to construct a very rudimentary
 		// parse-tree representing the FBX scope structure
 		FBXDocParser::Parser parser(tokens, is_binary);
+
+		if (parser.IsCorrupt()) {
+			for (FBXDocParser::TokenPtr token : tokens) {
+				delete token;
+			}
+			tokens.clear();
+			ERR_PRINT(vformat("Cannot import FBX file: %s the file is corrupt so we safely exited parsing the file.", p_path));
+			return memnew(Node3D);
+		}
+
 		FBXDocParser::ImportSettings settings;
 		settings.strictMode = false;
 
@@ -153,12 +174,10 @@ Node3D *EditorSceneImporterFBX::import_scene(const String &p_path, uint32_t p_fl
 		// safety for version handling
 		if (doc.IsSafeToImport()) {
 			bool is_blender_fbx = false;
-			//const FBXDocParser::PropertyPtr app_vendor = p_document->GlobalSettingsPtr()->Props()
-			//	p_document->Creator()
-			const FBXDocParser::PropertyTable *import_props = doc.GetMetadataProperties();
-			const FBXDocParser::PropertyPtr app_name = import_props->Get("Original|ApplicationName");
-			const FBXDocParser::PropertyPtr app_vendor = import_props->Get("Original|ApplicationVendor");
-			const FBXDocParser::PropertyPtr app_version = import_props->Get("Original|ApplicationVersion");
+			const FBXDocParser::PropertyTable &import_props = doc.GetMetadataProperties();
+			const FBXDocParser::PropertyPtr app_name = import_props.Get("Original|ApplicationName");
+			const FBXDocParser::PropertyPtr app_vendor = import_props.Get("Original|ApplicationVendor");
+			const FBXDocParser::PropertyPtr app_version = import_props.Get("Original|ApplicationVersion");
 			//
 			if (app_name) {
 				const FBXDocParser::TypedProperty<std::string> *app_name_string = dynamic_cast<const FBXDocParser::TypedProperty<std::string> *>(app_name);
@@ -200,6 +219,11 @@ Node3D *EditorSceneImporterFBX::import_scene(const String &p_path, uint32_t p_fl
 			return spatial;
 
 		} else {
+			for (FBXDocParser::TokenPtr token : tokens) {
+				delete token;
+			}
+			tokens.clear();
+
 			ERR_PRINT(vformat("Cannot import FBX file: %s. It uses file format %d which is unsupported by Godot. Please re-export it or convert it to a newer format.", p_path, doc.FBXVersion()));
 		}
 	}
@@ -892,7 +916,7 @@ Node3D *EditorSceneImporterFBX::_generate_scene(
 						uint64_t target_id = target->ID();
 						String target_name = ImportUtils::FBXNodeToName(target->Name());
 
-						const FBXDocParser::PropertyTable *properties = curve_node->Props();
+						const FBXDocParser::PropertyTable *properties = curve_node;
 						bool got_x = false, got_y = false, got_z = false;
 						float offset_x = FBXDocParser::PropertyGet<float>(properties, "d|X", got_x);
 						float offset_y = FBXDocParser::PropertyGet<float>(properties, "d|Y", got_y);
@@ -990,7 +1014,6 @@ Node3D *EditorSceneImporterFBX::_generate_scene(
 						int track_idx = animation->add_track(Animation::TYPE_TRANSFORM);
 
 						// animation->track_set_path(track_idx, node_path);
-						// animation->track_set_path(track_idx, node_path);
 						Ref<FBXBone> bone;
 
 						// note we must not run the below code if the entry doesn't exist, it will create dummy entries which is very bad.
@@ -1047,7 +1070,7 @@ Node3D *EditorSceneImporterFBX::_generate_scene(
 
 						Ref<FBXNode> target_node = state.fbx_target_map[target_id];
 						const FBXDocParser::Model *model = target_node->fbx_model;
-						const FBXDocParser::PropertyTable *props = model->Props();
+						const FBXDocParser::PropertyTable *props = dynamic_cast<const FBXDocParser::PropertyTable *>(model);
 
 						Map<StringName, FBXTrack> &track_data = track->value();
 						FBXTrack &translation_keys = track_data[StringName("T")];
diff --git a/modules/fbx/fbx_parser/FBXAnimation.cpp b/modules/fbx/fbx_parser/FBXAnimation.cpp
index 4ab5edebb1..0fbff035fd 100644
--- a/modules/fbx/fbx_parser/FBXAnimation.cpp
+++ b/modules/fbx/fbx_parser/FBXAnimation.cpp
@@ -128,11 +128,9 @@ AnimationCurve::~AnimationCurve() {
 
 // ------------------------------------------------------------------------------------------------
 AnimationCurveNode::AnimationCurveNode(uint64_t id, const ElementPtr element, const std::string &name,
-		const Document &doc, const char *const *target_prop_whitelist /*= NULL*/,
+		const Document &doc, const char *const *target_prop_whitelist /*= nullptr*/,
 		size_t whitelist_size /*= 0*/) :
-		Object(id, element, name), doc(doc) {
-	const ScopePtr sc = GetRequiredScope(element);
-
+		Object(id, element, name), target(), doc(doc) {
 	// find target node
 	const char *whitelist[] = { "Model", "NodeAttribute", "Deformer" };
 	const std::vector<const Connection *> &conns = doc.GetConnectionsBySourceSequenced(ID(), whitelist, 3);
@@ -154,8 +152,6 @@ AnimationCurveNode::AnimationCurveNode(uint64_t id, const ElementPtr element, co
 		prop = con->PropertyName();
 		break;
 	}
-
-	props = GetPropertyTable(doc, "AnimationCurveNode.FbxAnimCurveNode", element, sc, false);
 }
 
 // ------------------------------------------------------------------------------------------------
@@ -187,10 +183,6 @@ const AnimationMap &AnimationCurveNode::Curves() const {
 // ------------------------------------------------------------------------------------------------
 AnimationLayer::AnimationLayer(uint64_t id, const ElementPtr element, const std::string &name, const Document &doc) :
 		Object(id, element, name), doc(doc) {
-	const ScopePtr sc = GetRequiredScope(element);
-
-	// note: the props table here bears little importance and is usually absent
-	props = GetPropertyTable(doc, "AnimationLayer.FbxAnimLayer", element, sc, true);
 }
 
 // ------------------------------------------------------------------------------------------------
@@ -248,11 +240,6 @@ const AnimationCurveNodeList AnimationLayer::Nodes(const char *const *target_pro
 // ------------------------------------------------------------------------------------------------
 AnimationStack::AnimationStack(uint64_t id, const ElementPtr element, const std::string &name, const Document &doc) :
 		Object(id, element, name) {
-	const ScopePtr sc = GetRequiredScope(element);
-
-	// note: we don't currently use any of these properties so we shouldn't bother if it is missing
-	props = GetPropertyTable(doc, "AnimationStack.FbxAnimStack", element, sc, true);
-
 	// resolve attached animation layers
 	const std::vector<const Connection *> &conns = doc.GetConnectionsByDestinationSequenced(ID(), "AnimationLayer");
 	layers.reserve(conns.size());
@@ -282,9 +269,5 @@ AnimationStack::AnimationStack(uint64_t id, const ElementPtr element, const std:
 
 // ------------------------------------------------------------------------------------------------
 AnimationStack::~AnimationStack() {
-	if (props != nullptr) {
-		delete props;
-		props = nullptr;
-	}
 }
 } // namespace FBXDocParser
diff --git a/modules/fbx/fbx_parser/FBXBinaryTokenizer.cpp b/modules/fbx/fbx_parser/FBXBinaryTokenizer.cpp
index 1d2b7765c5..1eee10b251 100644
--- a/modules/fbx/fbx_parser/FBXBinaryTokenizer.cpp
+++ b/modules/fbx/fbx_parser/FBXBinaryTokenizer.cpp
@@ -130,6 +130,7 @@ Token::Token(const char *sbegin, const char *send, TokenType type, size_t offset
 		line(offset),
 		column(BINARY_MARKER) {
 #ifdef DEBUG_ENABLED
+	// contents is bad.. :/
 	contents = std::string(sbegin, static_cast<size_t>(send - sbegin));
 #endif
 	// calc length
@@ -232,9 +233,11 @@ unsigned int ReadString(const char *&sbegin_out, const char *&send_out, const ch
 }
 
 // ------------------------------------------------------------------------------------------------
-void ReadData(const char *&sbegin_out, const char *&send_out, const char *input, const char *&cursor, const char *end) {
+void ReadData(const char *&sbegin_out, const char *&send_out, const char *input, const char *&cursor, const char *end, bool &corrupt) {
 	if (Offset(cursor, end) < 1) {
 		TokenizeError("cannot ReadData, out of bounds reading length", input, cursor);
+		corrupt = true;
+		return;
 	}
 
 	const char type = *cursor;
@@ -328,9 +331,7 @@ void ReadData(const char *&sbegin_out, const char *&send_out, const char *input,
 			}
 			cursor += comp_len;
 			break;
-		}
-
-			// string
+		} // string
 		case 'S': {
 			const char *sb, *se;
 			// 0 characters can legally happen in such strings
@@ -338,11 +339,15 @@ void ReadData(const char *&sbegin_out, const char *&send_out, const char *input,
 			break;
 		}
 		default:
+			corrupt = true; // must exit
 			TokenizeError("cannot ReadData, unexpected type code: " + std::string(&type, 1), input, cursor);
+			return;
 	}
 
 	if (cursor > end) {
+		corrupt = true; // must exit
 		TokenizeError("cannot ReadData, the remaining size is too small for the data type: " + std::string(&type, 1), input, cursor);
+		return;
 	}
 
 	// the type code is contained in the returned range
@@ -350,7 +355,7 @@ void ReadData(const char *&sbegin_out, const char *&send_out, const char *input,
 }
 
 // ------------------------------------------------------------------------------------------------
-bool ReadScope(TokenList &output_tokens, const char *input, const char *&cursor, const char *end, bool const is64bits) {
+bool ReadScope(TokenList &output_tokens, const char *input, const char *&cursor, const char *end, bool const is64bits, bool &corrupt) {
 	// the first word contains the offset at which this block ends
 	const uint64_t end_offset = is64bits ? ReadDoubleWord(input, cursor, end) : ReadWord(input, cursor, end);
 
@@ -364,8 +369,12 @@ bool ReadScope(TokenList &output_tokens, const char *input, const char *&cursor,
 
 	if (end_offset > Offset(input, end)) {
 		TokenizeError("block offset is out of range", input, cursor);
+		corrupt = true;
+		return false;
 	} else if (end_offset < Offset(input, cursor)) {
 		TokenizeError("block offset is negative out of range", input, cursor);
+		corrupt = true;
+		return false;
 	}
 
 	// the second data word contains the number of properties in the scope
@@ -375,7 +384,7 @@ bool ReadScope(TokenList &output_tokens, const char *input, const char *&cursor,
 	const uint64_t prop_length = is64bits ? ReadDoubleWord(input, cursor, end) : ReadWord(input, cursor, end);
 
 	// now comes the name of the scope/key
-	const char *sbeg, *send;
+	const char *sbeg = nullptr, *send = nullptr;
 	ReadString(sbeg, send, input, cursor, end);
 
 	output_tokens.push_back(new_Token(sbeg, send, TokenType_KEY, Offset(input, cursor)));
@@ -383,7 +392,10 @@ bool ReadScope(TokenList &output_tokens, const char *input, const char *&cursor,
 	// now come the individual properties
 	const char *begin_cursor = cursor;
 	for (unsigned int i = 0; i < prop_count; ++i) {
-		ReadData(sbeg, send, input, cursor, begin_cursor + prop_length);
+		ReadData(sbeg, send, input, cursor, begin_cursor + prop_length, corrupt);
+		if (corrupt) {
+			return false;
+		}
 
 		output_tokens.push_back(new_Token(sbeg, send, TokenType_DATA, Offset(input, cursor)));
 
@@ -394,6 +406,8 @@ bool ReadScope(TokenList &output_tokens, const char *input, const char *&cursor,
 
 	if (Offset(begin_cursor, cursor) != prop_length) {
 		TokenizeError("property length not reached, something is wrong", input, cursor);
+		corrupt = true;
+		return false;
 	}
 
 	// at the end of each nested block, there is a NUL record to indicate
@@ -410,13 +424,18 @@ bool ReadScope(TokenList &output_tokens, const char *input, const char *&cursor,
 
 		// XXX this is vulnerable to stack overflowing ..
 		while (Offset(input, cursor) < end_offset - sentinel_block_length) {
-			ReadScope(output_tokens, input, cursor, input + end_offset - sentinel_block_length, is64bits);
+			ReadScope(output_tokens, input, cursor, input + end_offset - sentinel_block_length, is64bits, corrupt);
+			if (corrupt) {
+				return false;
+			}
 		}
 		output_tokens.push_back(new_Token(cursor, cursor + 1, TokenType_CLOSE_BRACKET, Offset(input, cursor)));
 
 		for (unsigned int i = 0; i < sentinel_block_length; ++i) {
 			if (cursor[i] != '\0') {
 				TokenizeError("failed to read nested block sentinel, expected all bytes to be 0", input, cursor);
+				corrupt = true;
+				return false;
 			}
 		}
 		cursor += sentinel_block_length;
@@ -424,6 +443,8 @@ bool ReadScope(TokenList &output_tokens, const char *input, const char *&cursor,
 
 	if (Offset(input, cursor) != end_offset) {
 		TokenizeError("scope length not reached, something is wrong", input, cursor);
+		corrupt = true;
+		return false;
 	}
 
 	return true;
@@ -432,7 +453,7 @@ bool ReadScope(TokenList &output_tokens, const char *input, const char *&cursor,
 
 // ------------------------------------------------------------------------------------------------
 // TODO: Test FBX Binary files newer than the 7500 version to check if the 64 bits address behaviour is consistent
-void TokenizeBinary(TokenList &output_tokens, const char *input, size_t length) {
+void TokenizeBinary(TokenList &output_tokens, const char *input, size_t length, bool &corrupt) {
 	if (length < 0x1b) {
 		//TokenizeError("file is too short",0);
 	}
@@ -459,7 +480,7 @@ void TokenizeBinary(TokenList &output_tokens, const char *input, size_t length)
 	const bool is64bits = version >= 7500;
 	const char *end = input + length;
 	while (cursor < end) {
-		if (!ReadScope(output_tokens, input, cursor, input + length, is64bits)) {
+		if (!ReadScope(output_tokens, input, cursor, input + length, is64bits, corrupt)) {
 			break;
 		}
 	}
diff --git a/modules/fbx/fbx_parser/FBXDeformer.cpp b/modules/fbx/fbx_parser/FBXDeformer.cpp
index 4b774e6b2a..039718ae15 100644
--- a/modules/fbx/fbx_parser/FBXDeformer.cpp
+++ b/modules/fbx/fbx_parser/FBXDeformer.cpp
@@ -89,10 +89,6 @@ using namespace Util;
 // ------------------------------------------------------------------------------------------------
 Deformer::Deformer(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name) :
 		Object(id, element, name) {
-	const ScopePtr sc = GetRequiredScope(element);
-
-	const std::string &classname = ParseTokenAsString(GetRequiredToken(element, 2));
-	props = GetPropertyTable(doc, "Deformer.Fbx" + classname, element, sc, true);
 }
 
 // ------------------------------------------------------------------------------------------------
@@ -101,10 +97,6 @@ Deformer::~Deformer() {
 
 Constraint::Constraint(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name) :
 		Object(id, element, name) {
-	const ScopePtr sc = GetRequiredScope(element);
-	const std::string &classname = ParseTokenAsString(GetRequiredToken(element, 2));
-	// used something.fbx as this is a cache name.
-	props = GetPropertyTable(doc, "Something.Fbx" + classname, element, sc, true);
 }
 
 Constraint::~Constraint() {
diff --git a/modules/fbx/fbx_parser/FBXDocument.cpp b/modules/fbx/fbx_parser/FBXDocument.cpp
index d156db201b..bb85d6ff7c 100644
--- a/modules/fbx/fbx_parser/FBXDocument.cpp
+++ b/modules/fbx/fbx_parser/FBXDocument.cpp
@@ -228,7 +228,7 @@ ObjectPtr LazyObject::LoadObject() {
 
 // ------------------------------------------------------------------------------------------------
 Object::Object(uint64_t id, const ElementPtr element, const std::string &name) :
-		element(element), name(name), id(id) {
+		PropertyTable(element), element(element), name(name), id(id) {
 }
 
 // ------------------------------------------------------------------------------------------------
@@ -237,17 +237,13 @@ Object::~Object() {
 }
 
 // ------------------------------------------------------------------------------------------------
-FileGlobalSettings::FileGlobalSettings(const Document &doc, const PropertyTable *props) :
-		props(props), doc(doc) {
+FileGlobalSettings::FileGlobalSettings(const Document &doc) :
+		PropertyTable(), doc(doc) {
 	// empty
 }
 
 // ------------------------------------------------------------------------------------------------
 FileGlobalSettings::~FileGlobalSettings() {
-	if (props != nullptr) {
-		delete props;
-		props = nullptr;
-	}
 }
 
 // ------------------------------------------------------------------------------------------------
@@ -287,15 +283,12 @@ Document::~Document() {
 		delete v.second;
 	}
 
-	if (metadata_properties != nullptr) {
-		delete metadata_properties;
-	}
 	// clear globals import pointer
 	globals.reset();
 }
 
 // ------------------------------------------------------------------------------------------------
-static const unsigned int LowerSupportedVersion = 7300;
+static const unsigned int LowerSupportedVersion = 7100;
 static const unsigned int UpperSupportedVersion = 7700;
 
 bool Document::ReadHeader() {
@@ -306,6 +299,11 @@ bool Document::ReadHeader() {
 		DOMError("no FBXHeaderExtension dictionary found");
 	}
 
+	if (parser.IsCorrupt()) {
+		DOMError("File is corrupt");
+		return false;
+	}
+
 	const ScopePtr shead = ehead->Compound();
 	fbxVersion = ParseTokenAsInt(GetRequiredToken(GetRequiredElement(shead, "FBXVersion", ehead), 0));
 
@@ -325,18 +323,11 @@ bool Document::ReadHeader() {
 		creator = ParseTokenAsString(GetRequiredToken(ecreator, 0));
 	}
 
-	//
 	// Scene Info
-	//
-
 	const ElementPtr scene_info = shead->GetElement("SceneInfo");
 
 	if (scene_info) {
-		PropertyTable *fileExportProps = const_cast<PropertyTable *>(GetPropertyTable(*this, "", scene_info, scene_info->Compound(), true));
-
-		if (fileExportProps) {
-			metadata_properties = fileExportProps;
-		}
+		metadata_properties.Setup(scene_info);
 	}
 
 	const ElementPtr etimestamp = shead->GetElement("CreationTimeStamp");
@@ -358,23 +349,7 @@ bool Document::ReadHeader() {
 void Document::ReadGlobalSettings() {
 	ERR_FAIL_COND_MSG(globals != nullptr, "Global settings is already setup this is a serious error and should be reported");
 
-	const ScopePtr sc = parser.GetRootScope();
-	const ElementPtr ehead = sc->GetElement("GlobalSettings");
-	if (nullptr == ehead || !ehead->Compound()) {
-		DOMWarning("no GlobalSettings dictionary found");
-		globals = std::make_shared<FileGlobalSettings>(*this, new PropertyTable());
-		return;
-	}
-
-	const PropertyTable *props = GetPropertyTable(*this, "", ehead, ehead->Compound(), true);
-
-	//double v = PropertyGet<float>( *props, std::string("UnitScaleFactor"), 1.0 );
-
-	if (!props) {
-		DOMError("GlobalSettings dictionary contains no property table");
-	}
-
-	globals = std::make_shared<FileGlobalSettings>(*this, props);
+	globals = std::make_shared<FileGlobalSettings>(*this);
 }
 
 // ------------------------------------------------------------------------------------------------
@@ -445,58 +420,6 @@ void Document::ReadObjects() {
 
 // ------------------------------------------------------------------------------------------------
 void Document::ReadPropertyTemplates() {
-	const ScopePtr sc = parser.GetRootScope();
-	// read property templates from "Definitions" section
-	const ElementPtr edefs = sc->GetElement("Definitions");
-	if (!edefs || !edefs->Compound()) {
-		DOMWarning("no Definitions dictionary found");
-		return;
-	}
-
-	const ScopePtr sdefs = edefs->Compound();
-	const ElementCollection otypes = sdefs->GetCollection("ObjectType");
-	for (ElementMap::const_iterator it = otypes.first; it != otypes.second; ++it) {
-		const ElementPtr el = (*it).second;
-		const ScopePtr sc_2 = el->Compound();
-		if (!sc_2) {
-			DOMWarning("expected nested scope in ObjectType, ignoring", el);
-			continue;
-		}
-
-		const TokenList &tok = el->Tokens();
-		if (tok.empty()) {
-			DOMWarning("expected name for ObjectType element, ignoring", el);
-			continue;
-		}
-
-		const std::string &oname = ParseTokenAsString(tok[0]);
-
-		const ElementCollection templs = sc_2->GetCollection("PropertyTemplate");
-		for (ElementMap::const_iterator iter = templs.first; iter != templs.second; ++iter) {
-			const ElementPtr el_2 = (*iter).second;
-			const ScopePtr sc_3 = el_2->Compound();
-			if (!sc_3) {
-				DOMWarning("expected nested scope in PropertyTemplate, ignoring", el);
-				continue;
-			}
-
-			const TokenList &tok_2 = el_2->Tokens();
-			if (tok_2.empty()) {
-				DOMWarning("expected name for PropertyTemplate element, ignoring", el);
-				continue;
-			}
-
-			const std::string &pname = ParseTokenAsString(tok_2[0]);
-
-			const ElementPtr Properties70 = sc_3->GetElement("Properties70");
-			if (Properties70) {
-				// PropertyTable(const ElementPtr element, const PropertyTable* templateProps);
-				const PropertyTable *props = new PropertyTable(Properties70, nullptr);
-
-				templates[oname + "." + pname] = props;
-			}
-		}
-	}
 }
 
 // ------------------------------------------------------------------------------------------------
diff --git a/modules/fbx/fbx_parser/FBXDocument.h b/modules/fbx/fbx_parser/FBXDocument.h
index 20e635a6a4..9664cd763a 100644
--- a/modules/fbx/fbx_parser/FBXDocument.h
+++ b/modules/fbx/fbx_parser/FBXDocument.h
@@ -130,7 +130,7 @@ private:
 };
 
 /** Base class for in-memory (DOM) representations of FBX objects */
-class Object {
+class Object : public PropertyTable {
 public:
 	Object(uint64_t id, const ElementPtr element, const std::string &name);
 
@@ -149,9 +149,9 @@ public:
 	}
 
 protected:
-	const ElementPtr element;
+	const ElementPtr element = nullptr;
 	const std::string name;
-	const uint64_t id = 0;
+	const uint64_t id;
 };
 
 /** DOM class for generic FBX NoteAttribute blocks. NoteAttribute's just hold a property table,
@@ -159,22 +159,13 @@ protected:
 class NodeAttribute : public Object {
 public:
 	NodeAttribute(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name);
-
 	virtual ~NodeAttribute();
-
-	const PropertyTable *Props() const {
-		return props;
-	}
-
-private:
-	const PropertyTable *props;
 };
 
 /** DOM base class for FBX camera settings attached to a node */
 class CameraSwitcher : public NodeAttribute {
 public:
 	CameraSwitcher(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name);
-
 	virtual ~CameraSwitcher();
 
 	int CameraID() const {
@@ -190,26 +181,26 @@ public:
 	}
 
 private:
-	int cameraId;
+	int cameraId = 0;
 	std::string cameraName;
 	std::string cameraIndexName;
 };
 
 #define fbx_stringize(a) #a
 
-#define fbx_simple_property(name, type, default_value)                           \
-	type name() const {                                                          \
-		return PropertyGet<type>(Props(), fbx_stringize(name), (default_value)); \
+#define fbx_simple_property(name, type, default_value)                        \
+	type name() const {                                                       \
+		return PropertyGet<type>(this, fbx_stringize(name), (default_value)); \
 	}
 
 // XXX improve logging
-#define fbx_simple_enum_property(name, type, default_value)                                               \
-	type name() const {                                                                                   \
-		const int ival = PropertyGet<int>(Props(), fbx_stringize(name), static_cast<int>(default_value)); \
-		if (ival < 0 || ival >= AI_CONCAT(type, _MAX)) {                                                  \
-			return static_cast<type>(default_value);                                                      \
-		}                                                                                                 \
-		return static_cast<type>(ival);                                                                   \
+#define fbx_simple_enum_property(name, type, default_value)                                            \
+	type name() const {                                                                                \
+		const int ival = PropertyGet<int>(this, fbx_stringize(name), static_cast<int>(default_value)); \
+		if (ival < 0 || ival >= AI_CONCAT(type, _MAX)) {                                               \
+			return static_cast<type>(default_value);                                                   \
+		}                                                                                              \
+		return static_cast<type>(ival);                                                                \
 	}
 
 class FbxPoseNode;
@@ -256,7 +247,7 @@ public:
 	}
 
 private:
-	uint64_t target_id;
+	uint64_t target_id = 0;
 	Transform transform;
 };
 
@@ -264,7 +255,6 @@ private:
 class Camera : public NodeAttribute {
 public:
 	Camera(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name);
-
 	virtual ~Camera();
 
 	fbx_simple_property(Position, Vector3, Vector3(0, 0, 0));
@@ -380,7 +370,6 @@ public:
 	};
 
 	Model(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name);
-
 	virtual ~Model();
 
 	fbx_simple_property(QuaternionInterpolate, int, 0);
@@ -466,10 +455,6 @@ public:
 		return culling;
 	}
 
-	const PropertyTable *Props() const {
-		return props;
-	}
-
 	/** Get material links */
 	const std::vector<const Material *> &GetMaterials() const {
 		return materials;
@@ -498,13 +483,11 @@ private:
 
 	std::string shading;
 	std::string culling;
-	const PropertyTable *props = nullptr;
 };
 
 class ModelLimbNode : public Model {
 public:
 	ModelLimbNode(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name);
-
 	virtual ~ModelLimbNode();
 };
 
@@ -512,7 +495,6 @@ public:
 class Texture : public Object {
 public:
 	Texture(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name);
-
 	virtual ~Texture();
 
 	const std::string &Type() const {
@@ -539,10 +521,6 @@ public:
 		return uvScaling;
 	}
 
-	const PropertyTable *Props() const {
-		return props;
-	}
-
 	// return a 4-tuple
 	const unsigned int *Crop() const {
 		return crop;
@@ -560,10 +538,8 @@ private:
 	std::string relativeFileName;
 	std::string fileName;
 	std::string alphaSource;
-	const PropertyTable *props = nullptr;
 
 	unsigned int crop[4] = { 0 };
-
 	const Video *media = nullptr;
 };
 
@@ -626,8 +602,8 @@ public:
 
 private:
 	std::vector<const Texture *> textures;
-	BlendMode blendMode;
-	float alpha;
+	BlendMode blendMode = BlendMode::BlendMode_Additive;
+	float alpha = 0;
 };
 
 typedef std::map<std::string, const Texture *> TextureMap;
@@ -656,10 +632,6 @@ public:
 		return relativeFileName;
 	}
 
-	const PropertyTable *Props() const {
-		return props;
-	}
-
 	const uint8_t *Content() const {
 		return content;
 	}
@@ -687,7 +659,6 @@ private:
 	std::string type;
 	std::string relativeFileName;
 	std::string fileName;
-	const PropertyTable *props = nullptr;
 
 	uint64_t contentLength = 0;
 	uint8_t *content = nullptr;
@@ -708,10 +679,6 @@ public:
 		return multilayer;
 	}
 
-	const PropertyTable *Props() const {
-		return props;
-	}
-
 	const TextureMap &Textures() const {
 		return textures;
 	}
@@ -722,8 +689,7 @@ public:
 
 private:
 	std::string shading;
-	bool multilayer;
-	const PropertyTable *props;
+	bool multilayer = false;
 
 	TextureMap textures;
 	LayeredTextureMap layeredTextures;
@@ -791,13 +757,9 @@ public:
 
 	virtual ~AnimationCurveNode();
 
-	const PropertyTable *Props() const {
-		return props;
-	}
-
 	const AnimationMap &Curves() const;
 
-	/** Object the curve is assigned to, this can be NULL if the
+	/** Object the curve is assigned to, this can be nullptr if the
      *  target object has no DOM representation or could not
      *  be read for other reasons.*/
 	Object *Target() const {
@@ -819,7 +781,6 @@ public:
 
 private:
 	Object *target = nullptr;
-	const PropertyTable *props;
 	mutable AnimationMap curves;
 	std::string prop;
 	const Document &doc;
@@ -837,18 +798,12 @@ public:
 	AnimationLayer(uint64_t id, const ElementPtr element, const std::string &name, const Document &doc);
 	virtual ~AnimationLayer();
 
-	const PropertyTable *Props() const {
-		//ai_assert(props.get());
-		return props;
-	}
-
 	/* the optional white list specifies a list of property names for which the caller
     wants animations for. Curves not matching this list will not be added to the
     animation layer. */
 	const AnimationCurveNodeList Nodes(const char *const *target_prop_whitelist = nullptr, size_t whitelist_size = 0) const;
 
 private:
-	const PropertyTable *props;
 	const Document &doc;
 };
 
@@ -863,16 +818,11 @@ public:
 	fbx_simple_property(ReferenceStart, int64_t, 0L);
 	fbx_simple_property(ReferenceStop, int64_t, 0L);
 
-	const PropertyTable *Props() const {
-		return props;
-	}
-
 	const AnimationLayerList &Layers() const {
 		return layers;
 	}
 
 private:
-	const PropertyTable *props = nullptr;
 	AnimationLayerList layers;
 };
 
@@ -881,14 +831,6 @@ class Deformer : public Object {
 public:
 	Deformer(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name);
 	virtual ~Deformer();
-
-	const PropertyTable *Props() const {
-		//ai_assert(props.get());
-		return props;
-	}
-
-private:
-	const PropertyTable *props;
 };
 
 /** Constraints are from Maya they can help us with BoneAttachments :) **/
@@ -896,9 +838,6 @@ class Constraint : public Object {
 public:
 	Constraint(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name);
 	virtual ~Constraint();
-
-private:
-	const PropertyTable *props;
 };
 
 typedef std::vector<float> WeightArray;
@@ -924,7 +863,7 @@ public:
 	}
 
 private:
-	float percent;
+	float percent = 0;
 	WeightArray fullWeights;
 	std::vector<const ShapeGeometry *> shapeGeometries;
 };
@@ -1006,7 +945,7 @@ private:
 	Transform transformLink;
 	Transform transformAssociateModel;
 	SkinLinkMode link_mode;
-	bool valid_transformAssociateModel;
+	bool valid_transformAssociateModel = false;
 	const Model *node = nullptr;
 };
 
@@ -1037,8 +976,8 @@ public:
 	}
 
 private:
-	float accuracy;
-	SkinType skinType;
+	float accuracy = 0;
+	SkinType skinType = SkinType::Skin_Linear;
 	std::vector<const Cluster *> clusters;
 };
 
@@ -1050,7 +989,7 @@ public:
 
 	// note: a connection ensures that the source and dest objects exist, but
 	// not that they have DOM representations, so the return value of one of
-	// these functions can still be NULL.
+	// these functions can still be nullptr.
 	Object *SourceObject() const;
 	Object *DestinationObject() const;
 
@@ -1087,10 +1026,10 @@ public:
 	}
 
 public:
-	uint64_t insertionOrder;
+	uint64_t insertionOrder = 0;
 	const std::string prop;
 
-	uint64_t src, dest;
+	uint64_t src = 0, dest = 0;
 	const Document &doc;
 };
 
@@ -1105,15 +1044,10 @@ typedef std::multimap<uint64_t, const Connection *> ConnectionMap;
 
 /** DOM class for global document settings, a single instance per document can
  *  be accessed via Document.Globals(). */
-class FileGlobalSettings {
+class FileGlobalSettings : public PropertyTable {
 public:
-	FileGlobalSettings(const Document &doc, const PropertyTable *props);
-
-	~FileGlobalSettings();
-
-	const PropertyTable *Props() const {
-		return props;
-	}
+	FileGlobalSettings(const Document &doc);
+	virtual ~FileGlobalSettings();
 
 	const Document &GetDocument() const {
 		return doc;
@@ -1158,7 +1092,6 @@ public:
 	fbx_simple_property(CustomFrameRate, float, -1.0f);
 
 private:
-	const PropertyTable *props = nullptr;
 	const Document &doc;
 };
 
@@ -1196,7 +1129,7 @@ public:
 		return globals.get();
 	}
 
-	const PropertyTable *GetMetadataProperties() const {
+	const PropertyTable &GetMetadataProperties() const {
 		return metadata_properties;
 	}
 
@@ -1293,7 +1226,7 @@ private:
 	std::vector<uint64_t> materials;
 	std::vector<uint64_t> skins;
 	mutable std::vector<const AnimationStack *> animationStacksResolved;
-	PropertyTable *metadata_properties = nullptr;
+	PropertyTable metadata_properties;
 	std::shared_ptr<FileGlobalSettings> globals = nullptr;
 };
 } // namespace FBXDocParser
diff --git a/modules/fbx/fbx_parser/FBXDocumentUtil.cpp b/modules/fbx/fbx_parser/FBXDocumentUtil.cpp
index df50a32c39..4a33024969 100644
--- a/modules/fbx/fbx_parser/FBXDocumentUtil.cpp
+++ b/modules/fbx/fbx_parser/FBXDocumentUtil.cpp
@@ -95,14 +95,14 @@ void DOMError(const std::string &message, const std::shared_ptr<Token> token) {
 	print_error("[FBX-DOM]" + String(message.c_str()) + ";" + String(token->StringContents().c_str()));
 }
 
-void DOMError(const std::string &message, const Element *element /*= NULL*/) {
+void DOMError(const std::string &message, const Element *element /*= nullptr*/) {
 	if (element) {
 		DOMError(message, element->KeyToken());
 	}
 	print_error("[FBX-DOM] " + String(message.c_str()));
 }
 
-void DOMError(const std::string &message, const std::shared_ptr<Element> element /*= NULL*/) {
+void DOMError(const std::string &message, const std::shared_ptr<Element> element /*= nullptr*/) {
 	if (element) {
 		DOMError(message, element->KeyToken());
 	}
@@ -117,7 +117,7 @@ void DOMWarning(const std::string &message, const Token *token) {
 	print_verbose("[FBX-DOM] warning:" + String(message.c_str()) + ";" + String(token->StringContents().c_str()));
 }
 
-void DOMWarning(const std::string &message, const Element *element /*= NULL*/) {
+void DOMWarning(const std::string &message, const Element *element /*= nullptr*/) {
 	if (element) {
 		DOMWarning(message, element->KeyToken());
 		return;
@@ -129,7 +129,7 @@ void DOMWarning(const std::string &message, const std::shared_ptr<Token> token)
 	print_verbose("[FBX-DOM] warning:" + String(message.c_str()) + ";" + String(token->StringContents().c_str()));
 }
 
-void DOMWarning(const std::string &message, const std::shared_ptr<Element> element /*= NULL*/) {
+void DOMWarning(const std::string &message, const std::shared_ptr<Element> element /*= nullptr*/) {
 	if (element) {
 		DOMWarning(message, element->KeyToken());
 		return;
@@ -137,36 +137,5 @@ void DOMWarning(const std::string &message, const std::shared_ptr<Element> eleme
 	print_verbose("[FBX-DOM] warning:" + String(message.c_str()));
 }
 
-// ------------------------------------------------------------------------------------------------
-// fetch a property table and the corresponding property template
-const PropertyTable *GetPropertyTable(const Document &doc,
-		const std::string &templateName,
-		const ElementPtr element,
-		const ScopePtr sc,
-		bool no_warn /*= false*/) {
-	// todo: make this an abstraction
-	const ElementPtr Properties70 = sc->GetElement("Properties70");
-	const PropertyTable *templateProps = static_cast<const PropertyTable *>(nullptr);
-
-	if (templateName.length()) {
-		PropertyTemplateMap::const_iterator it = doc.Templates().find(templateName);
-		if (it != doc.Templates().end()) {
-			templateProps = (*it).second;
-		}
-	}
-
-	if (!Properties70 || !Properties70->Compound()) {
-		if (!no_warn) {
-			DOMWarning("property table (Properties70) not found", element);
-		}
-		if (templateProps) {
-			return new const PropertyTable(templateProps);
-		} else {
-			return new const PropertyTable();
-		}
-	}
-
-	return new PropertyTable(Properties70, templateProps);
-}
 } // namespace Util
 } // namespace FBXDocParser
diff --git a/modules/fbx/fbx_parser/FBXDocumentUtil.h b/modules/fbx/fbx_parser/FBXDocumentUtil.h
index daa9de4a33..ba86191c4a 100644
--- a/modules/fbx/fbx_parser/FBXDocumentUtil.h
+++ b/modules/fbx/fbx_parser/FBXDocumentUtil.h
@@ -98,13 +98,6 @@ void DOMWarning(const std::string &message, const Element *element);
 void DOMWarning(const std::string &message, const std::shared_ptr<Token> token);
 void DOMWarning(const std::string &message, const std::shared_ptr<Element> element);
 
-// fetch a property table and the corresponding property template
-const PropertyTable *GetPropertyTable(const Document &doc,
-		const std::string &templateName,
-		const ElementPtr element,
-		const ScopePtr sc,
-		bool no_warn = false);
-
 // ------------------------------------------------------------------------------------------------
 template <typename T>
 const T *ProcessSimpleConnection(const Connection &con,
diff --git a/modules/fbx/fbx_parser/FBXMaterial.cpp b/modules/fbx/fbx_parser/FBXMaterial.cpp
index 219da1b2f4..bf8922267e 100644
--- a/modules/fbx/fbx_parser/FBXMaterial.cpp
+++ b/modules/fbx/fbx_parser/FBXMaterial.cpp
@@ -118,8 +118,6 @@ Material::Material(uint64_t id, const ElementPtr element, const Document &doc, c
 		DOMWarning("shading mode not recognized: " + shading, element);
 	}
 
-	props = GetPropertyTable(doc, templateName, element, sc);
-
 	// resolve texture links
 	const std::vector<const Connection *> &conns = doc.GetConnectionsByDestinationSequenced(ID());
 	for (const Connection *con : conns) {
@@ -163,10 +161,6 @@ Material::Material(uint64_t id, const ElementPtr element, const Document &doc, c
 
 // ------------------------------------------------------------------------------------------------
 Material::~Material() {
-	if (props != nullptr) {
-		delete props;
-		props = nullptr;
-	}
 }
 
 // ------------------------------------------------------------------------------------------------
@@ -219,17 +213,15 @@ Texture::Texture(uint64_t id, const ElementPtr element, const Document &doc, con
 		alphaSource = ParseTokenAsString(GetRequiredToken(Texture_Alpha_Source, 0));
 	}
 
-	props = GetPropertyTable(doc, "Texture.FbxFileTexture", element, sc);
-
 	// 3DS Max and FBX SDK use "Scaling" and "Translation" instead of "ModelUVScaling" and "ModelUVTranslation". Use these properties if available.
-	bool ok;
-	const Vector3 &scaling = PropertyGet<Vector3>(props, "Scaling", ok);
+	bool ok = true;
+	const Vector3 &scaling = PropertyGet<Vector3>(this, "Scaling", ok);
 	if (ok) {
 		uvScaling.x = scaling.x;
 		uvScaling.y = scaling.y;
 	}
 
-	const Vector3 &trans = PropertyGet<Vector3>(props, "Translation", ok);
+	const Vector3 &trans = PropertyGet<Vector3>(this, "Translation", ok);
 	if (ok) {
 		uvTrans.x = trans.x;
 		uvTrans.y = trans.y;
@@ -254,10 +246,6 @@ Texture::Texture(uint64_t id, const ElementPtr element, const Document &doc, con
 }
 
 Texture::~Texture() {
-	if (props != nullptr) {
-		delete props;
-		props = nullptr;
-	}
 }
 
 LayeredTexture::LayeredTexture(uint64_t id, const ElementPtr element, const Document & /*doc*/, const std::string &name) :
@@ -337,7 +325,7 @@ Video::Video(uint64_t id, const ElementPtr element, const Document &doc, const s
 					DOMError("embedded content is not surrounded by quotation marks", element);
 				} else {
 					size_t targetLength = 0;
-					auto numTokens = Content->Tokens().size();
+					const size_t numTokens = Content->Tokens().size();
 					// First time compute size (it could be large like 64Gb and it is good to allocate it once)
 					for (uint32_t tokenIdx = 0; tokenIdx < numTokens; ++tokenIdx) {
 						const Token *dataToken = GetRequiredToken(Content, tokenIdx);
@@ -390,18 +378,11 @@ Video::Video(uint64_t id, const ElementPtr element, const Document &doc, const s
 			//									   runtimeError.what());
 		}
 	}
-
-	props = GetPropertyTable(doc, "Video.FbxVideo", element, sc);
 }
 
 Video::~Video() {
 	if (content) {
 		delete[] content;
 	}
-
-	if (props != nullptr) {
-		delete props;
-		props = nullptr;
-	}
 }
 } // namespace FBXDocParser
diff --git a/modules/fbx/fbx_parser/FBXMeshGeometry.h b/modules/fbx/fbx_parser/FBXMeshGeometry.h
index 710e644c68..05493c4aec 100644
--- a/modules/fbx/fbx_parser/FBXMeshGeometry.h
+++ b/modules/fbx/fbx_parser/FBXMeshGeometry.h
@@ -96,7 +96,7 @@ public:
 	Geometry(uint64_t id, const ElementPtr element, const std::string &name, const Document &doc);
 	virtual ~Geometry();
 
-	/** Get the Skin attached to this geometry or NULL */
+	/** Get the Skin attached to this geometry or nullptr */
 	const Skin *DeformerSkin() const;
 
 	const std::vector<const BlendShape *> &get_blend_shapes() const;
diff --git a/modules/fbx/fbx_parser/FBXModel.cpp b/modules/fbx/fbx_parser/FBXModel.cpp
index 767994441f..03c9de0c35 100644
--- a/modules/fbx/fbx_parser/FBXModel.cpp
+++ b/modules/fbx/fbx_parser/FBXModel.cpp
@@ -98,16 +98,11 @@ Model::Model(uint64_t id, const ElementPtr element, const Document &doc, const s
 		culling = ParseTokenAsString(GetRequiredToken(Culling, 0));
 	}
 
-	props = GetPropertyTable(doc, "Model.FbxNode", element, sc);
 	ResolveLinks(element, doc);
 }
 
 // ------------------------------------------------------------------------------------------------
 Model::~Model() {
-	if (props != nullptr) {
-		delete props;
-		props = nullptr;
-	}
 }
 
 ModelLimbNode::ModelLimbNode(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name) :
diff --git a/modules/fbx/fbx_parser/FBXNodeAttribute.cpp b/modules/fbx/fbx_parser/FBXNodeAttribute.cpp
index 2749fc9f4d..15184a0f5d 100644
--- a/modules/fbx/fbx_parser/FBXNodeAttribute.cpp
+++ b/modules/fbx/fbx_parser/FBXNodeAttribute.cpp
@@ -84,16 +84,7 @@ using namespace Util;
 
 // ------------------------------------------------------------------------------------------------
 NodeAttribute::NodeAttribute(uint64_t id, const ElementPtr element, const Document &doc, const std::string &name) :
-		Object(id, element, name), props() {
-	const ScopePtr sc = GetRequiredScope(element);
-
-	const std::string &classname = ParseTokenAsString(GetRequiredToken(element, 2));
-
-	// hack on the deriving type but Null/LimbNode attributes are the only case in which
-	// the property table is by design absent and no warning should be generated
-	// for it.
-	const bool is_null_or_limb = !strcmp(classname.c_str(), "Null") || !strcmp(classname.c_str(), "LimbNode");
-	props = GetPropertyTable(doc, "NodeAttribute.Fbx" + classname, element, sc, is_null_or_limb);
+		Object(id, element, name) {
 }
 
 // ------------------------------------------------------------------------------------------------
diff --git a/modules/fbx/fbx_parser/FBXParser.cpp b/modules/fbx/fbx_parser/FBXParser.cpp
index 166d98bb8c..98435b5c0f 100644
--- a/modules/fbx/fbx_parser/FBXParser.cpp
+++ b/modules/fbx/fbx_parser/FBXParser.cpp
@@ -74,8 +74,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *  @brief Implementation of the FBX parser and the rudimentary DOM that we use
  */
 
-#include "thirdparty/zlib/zlib.h"
 #include <stdlib.h> /* strtol */
+#include <zlib.h>
 
 #include "ByteSwapper.h"
 #include "FBXParseTools.h"
@@ -131,6 +131,8 @@ Element::Element(const TokenPtr key_token, Parser &parser) :
 
 			if (!n) {
 				print_error("unexpected end of file, expected bracket, comma or key" + String(parser.LastToken()->StringContents().c_str()));
+				parser.corrupt = true;
+				return;
 			}
 
 			const TokenType ty = n->Type();
@@ -143,6 +145,8 @@ Element::Element(const TokenPtr key_token, Parser &parser) :
 
 			if (ty != TokenType_OPEN_BRACKET && ty != TokenType_CLOSE_BRACKET && ty != TokenType_COMMA && ty != TokenType_KEY) {
 				print_error("unexpected token; expected bracket, comma or key" + String(n->StringContents().c_str()));
+				parser.corrupt = true;
+				return;
 			}
 		}
 
@@ -150,11 +154,17 @@ Element::Element(const TokenPtr key_token, Parser &parser) :
 			compound = new_Scope(parser);
 			parser.scopes.push_back(compound);
 
+			if (parser.corrupt) {
+				return;
+			}
+
 			// current token should be a TOK_CLOSE_BRACKET
 			n = parser.CurrentToken();
 
 			if (n && n->Type() != TokenType_CLOSE_BRACKET) {
 				print_error("expected closing bracket" + String(n->StringContents().c_str()));
+				parser.corrupt = true;
+				return;
 			}
 
 			parser.AdvanceToNextToken();
@@ -173,22 +183,31 @@ Scope::Scope(Parser &parser, bool topLevel) {
 		TokenPtr t = parser.CurrentToken();
 		if (t->Type() != TokenType_OPEN_BRACKET) {
 			print_error("expected open bracket" + String(t->StringContents().c_str()));
+			parser.corrupt = true;
+			return;
 		}
 	}
 
 	TokenPtr n = parser.AdvanceToNextToken();
 	if (n == nullptr) {
 		print_error("unexpected end of file");
+		parser.corrupt = true;
+		return;
 	}
 
 	// note: empty scopes are allowed
 	while (n && n->Type() != TokenType_CLOSE_BRACKET) {
 		if (n->Type() != TokenType_KEY) {
 			print_error("unexpected token, expected TOK_KEY" + String(n->StringContents().c_str()));
+			parser.corrupt = true;
+			return;
 		}
 
 		const std::string str = n->StringContents();
 
+		if (parser.corrupt) {
+			return;
+		}
 		// std::multimap<std::string, ElementPtr> (key and value)
 		elements.insert(ElementMap::value_type(str, new_Element(n, parser)));
 
@@ -216,7 +235,7 @@ Scope::~Scope() {
 
 // ------------------------------------------------------------------------------------------------
 Parser::Parser(const TokenList &tokens, bool is_binary) :
-		tokens(tokens), cursor(tokens.begin()), is_binary(is_binary) {
+		corrupt(false), tokens(tokens), cursor(tokens.begin()), is_binary(is_binary) {
 	root = new_Scope(*this, true);
 	scopes.push_back(root);
 }
@@ -1187,7 +1206,7 @@ std::string ParseTokenAsString(const TokenPtr t) {
 
 // ------------------------------------------------------------------------------------------------
 // extract a required element from a scope, abort if the element cannot be found
-ElementPtr GetRequiredElement(const ScopePtr sc, const std::string &index, const ElementPtr element /*= NULL*/) {
+ElementPtr GetRequiredElement(const ScopePtr sc, const std::string &index, const ElementPtr element /*= nullptr*/) {
 	const ElementPtr el = sc->GetElement(index);
 	TokenPtr token = el->KeyToken();
 	ERR_FAIL_COND_V(!token, nullptr);
@@ -1208,7 +1227,7 @@ bool HasElement(const ScopePtr sc, const std::string &index) {
 
 // ------------------------------------------------------------------------------------------------
 // extract a required element from a scope, abort if the element cannot be found
-ElementPtr GetOptionalElement(const ScopePtr sc, const std::string &index, const ElementPtr element /*= NULL*/) {
+ElementPtr GetOptionalElement(const ScopePtr sc, const std::string &index, const ElementPtr element /*= nullptr*/) {
 	const ElementPtr el = sc->GetElement(index);
 	return el;
 }
@@ -1231,6 +1250,21 @@ ScopePtr GetRequiredScope(const ElementPtr el) {
 }
 
 // ------------------------------------------------------------------------------------------------
+// extract optional compound scope
+ScopePtr GetOptionalScope(const ElementPtr el) {
+	if (el) {
+		ScopePtr s = el->Compound();
+		TokenPtr token = el->KeyToken();
+
+		if (token && s) {
+			return s;
+		}
+	}
+
+	return nullptr;
+}
+
+// ------------------------------------------------------------------------------------------------
 // get token at a particular index
 TokenPtr GetRequiredToken(const ElementPtr el, unsigned int index) {
 	if (el) {
diff --git a/modules/fbx/fbx_parser/FBXParser.h b/modules/fbx/fbx_parser/FBXParser.h
index 37d27d3dca..8b248e8791 100644
--- a/modules/fbx/fbx_parser/FBXParser.h
+++ b/modules/fbx/fbx_parser/FBXParser.h
@@ -160,7 +160,7 @@ public:
 	}
 
 	ElementPtr FindElementCaseInsensitive(const std::string &elementName) const {
-		for (auto element = elements.begin(); element != elements.end(); ++element) {
+		for (FBXDocParser::ElementMap::const_iterator element = elements.begin(); element != elements.end(); ++element) {
 			if (element->first.compare(elementName)) {
 				return element->second;
 			}
@@ -199,6 +199,10 @@ public:
 		return is_binary;
 	}
 
+	bool IsCorrupt() const {
+		return corrupt;
+	}
+
 private:
 	friend class Scope;
 	friend class Element;
@@ -208,6 +212,7 @@ private:
 	TokenPtr CurrentToken() const;
 
 private:
+	bool corrupt = false;
 	ScopeList scopes;
 	const TokenList &tokens;
 
@@ -249,6 +254,8 @@ bool HasElement(const ScopePtr sc, const std::string &index);
 // extract a required element from a scope, abort if the element cannot be found
 ElementPtr GetRequiredElement(const ScopePtr sc, const std::string &index, const ElementPtr element = nullptr);
 ScopePtr GetRequiredScope(const ElementPtr el); // New in 2020. (less likely to destroy application)
+ScopePtr GetOptionalScope(const ElementPtr el); // New in 2021. (even LESS likely to destroy application now)
+
 ElementPtr GetOptionalElement(const ScopePtr sc, const std::string &index, const ElementPtr element = nullptr);
 // extract required compound scope
 ScopePtr GetRequiredScope(const ElementPtr el);
diff --git a/modules/fbx/fbx_parser/FBXProperties.cpp b/modules/fbx/fbx_parser/FBXProperties.cpp
index 84e71512d6..37717e9109 100644
--- a/modules/fbx/fbx_parser/FBXProperties.cpp
+++ b/modules/fbx/fbx_parser/FBXProperties.cpp
@@ -94,7 +94,7 @@ Property::~Property() {
 namespace {
 
 // ------------------------------------------------------------------------------------------------
-// read a typed property out of a FBX element. The return value is NULL if the property cannot be read.
+// read a typed property out of a FBX element. The return value is nullptr if the property cannot be read.
 PropertyPtr ReadTypedProperty(const ElementPtr element) {
 	//ai_assert(element.KeyToken().StringContents() == "P");
 
@@ -145,19 +145,33 @@ std::string PeekPropertyName(const Element &element) {
 } // namespace
 
 // ------------------------------------------------------------------------------------------------
-PropertyTable::PropertyTable() {
+PropertyTable::PropertyTable() :
+		element(nullptr) {
 }
 
-// ------------------------------------------------------------------------------------------------
-PropertyTable::PropertyTable(const PropertyTable *templateProps) :
-		templateProps(templateProps), element() {
+// Is used when dealing with FBX Objects not metadata.
+PropertyTable::PropertyTable(const ElementPtr element) :
+		element(element) {
+	Setup(element);
 }
 
 // ------------------------------------------------------------------------------------------------
-PropertyTable::PropertyTable(const ElementPtr element, const PropertyTable *templateProps) :
-		templateProps(templateProps), element(element) {
-	const ScopePtr scope = GetRequiredScope(element);
-	ERR_FAIL_COND(!scope);
+PropertyTable::~PropertyTable() {
+	for (PropertyMap::value_type &v : props) {
+		delete v.second;
+	}
+}
+
+void PropertyTable::Setup(ElementPtr ptr) {
+	const ScopePtr sc = GetRequiredScope(ptr);
+	const ElementPtr Properties70 = sc->GetElement("Properties70");
+	const ScopePtr scope = GetOptionalScope(Properties70);
+
+	// no scope, no care.
+	if (!scope) {
+		return; // NOTE: this is not an error this is actually a Object, without properties, here we will nullptr it.
+	}
+
 	for (const ElementMap::value_type &v : scope->Elements()) {
 		if (v.first != "P") {
 			DOMWarning("expected only P elements in property table", v.second);
@@ -182,13 +196,6 @@ PropertyTable::PropertyTable(const ElementPtr element, const PropertyTable *temp
 }
 
 // ------------------------------------------------------------------------------------------------
-PropertyTable::~PropertyTable() {
-	for (PropertyMap::value_type &v : props) {
-		delete v.second;
-	}
-}
-
-// ------------------------------------------------------------------------------------------------
 PropertyPtr PropertyTable::Get(const std::string &name) const {
 	PropertyMap::const_iterator it = props.find(name);
 	if (it == props.end()) {
@@ -203,10 +210,6 @@ PropertyPtr PropertyTable::Get(const std::string &name) const {
 
 		if (it == props.end()) {
 			// check property template
-			if (templateProps) {
-				return templateProps->Get(name);
-			}
-
 			return nullptr;
 		}
 	}
diff --git a/modules/fbx/fbx_parser/FBXProperties.h b/modules/fbx/fbx_parser/FBXProperties.h
index 0595b25fa7..bfd27ac94e 100644
--- a/modules/fbx/fbx_parser/FBXProperties.h
+++ b/modules/fbx/fbx_parser/FBXProperties.h
@@ -137,36 +137,31 @@ class PropertyTable {
 public:
 	// in-memory property table with no source element
 	PropertyTable();
-	PropertyTable(const PropertyTable *templateProps);
-	PropertyTable(const ElementPtr element, const PropertyTable *templateProps);
-	~PropertyTable();
+	PropertyTable(const ElementPtr element);
+	virtual ~PropertyTable();
 
 	PropertyPtr Get(const std::string &name) const;
+	void Setup(ElementPtr ptr);
 
 	// PropertyTable's need not be coupled with FBX elements so this can be NULL
-	ElementPtr GetElement() const {
+	ElementPtr GetElement() {
 		return element;
 	}
 
-	PropertyMap &GetProperties() const {
+	PropertyMap &GetProperties() {
 		return props;
 	}
 
-	const LazyPropertyMap &GetLazyProperties() const {
+	const LazyPropertyMap &GetLazyProperties() {
 		return lazyProps;
 	}
 
-	const PropertyTable *TemplateProps() const {
-		return templateProps;
-	}
-
 	DirectPropertyMap GetUnparsedProperties() const;
 
 private:
 	LazyPropertyMap lazyProps;
 	mutable PropertyMap props;
-	const PropertyTable *templateProps = nullptr;
-	const ElementPtr element = nullptr;
+	ElementPtr element = nullptr;
 };
 
 // ------------------------------------------------------------------------------------------------
@@ -191,16 +186,11 @@ template <typename T>
 inline T PropertyGet(const PropertyTable *in, const std::string &name, bool &result, bool useTemplate = false) {
 	PropertyPtr prop = in->Get(name);
 	if (nullptr == prop) {
-		if (!useTemplate) {
-			result = false;
-			return T();
-		}
-		const PropertyTable *templ = in->TemplateProps();
-		if (nullptr == templ) {
+		if (nullptr == in) {
 			result = false;
 			return T();
 		}
-		prop = templ->Get(name);
+		prop = in->Get(name);
 		if (nullptr == prop) {
 			result = false;
 			return T();
diff --git a/modules/fbx/fbx_parser/FBXTokenizer.cpp b/modules/fbx/fbx_parser/FBXTokenizer.cpp
index ea4568fe32..81c5b128e8 100644
--- a/modules/fbx/fbx_parser/FBXTokenizer.cpp
+++ b/modules/fbx/fbx_parser/FBXTokenizer.cpp
@@ -141,7 +141,7 @@ void ProcessDataToken(TokenList &output_tokens, const char *&start, const char *
 } // namespace
 
 // ------------------------------------------------------------------------------------------------
-void Tokenize(TokenList &output_tokens, const char *input, size_t length) {
+void Tokenize(TokenList &output_tokens, const char *input, size_t length, bool &corrupt) {
 	// line and column numbers numbers are one-based
 	unsigned int line = 1;
 	unsigned int column = 1;
@@ -185,6 +185,8 @@ void Tokenize(TokenList &output_tokens, const char *input, size_t length) {
 			case '\"':
 				if (token_begin) {
 					TokenizeError("unexpected double-quote", line, column);
+					corrupt = true;
+					return;
 				}
 				token_begin = cur;
 				in_double_quotes = true;
diff --git a/modules/fbx/fbx_parser/FBXTokenizer.h b/modules/fbx/fbx_parser/FBXTokenizer.h
index 1e7e5e6535..184d0fd894 100644
--- a/modules/fbx/fbx_parser/FBXTokenizer.h
+++ b/modules/fbx/fbx_parser/FBXTokenizer.h
@@ -187,7 +187,7 @@ typedef std::vector<TokenPtr> TokenList;
  * @param output_tokens Receives a list of all tokens in the input data.
  * @param input_buffer Textual input buffer to be processed, 0-terminated.
  * @print_error if something goes wrong */
-void Tokenize(TokenList &output_tokens, const char *input, size_t length);
+void Tokenize(TokenList &output_tokens, const char *input, size_t length, bool &corrupt);
 
 /** Tokenizer function for binary FBX files.
  *
@@ -197,7 +197,7 @@ void Tokenize(TokenList &output_tokens, const char *input, size_t length);
  * @param input_buffer Binary input buffer to be processed.
  * @param length Length of input buffer, in bytes. There is no 0-terminal.
  * @print_error if something goes wrong */
-void TokenizeBinary(TokenList &output_tokens, const char *input, size_t length);
+void TokenizeBinary(TokenList &output_tokens, const char *input, size_t length, bool &corrupt);
 } // namespace FBXDocParser
 
 #endif // FBX_TOKENIZER_H
diff --git a/modules/fbx/fbx_parser/FBXUtil.cpp b/modules/fbx/fbx_parser/FBXUtil.cpp
index 4295cb6f5e..1f14a69099 100644
--- a/modules/fbx/fbx_parser/FBXUtil.cpp
+++ b/modules/fbx/fbx_parser/FBXUtil.cpp
@@ -121,7 +121,7 @@ static const uint8_t base64DecodeTable[128] = {
 };
 
 uint8_t DecodeBase64(char ch) {
-	const auto idx = static_cast<uint8_t>(ch);
+	const uint8_t idx = static_cast<uint8_t>(ch);
 	if (idx > 127) {
 		return 255;
 	}
diff --git a/modules/fbx/tools/import_utils.h b/modules/fbx/tools/import_utils.h
index bea28ffeda..cf0f811e35 100644
--- a/modules/fbx/tools/import_utils.h
+++ b/modules/fbx/tools/import_utils.h
@@ -267,7 +267,7 @@ public:
 	  */
 	// static void set_texture_mapping_mode(aiTextureMapMode *map_mode, Ref<ImageTexture> texture) {
 	// 	ERR_FAIL_COND(texture.is_null());
-	// 	ERR_FAIL_COND(map_mode == NULL);
+	// 	ERR_FAIL_COND(map_mode == nullptr);
 	// 	aiTextureMapMode tex_mode = map_mode[0];
 
 	// 	int32_t flags = Texture::FLAGS_DEFAULT;
@@ -382,7 +382,7 @@ public:
 	// 		String &path,
 	// 		AssimpImageData &image_state) {
 	// 	aiString ai_filename = aiString();
-	// 	if (AI_SUCCESS == ai_material->GetTexture(texture_type, 0, &ai_filename, NULL, NULL, NULL, NULL, image_state.map_mode)) {
+	// 	if (AI_SUCCESS == ai_material->GetTexture(texture_type, 0, &ai_filename, nullptr, nullptr, nullptr, nullptr, image_state.map_mode)) {
 	// 		return CreateAssimpTexture(state, ai_filename, filename, path, image_state);
 	// 	}
 
diff --git a/modules/gdnative/nativescript/nativescript.cpp b/modules/gdnative/nativescript/nativescript.cpp
index f795bef59f..3283f28de5 100644
--- a/modules/gdnative/nativescript/nativescript.cpp
+++ b/modules/gdnative/nativescript/nativescript.cpp
@@ -41,6 +41,8 @@
 #include "core/os/file_access.h"
 #include "core/os/os.h"
 
+#include "main/main.h"
+
 #include "scene/main/scene_tree.h"
 #include "scene/resources/resource_format_text.h"
 
@@ -1248,6 +1250,7 @@ void NativeScriptLanguage::init() {
 		if (generate_c_api(E->next()->get()) != OK) {
 			ERR_PRINT("Failed to generate C API\n");
 		}
+		Main::cleanup(true);
 		exit(0);
 	}
 
@@ -1257,6 +1260,7 @@ void NativeScriptLanguage::init() {
 		if (generate_c_builtin_api(E->next()->get()) != OK) {
 			ERR_PRINT("Failed to generate C builtin API\n");
 		}
+		Main::cleanup(true);
 		exit(0);
 	}
 #endif
diff --git a/modules/gdnative/nativescript/nativescript.h b/modules/gdnative/nativescript/nativescript.h
index d6ba2bbec1..4bd54f9c46 100644
--- a/modules/gdnative/nativescript/nativescript.h
+++ b/modules/gdnative/nativescript/nativescript.h
@@ -90,8 +90,8 @@ struct NativeScriptDesc {
 	bool is_tool = false;
 
 	inline NativeScriptDesc() {
-		zeromem(&create_func, sizeof(godot_nativescript_instance_create_func));
-		zeromem(&destroy_func, sizeof(godot_nativescript_instance_destroy_func));
+		memset(&create_func, 0, sizeof(godot_nativescript_instance_create_func));
+		memset(&destroy_func, 0, sizeof(godot_nativescript_instance_destroy_func));
 	}
 };
 
diff --git a/modules/gdnative/tests/test_variant.h b/modules/gdnative/tests/test_variant.h
index aeceb6e68f..2850036604 100644
--- a/modules/gdnative/tests/test_variant.h
+++ b/modules/gdnative/tests/test_variant.h
@@ -107,7 +107,7 @@ TEST_CASE("[GDNative Variant] Variant call") {
 	godot_string_name_new_with_latin1_chars(&method, "is_valid_identifier");
 
 	godot_variant_call_error error;
-	godot_variant_call(&self, &method, NULL, 0, &ret, &error);
+	godot_variant_call(&self, &method, nullptr, 0, &ret, &error);
 
 	CHECK(godot_variant_get_type(&ret) == GODOT_VARIANT_TYPE_BOOL);
 	CHECK(godot_variant_as_bool(&ret));
diff --git a/modules/gdnavigation/gd_navigation_server.cpp b/modules/gdnavigation/gd_navigation_server.cpp
index 39f208c7a4..88ef434e0f 100644
--- a/modules/gdnavigation/gd_navigation_server.cpp
+++ b/modules/gdnavigation/gd_navigation_server.cpp
@@ -122,7 +122,7 @@ GdNavigationServer::~GdNavigationServer() {
 }
 
 void GdNavigationServer::add_command(SetCommand *command) const {
-	auto mut_this = const_cast<GdNavigationServer *>(this);
+	GdNavigationServer *mut_this = const_cast<GdNavigationServer *>(this);
 	{
 		MutexLock lock(commands_mutex);
 		mut_this->commands.push_back(command);
@@ -130,7 +130,7 @@ void GdNavigationServer::add_command(SetCommand *command) const {
 }
 
 RID GdNavigationServer::map_create() const {
-	auto mut_this = const_cast<GdNavigationServer *>(this);
+	GdNavigationServer *mut_this = const_cast<GdNavigationServer *>(this);
 	MutexLock lock(mut_this->operations_mutex);
 	NavMap *space = memnew(NavMap);
 	RID rid = map_owner.make_rid(space);
@@ -240,7 +240,7 @@ RID GdNavigationServer::map_get_closest_point_owner(RID p_map, const Vector3 &p_
 }
 
 RID GdNavigationServer::region_create() const {
-	auto mut_this = const_cast<GdNavigationServer *>(this);
+	GdNavigationServer *mut_this = const_cast<GdNavigationServer *>(this);
 	MutexLock lock(mut_this->operations_mutex);
 	NavRegion *reg = memnew(NavRegion);
 	RID rid = region_owner.make_rid(reg);
@@ -330,7 +330,7 @@ Vector3 GdNavigationServer::region_get_connection_pathway_end(RID p_region, int
 }
 
 RID GdNavigationServer::agent_create() const {
-	auto mut_this = const_cast<GdNavigationServer *>(this);
+	GdNavigationServer *mut_this = const_cast<GdNavigationServer *>(this);
 	MutexLock lock(mut_this->operations_mutex);
 	RvoAgent *agent = memnew(RvoAgent());
 	RID rid = agent_owner.make_rid(agent);
@@ -504,7 +504,7 @@ COMMAND_1(free, RID, p_object) {
 }
 
 void GdNavigationServer::set_active(bool p_active) const {
-	auto mut_this = const_cast<GdNavigationServer *>(this);
+	GdNavigationServer *mut_this = const_cast<GdNavigationServer *>(this);
 	MutexLock lock(mut_this->operations_mutex);
 	mut_this->active = p_active;
 }
diff --git a/modules/gdnavigation/nav_map.cpp b/modules/gdnavigation/nav_map.cpp
index 464082221f..2513c62b6a 100644
--- a/modules/gdnavigation/nav_map.cpp
+++ b/modules/gdnavigation/nav_map.cpp
@@ -168,7 +168,7 @@ Vector<Vector3> NavMap::get_path(Vector3 p_origin, Vector3 p_destination, bool p
 				const Vector3 new_entry = Geometry3D::get_closest_point_to_segment(least_cost_poly->entry, pathway);
 				const float new_distance = least_cost_poly->entry.distance_to(new_entry) + least_cost_poly->traveled_distance;
 
-				auto it = std::find(
+				const std::vector<gd::NavigationPoly>::iterator it = std::find(
 						navigation_polys.begin(),
 						navigation_polys.end(),
 						gd::NavigationPoly(connection.polygon));
@@ -504,7 +504,7 @@ void NavMap::add_region(NavRegion *p_region) {
 }
 
 void NavMap::remove_region(NavRegion *p_region) {
-	std::vector<NavRegion *>::iterator it = std::find(regions.begin(), regions.end(), p_region);
+	const std::vector<NavRegion *>::iterator it = std::find(regions.begin(), regions.end(), p_region);
 	if (it != regions.end()) {
 		regions.erase(it);
 		regenerate_links = true;
@@ -524,7 +524,7 @@ void NavMap::add_agent(RvoAgent *agent) {
 
 void NavMap::remove_agent(RvoAgent *agent) {
 	remove_agent_as_controlled(agent);
-	auto it = std::find(agents.begin(), agents.end(), agent);
+	const std::vector<RvoAgent *>::iterator it = std::find(agents.begin(), agents.end(), agent);
 	if (it != agents.end()) {
 		agents.erase(it);
 		agents_dirty = true;
@@ -540,7 +540,7 @@ void NavMap::set_agent_as_controlled(RvoAgent *agent) {
 }
 
 void NavMap::remove_agent_as_controlled(RvoAgent *agent) {
-	auto it = std::find(controlled_agents.begin(), controlled_agents.end(), agent);
+	const std::vector<RvoAgent *>::iterator it = std::find(controlled_agents.begin(), controlled_agents.end(), agent);
 	if (it != controlled_agents.end()) {
 		controlled_agents.erase(it);
 	}
diff --git a/modules/gdscript/gdscript.cpp b/modules/gdscript/gdscript.cpp
index c9c5d00aa5..5f590383d0 100644
--- a/modules/gdscript/gdscript.cpp
+++ b/modules/gdscript/gdscript.cpp
@@ -45,6 +45,10 @@
 #include "gdscript_parser.h"
 #include "gdscript_warning.h"
 
+#ifdef TESTS_ENABLED
+#include "tests/gdscript_test_runner.h"
+#endif
+
 ///////////////////////////
 
 GDScriptNativeClass::GDScriptNativeClass(const StringName &p_name) {
@@ -1766,6 +1770,10 @@ void GDScriptLanguage::init() {
 	for (List<Engine::Singleton>::Element *E = singletons.front(); E; E = E->next()) {
 		_add_global(E->get().name, E->get().ptr);
 	}
+
+#ifdef TESTS_ENABLED
+	GDScriptTests::GDScriptTestRunner::handle_cmdline();
+#endif
 }
 
 String GDScriptLanguage::get_type() const {
diff --git a/modules/gdscript/gdscript.h b/modules/gdscript/gdscript.h
index 12c909fd4f..98da5ad4cb 100644
--- a/modules/gdscript/gdscript.h
+++ b/modules/gdscript/gdscript.h
@@ -270,6 +270,7 @@ public:
 class GDScriptInstance : public ScriptInstance {
 	friend class GDScript;
 	friend class GDScriptFunction;
+	friend class GDScriptLambdaCallable;
 	friend class GDScriptCompiler;
 	friend struct GDScriptUtilityFunctionsDefinitions;
 
diff --git a/modules/gdscript/gdscript_analyzer.cpp b/modules/gdscript/gdscript_analyzer.cpp
index bdca64c146..17ae52f3ab 100644
--- a/modules/gdscript/gdscript_analyzer.cpp
+++ b/modules/gdscript/gdscript_analyzer.cpp
@@ -856,6 +856,7 @@ void GDScriptAnalyzer::resolve_node(GDScriptParser::Node *p_node) {
 		case GDScriptParser::Node::DICTIONARY:
 		case GDScriptParser::Node::GET_NODE:
 		case GDScriptParser::Node::IDENTIFIER:
+		case GDScriptParser::Node::LAMBDA:
 		case GDScriptParser::Node::LITERAL:
 		case GDScriptParser::Node::PRELOAD:
 		case GDScriptParser::Node::SELF:
@@ -1458,6 +1459,9 @@ void GDScriptAnalyzer::reduce_expression(GDScriptParser::ExpressionNode *p_expre
 		case GDScriptParser::Node::IDENTIFIER:
 			reduce_identifier(static_cast<GDScriptParser::IdentifierNode *>(p_expression));
 			break;
+		case GDScriptParser::Node::LAMBDA:
+			reduce_lambda(static_cast<GDScriptParser::LambdaNode *>(p_expression));
+			break;
 		case GDScriptParser::Node::LITERAL:
 			reduce_literal(static_cast<GDScriptParser::LiteralNode *>(p_expression));
 			break;
@@ -2061,6 +2065,12 @@ void GDScriptAnalyzer::reduce_call(GDScriptParser::CallNode *p_call, bool is_awa
 		is_self = true;
 	} else if (callee_type == GDScriptParser::Node::SUBSCRIPT) {
 		GDScriptParser::SubscriptNode *subscript = static_cast<GDScriptParser::SubscriptNode *>(p_call->callee);
+		if (subscript->base == nullptr) {
+			// Invalid syntax, error already set on parser.
+			p_call->set_datatype(call_type);
+			mark_node_unsafe(p_call);
+			return;
+		}
 		if (!subscript->is_attribute) {
 			// Invalid call. Error already sent in parser.
 			// TODO: Could check if Callable here.
@@ -2097,6 +2107,8 @@ void GDScriptAnalyzer::reduce_call(GDScriptParser::CallNode *p_call, bool is_awa
 
 		if (is_self && parser->current_function != nullptr && parser->current_function->is_static && !is_static) {
 			push_error(vformat(R"*(Cannot call non-static function "%s()" from static function "%s()".)*", p_call->function_name, parser->current_function->identifier->name), p_call->callee);
+		} else if (is_self && !is_static && !lambda_stack.is_empty()) {
+			push_error(vformat(R"*(Cannot call non-static function "%s()" from a lambda function.)*", p_call->function_name), p_call->callee);
 		}
 
 		call_type = return_type;
@@ -2219,6 +2231,8 @@ void GDScriptAnalyzer::reduce_get_node(GDScriptParser::GetNodeNode *p_get_node)
 
 	if (!ClassDB::is_parent_class(GDScriptParser::get_real_class_name(parser->current_class->base_type.native_type), result.native_type)) {
 		push_error(R"*(Cannot use shorthand "get_node()" notation ("$") on a class that isn't a node.)*", p_get_node);
+	} else if (!lambda_stack.is_empty()) {
+		push_error(R"*(Cannot use shorthand "get_node()" notation ("$") inside a lambda. Use a captured variable instead.)*", p_get_node);
 	}
 
 	p_get_node->set_datatype(result);
@@ -2346,6 +2360,7 @@ void GDScriptAnalyzer::reduce_identifier_from_base(GDScriptParser::IdentifierNod
 				case GDScriptParser::ClassNode::Member::ENUM_VALUE:
 					p_identifier->is_constant = true;
 					p_identifier->reduced_value = member.enum_value.value;
+					p_identifier->source = GDScriptParser::IdentifierNode::MEMBER_CONSTANT;
 					break;
 				case GDScriptParser::ClassNode::Member::VARIABLE:
 					p_identifier->source = GDScriptParser::IdentifierNode::MEMBER_VARIABLE;
@@ -2446,42 +2461,65 @@ void GDScriptAnalyzer::reduce_identifier(GDScriptParser::IdentifierNode *p_ident
 		}
 	}
 
+	bool found_source = false;
 	// Check if identifier is local.
 	// If that's the case, the declaration already was solved before.
 	switch (p_identifier->source) {
 		case GDScriptParser::IdentifierNode::FUNCTION_PARAMETER:
 			p_identifier->set_datatype(p_identifier->parameter_source->get_datatype());
-			return;
+			found_source = true;
+			break;
 		case GDScriptParser::IdentifierNode::LOCAL_CONSTANT:
 		case GDScriptParser::IdentifierNode::MEMBER_CONSTANT:
 			p_identifier->set_datatype(p_identifier->constant_source->get_datatype());
 			p_identifier->is_constant = true;
 			// TODO: Constant should have a value on the node itself.
 			p_identifier->reduced_value = p_identifier->constant_source->initializer->reduced_value;
-			return;
+			found_source = true;
+			break;
 		case GDScriptParser::IdentifierNode::MEMBER_VARIABLE:
 			p_identifier->variable_source->usages++;
 			[[fallthrough]];
 		case GDScriptParser::IdentifierNode::LOCAL_VARIABLE:
 			p_identifier->set_datatype(p_identifier->variable_source->get_datatype());
-			return;
+			found_source = true;
+			break;
 		case GDScriptParser::IdentifierNode::LOCAL_ITERATOR:
 			p_identifier->set_datatype(p_identifier->bind_source->get_datatype());
-			return;
+			found_source = true;
+			break;
 		case GDScriptParser::IdentifierNode::LOCAL_BIND: {
 			GDScriptParser::DataType result = p_identifier->bind_source->get_datatype();
 			result.is_constant = true;
 			p_identifier->set_datatype(result);
-			return;
-		}
+			found_source = true;
+		} break;
 		case GDScriptParser::IdentifierNode::UNDEFINED_SOURCE:
 			break;
 	}
 
 	// Not a local, so check members.
-	reduce_identifier_from_base(p_identifier);
-	if (p_identifier->get_datatype().is_set()) {
-		// Found.
+	if (!found_source) {
+		reduce_identifier_from_base(p_identifier);
+		if (p_identifier->source != GDScriptParser::IdentifierNode::UNDEFINED_SOURCE || p_identifier->get_datatype().is_set()) {
+			// Found.
+			found_source = true;
+		}
+	}
+
+	if (found_source) {
+		// If the identifier is local, check if it's any kind of capture by comparing their source function.
+		// Only capture locals and members and enum values. Constants are still accessible from the lambda using the script reference.
+		if (p_identifier->source == GDScriptParser::IdentifierNode::UNDEFINED_SOURCE || p_identifier->source == GDScriptParser::IdentifierNode::MEMBER_CONSTANT || lambda_stack.is_empty()) {
+			return;
+		}
+
+		GDScriptParser::FunctionNode *function_test = lambda_stack.back()->get()->function;
+		while (function_test != nullptr && function_test != p_identifier->source_function && function_test->source_lambda != nullptr && !function_test->source_lambda->captures_indices.has(p_identifier->name)) {
+			function_test->source_lambda->captures_indices[p_identifier->name] = function_test->source_lambda->captures.size();
+			function_test->source_lambda->captures.push_back(p_identifier);
+			function_test = function_test->source_lambda->parent_function;
+		}
 		return;
 	}
 
@@ -2563,6 +2601,57 @@ void GDScriptAnalyzer::reduce_identifier(GDScriptParser::IdentifierNode *p_ident
 	p_identifier->set_datatype(dummy); // Just so type is set to something.
 }
 
+void GDScriptAnalyzer::reduce_lambda(GDScriptParser::LambdaNode *p_lambda) {
+	// Lambda is always a Callable.
+	GDScriptParser::DataType lambda_type;
+	lambda_type.type_source = GDScriptParser::DataType::ANNOTATED_INFERRED;
+	lambda_type.kind = GDScriptParser::DataType::BUILTIN;
+	lambda_type.builtin_type = Variant::CALLABLE;
+	p_lambda->set_datatype(lambda_type);
+
+	if (p_lambda->function == nullptr) {
+		return;
+	}
+
+	GDScriptParser::FunctionNode *previous_function = parser->current_function;
+	parser->current_function = p_lambda->function;
+
+	lambda_stack.push_back(p_lambda);
+
+	for (int i = 0; i < p_lambda->function->parameters.size(); i++) {
+		resolve_parameter(p_lambda->function->parameters[i]);
+	}
+
+	resolve_suite(p_lambda->function->body);
+
+	int captures_amount = p_lambda->captures.size();
+	if (captures_amount > 0) {
+		// Create space for lambda parameters.
+		// At the beginning to not mess with optional parameters.
+		int param_count = p_lambda->function->parameters.size();
+		p_lambda->function->parameters.resize(param_count + captures_amount);
+		for (int i = param_count - 1; i >= 0; i--) {
+			p_lambda->function->parameters.write[i + captures_amount] = p_lambda->function->parameters[i];
+			p_lambda->function->parameters_indices[p_lambda->function->parameters[i]->identifier->name] = i + captures_amount;
+		}
+
+		// Add captures as extra parameters at the beginning.
+		for (int i = 0; i < p_lambda->captures.size(); i++) {
+			GDScriptParser::IdentifierNode *capture = p_lambda->captures[i];
+			GDScriptParser::ParameterNode *capture_param = parser->alloc_node<GDScriptParser::ParameterNode>();
+			capture_param->identifier = capture;
+			capture_param->usages = capture->usages;
+			capture_param->set_datatype(capture->get_datatype());
+
+			p_lambda->function->parameters.write[i] = capture_param;
+			p_lambda->function->parameters_indices[capture->name] = i;
+		}
+	}
+
+	lambda_stack.pop_back();
+	parser->current_function = previous_function;
+}
+
 void GDScriptAnalyzer::reduce_literal(GDScriptParser::LiteralNode *p_literal) {
 	p_literal->reduced_value = p_literal->value;
 	p_literal->is_constant = true;
@@ -2621,25 +2710,6 @@ void GDScriptAnalyzer::reduce_subscript(GDScriptParser::SubscriptNode *p_subscri
 
 	GDScriptParser::DataType result_type;
 
-	// Reduce index first. If it's a constant StringName, use attribute instead.
-	if (!p_subscript->is_attribute) {
-		if (p_subscript->index == nullptr) {
-			return;
-		}
-		reduce_expression(p_subscript->index);
-
-		if (p_subscript->index->is_constant && p_subscript->index->reduced_value.get_type() == Variant::STRING_NAME) {
-			GDScriptParser::IdentifierNode *attribute = parser->alloc_node<GDScriptParser::IdentifierNode>();
-			// Copy location for better error message.
-			attribute->start_line = p_subscript->index->start_line;
-			attribute->end_line = p_subscript->index->end_line;
-			attribute->leftmost_column = p_subscript->index->leftmost_column;
-			attribute->rightmost_column = p_subscript->index->rightmost_column;
-			p_subscript->is_attribute = true;
-			p_subscript->attribute = attribute;
-		}
-	}
-
 	if (p_subscript->is_attribute) {
 		if (p_subscript->attribute == nullptr) {
 			return;
@@ -2682,7 +2752,10 @@ void GDScriptAnalyzer::reduce_subscript(GDScriptParser::SubscriptNode *p_subscri
 			}
 		}
 	} else {
-		// Index was already reduced before.
+		if (p_subscript->index == nullptr) {
+			return;
+		}
+		reduce_expression(p_subscript->index);
 
 		if (p_subscript->base->is_constant && p_subscript->index->is_constant) {
 			// Just try to get it.
diff --git a/modules/gdscript/gdscript_analyzer.h b/modules/gdscript/gdscript_analyzer.h
index 8430d3f4a5..aabf407c76 100644
--- a/modules/gdscript/gdscript_analyzer.h
+++ b/modules/gdscript/gdscript_analyzer.h
@@ -42,6 +42,7 @@ class GDScriptAnalyzer {
 	HashMap<String, Ref<GDScriptParserRef>> depended_parsers;
 
 	const GDScriptParser::EnumNode *current_enum = nullptr;
+	List<const GDScriptParser::LambdaNode *> lambda_stack;
 
 	Error resolve_inheritance(GDScriptParser::ClassNode *p_class, bool p_recursive = true);
 	GDScriptParser::DataType resolve_datatype(GDScriptParser::TypeNode *p_type);
@@ -82,6 +83,7 @@ class GDScriptAnalyzer {
 	void reduce_get_node(GDScriptParser::GetNodeNode *p_get_node);
 	void reduce_identifier(GDScriptParser::IdentifierNode *p_identifier, bool can_be_builtin = false);
 	void reduce_identifier_from_base(GDScriptParser::IdentifierNode *p_identifier, GDScriptParser::DataType *p_base = nullptr);
+	void reduce_lambda(GDScriptParser::LambdaNode *p_lambda);
 	void reduce_literal(GDScriptParser::LiteralNode *p_literal);
 	void reduce_preload(GDScriptParser::PreloadNode *p_preload);
 	void reduce_self(GDScriptParser::SelfNode *p_self);
diff --git a/modules/gdscript/gdscript_byte_codegen.cpp b/modules/gdscript/gdscript_byte_codegen.cpp
index ec1116197e..0da99ccee3 100644
--- a/modules/gdscript/gdscript_byte_codegen.cpp
+++ b/modules/gdscript/gdscript_byte_codegen.cpp
@@ -47,7 +47,8 @@ uint32_t GDScriptByteCodeGenerator::add_parameter(const StringName &p_name, bool
 }
 
 uint32_t GDScriptByteCodeGenerator::add_local(const StringName &p_name, const GDScriptDataType &p_type) {
-	int stack_pos = increase_stack();
+	int stack_pos = locals.size() + RESERVED_STACK;
+	locals.push_back(StackSlot(p_type.builtin_type));
 	add_stack_identifier(p_name, stack_pos);
 	return stack_pos;
 }
@@ -59,37 +60,94 @@ uint32_t GDScriptByteCodeGenerator::add_local_constant(const StringName &p_name,
 }
 
 uint32_t GDScriptByteCodeGenerator::add_or_get_constant(const Variant &p_constant) {
-	if (constant_map.has(p_constant)) {
-		return constant_map[p_constant];
-	}
-	int index = constant_map.size();
-	constant_map[p_constant] = index;
-	return index;
+	return get_constant_pos(p_constant);
 }
 
 uint32_t GDScriptByteCodeGenerator::add_or_get_name(const StringName &p_name) {
 	return get_name_map_pos(p_name);
 }
 
-uint32_t GDScriptByteCodeGenerator::add_temporary() {
-	current_temporaries++;
-	int idx = increase_stack();
-#ifdef DEBUG_ENABLED
-	temp_stack.push_back(idx);
-#endif
-	return idx;
+uint32_t GDScriptByteCodeGenerator::add_temporary(const GDScriptDataType &p_type) {
+	Variant::Type temp_type = Variant::NIL;
+	if (p_type.has_type) {
+		if (p_type.kind == GDScriptDataType::BUILTIN) {
+			switch (p_type.builtin_type) {
+				case Variant::NIL:
+				case Variant::BOOL:
+				case Variant::INT:
+				case Variant::FLOAT:
+				case Variant::STRING:
+				case Variant::VECTOR2:
+				case Variant::VECTOR2I:
+				case Variant::RECT2:
+				case Variant::RECT2I:
+				case Variant::VECTOR3:
+				case Variant::VECTOR3I:
+				case Variant::TRANSFORM2D:
+				case Variant::PLANE:
+				case Variant::QUAT:
+				case Variant::AABB:
+				case Variant::BASIS:
+				case Variant::TRANSFORM:
+				case Variant::COLOR:
+				case Variant::STRING_NAME:
+				case Variant::NODE_PATH:
+				case Variant::RID:
+				case Variant::OBJECT:
+				case Variant::CALLABLE:
+				case Variant::SIGNAL:
+				case Variant::DICTIONARY:
+				case Variant::ARRAY:
+					temp_type = p_type.builtin_type;
+					break;
+				case Variant::PACKED_BYTE_ARRAY:
+				case Variant::PACKED_INT32_ARRAY:
+				case Variant::PACKED_INT64_ARRAY:
+				case Variant::PACKED_FLOAT32_ARRAY:
+				case Variant::PACKED_FLOAT64_ARRAY:
+				case Variant::PACKED_STRING_ARRAY:
+				case Variant::PACKED_VECTOR2_ARRAY:
+				case Variant::PACKED_VECTOR3_ARRAY:
+				case Variant::PACKED_COLOR_ARRAY:
+				case Variant::VARIANT_MAX:
+					// Packed arrays are reference counted, so we don't use the pool for them.
+					temp_type = Variant::NIL;
+					break;
+			}
+		} else {
+			temp_type = Variant::OBJECT;
+		}
+	}
+
+	if (!temporaries_pool.has(temp_type)) {
+		temporaries_pool[temp_type] = List<int>();
+	}
+
+	List<int> &pool = temporaries_pool[temp_type];
+	if (pool.is_empty()) {
+		StackSlot new_temp(temp_type);
+		int idx = temporaries.size();
+		pool.push_back(idx);
+		temporaries.push_back(new_temp);
+
+		// First time using this, so adjust to the proper type.
+		if (temp_type != Variant::NIL) {
+			Address addr(Address::TEMPORARY, idx, p_type);
+			write_type_adjust(addr, temp_type);
+		}
+	}
+	int slot = pool.front()->get();
+	pool.pop_front();
+	used_temporaries.push_back(slot);
+	return slot;
 }
 
 void GDScriptByteCodeGenerator::pop_temporary() {
-	ERR_FAIL_COND(current_temporaries == 0);
-	current_stack_size--;
-#ifdef DEBUG_ENABLED
-	if (temp_stack.back()->get() != current_stack_size) {
-		ERR_PRINT("Mismatched popping of temporary value");
-	}
-	temp_stack.pop_back();
-#endif
-	current_temporaries--;
+	ERR_FAIL_COND(used_temporaries.is_empty());
+	int slot_idx = used_temporaries.back()->get();
+	const StackSlot &slot = temporaries[slot_idx];
+	temporaries_pool[slot.type].push_back(slot_idx);
+	used_temporaries.pop_back();
 }
 
 void GDScriptByteCodeGenerator::start_parameters() {
@@ -124,12 +182,18 @@ void GDScriptByteCodeGenerator::write_start(GDScript *p_script, const StringName
 
 GDScriptFunction *GDScriptByteCodeGenerator::write_end() {
 #ifdef DEBUG_ENABLED
-	if (current_temporaries != 0) {
-		ERR_PRINT("Non-zero temporary variables at end of function: " + itos(current_temporaries));
+	if (!used_temporaries.is_empty()) {
+		ERR_PRINT("Non-zero temporary variables at end of function: " + itos(used_temporaries.size()));
 	}
 #endif
 	append(GDScriptFunction::OPCODE_END, 0);
 
+	for (int i = 0; i < temporaries.size(); i++) {
+		for (int j = 0; j < temporaries[i].bytecode_indices.size(); j++) {
+			opcodes.write[temporaries[i].bytecode_indices[j]] = (i + max_locals + RESERVED_STACK) | (GDScriptFunction::ADDR_TYPE_STACK << GDScriptFunction::ADDR_BITS);
+		}
+	}
+
 	if (constant_map.size()) {
 		function->_constant_count = constant_map.size();
 		function->constants.resize(constant_map.size());
@@ -319,10 +383,22 @@ GDScriptFunction *GDScriptByteCodeGenerator::write_end() {
 		function->_methods_count = 0;
 	}
 
+	if (lambdas_map.size()) {
+		function->lambdas.resize(lambdas_map.size());
+		function->_lambdas_ptr = function->lambdas.ptrw();
+		function->_lambdas_count = lambdas_map.size();
+		for (const Map<GDScriptFunction *, int>::Element *E = lambdas_map.front(); E; E = E->next()) {
+			function->lambdas.write[E->get()] = E->key();
+		}
+	} else {
+		function->_lambdas_ptr = nullptr;
+		function->_lambdas_count = 0;
+	}
+
 	if (debug_stack) {
 		function->stack_debug = stack_debug;
 	}
-	function->_stack_size = stack_max;
+	function->_stack_size = RESERVED_STACK + max_locals + temporaries.size();
 	function->_instruction_args_size = instr_args_max;
 	function->_ptrcall_args_size = ptrcall_max;
 
@@ -346,6 +422,117 @@ void GDScriptByteCodeGenerator::set_initial_line(int p_line) {
 #define IS_BUILTIN_TYPE(m_var, m_type) \
 	(m_var.type.has_type && m_var.type.kind == GDScriptDataType::BUILTIN && m_var.type.builtin_type == m_type)
 
+void GDScriptByteCodeGenerator::write_type_adjust(const Address &p_target, Variant::Type p_new_type) {
+	switch (p_new_type) {
+		case Variant::BOOL:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_BOOL, 1);
+			break;
+		case Variant::INT:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_INT, 1);
+			break;
+		case Variant::FLOAT:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_FLOAT, 1);
+			break;
+		case Variant::STRING:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_STRING, 1);
+			break;
+		case Variant::VECTOR2:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_VECTOR2, 1);
+			break;
+		case Variant::VECTOR2I:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_VECTOR2I, 1);
+			break;
+		case Variant::RECT2:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_RECT2, 1);
+			break;
+		case Variant::RECT2I:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_RECT2I, 1);
+			break;
+		case Variant::VECTOR3:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_VECTOR3, 1);
+			break;
+		case Variant::VECTOR3I:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_VECTOR3I, 1);
+			break;
+		case Variant::TRANSFORM2D:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_TRANSFORM2D, 1);
+			break;
+		case Variant::PLANE:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_PLANE, 1);
+			break;
+		case Variant::QUAT:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_QUAT, 1);
+			break;
+		case Variant::AABB:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_AABB, 1);
+			break;
+		case Variant::BASIS:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_BASIS, 1);
+			break;
+		case Variant::TRANSFORM:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_TRANSFORM, 1);
+			break;
+		case Variant::COLOR:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_COLOR, 1);
+			break;
+		case Variant::STRING_NAME:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_STRING_NAME, 1);
+			break;
+		case Variant::NODE_PATH:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_NODE_PATH, 1);
+			break;
+		case Variant::RID:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_RID, 1);
+			break;
+		case Variant::OBJECT:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_OBJECT, 1);
+			break;
+		case Variant::CALLABLE:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_CALLABLE, 1);
+			break;
+		case Variant::SIGNAL:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_SIGNAL, 1);
+			break;
+		case Variant::DICTIONARY:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_DICTIONARY, 1);
+			break;
+		case Variant::ARRAY:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_ARRAY, 1);
+			break;
+		case Variant::PACKED_BYTE_ARRAY:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_PACKED_BYTE_ARRAY, 1);
+			break;
+		case Variant::PACKED_INT32_ARRAY:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_PACKED_INT32_ARRAY, 1);
+			break;
+		case Variant::PACKED_INT64_ARRAY:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_PACKED_INT64_ARRAY, 1);
+			break;
+		case Variant::PACKED_FLOAT32_ARRAY:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_PACKED_FLOAT32_ARRAY, 1);
+			break;
+		case Variant::PACKED_FLOAT64_ARRAY:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_PACKED_FLOAT64_ARRAY, 1);
+			break;
+		case Variant::PACKED_STRING_ARRAY:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_PACKED_STRING_ARRAY, 1);
+			break;
+		case Variant::PACKED_VECTOR2_ARRAY:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_PACKED_VECTOR2_ARRAY, 1);
+			break;
+		case Variant::PACKED_VECTOR3_ARRAY:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_PACKED_VECTOR3_ARRAY, 1);
+			break;
+		case Variant::PACKED_COLOR_ARRAY:
+			append(GDScriptFunction::OPCODE_TYPE_ADJUST_PACKED_COLOR_ARRAY, 1);
+			break;
+		case Variant::NIL:
+		case Variant::VARIANT_MAX:
+			return;
+	}
+	append(p_target);
+}
+
 void GDScriptByteCodeGenerator::write_unary_operator(const Address &p_target, Variant::Operator p_operator, const Address &p_left_operand) {
 	if (HAS_BUILTIN_TYPE(p_left_operand)) {
 		// Gather specific operator.
@@ -396,7 +583,7 @@ void GDScriptByteCodeGenerator::write_type_test(const Address &p_target, const A
 }
 
 void GDScriptByteCodeGenerator::write_type_test_builtin(const Address &p_target, const Address &p_source, Variant::Type p_type) {
-	append(GDScriptFunction::OPCODE_IS_BUILTIN, 3);
+	append(GDScriptFunction::OPCODE_IS_BUILTIN, 2);
 	append(p_source);
 	append(p_target);
 	append(p_type);
@@ -612,7 +799,8 @@ void GDScriptByteCodeGenerator::write_assign(const Address &p_target, const Addr
 			} break;
 			case GDScriptDataType::NATIVE: {
 				int class_idx = GDScriptLanguage::get_singleton()->get_global_map()[p_target.type.native_type];
-				class_idx |= (GDScriptFunction::ADDR_TYPE_GLOBAL << GDScriptFunction::ADDR_BITS);
+				Variant nc = GDScriptLanguage::get_singleton()->get_global_array()[class_idx];
+				class_idx = get_constant_pos(nc) | (GDScriptFunction::ADDR_TYPE_CONSTANT << GDScriptFunction::ADDR_BITS);
 				append(GDScriptFunction::OPCODE_ASSIGN_TYPED_NATIVE, 3);
 				append(p_target);
 				append(p_source);
@@ -621,8 +809,7 @@ void GDScriptByteCodeGenerator::write_assign(const Address &p_target, const Addr
 			case GDScriptDataType::SCRIPT:
 			case GDScriptDataType::GDSCRIPT: {
 				Variant script = p_target.type.script_type;
-				int idx = get_constant_pos(script);
-				idx |= (GDScriptFunction::ADDR_TYPE_LOCAL_CONSTANT << GDScriptFunction::ADDR_BITS);
+				int idx = get_constant_pos(script) | (GDScriptFunction::ADDR_TYPE_CONSTANT << GDScriptFunction::ADDR_BITS);
 
 				append(GDScriptFunction::OPCODE_ASSIGN_TYPED_SCRIPT, 3);
 				append(p_target);
@@ -673,6 +860,12 @@ void GDScriptByteCodeGenerator::write_assign_default_parameter(const Address &p_
 	function->default_arguments.push_back(opcodes.size());
 }
 
+void GDScriptByteCodeGenerator::write_store_named_global(const Address &p_dst, const StringName &p_global) {
+	append(GDScriptFunction::OPCODE_STORE_NAMED_GLOBAL, 1);
+	append(p_dst);
+	append(p_global);
+}
+
 void GDScriptByteCodeGenerator::write_cast(const Address &p_target, const Address &p_source, const GDScriptDataType &p_type) {
 	int index = 0;
 
@@ -683,16 +876,14 @@ void GDScriptByteCodeGenerator::write_cast(const Address &p_target, const Addres
 		} break;
 		case GDScriptDataType::NATIVE: {
 			int class_idx = GDScriptLanguage::get_singleton()->get_global_map()[p_type.native_type];
-			class_idx |= (GDScriptFunction::ADDR_TYPE_GLOBAL << GDScriptFunction::ADDR_BITS);
+			Variant nc = GDScriptLanguage::get_singleton()->get_global_array()[class_idx];
 			append(GDScriptFunction::OPCODE_CAST_TO_NATIVE, 3);
-			index = class_idx;
+			index = get_constant_pos(nc) | (GDScriptFunction::ADDR_TYPE_CONSTANT << GDScriptFunction::ADDR_BITS);
 		} break;
 		case GDScriptDataType::SCRIPT:
 		case GDScriptDataType::GDSCRIPT: {
 			Variant script = p_type.script_type;
-			int idx = get_constant_pos(script);
-			idx |= (GDScriptFunction::ADDR_TYPE_LOCAL_CONSTANT << GDScriptFunction::ADDR_BITS);
-
+			int idx = get_constant_pos(script) | (GDScriptFunction::ADDR_TYPE_CONSTANT << GDScriptFunction::ADDR_BITS);
 			append(GDScriptFunction::OPCODE_CAST_TO_SCRIPT, 3);
 			index = idx;
 		} break;
@@ -807,6 +998,14 @@ void GDScriptByteCodeGenerator::write_call_builtin_type(const Address &p_target,
 		return;
 	}
 
+	if (p_target.mode == Address::TEMPORARY) {
+		Variant::Type result_type = Variant::get_builtin_method_return_type(p_type, p_method);
+		Variant::Type temp_type = temporaries[p_target.address].type;
+		if (result_type != temp_type) {
+			write_type_adjust(p_target, result_type);
+		}
+	}
+
 	append(GDScriptFunction::OPCODE_CALL_BUILTIN_TYPE_VALIDATED, 2 + p_arguments.size());
 
 	for (int i = 0; i < p_arguments.size(); i++) {
@@ -903,7 +1102,7 @@ void GDScriptByteCodeGenerator::write_call_self(const Address &p_target, const S
 	for (int i = 0; i < p_arguments.size(); i++) {
 		append(p_arguments[i]);
 	}
-	append(GDScriptFunction::ADDR_TYPE_SELF << GDScriptFunction::ADDR_BITS);
+	append(GDScriptFunction::ADDR_TYPE_STACK << GDScriptFunction::ADDR_BITS);
 	append(p_target);
 	append(p_arguments.size());
 	append(p_function_name);
@@ -914,7 +1113,7 @@ void GDScriptByteCodeGenerator::write_call_self_async(const Address &p_target, c
 	for (int i = 0; i < p_arguments.size(); i++) {
 		append(p_arguments[i]);
 	}
-	append(GDScriptFunction::ADDR_TYPE_SELF << GDScriptFunction::ADDR_BITS);
+	append(GDScriptFunction::ADDR_SELF);
 	append(p_target);
 	append(p_arguments.size());
 	append(p_function_name);
@@ -931,6 +1130,17 @@ void GDScriptByteCodeGenerator::write_call_script_function(const Address &p_targ
 	append(p_function_name);
 }
 
+void GDScriptByteCodeGenerator::write_lambda(const Address &p_target, GDScriptFunction *p_function, const Vector<Address> &p_captures) {
+	append(GDScriptFunction::OPCODE_CREATE_LAMBDA, 1 + p_captures.size());
+	for (int i = 0; i < p_captures.size(); i++) {
+		append(p_captures[i]);
+	}
+
+	append(p_target);
+	append(p_captures.size());
+	append(p_function);
+}
+
 void GDScriptByteCodeGenerator::write_construct(const Address &p_target, Variant::Type p_type, const Vector<Address> &p_arguments) {
 	// Try to find an appropriate constructor.
 	bool all_have_type = true;
@@ -999,7 +1209,7 @@ void GDScriptByteCodeGenerator::write_construct_typed_array(const Address &p_tar
 	if (p_element_type.script_type) {
 		Variant script_type = Ref<Script>(p_element_type.script_type);
 		int addr = get_constant_pos(script_type);
-		addr |= GDScriptFunction::ADDR_TYPE_LOCAL_CONSTANT << GDScriptFunction::ADDR_BITS;
+		addr |= GDScriptFunction::ADDR_TYPE_CONSTANT << GDScriptFunction::ADDR_BITS;
 		append(addr);
 	} else {
 		append(Address()); // null.
@@ -1296,8 +1506,7 @@ void GDScriptByteCodeGenerator::write_return(const Address &p_return_value) {
 				const GDScriptDataType &element_type = function->return_type.get_container_element_type();
 
 				Variant script = function->return_type.script_type;
-				int script_idx = get_constant_pos(script);
-				script_idx |= (GDScriptFunction::ADDR_TYPE_LOCAL_CONSTANT << GDScriptFunction::ADDR_BITS);
+				int script_idx = get_constant_pos(script) | (GDScriptFunction::ADDR_TYPE_CONSTANT << GDScriptFunction::ADDR_BITS);
 
 				append(GDScriptFunction::OPCODE_RETURN_TYPED_ARRAY, 2);
 				append(p_return_value);
@@ -1326,7 +1535,7 @@ void GDScriptByteCodeGenerator::write_return(const Address &p_return_value) {
 
 					Variant script = function->return_type.script_type;
 					int script_idx = get_constant_pos(script);
-					script_idx |= (GDScriptFunction::ADDR_TYPE_LOCAL_CONSTANT << GDScriptFunction::ADDR_BITS);
+					script_idx |= (GDScriptFunction::ADDR_TYPE_CONSTANT << GDScriptFunction::ADDR_BITS);
 
 					append(GDScriptFunction::OPCODE_RETURN_TYPED_ARRAY, 2);
 					append(p_return_value);
@@ -1343,14 +1552,14 @@ void GDScriptByteCodeGenerator::write_return(const Address &p_return_value) {
 				append(GDScriptFunction::OPCODE_RETURN_TYPED_NATIVE, 2);
 				append(p_return_value);
 				int class_idx = GDScriptLanguage::get_singleton()->get_global_map()[function->return_type.native_type];
-				class_idx |= (GDScriptFunction::ADDR_TYPE_GLOBAL << GDScriptFunction::ADDR_BITS);
+				Variant nc = GDScriptLanguage::get_singleton()->get_global_array()[class_idx];
+				class_idx = get_constant_pos(nc) | (GDScriptFunction::ADDR_TYPE_CONSTANT << GDScriptFunction::ADDR_BITS);
 				append(class_idx);
 			} break;
 			case GDScriptDataType::GDSCRIPT:
 			case GDScriptDataType::SCRIPT: {
 				Variant script = function->return_type.script_type;
-				int script_idx = get_constant_pos(script);
-				script_idx |= (GDScriptFunction::ADDR_TYPE_LOCAL_CONSTANT << GDScriptFunction::ADDR_BITS);
+				int script_idx = get_constant_pos(script) | (GDScriptFunction::ADDR_TYPE_CONSTANT << GDScriptFunction::ADDR_BITS);
 
 				append(GDScriptFunction::OPCODE_RETURN_TYPED_SCRIPT, 2);
 				append(p_return_value);
diff --git a/modules/gdscript/gdscript_byte_codegen.h b/modules/gdscript/gdscript_byte_codegen.h
index 6eaec91504..c060476f39 100644
--- a/modules/gdscript/gdscript_byte_codegen.h
+++ b/modules/gdscript/gdscript_byte_codegen.h
@@ -37,6 +37,17 @@
 #include "gdscript_utility_functions.h"
 
 class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
+	struct StackSlot {
+		Variant::Type type = Variant::NIL;
+		Vector<int> bytecode_indices;
+
+		StackSlot() = default;
+		StackSlot(Variant::Type p_type) :
+				type(p_type) {}
+	};
+
+	const static int RESERVED_STACK = 3; // For self, class, and nil.
+
 	bool ended = false;
 	GDScriptFunction *function = nullptr;
 	bool debug_stack = false;
@@ -47,15 +58,17 @@ class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
 	List<int> stack_identifiers_counts;
 	Map<StringName, int> local_constants;
 
+	Vector<StackSlot> locals;
+	Vector<StackSlot> temporaries;
+	List<int> used_temporaries;
+	Map<Variant::Type, List<int>> temporaries_pool;
+
 	List<GDScriptFunction::StackDebug> stack_debug;
 	List<Map<StringName, int>> block_identifier_stack;
 	Map<StringName, int> block_identifiers;
 
-	int current_stack_size = 0;
-	int current_temporaries = 0;
-	int current_locals = 0;
+	int max_locals = 0;
 	int current_line = 0;
-	int stack_max = 0;
 	int instr_args_max = 0;
 	int ptrcall_max = 0;
 
@@ -80,6 +93,7 @@ class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
 	Map<Variant::ValidatedUtilityFunction, int> utilities_map;
 	Map<GDScriptUtilityFunctions::FunctionPtr, int> gds_utilities_map;
 	Map<MethodBind *, int> method_bind_map;
+	Map<GDScriptFunction *, int> lambdas_map;
 
 	// Lists since these can be nested.
 	List<int> if_jmp_addrs;
@@ -102,7 +116,9 @@ class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
 	List<List<int>> match_continues_to_patch;
 
 	void add_stack_identifier(const StringName &p_id, int p_stackpos) {
-		current_locals++;
+		if (locals.size() > max_locals) {
+			max_locals = locals.size();
+		}
 		stack_identifiers[p_id] = p_stackpos;
 		if (debug_stack) {
 			block_identifiers[p_id] = p_stackpos;
@@ -116,7 +132,7 @@ class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
 	}
 
 	void push_stack_identifiers() {
-		stack_identifiers_counts.push_back(current_locals);
+		stack_identifiers_counts.push_back(locals.size());
 		stack_id_stack.push_back(stack_identifiers);
 		if (debug_stack) {
 			Map<StringName, int> block_ids(block_identifiers);
@@ -126,17 +142,16 @@ class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
 	}
 
 	void pop_stack_identifiers() {
-		current_locals = stack_identifiers_counts.back()->get();
+		int current_locals = stack_identifiers_counts.back()->get();
 		stack_identifiers_counts.pop_back();
 		stack_identifiers = stack_id_stack.back()->get();
 		stack_id_stack.pop_back();
 #ifdef DEBUG_ENABLED
-		if (current_temporaries != 0) {
-			ERR_PRINT("Leaving block with non-zero temporary variables: " + itos(current_temporaries));
+		if (!used_temporaries.is_empty()) {
+			ERR_PRINT("Leaving block with non-zero temporary variables: " + itos(used_temporaries.size()));
 		}
 #endif
-		current_stack_size = current_locals;
-
+		locals.resize(current_locals);
 		if (debug_stack) {
 			for (Map<StringName, int>::Element *E = block_identifiers.front(); E; E = E->next()) {
 				GDScriptFunction::StackDebug sd;
@@ -279,16 +294,13 @@ class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
 		return pos;
 	}
 
-	void alloc_stack(int p_level) {
-		if (p_level >= stack_max) {
-			stack_max = p_level + 1;
+	int get_lambda_function_pos(GDScriptFunction *p_lambda_function) {
+		if (lambdas_map.has(p_lambda_function)) {
+			return lambdas_map[p_lambda_function];
 		}
-	}
-
-	int increase_stack() {
-		int top = current_stack_size++;
-		alloc_stack(current_stack_size);
-		return top;
+		int pos = lambdas_map.size();
+		lambdas_map[p_lambda_function] = pos;
+		return pos;
 	}
 
 	void alloc_ptrcall(int p_params) {
@@ -300,26 +312,21 @@ class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
 	int address_of(const Address &p_address) {
 		switch (p_address.mode) {
 			case Address::SELF:
-				return GDScriptFunction::ADDR_TYPE_SELF << GDScriptFunction::ADDR_BITS;
+				return GDScriptFunction::ADDR_SELF;
 			case Address::CLASS:
-				return GDScriptFunction::ADDR_TYPE_CLASS << GDScriptFunction::ADDR_BITS;
+				return GDScriptFunction::ADDR_CLASS;
 			case Address::MEMBER:
 				return p_address.address | (GDScriptFunction::ADDR_TYPE_MEMBER << GDScriptFunction::ADDR_BITS);
-			case Address::CLASS_CONSTANT:
-				return p_address.address | (GDScriptFunction::ADDR_TYPE_CLASS_CONSTANT << GDScriptFunction::ADDR_BITS);
-			case Address::LOCAL_CONSTANT:
 			case Address::CONSTANT:
-				return p_address.address | (GDScriptFunction::ADDR_TYPE_LOCAL_CONSTANT << GDScriptFunction::ADDR_BITS);
+				return p_address.address | (GDScriptFunction::ADDR_TYPE_CONSTANT << GDScriptFunction::ADDR_BITS);
 			case Address::LOCAL_VARIABLE:
-			case Address::TEMPORARY:
 			case Address::FUNCTION_PARAMETER:
 				return p_address.address | (GDScriptFunction::ADDR_TYPE_STACK << GDScriptFunction::ADDR_BITS);
-			case Address::GLOBAL:
-				return p_address.address | (GDScriptFunction::ADDR_TYPE_GLOBAL << GDScriptFunction::ADDR_BITS);
-			case Address::NAMED_GLOBAL:
-				return p_address.address | (GDScriptFunction::ADDR_TYPE_NAMED_GLOBAL << GDScriptFunction::ADDR_BITS);
+			case Address::TEMPORARY:
+				temporaries.write[p_address.address].bytecode_indices.push_back(opcodes.size());
+				return -1;
 			case Address::NIL:
-				return GDScriptFunction::ADDR_TYPE_NIL << GDScriptFunction::ADDR_BITS;
+				return GDScriptFunction::ADDR_NIL;
 		}
 		return -1; // Unreachable.
 	}
@@ -389,6 +396,10 @@ class GDScriptByteCodeGenerator : public GDScriptCodeGenerator {
 		opcodes.push_back(get_method_bind_pos(p_method));
 	}
 
+	void append(GDScriptFunction *p_lambda_function) {
+		opcodes.push_back(get_lambda_function_pos(p_lambda_function));
+	}
+
 	void patch_jump(int p_address) {
 		opcodes.write[p_address] = opcodes.size();
 	}
@@ -399,7 +410,7 @@ public:
 	virtual uint32_t add_local_constant(const StringName &p_name, const Variant &p_constant) override;
 	virtual uint32_t add_or_get_constant(const Variant &p_constant) override;
 	virtual uint32_t add_or_get_name(const StringName &p_name) override;
-	virtual uint32_t add_temporary() override;
+	virtual uint32_t add_temporary(const GDScriptDataType &p_type) override;
 	virtual void pop_temporary() override;
 
 	virtual void start_parameters() override;
@@ -416,6 +427,7 @@ public:
 #endif
 	virtual void set_initial_line(int p_line) override;
 
+	virtual void write_type_adjust(const Address &p_target, Variant::Type p_new_type) override;
 	virtual void write_unary_operator(const Address &p_target, Variant::Operator p_operator, const Address &p_left_operand) override;
 	virtual void write_binary_operator(const Address &p_target, Variant::Operator p_operator, const Address &p_left_operand, const Address &p_right_operand) override;
 	virtual void write_type_test(const Address &p_target, const Address &p_source, const Address &p_type) override;
@@ -441,6 +453,7 @@ public:
 	virtual void write_assign_true(const Address &p_target) override;
 	virtual void write_assign_false(const Address &p_target) override;
 	virtual void write_assign_default_parameter(const Address &p_dst, const Address &p_src) override;
+	virtual void write_store_named_global(const Address &p_dst, const StringName &p_global) override;
 	virtual void write_cast(const Address &p_target, const Address &p_source, const GDScriptDataType &p_type) override;
 	virtual void write_call(const Address &p_target, const Address &p_base, const StringName &p_function_name, const Vector<Address> &p_arguments) override;
 	virtual void write_super_call(const Address &p_target, const StringName &p_function_name, const Vector<Address> &p_arguments) override;
@@ -453,6 +466,7 @@ public:
 	virtual void write_call_self(const Address &p_target, const StringName &p_function_name, const Vector<Address> &p_arguments) override;
 	virtual void write_call_self_async(const Address &p_target, const StringName &p_function_name, const Vector<Address> &p_arguments) override;
 	virtual void write_call_script_function(const Address &p_target, const Address &p_base, const StringName &p_function_name, const Vector<Address> &p_arguments) override;
+	virtual void write_lambda(const Address &p_target, GDScriptFunction *p_function, const Vector<Address> &p_captures) override;
 	virtual void write_construct(const Address &p_target, Variant::Type p_type, const Vector<Address> &p_arguments) override;
 	virtual void write_construct_array(const Address &p_target, const Vector<Address> &p_arguments) override;
 	virtual void write_construct_typed_array(const Address &p_target, const GDScriptDataType &p_element_type, const Vector<Address> &p_arguments) override;
diff --git a/modules/gdscript/gdscript_codegen.h b/modules/gdscript/gdscript_codegen.h
index 3c05f14cf7..ae9a8ede5e 100644
--- a/modules/gdscript/gdscript_codegen.h
+++ b/modules/gdscript/gdscript_codegen.h
@@ -45,13 +45,9 @@ public:
 			CLASS,
 			MEMBER,
 			CONSTANT,
-			CLASS_CONSTANT,
-			LOCAL_CONSTANT,
 			LOCAL_VARIABLE,
 			FUNCTION_PARAMETER,
 			TEMPORARY,
-			GLOBAL,
-			NAMED_GLOBAL,
 			NIL,
 		};
 		AddressMode mode = NIL;
@@ -75,7 +71,7 @@ public:
 	virtual uint32_t add_local_constant(const StringName &p_name, const Variant &p_constant) = 0;
 	virtual uint32_t add_or_get_constant(const Variant &p_constant) = 0;
 	virtual uint32_t add_or_get_name(const StringName &p_name) = 0;
-	virtual uint32_t add_temporary() = 0;
+	virtual uint32_t add_temporary(const GDScriptDataType &p_type) = 0;
 	virtual void pop_temporary() = 0;
 
 	virtual void start_parameters() = 0;
@@ -84,9 +80,6 @@ public:
 	virtual void start_block() = 0;
 	virtual void end_block() = 0;
 
-	// virtual int get_max_stack_level() = 0;
-	// virtual int get_max_function_arguments() = 0;
-
 	virtual void write_start(GDScript *p_script, const StringName &p_function_name, bool p_static, MultiplayerAPI::RPCMode p_rpc_mode, const GDScriptDataType &p_return_type) = 0;
 	virtual GDScriptFunction *write_end() = 0;
 
@@ -95,9 +88,7 @@ public:
 #endif
 	virtual void set_initial_line(int p_line) = 0;
 
-	// virtual void alloc_stack(int p_level) = 0; // Is this needed?
-	// virtual void alloc_call(int p_arg_count) = 0; // This might be automatic from other functions.
-
+	virtual void write_type_adjust(const Address &p_target, Variant::Type p_new_type) = 0;
 	virtual void write_unary_operator(const Address &p_target, Variant::Operator p_operator, const Address &p_left_operand) = 0;
 	virtual void write_binary_operator(const Address &p_target, Variant::Operator p_operator, const Address &p_left_operand, const Address &p_right_operand) = 0;
 	virtual void write_type_test(const Address &p_target, const Address &p_source, const Address &p_type) = 0;
@@ -123,6 +114,7 @@ public:
 	virtual void write_assign_true(const Address &p_target) = 0;
 	virtual void write_assign_false(const Address &p_target) = 0;
 	virtual void write_assign_default_parameter(const Address &dst, const Address &src) = 0;
+	virtual void write_store_named_global(const Address &p_dst, const StringName &p_global) = 0;
 	virtual void write_cast(const Address &p_target, const Address &p_source, const GDScriptDataType &p_type) = 0;
 	virtual void write_call(const Address &p_target, const Address &p_base, const StringName &p_function_name, const Vector<Address> &p_arguments) = 0;
 	virtual void write_super_call(const Address &p_target, const StringName &p_function_name, const Vector<Address> &p_arguments) = 0;
@@ -135,13 +127,13 @@ public:
 	virtual void write_call_self(const Address &p_target, const StringName &p_function_name, const Vector<Address> &p_arguments) = 0;
 	virtual void write_call_self_async(const Address &p_target, const StringName &p_function_name, const Vector<Address> &p_arguments) = 0;
 	virtual void write_call_script_function(const Address &p_target, const Address &p_base, const StringName &p_function_name, const Vector<Address> &p_arguments) = 0;
+	virtual void write_lambda(const Address &p_target, GDScriptFunction *p_function, const Vector<Address> &p_captures) = 0;
 	virtual void write_construct(const Address &p_target, Variant::Type p_type, const Vector<Address> &p_arguments) = 0;
 	virtual void write_construct_array(const Address &p_target, const Vector<Address> &p_arguments) = 0;
 	virtual void write_construct_typed_array(const Address &p_target, const GDScriptDataType &p_element_type, const Vector<Address> &p_arguments) = 0;
 	virtual void write_construct_dictionary(const Address &p_target, const Vector<Address> &p_arguments) = 0;
 	virtual void write_await(const Address &p_target, const Address &p_operand) = 0;
 	virtual void write_if(const Address &p_condition) = 0;
-	// virtual void write_elseif(const Address &p_condition) = 0; This kind of makes things more difficult for no real benefit.
 	virtual void write_else() = 0;
 	virtual void write_endif() = 0;
 	virtual void start_for(const GDScriptDataType &p_iterator_type, const GDScriptDataType &p_list_type) = 0;
diff --git a/modules/gdscript/gdscript_compiler.cpp b/modules/gdscript/gdscript_compiler.cpp
index 6a91148575..37ce8ae2cb 100644
--- a/modules/gdscript/gdscript_compiler.cpp
+++ b/modules/gdscript/gdscript_compiler.cpp
@@ -262,7 +262,7 @@ GDScriptCodeGenerator::Address GDScriptCompiler::_parse_expression(CodeGen &code
 					GDScriptNativeClass *nc = nullptr;
 					while (scr) {
 						if (scr->constants.has(identifier)) {
-							return GDScriptCodeGenerator::Address(GDScriptCodeGenerator::Address::CLASS_CONSTANT, gen->add_or_get_name(identifier)); // TODO: Get type here.
+							return codegen.add_constant(scr->constants[identifier]); // TODO: Get type here.
 						}
 						if (scr->native.is_valid()) {
 							nc = scr->native.ptr();
@@ -319,7 +319,8 @@ GDScriptCodeGenerator::Address GDScriptCompiler::_parse_expression(CodeGen &code
 
 			if (GDScriptLanguage::get_singleton()->get_global_map().has(identifier)) {
 				int idx = GDScriptLanguage::get_singleton()->get_global_map()[identifier];
-				return GDScriptCodeGenerator::Address(GDScriptCodeGenerator::Address::GLOBAL, idx); // TODO: Get type.
+				Variant global = GDScriptLanguage::get_singleton()->get_global_array()[idx];
+				return codegen.add_constant(global); // TODO: Get type.
 			}
 
 			// Try global classes.
@@ -347,7 +348,9 @@ GDScriptCodeGenerator::Address GDScriptCompiler::_parse_expression(CodeGen &code
 
 #ifdef TOOLS_ENABLED
 			if (GDScriptLanguage::get_singleton()->get_named_globals_map().has(identifier)) {
-				return GDScriptCodeGenerator::Address(GDScriptCodeGenerator::Address::NAMED_GLOBAL, gen->add_or_get_name(identifier)); // TODO: Get type.
+				GDScriptCodeGenerator::Address global = codegen.add_temporary(); // TODO: Get type.
+				gen->write_store_named_global(global, identifier);
+				return global;
 			}
 #endif
 
@@ -424,8 +427,8 @@ GDScriptCodeGenerator::Address GDScriptCompiler::_parse_expression(CodeGen &code
 						}
 						break;
 					case GDScriptParser::DictionaryNode::LUA_TABLE:
-						// Lua-style: key is an identifier interpreted as string.
-						String key = static_cast<const GDScriptParser::IdentifierNode *>(dn->elements[i].key)->name;
+						// Lua-style: key is an identifier interpreted as StringName.
+						StringName key = static_cast<const GDScriptParser::IdentifierNode *>(dn->elements[i].key)->name;
 						element = codegen.add_constant(key);
 						break;
 				}
@@ -677,9 +680,9 @@ GDScriptCodeGenerator::Address GDScriptCompiler::_parse_expression(CodeGen &code
 				name = subscript->attribute->name;
 				named = true;
 			} else {
-				if (subscript->index->type == GDScriptParser::Node::LITERAL && static_cast<const GDScriptParser::LiteralNode *>(subscript->index)->value.get_type() == Variant::STRING) {
+				if (subscript->index->is_constant && subscript->index->reduced_value.get_type() == Variant::STRING_NAME) {
 					// Also, somehow, named (speed up anyway).
-					name = static_cast<const GDScriptParser::LiteralNode *>(subscript->index)->value;
+					name = subscript->index->reduced_value;
 					named = true;
 				} else {
 					// Regular indexing.
@@ -708,7 +711,7 @@ GDScriptCodeGenerator::Address GDScriptCompiler::_parse_expression(CodeGen &code
 		case GDScriptParser::Node::UNARY_OPERATOR: {
 			const GDScriptParser::UnaryOpNode *unary = static_cast<const GDScriptParser::UnaryOpNode *>(p_expression);
 
-			GDScriptCodeGenerator::Address result = codegen.add_temporary();
+			GDScriptCodeGenerator::Address result = codegen.add_temporary(_gdtype_from_datatype(unary->get_datatype()));
 
 			GDScriptCodeGenerator::Address operand = _parse_expression(codegen, r_error, unary->operand);
 			if (r_error) {
@@ -726,7 +729,7 @@ GDScriptCodeGenerator::Address GDScriptCompiler::_parse_expression(CodeGen &code
 		case GDScriptParser::Node::BINARY_OPERATOR: {
 			const GDScriptParser::BinaryOpNode *binary = static_cast<const GDScriptParser::BinaryOpNode *>(p_expression);
 
-			GDScriptCodeGenerator::Address result = codegen.add_temporary();
+			GDScriptCodeGenerator::Address result = codegen.add_temporary(_gdtype_from_datatype(binary->get_datatype()));
 
 			switch (binary->operation) {
 				case GDScriptParser::BinaryOpNode::OP_LOGIC_AND: {
@@ -1088,6 +1091,34 @@ GDScriptCodeGenerator::Address GDScriptCompiler::_parse_expression(CodeGen &code
 			}
 			return GDScriptCodeGenerator::Address(); // Assignment does not return a value.
 		} break;
+		case GDScriptParser::Node::LAMBDA: {
+			const GDScriptParser::LambdaNode *lambda = static_cast<const GDScriptParser::LambdaNode *>(p_expression);
+			GDScriptCodeGenerator::Address result = codegen.add_temporary(_gdtype_from_datatype(lambda->get_datatype()));
+
+			Vector<GDScriptCodeGenerator::Address> captures;
+			captures.resize(lambda->captures.size());
+			for (int i = 0; i < lambda->captures.size(); i++) {
+				captures.write[i] = _parse_expression(codegen, r_error, lambda->captures[i]);
+				if (r_error) {
+					return GDScriptCodeGenerator::Address();
+				}
+			}
+
+			GDScriptFunction *function = _parse_function(r_error, codegen.script, codegen.class_node, lambda->function, false, true);
+			if (r_error) {
+				return GDScriptCodeGenerator::Address();
+			}
+
+			gen->write_lambda(result, function, captures);
+
+			for (int i = 0; i < captures.size(); i++) {
+				if (captures[i].mode == GDScriptCodeGenerator::Address::TEMPORARY) {
+					gen->pop_temporary();
+				}
+			}
+
+			return result;
+		} break;
 		default: {
 			ERR_FAIL_V_MSG(GDScriptCodeGenerator::Address(), "Bug in bytecode compiler, unexpected node in parse tree while parsing expression."); // Unreachable code.
 		} break;
@@ -1801,8 +1832,8 @@ Error GDScriptCompiler::_parse_block(CodeGen &codegen, const GDScriptParser::Sui
 	return OK;
 }
 
-Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::FunctionNode *p_func, bool p_for_ready) {
-	Error error = OK;
+GDScriptFunction *GDScriptCompiler::_parse_function(Error &r_error, GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::FunctionNode *p_func, bool p_for_ready, bool p_for_lambda) {
+	r_error = OK;
 	CodeGen codegen;
 	codegen.generator = memnew(GDScriptByteCodeGenerator);
 
@@ -1819,7 +1850,11 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 	return_type.builtin_type = Variant::NIL;
 
 	if (p_func) {
-		func_name = p_func->identifier->name;
+		if (p_func->identifier) {
+			func_name = p_func->identifier->name;
+		} else {
+			func_name = "<anonymous lambda>";
+		}
 		is_static = p_func->is_static;
 		rpc_mode = p_func->rpc_mode;
 		return_type = _gdtype_from_datatype(p_func->get_datatype(), p_script);
@@ -1850,11 +1885,11 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 	}
 
 	// Parse initializer if applies.
-	bool is_implicit_initializer = !p_for_ready && !p_func;
-	bool is_initializer = p_func && String(p_func->identifier->name) == GDScriptLanguage::get_singleton()->strings._init;
-	bool is_for_ready = p_for_ready || (p_func && String(p_func->identifier->name) == "_ready");
+	bool is_implicit_initializer = !p_for_ready && !p_func && !p_for_lambda;
+	bool is_initializer = p_func && !p_for_lambda && String(p_func->identifier->name) == GDScriptLanguage::get_singleton()->strings._init;
+	bool is_for_ready = p_for_ready || (p_func && !p_for_lambda && String(p_func->identifier->name) == "_ready");
 
-	if (is_implicit_initializer || is_for_ready) {
+	if (!p_for_lambda && (is_implicit_initializer || is_for_ready)) {
 		// Initialize class fields.
 		for (int i = 0; i < p_class->members.size(); i++) {
 			if (p_class->members[i].type != GDScriptParser::ClassNode::Member::VARIABLE) {
@@ -1881,10 +1916,10 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 						codegen.generator->write_construct_array(dst_address, Vector<GDScriptCodeGenerator::Address>());
 					}
 				}
-				GDScriptCodeGenerator::Address src_address = _parse_expression(codegen, error, field->initializer, false, true);
-				if (error) {
+				GDScriptCodeGenerator::Address src_address = _parse_expression(codegen, r_error, field->initializer, false, true);
+				if (r_error) {
 					memdelete(codegen.generator);
-					return error;
+					return nullptr;
 				}
 
 				codegen.generator->write_assign(dst_address, src_address);
@@ -1911,10 +1946,10 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 			codegen.generator->start_parameters();
 			for (int i = p_func->parameters.size() - optional_parameters; i < p_func->parameters.size(); i++) {
 				const GDScriptParser::ParameterNode *parameter = p_func->parameters[i];
-				GDScriptCodeGenerator::Address src_addr = _parse_expression(codegen, error, parameter->default_value, true);
-				if (error) {
+				GDScriptCodeGenerator::Address src_addr = _parse_expression(codegen, r_error, parameter->default_value, true);
+				if (r_error) {
 					memdelete(codegen.generator);
-					return error;
+					return nullptr;
 				}
 				GDScriptCodeGenerator::Address dst_addr = codegen.parameters[parameter->identifier->name];
 				codegen.generator->write_assign_default_parameter(dst_addr, src_addr);
@@ -1925,10 +1960,10 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 			codegen.generator->end_parameters();
 		}
 
-		Error err = _parse_block(codegen, p_func->body);
-		if (err) {
+		r_error = _parse_block(codegen, p_func->body);
+		if (r_error) {
 			memdelete(codegen.generator);
-			return err;
+			return nullptr;
 		}
 	}
 
@@ -1954,6 +1989,10 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 			signature += "::" + String(func_name);
 		}
 
+		if (p_for_lambda) {
+			signature += "(lambda)";
+		}
+
 		codegen.generator->set_signature(signature);
 	}
 #endif
@@ -1961,8 +2000,10 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 	if (p_func) {
 		codegen.generator->set_initial_line(p_func->start_line);
 #ifdef TOOLS_ENABLED
-		p_script->member_lines[func_name] = p_func->start_line;
-		p_script->doc_functions[func_name] = p_func->doc_description;
+		if (!p_for_lambda) {
+			p_script->member_lines[func_name] = p_func->start_line;
+			p_script->doc_functions[func_name] = p_func->doc_description;
+		}
 #endif
 	} else {
 		codegen.generator->set_initial_line(0);
@@ -1991,11 +2032,13 @@ Error GDScriptCompiler::_parse_function(GDScript *p_script, const GDScriptParser
 #endif
 	}
 
-	p_script->member_functions[func_name] = gd_function;
+	if (!p_for_lambda) {
+		p_script->member_functions[func_name] = gd_function;
+	}
 
 	memdelete(codegen.generator);
 
-	return OK;
+	return gd_function;
 }
 
 Error GDScriptCompiler::_parse_setter_getter(GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::VariableNode *p_variable, bool p_is_setter) {
@@ -2388,7 +2431,8 @@ Error GDScriptCompiler::_parse_class_blocks(GDScript *p_script, const GDScriptPa
 			if (!has_ready && function->identifier->name == "_ready") {
 				has_ready = true;
 			}
-			Error err = _parse_function(p_script, p_class, function);
+			Error err = OK;
+			_parse_function(err, p_script, p_class, function);
 			if (err) {
 				return err;
 			}
@@ -2413,7 +2457,8 @@ Error GDScriptCompiler::_parse_class_blocks(GDScript *p_script, const GDScriptPa
 
 	{
 		// Create an implicit constructor in any case.
-		Error err = _parse_function(p_script, p_class, nullptr);
+		Error err = OK;
+		_parse_function(err, p_script, p_class, nullptr);
 		if (err) {
 			return err;
 		}
@@ -2421,7 +2466,8 @@ Error GDScriptCompiler::_parse_class_blocks(GDScript *p_script, const GDScriptPa
 
 	if (!has_ready && p_class->onready_used) {
 		//create a _ready constructor
-		Error err = _parse_function(p_script, p_class, nullptr, true);
+		Error err = OK;
+		_parse_function(err, p_script, p_class, nullptr, true);
 		if (err) {
 			return err;
 		}
diff --git a/modules/gdscript/gdscript_compiler.h b/modules/gdscript/gdscript_compiler.h
index 651391f972..7d5bee93ac 100644
--- a/modules/gdscript/gdscript_compiler.h
+++ b/modules/gdscript/gdscript_compiler.h
@@ -61,12 +61,12 @@ class GDScriptCompiler {
 
 		GDScriptCodeGenerator::Address add_local_constant(const StringName &p_name, const Variant &p_value) {
 			uint32_t addr = generator->add_local_constant(p_name, p_value);
-			locals[p_name] = GDScriptCodeGenerator::Address(GDScriptCodeGenerator::Address::LOCAL_CONSTANT, addr);
+			locals[p_name] = GDScriptCodeGenerator::Address(GDScriptCodeGenerator::Address::CONSTANT, addr);
 			return locals[p_name];
 		}
 
 		GDScriptCodeGenerator::Address add_temporary(const GDScriptDataType &p_type = GDScriptDataType()) {
-			uint32_t addr = generator->add_temporary();
+			uint32_t addr = generator->add_temporary(p_type);
 			return GDScriptCodeGenerator::Address(GDScriptCodeGenerator::Address::TEMPORARY, addr, p_type);
 		}
 
@@ -128,7 +128,7 @@ class GDScriptCompiler {
 	GDScriptCodeGenerator::Address _parse_match_pattern(CodeGen &codegen, Error &r_error, const GDScriptParser::PatternNode *p_pattern, const GDScriptCodeGenerator::Address &p_value_addr, const GDScriptCodeGenerator::Address &p_type_addr, const GDScriptCodeGenerator::Address &p_previous_test, bool p_is_first, bool p_is_nested);
 	void _add_locals_in_block(CodeGen &codegen, const GDScriptParser::SuiteNode *p_block);
 	Error _parse_block(CodeGen &codegen, const GDScriptParser::SuiteNode *p_block, bool p_add_locals = true);
-	Error _parse_function(GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::FunctionNode *p_func, bool p_for_ready = false);
+	GDScriptFunction *_parse_function(Error &r_error, GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::FunctionNode *p_func, bool p_for_ready = false, bool p_for_lambda = false);
 	Error _parse_setter_getter(GDScript *p_script, const GDScriptParser::ClassNode *p_class, const GDScriptParser::VariableNode *p_variable, bool p_is_setter);
 	Error _parse_class_level(GDScript *p_script, const GDScriptParser::ClassNode *p_class, bool p_keep_state);
 	Error _parse_class_blocks(GDScript *p_script, const GDScriptParser::ClassNode *p_class, bool p_keep_state);
diff --git a/modules/gdscript/gdscript_disassembler.cpp b/modules/gdscript/gdscript_disassembler.cpp
index 33acbb2a35..789af57b4c 100644
--- a/modules/gdscript/gdscript_disassembler.cpp
+++ b/modules/gdscript/gdscript_disassembler.cpp
@@ -69,35 +69,23 @@ static String _disassemble_address(const GDScript *p_script, const GDScriptFunct
 	int addr = p_address & GDScriptFunction::ADDR_MASK;
 
 	switch (p_address >> GDScriptFunction::ADDR_BITS) {
-		case GDScriptFunction::ADDR_TYPE_SELF: {
-			return "self";
-		} break;
-		case GDScriptFunction::ADDR_TYPE_CLASS: {
-			return "class";
-		} break;
 		case GDScriptFunction::ADDR_TYPE_MEMBER: {
 			return "member(" + p_script->debug_get_member_by_index(addr) + ")";
 		} break;
-		case GDScriptFunction::ADDR_TYPE_CLASS_CONSTANT: {
-			return "class_const(" + p_function.get_global_name(addr) + ")";
-		} break;
-		case GDScriptFunction::ADDR_TYPE_LOCAL_CONSTANT: {
+		case GDScriptFunction::ADDR_TYPE_CONSTANT: {
 			return "const(" + _get_variant_string(p_function.get_constant(addr)) + ")";
 		} break;
 		case GDScriptFunction::ADDR_TYPE_STACK: {
-			return "stack(" + itos(addr) + ")";
-		} break;
-		case GDScriptFunction::ADDR_TYPE_STACK_VARIABLE: {
-			return "var_stack(" + itos(addr) + ")";
-		} break;
-		case GDScriptFunction::ADDR_TYPE_GLOBAL: {
-			return "global(" + _get_variant_string(GDScriptLanguage::get_singleton()->get_global_array()[addr]) + ")";
-		} break;
-		case GDScriptFunction::ADDR_TYPE_NAMED_GLOBAL: {
-			return "named_global(" + p_function.get_global_name(addr) + ")";
-		} break;
-		case GDScriptFunction::ADDR_TYPE_NIL: {
-			return "nil";
+			switch (addr) {
+				case GDScriptFunction::ADDR_STACK_SELF:
+					return "self";
+				case GDScriptFunction::ADDR_STACK_CLASS:
+					return "class";
+				case GDScriptFunction::ADDR_STACK_NIL:
+					return "nil";
+				default:
+					return "stack(" + itos(addr) + ")";
+			}
 		} break;
 	}
 
@@ -733,7 +721,7 @@ void GDScriptFunction::disassemble(const Vector<String> &p_code_lines) const {
 				text += "await ";
 				text += DADDR(1);
 
-				incr += 2;
+				incr = 2;
 			} break;
 			case OPCODE_AWAIT_RESUME: {
 				text += "await resume ";
@@ -741,6 +729,25 @@ void GDScriptFunction::disassemble(const Vector<String> &p_code_lines) const {
 
 				incr = 2;
 			} break;
+			case OPCODE_CREATE_LAMBDA: {
+				int captures_count = _code_ptr[ip + 1 + instr_var_args];
+				GDScriptFunction *lambda = _lambdas_ptr[_code_ptr[ip + 2 + instr_var_args]];
+
+				text += DADDR(1 + captures_count);
+				text += "create lambda from ";
+				text += lambda->name.operator String();
+				text += "function, captures (";
+
+				for (int i = 0; i < captures_count; i++) {
+					if (i > 0) {
+						text += ", ";
+					}
+					text += DADDR(1 + i);
+				}
+				text += ")";
+
+				incr = 3 + captures_count;
+			} break;
 			case OPCODE_JUMP: {
 				text += "jump ";
 				text += itos(_code_ptr[ip + 1]);
@@ -885,6 +892,14 @@ void GDScriptFunction::disassemble(const Vector<String> &p_code_lines) const {
 				incr += 5;
 			} break;
 				DISASSEMBLE_ITERATE_TYPES(DISASSEMBLE_ITERATE);
+			case OPCODE_STORE_NAMED_GLOBAL: {
+				text += "store named global ";
+				text += DADDR(1);
+				text += " = ";
+				text += String(_global_names_ptr[_code_ptr[ip + 2]]);
+
+				incr += 3;
+			} break;
 			case OPCODE_LINE: {
 				int line = _code_ptr[ip + 1] - 1;
 				if (line >= 0 && line < p_code_lines.size()) {
@@ -898,6 +913,51 @@ void GDScriptFunction::disassemble(const Vector<String> &p_code_lines) const {
 
 				incr += 2;
 			} break;
+
+#define DISASSEMBLE_TYPE_ADJUST(m_v_type) \
+	case OPCODE_TYPE_ADJUST_##m_v_type: { \
+		text += "type adjust (";          \
+		text += #m_v_type;                \
+		text += ") ";                     \
+		text += DADDR(1);                 \
+		incr += 2;                        \
+	} break
+
+				DISASSEMBLE_TYPE_ADJUST(BOOL);
+				DISASSEMBLE_TYPE_ADJUST(INT);
+				DISASSEMBLE_TYPE_ADJUST(FLOAT);
+				DISASSEMBLE_TYPE_ADJUST(STRING);
+				DISASSEMBLE_TYPE_ADJUST(VECTOR2);
+				DISASSEMBLE_TYPE_ADJUST(VECTOR2I);
+				DISASSEMBLE_TYPE_ADJUST(RECT2);
+				DISASSEMBLE_TYPE_ADJUST(RECT2I);
+				DISASSEMBLE_TYPE_ADJUST(VECTOR3);
+				DISASSEMBLE_TYPE_ADJUST(VECTOR3I);
+				DISASSEMBLE_TYPE_ADJUST(TRANSFORM2D);
+				DISASSEMBLE_TYPE_ADJUST(PLANE);
+				DISASSEMBLE_TYPE_ADJUST(QUAT);
+				DISASSEMBLE_TYPE_ADJUST(AABB);
+				DISASSEMBLE_TYPE_ADJUST(BASIS);
+				DISASSEMBLE_TYPE_ADJUST(TRANSFORM);
+				DISASSEMBLE_TYPE_ADJUST(COLOR);
+				DISASSEMBLE_TYPE_ADJUST(STRING_NAME);
+				DISASSEMBLE_TYPE_ADJUST(NODE_PATH);
+				DISASSEMBLE_TYPE_ADJUST(RID);
+				DISASSEMBLE_TYPE_ADJUST(OBJECT);
+				DISASSEMBLE_TYPE_ADJUST(CALLABLE);
+				DISASSEMBLE_TYPE_ADJUST(SIGNAL);
+				DISASSEMBLE_TYPE_ADJUST(DICTIONARY);
+				DISASSEMBLE_TYPE_ADJUST(ARRAY);
+				DISASSEMBLE_TYPE_ADJUST(PACKED_BYTE_ARRAY);
+				DISASSEMBLE_TYPE_ADJUST(PACKED_INT32_ARRAY);
+				DISASSEMBLE_TYPE_ADJUST(PACKED_INT64_ARRAY);
+				DISASSEMBLE_TYPE_ADJUST(PACKED_FLOAT32_ARRAY);
+				DISASSEMBLE_TYPE_ADJUST(PACKED_FLOAT64_ARRAY);
+				DISASSEMBLE_TYPE_ADJUST(PACKED_STRING_ARRAY);
+				DISASSEMBLE_TYPE_ADJUST(PACKED_VECTOR2_ARRAY);
+				DISASSEMBLE_TYPE_ADJUST(PACKED_VECTOR3_ARRAY);
+				DISASSEMBLE_TYPE_ADJUST(PACKED_COLOR_ARRAY);
+
 			case OPCODE_ASSERT: {
 				text += "assert (";
 				text += DADDR(1);
diff --git a/modules/gdscript/gdscript_editor.cpp b/modules/gdscript/gdscript_editor.cpp
index 136c9f2afb..099abd35a7 100644
--- a/modules/gdscript/gdscript_editor.cpp
+++ b/modules/gdscript/gdscript_editor.cpp
@@ -3075,7 +3075,7 @@ Error GDScriptLanguage::lookup_code(const String &p_code, const String &p_symbol
 						// We cannot determine the exact nature of the identifier here
 						// Otherwise these codes would work
 						StringName enumName = ClassDB::get_integer_constant_enum("@GlobalScope", p_symbol, true);
-						if (enumName != NULL) {
+						if (enumName != nullptr) {
 							r_result.type = ScriptLanguage::LookupResult::RESULT_CLASS_ENUM;
 							r_result.class_name = "@GlobalScope";
 							r_result.class_member = enumName;
diff --git a/modules/gdscript/gdscript_function.cpp b/modules/gdscript/gdscript_function.cpp
index 7b37aa40a2..78399114a5 100644
--- a/modules/gdscript/gdscript_function.cpp
+++ b/modules/gdscript/gdscript_function.cpp
@@ -150,6 +150,10 @@ GDScriptFunction::GDScriptFunction() {
 }
 
 GDScriptFunction::~GDScriptFunction() {
+	for (int i = 0; i < lambdas.size(); i++) {
+		memdelete(lambdas[i]);
+	}
+
 #ifdef DEBUG_ENABLED
 
 	MutexLock lock(GDScriptLanguage::get_singleton()->lock);
diff --git a/modules/gdscript/gdscript_function.h b/modules/gdscript/gdscript_function.h
index 9fc75b66ce..70b62ced6d 100644
--- a/modules/gdscript/gdscript_function.h
+++ b/modules/gdscript/gdscript_function.h
@@ -301,6 +301,7 @@ public:
 		OPCODE_CALL_PTRCALL_PACKED_COLOR_ARRAY,
 		OPCODE_AWAIT,
 		OPCODE_AWAIT_RESUME,
+		OPCODE_CREATE_LAMBDA,
 		OPCODE_JUMP,
 		OPCODE_JUMP_IF,
 		OPCODE_JUMP_IF_NOT,
@@ -350,6 +351,41 @@ public:
 		OPCODE_ITERATE_PACKED_VECTOR3_ARRAY,
 		OPCODE_ITERATE_PACKED_COLOR_ARRAY,
 		OPCODE_ITERATE_OBJECT,
+		OPCODE_STORE_NAMED_GLOBAL,
+		OPCODE_TYPE_ADJUST_BOOL,
+		OPCODE_TYPE_ADJUST_INT,
+		OPCODE_TYPE_ADJUST_FLOAT,
+		OPCODE_TYPE_ADJUST_STRING,
+		OPCODE_TYPE_ADJUST_VECTOR2,
+		OPCODE_TYPE_ADJUST_VECTOR2I,
+		OPCODE_TYPE_ADJUST_RECT2,
+		OPCODE_TYPE_ADJUST_RECT2I,
+		OPCODE_TYPE_ADJUST_VECTOR3,
+		OPCODE_TYPE_ADJUST_VECTOR3I,
+		OPCODE_TYPE_ADJUST_TRANSFORM2D,
+		OPCODE_TYPE_ADJUST_PLANE,
+		OPCODE_TYPE_ADJUST_QUAT,
+		OPCODE_TYPE_ADJUST_AABB,
+		OPCODE_TYPE_ADJUST_BASIS,
+		OPCODE_TYPE_ADJUST_TRANSFORM,
+		OPCODE_TYPE_ADJUST_COLOR,
+		OPCODE_TYPE_ADJUST_STRING_NAME,
+		OPCODE_TYPE_ADJUST_NODE_PATH,
+		OPCODE_TYPE_ADJUST_RID,
+		OPCODE_TYPE_ADJUST_OBJECT,
+		OPCODE_TYPE_ADJUST_CALLABLE,
+		OPCODE_TYPE_ADJUST_SIGNAL,
+		OPCODE_TYPE_ADJUST_DICTIONARY,
+		OPCODE_TYPE_ADJUST_ARRAY,
+		OPCODE_TYPE_ADJUST_PACKED_BYTE_ARRAY,
+		OPCODE_TYPE_ADJUST_PACKED_INT32_ARRAY,
+		OPCODE_TYPE_ADJUST_PACKED_INT64_ARRAY,
+		OPCODE_TYPE_ADJUST_PACKED_FLOAT32_ARRAY,
+		OPCODE_TYPE_ADJUST_PACKED_FLOAT64_ARRAY,
+		OPCODE_TYPE_ADJUST_PACKED_STRING_ARRAY,
+		OPCODE_TYPE_ADJUST_PACKED_VECTOR2_ARRAY,
+		OPCODE_TYPE_ADJUST_PACKED_VECTOR3_ARRAY,
+		OPCODE_TYPE_ADJUST_PACKED_COLOR_ARRAY,
 		OPCODE_ASSERT,
 		OPCODE_BREAKPOINT,
 		OPCODE_LINE,
@@ -360,16 +396,18 @@ public:
 		ADDR_BITS = 24,
 		ADDR_MASK = ((1 << ADDR_BITS) - 1),
 		ADDR_TYPE_MASK = ~ADDR_MASK,
-		ADDR_TYPE_SELF = 0,
-		ADDR_TYPE_CLASS = 1,
+		ADDR_TYPE_STACK = 0,
+		ADDR_TYPE_CONSTANT = 1,
 		ADDR_TYPE_MEMBER = 2,
-		ADDR_TYPE_CLASS_CONSTANT = 3,
-		ADDR_TYPE_LOCAL_CONSTANT = 4,
-		ADDR_TYPE_STACK = 5,
-		ADDR_TYPE_STACK_VARIABLE = 6,
-		ADDR_TYPE_GLOBAL = 7,
-		ADDR_TYPE_NAMED_GLOBAL = 8,
-		ADDR_TYPE_NIL = 9
+	};
+
+	enum FixedAddresses {
+		ADDR_STACK_SELF = 0,
+		ADDR_STACK_CLASS = 1,
+		ADDR_STACK_NIL = 2,
+		ADDR_SELF = ADDR_STACK_SELF | (ADDR_TYPE_STACK << ADDR_BITS),
+		ADDR_CLASS = ADDR_STACK_CLASS | (ADDR_TYPE_STACK << ADDR_BITS),
+		ADDR_NIL = ADDR_STACK_NIL | (ADDR_TYPE_STACK << ADDR_BITS),
 	};
 
 	enum Instruction {
@@ -422,6 +460,8 @@ private:
 	const GDScriptUtilityFunctions::FunctionPtr *_gds_utilities_ptr = nullptr;
 	int _methods_count = 0;
 	MethodBind **_methods_ptr = nullptr;
+	int _lambdas_count = 0;
+	GDScriptFunction **_lambdas_ptr = nullptr;
 	const int *_code_ptr = nullptr;
 	int _code_size = 0;
 	int _argument_count = 0;
@@ -451,6 +491,7 @@ private:
 	Vector<Variant::ValidatedUtilityFunction> utilities;
 	Vector<GDScriptUtilityFunctions::FunctionPtr> gds_utilities;
 	Vector<MethodBind *> methods;
+	Vector<GDScriptFunction *> lambdas;
 	Vector<int> code;
 	Vector<GDScriptDataType> argument_types;
 	GDScriptDataType return_type;
@@ -462,7 +503,7 @@ private:
 
 	List<StackDebug> stack_debug;
 
-	_FORCE_INLINE_ Variant *_get_variant(int p_address, GDScriptInstance *p_instance, GDScript *p_script, Variant &self, Variant &static_ref, Variant *p_stack, String &r_error) const;
+	_FORCE_INLINE_ Variant *_get_variant(int p_address, GDScriptInstance *p_instance, Variant *p_stack, String &r_error) const;
 	_FORCE_INLINE_ String _get_call_error(const Callable::CallError &p_err, const String &p_where, const Variant **argptrs) const;
 
 	friend class GDScriptLanguage;
@@ -497,7 +538,6 @@ public:
 #endif
 		Vector<uint8_t> stack;
 		int stack_size = 0;
-		Variant self;
 		uint32_t alloca_size = 0;
 		int ip = 0;
 		int line = 0;
diff --git a/modules/gdscript/gdscript_lambda_callable.cpp b/modules/gdscript/gdscript_lambda_callable.cpp
new file mode 100644
index 0000000000..0bc109b6e1
--- /dev/null
+++ b/modules/gdscript/gdscript_lambda_callable.cpp
@@ -0,0 +1,95 @@
+/*************************************************************************/
+/*  gdscript_lambda_callable.cpp                                         */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "gdscript_lambda_callable.h"
+
+#include "core/templates/hashfuncs.h"
+#include "gdscript.h"
+
+bool GDScriptLambdaCallable::compare_equal(const CallableCustom *p_a, const CallableCustom *p_b) {
+	// Lambda callables are only compared by reference.
+	return p_a == p_b;
+}
+
+bool GDScriptLambdaCallable::compare_less(const CallableCustom *p_a, const CallableCustom *p_b) {
+	// Lambda callables are only compared by reference.
+	return p_a < p_b;
+}
+
+uint32_t GDScriptLambdaCallable::hash() const {
+	return h;
+}
+
+String GDScriptLambdaCallable::get_as_text() const {
+	if (function->get_name() != StringName()) {
+		return function->get_name().operator String() + "(lambda)";
+	}
+	return "(anonymous lambda)";
+}
+
+CallableCustom::CompareEqualFunc GDScriptLambdaCallable::get_compare_equal_func() const {
+	return compare_equal;
+}
+
+CallableCustom::CompareLessFunc GDScriptLambdaCallable::get_compare_less_func() const {
+	return compare_less;
+}
+
+ObjectID GDScriptLambdaCallable::get_object() const {
+	return script->get_instance_id();
+}
+
+void GDScriptLambdaCallable::call(const Variant **p_arguments, int p_argcount, Variant &r_return_value, Callable::CallError &r_call_error) const {
+	int captures_amount = captures.size();
+
+	if (captures_amount > 0) {
+		Vector<const Variant *> args;
+		args.resize(p_argcount + captures_amount);
+		for (int i = 0; i < captures_amount; i++) {
+			args.write[i] = &captures[i];
+		}
+		for (int i = 0; i < p_argcount; i++) {
+			args.write[i + captures_amount] = p_arguments[i];
+		}
+
+		r_return_value = function->call(nullptr, args.ptrw(), args.size(), r_call_error);
+		r_call_error.argument -= captures_amount;
+	} else {
+		r_return_value = function->call(nullptr, p_arguments, p_argcount, r_call_error);
+	}
+}
+
+GDScriptLambdaCallable::GDScriptLambdaCallable(Ref<GDScript> p_script, GDScriptFunction *p_function, const Vector<Variant> &p_captures) {
+	script = p_script;
+	function = p_function;
+	captures = p_captures;
+
+	h = (uint32_t)hash_djb2_one_64((uint64_t)this);
+}
diff --git a/modules/gdscript/gdscript_lambda_callable.h b/modules/gdscript/gdscript_lambda_callable.h
new file mode 100644
index 0000000000..357c845250
--- /dev/null
+++ b/modules/gdscript/gdscript_lambda_callable.h
@@ -0,0 +1,65 @@
+/*************************************************************************/
+/*  gdscript_lambda_callable.h                                           */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef GDSCRIPT_LAMBDA_CALLABLE
+#define GDSCRIPT_LAMBDA_CALLABLE
+
+#include "core/object/reference.h"
+#include "core/templates/vector.h"
+#include "core/variant/callable.h"
+#include "core/variant/variant.h"
+
+class GDScript;
+class GDScriptFunction;
+class GDScriptInstance;
+
+class GDScriptLambdaCallable : public CallableCustom {
+	GDScriptFunction *function = nullptr;
+	Ref<GDScript> script;
+	uint32_t h;
+
+	Vector<Variant> captures;
+
+	static bool compare_equal(const CallableCustom *p_a, const CallableCustom *p_b);
+	static bool compare_less(const CallableCustom *p_a, const CallableCustom *p_b);
+
+public:
+	uint32_t hash() const override;
+	String get_as_text() const override;
+	CompareEqualFunc get_compare_equal_func() const override;
+	CompareLessFunc get_compare_less_func() const override;
+	ObjectID get_object() const override;
+	void call(const Variant **p_arguments, int p_argcount, Variant &r_return_value, Callable::CallError &r_call_error) const override;
+
+	GDScriptLambdaCallable(Ref<GDScript> p_script, GDScriptFunction *p_function, const Vector<Variant> &p_captures);
+	virtual ~GDScriptLambdaCallable() = default;
+};
+
+#endif // GDSCRIPT_LAMBDA_CALLABLE
diff --git a/modules/gdscript/gdscript_parser.cpp b/modules/gdscript/gdscript_parser.cpp
index 695154e9a9..f9027c3a87 100644
--- a/modules/gdscript/gdscript_parser.cpp
+++ b/modules/gdscript/gdscript_parser.cpp
@@ -402,6 +402,8 @@ Error GDScriptParser::parse(const String &p_source_code, const String &p_script_
 }
 
 GDScriptTokenizer::Token GDScriptParser::advance() {
+	lambda_ended = false; // Empty marker since we're past the end in any case.
+
 	if (current.type == GDScriptTokenizer::Token::TK_EOF) {
 		ERR_FAIL_COND_V_MSG(current.type == GDScriptTokenizer::Token::TK_EOF, current, "GDScript parser bug: Trying to advance past the end of stream.");
 	}
@@ -428,7 +430,7 @@ bool GDScriptParser::match(GDScriptTokenizer::Token::Type p_token_type) {
 	return true;
 }
 
-bool GDScriptParser::check(GDScriptTokenizer::Token::Type p_token_type) {
+bool GDScriptParser::check(GDScriptTokenizer::Token::Type p_token_type) const {
 	if (p_token_type == GDScriptTokenizer::Token::IDENTIFIER) {
 		return current.is_identifier();
 	}
@@ -443,7 +445,7 @@ bool GDScriptParser::consume(GDScriptTokenizer::Token::Type p_token_type, const
 	return false;
 }
 
-bool GDScriptParser::is_at_end() {
+bool GDScriptParser::is_at_end() const {
 	return check(GDScriptTokenizer::Token::TK_EOF);
 }
 
@@ -494,16 +496,34 @@ void GDScriptParser::pop_multiline() {
 	tokenizer.set_multiline_mode(multiline_stack.size() > 0 ? multiline_stack.back()->get() : false);
 }
 
-bool GDScriptParser::is_statement_end() {
+bool GDScriptParser::is_statement_end_token() const {
 	return check(GDScriptTokenizer::Token::NEWLINE) || check(GDScriptTokenizer::Token::SEMICOLON) || check(GDScriptTokenizer::Token::TK_EOF);
 }
 
+bool GDScriptParser::is_statement_end() const {
+	return lambda_ended || in_lambda || is_statement_end_token();
+}
+
 void GDScriptParser::end_statement(const String &p_context) {
 	bool found = false;
 	while (is_statement_end() && !is_at_end()) {
 		// Remove sequential newlines/semicolons.
+		if (is_statement_end_token()) {
+			// Only consume if this is an actual token.
+			advance();
+		} else if (lambda_ended) {
+			lambda_ended = false; // Consume this "token".
+			found = true;
+			break;
+		} else {
+			if (!found) {
+				lambda_ended = true; // Mark the lambda as done since we found something else to end the statement.
+				found = true;
+			}
+			break;
+		}
+
 		found = true;
-		advance();
 	}
 	if (!found && !is_at_end()) {
 		push_error(vformat(R"(Expected end of statement after %s, found "%s" instead.)", p_context, current.get_name()));
@@ -811,6 +831,7 @@ GDScriptParser::VariableNode *GDScriptParser::parse_variable(bool p_allow_proper
 
 	VariableNode *variable = alloc_node<VariableNode>();
 	variable->identifier = parse_identifier();
+	variable->export_info.name = variable->identifier->name;
 
 	if (match(GDScriptTokenizer::Token::COLON)) {
 		if (check(GDScriptTokenizer::Token::NEWLINE)) {
@@ -860,8 +881,6 @@ GDScriptParser::VariableNode *GDScriptParser::parse_variable(bool p_allow_proper
 
 	end_statement("variable declaration");
 
-	variable->export_info.name = variable->identifier->name;
-
 	return variable;
 }
 
@@ -1183,36 +1202,7 @@ GDScriptParser::EnumNode *GDScriptParser::parse_enum() {
 	return enum_node;
 }
 
-GDScriptParser::FunctionNode *GDScriptParser::parse_function() {
-	bool _static = false;
-	if (previous.type == GDScriptTokenizer::Token::STATIC) {
-		// TODO: Improve message if user uses "static" with "var" or "const"
-		if (!consume(GDScriptTokenizer::Token::FUNC, R"(Expected "func" after "static".)")) {
-			return nullptr;
-		}
-		_static = true;
-	}
-
-	FunctionNode *function = alloc_node<FunctionNode>();
-	make_completion_context(COMPLETION_OVERRIDE_METHOD, function);
-
-	if (!consume(GDScriptTokenizer::Token::IDENTIFIER, R"(Expected function name after "func".)")) {
-		return nullptr;
-	}
-
-	FunctionNode *previous_function = current_function;
-	current_function = function;
-
-	function->identifier = parse_identifier();
-	function->is_static = _static;
-
-	push_multiline(true);
-	consume(GDScriptTokenizer::Token::PARENTHESIS_OPEN, R"(Expected opening "(" after function name.)");
-
-	SuiteNode *body = alloc_node<SuiteNode>();
-	SuiteNode *previous_suite = current_suite;
-	current_suite = body;
-
+void GDScriptParser::parse_function_signature(FunctionNode *p_function, SuiteNode *p_body, const String &p_type) {
 	if (!check(GDScriptTokenizer::Token::PARENTHESIS_CLOSE) && !is_at_end()) {
 		bool default_used = false;
 		do {
@@ -1232,29 +1222,61 @@ GDScriptParser::FunctionNode *GDScriptParser::parse_function() {
 					continue;
 				}
 			}
-			if (function->parameters_indices.has(parameter->identifier->name)) {
-				push_error(vformat(R"(Parameter with name "%s" was already declared for this function.)", parameter->identifier->name));
+			if (p_function->parameters_indices.has(parameter->identifier->name)) {
+				push_error(vformat(R"(Parameter with name "%s" was already declared for this %s.)", parameter->identifier->name, p_type));
 			} else {
-				function->parameters_indices[parameter->identifier->name] = function->parameters.size();
-				function->parameters.push_back(parameter);
-				body->add_local(parameter);
+				p_function->parameters_indices[parameter->identifier->name] = p_function->parameters.size();
+				p_function->parameters.push_back(parameter);
+				p_body->add_local(parameter, current_function);
 			}
 		} while (match(GDScriptTokenizer::Token::COMMA));
 	}
 
 	pop_multiline();
-	consume(GDScriptTokenizer::Token::PARENTHESIS_CLOSE, R"*(Expected closing ")" after function parameters.)*");
+	consume(GDScriptTokenizer::Token::PARENTHESIS_CLOSE, vformat(R"*(Expected closing ")" after %s parameters.)*", p_type));
 
 	if (match(GDScriptTokenizer::Token::FORWARD_ARROW)) {
-		make_completion_context(COMPLETION_TYPE_NAME_OR_VOID, function);
-		function->return_type = parse_type(true);
-		if (function->return_type == nullptr) {
+		make_completion_context(COMPLETION_TYPE_NAME_OR_VOID, p_function);
+		p_function->return_type = parse_type(true);
+		if (p_function->return_type == nullptr) {
 			push_error(R"(Expected return type or "void" after "->".)");
 		}
 	}
 
 	// TODO: Improve token consumption so it synchronizes to a statement boundary. This way we can get into the function body with unrecognized tokens.
-	consume(GDScriptTokenizer::Token::COLON, R"(Expected ":" after function declaration.)");
+	consume(GDScriptTokenizer::Token::COLON, vformat(R"(Expected ":" after %s declaration.)", p_type));
+}
+
+GDScriptParser::FunctionNode *GDScriptParser::parse_function() {
+	bool _static = false;
+	if (previous.type == GDScriptTokenizer::Token::STATIC) {
+		// TODO: Improve message if user uses "static" with "var" or "const"
+		if (!consume(GDScriptTokenizer::Token::FUNC, R"(Expected "func" after "static".)")) {
+			return nullptr;
+		}
+		_static = true;
+	}
+
+	FunctionNode *function = alloc_node<FunctionNode>();
+	make_completion_context(COMPLETION_OVERRIDE_METHOD, function);
+
+	if (!consume(GDScriptTokenizer::Token::IDENTIFIER, R"(Expected function name after "func".)")) {
+		return nullptr;
+	}
+
+	FunctionNode *previous_function = current_function;
+	current_function = function;
+
+	function->identifier = parse_identifier();
+	function->is_static = _static;
+
+	SuiteNode *body = alloc_node<SuiteNode>();
+	SuiteNode *previous_suite = current_suite;
+	current_suite = body;
+
+	push_multiline(true);
+	consume(GDScriptTokenizer::Token::PARENTHESIS_OPEN, R"(Expected opening "(" after function name.)");
+	parse_function_signature(function, body, "function");
 
 	current_suite = previous_suite;
 	function->body = parse_suite("function declaration", body);
@@ -1340,29 +1362,34 @@ bool GDScriptParser::register_annotation(const MethodInfo &p_info, uint32_t p_ta
 	return true;
 }
 
-GDScriptParser::SuiteNode *GDScriptParser::parse_suite(const String &p_context, SuiteNode *p_suite) {
+GDScriptParser::SuiteNode *GDScriptParser::parse_suite(const String &p_context, SuiteNode *p_suite, bool p_for_lambda) {
 	SuiteNode *suite = p_suite != nullptr ? p_suite : alloc_node<SuiteNode>();
 	suite->parent_block = current_suite;
+	suite->parent_function = current_function;
 	current_suite = suite;
 
 	bool multiline = false;
 
-	if (check(GDScriptTokenizer::Token::NEWLINE)) {
+	if (match(GDScriptTokenizer::Token::NEWLINE)) {
 		multiline = true;
 	}
 
 	if (multiline) {
-		consume(GDScriptTokenizer::Token::NEWLINE, vformat(R"(Expected newline after %s.)", p_context));
-
 		if (!consume(GDScriptTokenizer::Token::INDENT, vformat(R"(Expected indented block after %s.)", p_context))) {
 			current_suite = suite->parent_block;
 			return suite;
 		}
 	}
 
+	int error_count = 0;
+
 	do {
 		Node *statement = parse_statement();
 		if (statement == nullptr) {
+			if (error_count++ > 100) {
+				push_error("Too many statement errors.", suite);
+				break;
+			}
 			continue;
 		}
 		suite->statements.push_back(statement);
@@ -1375,7 +1402,7 @@ GDScriptParser::SuiteNode *GDScriptParser::parse_suite(const String &p_context,
 				if (local.type != SuiteNode::Local::UNDEFINED) {
 					push_error(vformat(R"(There is already a %s named "%s" declared in this scope.)", local.get_name(), variable->identifier->name));
 				}
-				current_suite->add_local(variable);
+				current_suite->add_local(variable, current_function);
 				break;
 			}
 			case Node::CONSTANT: {
@@ -1390,19 +1417,29 @@ GDScriptParser::SuiteNode *GDScriptParser::parse_suite(const String &p_context,
 					}
 					push_error(vformat(R"(There is already a %s named "%s" declared in this scope.)", name, constant->identifier->name));
 				}
-				current_suite->add_local(constant);
+				current_suite->add_local(constant, current_function);
 				break;
 			}
 			default:
 				break;
 		}
 
-	} while (multiline && !check(GDScriptTokenizer::Token::DEDENT) && !is_at_end());
+	} while (multiline && !check(GDScriptTokenizer::Token::DEDENT) && !lambda_ended && !is_at_end());
 
 	if (multiline) {
-		consume(GDScriptTokenizer::Token::DEDENT, vformat(R"(Missing unindent at the end of %s.)", p_context));
+		if (!lambda_ended) {
+			consume(GDScriptTokenizer::Token::DEDENT, vformat(R"(Missing unindent at the end of %s.)", p_context));
+
+		} else {
+			match(GDScriptTokenizer::Token::DEDENT);
+		}
+	} else if (previous.type == GDScriptTokenizer::Token::SEMICOLON) {
+		consume(GDScriptTokenizer::Token::NEWLINE, vformat(R"(Expected newline after ";" at the end of %s.)", p_context));
 	}
 
+	if (p_for_lambda) {
+		lambda_ended = true;
+	}
 	current_suite = suite->parent_block;
 	return suite;
 }
@@ -1459,6 +1496,10 @@ GDScriptParser::Node *GDScriptParser::parse_statement() {
 					push_error(R"(Constructor cannot return a value.)");
 				}
 				n_return->return_value = parse_expression(false);
+			} else if (in_lambda && !is_statement_end_token()) {
+				// Try to parse it anyway as this might not be the statement end in a lambda.
+				// If this fails the expression will be nullptr, but that's the same as no return, so it's fine.
+				n_return->return_value = parse_expression(false);
 			}
 			result = n_return;
 
@@ -1487,10 +1528,18 @@ GDScriptParser::Node *GDScriptParser::parse_statement() {
 		default: {
 			// Expression statement.
 			ExpressionNode *expression = parse_expression(true); // Allow assignment here.
+			bool has_ended_lambda = false;
 			if (expression == nullptr) {
-				push_error(vformat(R"(Expected statement, found "%s" instead.)", previous.get_name()));
+				if (in_lambda) {
+					// If it's not a valid expression beginning, it might be the continuation of the outer expression where this lambda is.
+					lambda_ended = true;
+					has_ended_lambda = true;
+				} else {
+					push_error(vformat(R"(Expected statement, found "%s" instead.)", previous.get_name()));
+				}
 			}
 			end_statement("expression");
+			lambda_ended = lambda_ended || has_ended_lambda;
 			result = expression;
 
 #ifdef DEBUG_ENABLED
@@ -1514,7 +1563,7 @@ GDScriptParser::Node *GDScriptParser::parse_statement() {
 	if (unreachable && result != nullptr) {
 		current_suite->has_unreachable_code = true;
 		if (current_function) {
-			push_warning(result, GDScriptWarning::UNREACHABLE_CODE, current_function->identifier->name);
+			push_warning(result, GDScriptWarning::UNREACHABLE_CODE, current_function->identifier ? current_function->identifier->name : "<anonymous lambda>");
 		} else {
 			// TODO: Properties setters and getters with unreachable code are not being warned
 		}
@@ -1599,7 +1648,7 @@ GDScriptParser::ForNode *GDScriptParser::parse_for() {
 
 	SuiteNode *suite = alloc_node<SuiteNode>();
 	if (n_for->variable) {
-		suite->add_local(SuiteNode::Local(n_for->variable));
+		suite->add_local(SuiteNode::Local(n_for->variable, current_function));
 	}
 	suite->parent_for = n_for;
 
@@ -1754,7 +1803,7 @@ GDScriptParser::MatchBranchNode *GDScriptParser::parse_match_branch() {
 		branch->patterns[0]->binds.get_key_list(&binds);
 
 		for (List<StringName>::Element *E = binds.front(); E != nullptr; E = E->next()) {
-			SuiteNode::Local local(branch->patterns[0]->binds[E->get()]);
+			SuiteNode::Local local(branch->patterns[0]->binds[E->get()], current_function);
 			suite->add_local(local);
 		}
 	}
@@ -1954,7 +2003,7 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_precedence(Precedence p_pr
 	// Completion can appear whenever an expression is expected.
 	make_completion_context(COMPLETION_IDENTIFIER, nullptr);
 
-	GDScriptTokenizer::Token token = advance();
+	GDScriptTokenizer::Token token = current;
 	ParseFunction prefix_rule = get_rule(token.type)->prefix;
 
 	if (prefix_rule == nullptr) {
@@ -1962,6 +2011,8 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_precedence(Precedence p_pr
 		return nullptr;
 	}
 
+	advance(); // Only consume the token if there's a valid rule.
+
 	ExpressionNode *previous_operand = (this->*prefix_rule)(nullptr, p_can_assign);
 
 	while (p_precedence <= get_rule(current.type)->precedence) {
@@ -2003,6 +2054,8 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_identifier(ExpressionNode
 
 	if (current_suite != nullptr && current_suite->has_local(identifier->name)) {
 		const SuiteNode::Local &declaration = current_suite->get_local(identifier->name);
+
+		identifier->source_function = declaration.source_function;
 		switch (declaration.type) {
 			case SuiteNode::Local::CONSTANT:
 				identifier->source = IdentifierNode::LOCAL_CONSTANT;
@@ -2056,6 +2109,9 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_self(ExpressionNode *p_pre
 	if (current_function && current_function->is_static) {
 		push_error(R"(Cannot use "self" inside a static function.)");
 	}
+	if (in_lambda) {
+		push_error(R"(Cannot use "self" inside a lambda.)");
+	}
 	SelfNode *self = alloc_node<SelfNode>();
 	self->current_class = current_class;
 	return self;
@@ -2441,6 +2497,8 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_dictionary(ExpressionNode
 							push_error(R"(Expected "=" after dictionary key.)");
 						}
 					}
+					key->is_constant = true;
+					key->reduced_value = static_cast<IdentifierNode *>(key)->name;
 					break;
 				case DictionaryNode::PYTHON_DICT:
 					if (!match(GDScriptTokenizer::Token::COLON)) {
@@ -2487,7 +2545,7 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_attribute(ExpressionNode *
 
 	if (for_completion) {
 		bool is_builtin = false;
-		if (p_previous_operand->type == Node::IDENTIFIER) {
+		if (p_previous_operand && p_previous_operand->type == Node::IDENTIFIER) {
 			const IdentifierNode *id = static_cast<const IdentifierNode *>(p_previous_operand);
 			Variant::Type builtin_type = get_builtin_type(id->name);
 			if (builtin_type < Variant::VARIANT_MAX) {
@@ -2674,6 +2732,65 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_preload(ExpressionNode *p_
 	return preload;
 }
 
+GDScriptParser::ExpressionNode *GDScriptParser::parse_lambda(ExpressionNode *p_previous_operand, bool p_can_assign) {
+	LambdaNode *lambda = alloc_node<LambdaNode>();
+	lambda->parent_function = current_function;
+	FunctionNode *function = alloc_node<FunctionNode>();
+	function->source_lambda = lambda;
+
+	function->is_static = current_function != nullptr ? current_function->is_static : false;
+
+	if (match(GDScriptTokenizer::Token::IDENTIFIER)) {
+		function->identifier = parse_identifier();
+	}
+
+	bool multiline_context = multiline_stack.back()->get();
+
+	// Reset the multiline stack since we don't want the multiline mode one in the lambda body.
+	push_multiline(false);
+	if (multiline_context) {
+		tokenizer.push_expression_indented_block();
+	}
+
+	push_multiline(true); // For the parameters.
+	if (function->identifier) {
+		consume(GDScriptTokenizer::Token::PARENTHESIS_OPEN, R"(Expected opening "(" after lambda name.)");
+	} else {
+		consume(GDScriptTokenizer::Token::PARENTHESIS_OPEN, R"(Expected opening "(" after "func".)");
+	}
+
+	FunctionNode *previous_function = current_function;
+	current_function = function;
+
+	SuiteNode *body = alloc_node<SuiteNode>();
+	SuiteNode *previous_suite = current_suite;
+	current_suite = body;
+
+	parse_function_signature(function, body, "lambda");
+
+	current_suite = previous_suite;
+
+	bool previous_in_lambda = in_lambda;
+	in_lambda = true;
+
+	function->body = parse_suite("lambda declaration", body, true);
+
+	pop_multiline();
+
+	if (multiline_context) {
+		// If we're in multiline mode, we want to skip the spurious DEDENT and NEWLINE tokens.
+		while (check(GDScriptTokenizer::Token::DEDENT) || check(GDScriptTokenizer::Token::INDENT) || check(GDScriptTokenizer::Token::NEWLINE)) {
+			current = tokenizer.scan(); // Not advance() since we don't want to change the previous token.
+		}
+		tokenizer.pop_expression_indented_block();
+	}
+
+	current_function = previous_function;
+	in_lambda = previous_in_lambda;
+	lambda->function = function;
+	return lambda;
+}
+
 GDScriptParser::ExpressionNode *GDScriptParser::parse_invalid_token(ExpressionNode *p_previous_operand, bool p_can_assign) {
 	// Just for better error messages.
 	GDScriptTokenizer::Token::Type invalid = previous.type;
@@ -3018,7 +3135,7 @@ GDScriptParser::ParseRule *GDScriptParser::get_rule(GDScriptTokenizer::Token::Ty
 		{ nullptr,                                          nullptr,                                        PREC_NONE }, // CONST,
 		{ nullptr,                                          nullptr,                                        PREC_NONE }, // ENUM,
 		{ nullptr,                                          nullptr,                                        PREC_NONE }, // EXTENDS,
-		{ nullptr,                                          nullptr,                                        PREC_NONE }, // FUNC,
+		{ &GDScriptParser::parse_lambda,                    nullptr,                                        PREC_NONE }, // FUNC,
 		{ nullptr,                                          &GDScriptParser::parse_binary_operator,      	PREC_CONTENT_TEST }, // IN,
 		{ nullptr,                                          &GDScriptParser::parse_binary_operator,      	PREC_TYPE_TEST }, // IS,
 		{ nullptr,                                          nullptr,                                        PREC_NONE }, // NAMESPACE,
@@ -3754,6 +3871,10 @@ void GDScriptParser::TreePrinter::print_dictionary(DictionaryNode *p_dictionary)
 }
 
 void GDScriptParser::TreePrinter::print_expression(ExpressionNode *p_expression) {
+	if (p_expression == nullptr) {
+		push_text("<invalid expression>");
+		return;
+	}
 	switch (p_expression->type) {
 		case Node::ARRAY:
 			print_array(static_cast<ArrayNode *>(p_expression));
@@ -3782,6 +3903,9 @@ void GDScriptParser::TreePrinter::print_expression(ExpressionNode *p_expression)
 		case Node::IDENTIFIER:
 			print_identifier(static_cast<IdentifierNode *>(p_expression));
 			break;
+		case Node::LAMBDA:
+			print_lambda(static_cast<LambdaNode *>(p_expression));
+			break;
 		case Node::LITERAL:
 			print_literal(static_cast<LiteralNode *>(p_expression));
 			break;
@@ -3841,12 +3965,17 @@ void GDScriptParser::TreePrinter::print_for(ForNode *p_for) {
 	decrease_indent();
 }
 
-void GDScriptParser::TreePrinter::print_function(FunctionNode *p_function) {
+void GDScriptParser::TreePrinter::print_function(FunctionNode *p_function, const String &p_context) {
 	for (const List<AnnotationNode *>::Element *E = p_function->annotations.front(); E != nullptr; E = E->next()) {
 		print_annotation(E->get());
 	}
-	push_text("Function ");
-	print_identifier(p_function->identifier);
+	push_text(p_context);
+	push_text(" ");
+	if (p_function->identifier) {
+		print_identifier(p_function->identifier);
+	} else {
+		push_text("<anonymous>");
+	}
 	push_text("( ");
 	for (int i = 0; i < p_function->parameters.size(); i++) {
 		if (i > 0) {
@@ -3900,6 +4029,18 @@ void GDScriptParser::TreePrinter::print_if(IfNode *p_if, bool p_is_elif) {
 	}
 }
 
+void GDScriptParser::TreePrinter::print_lambda(LambdaNode *p_lambda) {
+	print_function(p_lambda->function, "Lambda");
+	push_text("| captures [ ");
+	for (int i = 0; i < p_lambda->captures.size(); i++) {
+		if (i > 0) {
+			push_text(" , ");
+		}
+		push_text(p_lambda->captures[i]->name.operator String());
+	}
+	push_line(" ]");
+}
+
 void GDScriptParser::TreePrinter::print_literal(LiteralNode *p_literal) {
 	// Prefix for string types.
 	switch (p_literal->value.get_type()) {
diff --git a/modules/gdscript/gdscript_parser.h b/modules/gdscript/gdscript_parser.h
index 272d21ffce..b1b29a7bd1 100644
--- a/modules/gdscript/gdscript_parser.h
+++ b/modules/gdscript/gdscript_parser.h
@@ -76,6 +76,7 @@ public:
 	struct GetNodeNode;
 	struct IdentifierNode;
 	struct IfNode;
+	struct LambdaNode;
 	struct LiteralNode;
 	struct MatchNode;
 	struct MatchBranchNode;
@@ -267,6 +268,7 @@ public:
 			GET_NODE,
 			IDENTIFIER,
 			IF,
+			LAMBDA,
 			LITERAL,
 			MATCH,
 			MATCH_BRANCH,
@@ -728,6 +730,7 @@ public:
 		bool is_coroutine = false;
 		MultiplayerAPI::RPCMode rpc_mode = MultiplayerAPI::RPC_MODE_DISABLED;
 		MethodInfo info;
+		LambdaNode *source_lambda = nullptr;
 #ifdef TOOLS_ENABLED
 		Vector<Variant> default_arg_values;
 		String doc_description;
@@ -771,6 +774,7 @@ public:
 			VariableNode *variable_source;
 			IdentifierNode *bind_source;
 		};
+		FunctionNode *source_function = nullptr;
 
 		int usages = 0; // Useful for binds/iterator variable.
 
@@ -789,6 +793,21 @@ public:
 		}
 	};
 
+	struct LambdaNode : public ExpressionNode {
+		FunctionNode *function = nullptr;
+		FunctionNode *parent_function = nullptr;
+		Vector<IdentifierNode *> captures;
+		Map<StringName, int> captures_indices;
+
+		bool has_name() const {
+			return function && function->identifier;
+		}
+
+		LambdaNode() {
+			type = LAMBDA;
+		}
+	};
+
 	struct LiteralNode : public ExpressionNode {
 		Variant value;
 
@@ -942,6 +961,7 @@ public:
 				IdentifierNode *bind;
 			};
 			StringName name;
+			FunctionNode *source_function = nullptr;
 
 			int start_line = 0, end_line = 0;
 			int start_column = 0, end_column = 0;
@@ -951,10 +971,11 @@ public:
 			String get_name() const;
 
 			Local() {}
-			Local(ConstantNode *p_constant) {
+			Local(ConstantNode *p_constant, FunctionNode *p_source_function) {
 				type = CONSTANT;
 				constant = p_constant;
 				name = p_constant->identifier->name;
+				source_function = p_source_function;
 
 				start_line = p_constant->start_line;
 				end_line = p_constant->end_line;
@@ -963,10 +984,11 @@ public:
 				leftmost_column = p_constant->leftmost_column;
 				rightmost_column = p_constant->rightmost_column;
 			}
-			Local(VariableNode *p_variable) {
+			Local(VariableNode *p_variable, FunctionNode *p_source_function) {
 				type = VARIABLE;
 				variable = p_variable;
 				name = p_variable->identifier->name;
+				source_function = p_source_function;
 
 				start_line = p_variable->start_line;
 				end_line = p_variable->end_line;
@@ -975,10 +997,11 @@ public:
 				leftmost_column = p_variable->leftmost_column;
 				rightmost_column = p_variable->rightmost_column;
 			}
-			Local(ParameterNode *p_parameter) {
+			Local(ParameterNode *p_parameter, FunctionNode *p_source_function) {
 				type = PARAMETER;
 				parameter = p_parameter;
 				name = p_parameter->identifier->name;
+				source_function = p_source_function;
 
 				start_line = p_parameter->start_line;
 				end_line = p_parameter->end_line;
@@ -987,10 +1010,11 @@ public:
 				leftmost_column = p_parameter->leftmost_column;
 				rightmost_column = p_parameter->rightmost_column;
 			}
-			Local(IdentifierNode *p_identifier) {
+			Local(IdentifierNode *p_identifier, FunctionNode *p_source_function) {
 				type = FOR_VARIABLE;
 				bind = p_identifier;
 				name = p_identifier->name;
+				source_function = p_source_function;
 
 				start_line = p_identifier->start_line;
 				end_line = p_identifier->end_line;
@@ -1015,9 +1039,9 @@ public:
 		bool has_local(const StringName &p_name) const;
 		const Local &get_local(const StringName &p_name) const;
 		template <class T>
-		void add_local(T *p_local) {
+		void add_local(T *p_local, FunctionNode *p_source_function) {
 			locals_indices[p_local->identifier->name] = locals.size();
-			locals.push_back(Local(p_local));
+			locals.push_back(Local(p_local, p_source_function));
 		}
 		void add_local(const Local &p_local) {
 			locals_indices[p_local.name] = locals.size();
@@ -1191,6 +1215,8 @@ private:
 	CompletionCall completion_call;
 	List<CompletionCall> completion_call_stack;
 	bool passed_cursor = false;
+	bool in_lambda = false;
+	bool lambda_ended = false; // Marker for when a lambda ends, to apply an end of statement if needed.
 
 	typedef bool (GDScriptParser::*AnnotationAction)(const AnnotationNode *p_annotation, Node *p_target);
 	struct AnnotationInfo {
@@ -1278,10 +1304,11 @@ private:
 
 	GDScriptTokenizer::Token advance();
 	bool match(GDScriptTokenizer::Token::Type p_token_type);
-	bool check(GDScriptTokenizer::Token::Type p_token_type);
+	bool check(GDScriptTokenizer::Token::Type p_token_type) const;
 	bool consume(GDScriptTokenizer::Token::Type p_token_type, const String &p_error_message);
-	bool is_at_end();
-	bool is_statement_end();
+	bool is_at_end() const;
+	bool is_statement_end_token() const;
+	bool is_statement_end() const;
 	void end_statement(const String &p_context);
 	void synchronize();
 	void push_multiline(bool p_state);
@@ -1299,7 +1326,8 @@ private:
 	EnumNode *parse_enum();
 	ParameterNode *parse_parameter();
 	FunctionNode *parse_function();
-	SuiteNode *parse_suite(const String &p_context, SuiteNode *p_suite = nullptr);
+	void parse_function_signature(FunctionNode *p_function, SuiteNode *p_body, const String &p_type);
+	SuiteNode *parse_suite(const String &p_context, SuiteNode *p_suite = nullptr, bool p_for_lambda = false);
 	// Annotations
 	AnnotationNode *parse_annotation(uint32_t p_valid_targets);
 	bool register_annotation(const MethodInfo &p_info, uint32_t p_target_kinds, AnnotationAction p_apply, int p_optional_arguments = 0, bool p_is_vararg = false);
@@ -1354,6 +1382,7 @@ private:
 	ExpressionNode *parse_await(ExpressionNode *p_previous_operand, bool p_can_assign);
 	ExpressionNode *parse_attribute(ExpressionNode *p_previous_operand, bool p_can_assign);
 	ExpressionNode *parse_subscript(ExpressionNode *p_previous_operand, bool p_can_assign);
+	ExpressionNode *parse_lambda(ExpressionNode *p_previous_operand, bool p_can_assign);
 	ExpressionNode *parse_invalid_token(ExpressionNode *p_previous_operand, bool p_can_assign);
 	TypeNode *parse_type(bool p_allow_void = false);
 #ifdef TOOLS_ENABLED
@@ -1415,10 +1444,11 @@ public:
 		void print_expression(ExpressionNode *p_expression);
 		void print_enum(EnumNode *p_enum);
 		void print_for(ForNode *p_for);
-		void print_function(FunctionNode *p_function);
+		void print_function(FunctionNode *p_function, const String &p_context = "Function");
 		void print_get_node(GetNodeNode *p_get_node);
 		void print_if(IfNode *p_if, bool p_is_elif = false);
 		void print_identifier(IdentifierNode *p_identifier);
+		void print_lambda(LambdaNode *p_lambda);
 		void print_literal(LiteralNode *p_literal);
 		void print_match(MatchNode *p_match);
 		void print_match_branch(MatchBranchNode *p_match_branch);
diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp
index e432dfc891..2e6388d92f 100644
--- a/modules/gdscript/gdscript_tokenizer.cpp
+++ b/modules/gdscript/gdscript_tokenizer.cpp
@@ -242,6 +242,16 @@ void GDScriptTokenizer::set_multiline_mode(bool p_state) {
 	multiline_mode = p_state;
 }
 
+void GDScriptTokenizer::push_expression_indented_block() {
+	indent_stack_stack.push_back(indent_stack);
+}
+
+void GDScriptTokenizer::pop_expression_indented_block() {
+	ERR_FAIL_COND(indent_stack_stack.size() == 0);
+	indent_stack = indent_stack_stack.back()->get();
+	indent_stack_stack.pop_back();
+}
+
 int GDScriptTokenizer::get_cursor_line() const {
 	return cursor_line;
 }
diff --git a/modules/gdscript/gdscript_tokenizer.h b/modules/gdscript/gdscript_tokenizer.h
index bea4b14019..84b82c07f0 100644
--- a/modules/gdscript/gdscript_tokenizer.h
+++ b/modules/gdscript/gdscript_tokenizer.h
@@ -217,6 +217,7 @@ private:
 	Token last_newline;
 	int pending_indents = 0;
 	List<int> indent_stack;
+	List<List<int>> indent_stack_stack; // For lambdas, which require manipulating the indentation point.
 	List<char32_t> paren_stack;
 	char32_t indent_char = '\0';
 	int position = 0;
@@ -263,6 +264,8 @@ public:
 	void set_multiline_mode(bool p_state);
 	bool is_past_cursor() const;
 	static String get_token_name(Token::Type p_token_type);
+	void push_expression_indented_block(); // For lambdas, or blocks inside expressions.
+	void pop_expression_indented_block(); // For lambdas, or blocks inside expressions.
 
 	GDScriptTokenizer();
 };
diff --git a/modules/gdscript/gdscript_vm.cpp b/modules/gdscript/gdscript_vm.cpp
index 6b7da4a467..4757ec6ca9 100644
--- a/modules/gdscript/gdscript_vm.cpp
+++ b/modules/gdscript/gdscript_vm.cpp
@@ -33,23 +33,24 @@
 #include "core/core_string_names.h"
 #include "core/os/os.h"
 #include "gdscript.h"
+#include "gdscript_lambda_callable.h"
 
-Variant *GDScriptFunction::_get_variant(int p_address, GDScriptInstance *p_instance, GDScript *p_script, Variant &self, Variant &static_ref, Variant *p_stack, String &r_error) const {
+Variant *GDScriptFunction::_get_variant(int p_address, GDScriptInstance *p_instance, Variant *p_stack, String &r_error) const {
 	int address = p_address & ADDR_MASK;
 
 	//sequential table (jump table generated by compiler)
 	switch ((p_address & ADDR_TYPE_MASK) >> ADDR_BITS) {
-		case ADDR_TYPE_SELF: {
+		case ADDR_TYPE_STACK: {
 #ifdef DEBUG_ENABLED
-			if (unlikely(!p_instance)) {
-				r_error = "Cannot access self without instance.";
-				return nullptr;
-			}
+			ERR_FAIL_INDEX_V(address, _stack_size, nullptr);
 #endif
-			return &self;
+			return &p_stack[address];
 		} break;
-		case ADDR_TYPE_CLASS: {
-			return &static_ref;
+		case ADDR_TYPE_CONSTANT: {
+#ifdef DEBUG_ENABLED
+			ERR_FAIL_INDEX_V(address, _constant_count, nullptr);
+#endif
+			return &_constants_ptr[address];
 		} break;
 		case ADDR_TYPE_MEMBER: {
 #ifdef DEBUG_ENABLED
@@ -61,65 +62,6 @@ Variant *GDScriptFunction::_get_variant(int p_address, GDScriptInstance *p_insta
 			//member indexing is O(1)
 			return &p_instance->members.write[address];
 		} break;
-		case ADDR_TYPE_CLASS_CONSTANT: {
-			//todo change to index!
-			GDScript *s = p_script;
-#ifdef DEBUG_ENABLED
-			ERR_FAIL_INDEX_V(address, _global_names_count, nullptr);
-#endif
-			const StringName *sn = &_global_names_ptr[address];
-
-			while (s) {
-				GDScript *o = s;
-				while (o) {
-					Map<StringName, Variant>::Element *E = o->constants.find(*sn);
-					if (E) {
-						return &E->get();
-					}
-					o = o->_owner;
-				}
-				s = s->_base;
-			}
-
-			ERR_FAIL_V_MSG(nullptr, "GDScriptCompiler bug.");
-		} break;
-		case ADDR_TYPE_LOCAL_CONSTANT: {
-#ifdef DEBUG_ENABLED
-			ERR_FAIL_INDEX_V(address, _constant_count, nullptr);
-#endif
-			return &_constants_ptr[address];
-		} break;
-		case ADDR_TYPE_STACK:
-		case ADDR_TYPE_STACK_VARIABLE: {
-#ifdef DEBUG_ENABLED
-			ERR_FAIL_INDEX_V(address, _stack_size, nullptr);
-#endif
-			return &p_stack[address];
-		} break;
-		case ADDR_TYPE_GLOBAL: {
-#ifdef DEBUG_ENABLED
-			ERR_FAIL_INDEX_V(address, GDScriptLanguage::get_singleton()->get_global_array_size(), nullptr);
-#endif
-			return &GDScriptLanguage::get_singleton()->get_global_array()[address];
-		} break;
-#ifdef TOOLS_ENABLED
-		case ADDR_TYPE_NAMED_GLOBAL: {
-#ifdef DEBUG_ENABLED
-			ERR_FAIL_INDEX_V(address, _global_names_count, nullptr);
-#endif
-			StringName id = _global_names_ptr[address];
-
-			if (GDScriptLanguage::get_singleton()->get_named_globals_map().has(id)) {
-				return (Variant *)&GDScriptLanguage::get_singleton()->get_named_globals_map()[id];
-			} else {
-				r_error = "Autoload singleton '" + String(id) + "' has been removed.";
-				return nullptr;
-			}
-		} break;
-#endif
-		case ADDR_TYPE_NIL: {
-			return &nil;
-		} break;
 	}
 
 	ERR_FAIL_V_MSG(nullptr, "Bad code! (unknown addressing mode).");
@@ -291,6 +233,7 @@ String GDScriptFunction::_get_call_error(const Callable::CallError &p_err, const
 		&&OPCODE_CALL_PTRCALL_PACKED_COLOR_ARRAY,    \
 		&&OPCODE_AWAIT,                              \
 		&&OPCODE_AWAIT_RESUME,                       \
+		&&OPCODE_CREATE_LAMBDA,                      \
 		&&OPCODE_JUMP,                               \
 		&&OPCODE_JUMP_IF,                            \
 		&&OPCODE_JUMP_IF_NOT,                        \
@@ -340,6 +283,41 @@ String GDScriptFunction::_get_call_error(const Callable::CallError &p_err, const
 		&&OPCODE_ITERATE_PACKED_VECTOR3_ARRAY,       \
 		&&OPCODE_ITERATE_PACKED_COLOR_ARRAY,         \
 		&&OPCODE_ITERATE_OBJECT,                     \
+		&&OPCODE_STORE_NAMED_GLOBAL,                 \
+		&&OPCODE_TYPE_ADJUST_BOOL,                   \
+		&&OPCODE_TYPE_ADJUST_INT,                    \
+		&&OPCODE_TYPE_ADJUST_FLOAT,                  \
+		&&OPCODE_TYPE_ADJUST_STRING,                 \
+		&&OPCODE_TYPE_ADJUST_VECTOR2,                \
+		&&OPCODE_TYPE_ADJUST_VECTOR2I,               \
+		&&OPCODE_TYPE_ADJUST_RECT2,                  \
+		&&OPCODE_TYPE_ADJUST_RECT2I,                 \
+		&&OPCODE_TYPE_ADJUST_VECTOR3,                \
+		&&OPCODE_TYPE_ADJUST_VECTOR3I,               \
+		&&OPCODE_TYPE_ADJUST_TRANSFORM2D,            \
+		&&OPCODE_TYPE_ADJUST_PLANE,                  \
+		&&OPCODE_TYPE_ADJUST_QUAT,                   \
+		&&OPCODE_TYPE_ADJUST_AABB,                   \
+		&&OPCODE_TYPE_ADJUST_BASIS,                  \
+		&&OPCODE_TYPE_ADJUST_TRANSFORM,              \
+		&&OPCODE_TYPE_ADJUST_COLOR,                  \
+		&&OPCODE_TYPE_ADJUST_STRING_NAME,            \
+		&&OPCODE_TYPE_ADJUST_NODE_PATH,              \
+		&&OPCODE_TYPE_ADJUST_RID,                    \
+		&&OPCODE_TYPE_ADJUST_OBJECT,                 \
+		&&OPCODE_TYPE_ADJUST_CALLABLE,               \
+		&&OPCODE_TYPE_ADJUST_SIGNAL,                 \
+		&&OPCODE_TYPE_ADJUST_DICTIONARY,             \
+		&&OPCODE_TYPE_ADJUST_ARRAY,                  \
+		&&OPCODE_TYPE_ADJUST_PACKED_BYTE_ARRAY,      \
+		&&OPCODE_TYPE_ADJUST_PACKED_INT32_ARRAY,     \
+		&&OPCODE_TYPE_ADJUST_PACKED_INT64_ARRAY,     \
+		&&OPCODE_TYPE_ADJUST_PACKED_FLOAT32_ARRAY,   \
+		&&OPCODE_TYPE_ADJUST_PACKED_FLOAT64_ARRAY,   \
+		&&OPCODE_TYPE_ADJUST_PACKED_STRING_ARRAY,    \
+		&&OPCODE_TYPE_ADJUST_PACKED_VECTOR2_ARRAY,   \
+		&&OPCODE_TYPE_ADJUST_PACKED_VECTOR3_ARRAY,   \
+		&&OPCODE_TYPE_ADJUST_PACKED_COLOR_ARRAY,     \
 		&&OPCODE_ASSERT,                             \
 		&&OPCODE_BREAKPOINT,                         \
 		&&OPCODE_LINE,                               \
@@ -415,11 +393,9 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 
 	r_err.error = Callable::CallError::CALL_OK;
 
-	Variant self;
-	Variant static_ref;
 	Variant retvalue;
 	Variant *stack = nullptr;
-	Variant **instruction_args;
+	Variant **instruction_args = nullptr;
 	const void **call_args_ptr = nullptr;
 	int defarg = 0;
 
@@ -444,7 +420,6 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 		script = p_state->script;
 		p_instance = p_state->instance;
 		defarg = p_state->defarg;
-		self = p_state->self;
 
 	} else {
 		if (p_argcount != _argument_count) {
@@ -462,55 +437,49 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 			}
 		}
 
-		alloca_size = sizeof(Variant *) * _instruction_args_size + sizeof(Variant) * _stack_size;
+		// Add 3 here for self, class, and nil.
+		alloca_size = sizeof(Variant *) * 3 + sizeof(Variant *) * _instruction_args_size + sizeof(Variant) * _stack_size;
 
-		if (alloca_size) {
-			uint8_t *aptr = (uint8_t *)alloca(alloca_size);
+		uint8_t *aptr = (uint8_t *)alloca(alloca_size);
+		stack = (Variant *)aptr;
 
-			if (_stack_size) {
-				stack = (Variant *)aptr;
-				for (int i = 0; i < p_argcount; i++) {
-					if (!argument_types[i].has_type) {
-						memnew_placement(&stack[i], Variant(*p_args[i]));
-						continue;
-					}
-
-					if (!argument_types[i].is_type(*p_args[i], true)) {
-						r_err.error = Callable::CallError::CALL_ERROR_INVALID_ARGUMENT;
-						r_err.argument = i;
-						r_err.expected = argument_types[i].kind == GDScriptDataType::BUILTIN ? argument_types[i].builtin_type : Variant::OBJECT;
-						return Variant();
-					}
-					if (argument_types[i].kind == GDScriptDataType::BUILTIN) {
-						Variant arg;
-						Variant::construct(argument_types[i].builtin_type, arg, &p_args[i], 1, r_err);
-						memnew_placement(&stack[i], Variant(arg));
-					} else {
-						memnew_placement(&stack[i], Variant(*p_args[i]));
-					}
-				}
-				for (int i = p_argcount; i < _stack_size; i++) {
-					memnew_placement(&stack[i], Variant);
-				}
-			} else {
-				stack = nullptr;
+		for (int i = 0; i < p_argcount; i++) {
+			if (!argument_types[i].has_type) {
+				memnew_placement(&stack[i + 3], Variant(*p_args[i]));
+				continue;
 			}
 
-			if (_instruction_args_size) {
-				instruction_args = (Variant **)&aptr[sizeof(Variant) * _stack_size];
+			if (!argument_types[i].is_type(*p_args[i], true)) {
+				r_err.error = Callable::CallError::CALL_ERROR_INVALID_ARGUMENT;
+				r_err.argument = i;
+				r_err.expected = argument_types[i].kind == GDScriptDataType::BUILTIN ? argument_types[i].builtin_type : Variant::OBJECT;
+				return Variant();
+			}
+			if (argument_types[i].kind == GDScriptDataType::BUILTIN) {
+				Variant arg;
+				Variant::construct(argument_types[i].builtin_type, arg, &p_args[i], 1, r_err);
+				memnew_placement(&stack[i + 3], Variant(arg));
 			} else {
-				instruction_args = nullptr;
+				memnew_placement(&stack[i + 3], Variant(*p_args[i]));
 			}
+		}
+		for (int i = p_argcount + 3; i < _stack_size; i++) {
+			memnew_placement(&stack[i], Variant);
+		}
+
+		memnew_placement(&stack[ADDR_STACK_NIL], Variant);
 
+		if (_instruction_args_size) {
+			instruction_args = (Variant **)&aptr[sizeof(Variant) * _stack_size];
 		} else {
-			stack = nullptr;
 			instruction_args = nullptr;
 		}
 
 		if (p_instance) {
-			self = p_instance->owner;
+			memnew_placement(&stack[ADDR_STACK_SELF], Variant(p_instance->owner));
 			script = p_instance->script.ptr();
 		} else {
+			memnew_placement(&stack[ADDR_STACK_SELF], Variant);
 			script = _script;
 		}
 	}
@@ -520,7 +489,7 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 		call_args_ptr = nullptr;
 	}
 
-	static_ref = script;
+	memnew_placement(&stack[ADDR_STACK_CLASS], Variant(script));
 
 	String err_text;
 
@@ -541,10 +510,10 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 #define CHECK_SPACE(m_space) \
 	GD_ERR_BREAK((ip + m_space) > _code_size)
 
-#define GET_VARIANT_PTR(m_v, m_code_ofs)                                                                   \
-	Variant *m_v;                                                                                          \
-	m_v = _get_variant(_code_ptr[ip + m_code_ofs], p_instance, script, self, static_ref, stack, err_text); \
-	if (unlikely(!m_v))                                                                                    \
+#define GET_VARIANT_PTR(m_v, m_code_ofs)                                         \
+	Variant *m_v;                                                                \
+	m_v = _get_variant(_code_ptr[ip + m_code_ofs], p_instance, stack, err_text); \
+	if (unlikely(!m_v))                                                          \
 		OPCODE_BREAK;
 
 #else
@@ -552,7 +521,7 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 #define CHECK_SPACE(m_space)
 #define GET_VARIANT_PTR(m_v, m_code_ofs) \
 	Variant *m_v;                        \
-	m_v = _get_variant(_code_ptr[ip + m_code_ofs], p_instance, script, self, static_ref, stack, err_text);
+	m_v = _get_variant(_code_ptr[ip + m_code_ofs], p_instance, stack, err_text);
 
 #endif
 
@@ -1485,13 +1454,17 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 				if (err.error != Callable::CallError::CALL_OK) {
 					String methodstr = *methodname;
 					String basestr = _get_var_type(base);
+					bool is_callable = false;
 
 					if (methodstr == "call") {
-						if (argc >= 1) {
+						if (argc >= 1 && base->get_type() != Variant::CALLABLE) {
 							methodstr = String(*argptrs[0]) + " (via call)";
 							if (err.error == Callable::CallError::CALL_ERROR_INVALID_ARGUMENT) {
 								err.argument += 1;
 							}
+						} else {
+							methodstr = base->operator String() + " (Callable)";
+							is_callable = true;
 						}
 					} else if (methodstr == "free") {
 						if (err.error == Callable::CallError::CALL_ERROR_INVALID_METHOD) {
@@ -1511,7 +1484,7 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 							}
 						}
 					}
-					err_text = _get_call_error(err, "function '" + methodstr + "' in base '" + basestr + "'", (const Variant **)argptrs);
+					err_text = _get_call_error(err, "function '" + methodstr + (is_callable ? "" : "' in base '" + basestr) + "'", (const Variant **)argptrs);
 					OPCODE_BREAK;
 				}
 #endif
@@ -2038,7 +2011,6 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 						memnew_placement(&gdfs->state.stack.write[sizeof(Variant) * i], Variant(stack[i]));
 					}
 					gdfs->state.stack_size = _stack_size;
-					gdfs->state.self = self;
 					gdfs->state.alloca_size = alloca_size;
 					gdfs->state.ip = ip + 2;
 					gdfs->state.line = line;
@@ -2091,6 +2063,34 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 			}
 			DISPATCH_OPCODE;
 
+			OPCODE(OPCODE_CREATE_LAMBDA) {
+				CHECK_SPACE(2 + instr_arg_count);
+
+				ip += instr_arg_count;
+
+				int captures_count = _code_ptr[ip + 1];
+				GD_ERR_BREAK(captures_count < 0);
+
+				int lambda_index = _code_ptr[ip + 2];
+				GD_ERR_BREAK(lambda_index < 0 || lambda_index >= _lambdas_count);
+				GDScriptFunction *lambda = _lambdas_ptr[lambda_index];
+
+				Vector<Variant> captures;
+				captures.resize(captures_count);
+				for (int i = 0; i < captures_count; i++) {
+					GET_INSTRUCTION_ARG(arg, i);
+					captures.write[i] = *arg;
+				}
+
+				GDScriptLambdaCallable *callable = memnew(GDScriptLambdaCallable(Ref<GDScript>(script), lambda, captures));
+
+				GET_INSTRUCTION_ARG(result, captures_count);
+				*result = Callable(callable);
+
+				ip += 3;
+			}
+			DISPATCH_OPCODE;
+
 			OPCODE(OPCODE_JUMP) {
 				CHECK_SPACE(2);
 				int to = _code_ptr[ip + 1];
@@ -3028,6 +3028,63 @@ Variant GDScriptFunction::call(GDScriptInstance *p_instance, const Variant **p_a
 			}
 			DISPATCH_OPCODE;
 
+			OPCODE(OPCODE_STORE_NAMED_GLOBAL) {
+				CHECK_SPACE(3);
+				int globalname_idx = _code_ptr[ip + 2];
+				GD_ERR_BREAK(globalname_idx < 0 || globalname_idx >= _global_names_count);
+				const StringName *globalname = &_global_names_ptr[globalname_idx];
+
+				GET_INSTRUCTION_ARG(dst, 0);
+				*dst = GDScriptLanguage::get_singleton()->get_named_globals_map()[*globalname];
+
+				ip += 3;
+			}
+			DISPATCH_OPCODE;
+
+#define OPCODE_TYPE_ADJUST(m_v_type, m_c_type)    \
+	OPCODE(OPCODE_TYPE_ADJUST_##m_v_type) {       \
+		CHECK_SPACE(2);                           \
+		GET_INSTRUCTION_ARG(arg, 0);              \
+		VariantTypeAdjust<m_c_type>::adjust(arg); \
+		ip += 2;                                  \
+	}                                             \
+	DISPATCH_OPCODE
+
+			OPCODE_TYPE_ADJUST(BOOL, bool);
+			OPCODE_TYPE_ADJUST(INT, int64_t);
+			OPCODE_TYPE_ADJUST(FLOAT, double);
+			OPCODE_TYPE_ADJUST(STRING, String);
+			OPCODE_TYPE_ADJUST(VECTOR2, Vector2);
+			OPCODE_TYPE_ADJUST(VECTOR2I, Vector2i);
+			OPCODE_TYPE_ADJUST(RECT2, Rect2);
+			OPCODE_TYPE_ADJUST(RECT2I, Rect2i);
+			OPCODE_TYPE_ADJUST(VECTOR3, Vector3);
+			OPCODE_TYPE_ADJUST(VECTOR3I, Vector3i);
+			OPCODE_TYPE_ADJUST(TRANSFORM2D, Transform2D);
+			OPCODE_TYPE_ADJUST(PLANE, Plane);
+			OPCODE_TYPE_ADJUST(QUAT, Quat);
+			OPCODE_TYPE_ADJUST(AABB, AABB);
+			OPCODE_TYPE_ADJUST(BASIS, Basis);
+			OPCODE_TYPE_ADJUST(TRANSFORM, Transform);
+			OPCODE_TYPE_ADJUST(COLOR, Color);
+			OPCODE_TYPE_ADJUST(STRING_NAME, StringName);
+			OPCODE_TYPE_ADJUST(NODE_PATH, NodePath);
+			OPCODE_TYPE_ADJUST(RID, RID);
+			OPCODE_TYPE_ADJUST(OBJECT, Object *);
+			OPCODE_TYPE_ADJUST(CALLABLE, Callable);
+			OPCODE_TYPE_ADJUST(SIGNAL, Signal);
+			OPCODE_TYPE_ADJUST(DICTIONARY, Dictionary);
+			OPCODE_TYPE_ADJUST(ARRAY, Array);
+			OPCODE_TYPE_ADJUST(PACKED_BYTE_ARRAY, PackedByteArray);
+			OPCODE_TYPE_ADJUST(PACKED_INT32_ARRAY, PackedInt32Array);
+			OPCODE_TYPE_ADJUST(PACKED_INT64_ARRAY, PackedInt64Array);
+			OPCODE_TYPE_ADJUST(PACKED_FLOAT32_ARRAY, PackedFloat32Array);
+			OPCODE_TYPE_ADJUST(PACKED_FLOAT64_ARRAY, PackedFloat64Array);
+			OPCODE_TYPE_ADJUST(PACKED_STRING_ARRAY, PackedStringArray);
+			OPCODE_TYPE_ADJUST(PACKED_VECTOR2_ARRAY, PackedVector2Array);
+			OPCODE_TYPE_ADJUST(PACKED_VECTOR3_ARRAY, PackedVector3Array);
+			OPCODE_TYPE_ADJUST(PACKED_COLOR_ARRAY, PackedColorArray);
+
 			OPCODE(OPCODE_ASSERT) {
 				CHECK_SPACE(3);
 
diff --git a/modules/gdscript/language_server/gdscript_language_protocol.cpp b/modules/gdscript/language_server/gdscript_language_protocol.cpp
index 912c9a174e..0432e7caea 100644
--- a/modules/gdscript/language_server/gdscript_language_protocol.cpp
+++ b/modules/gdscript/language_server/gdscript_language_protocol.cpp
@@ -32,7 +32,6 @@
 
 #include "core/config/project_settings.h"
 #include "core/io/json.h"
-#include "core/os/copymem.h"
 #include "editor/doc_tools.h"
 #include "editor/editor_log.h"
 #include "editor/editor_node.h"
diff --git a/modules/gdscript/language_server/gdscript_text_document.cpp b/modules/gdscript/language_server/gdscript_text_document.cpp
index 9f2373bf56..030633274c 100644
--- a/modules/gdscript/language_server/gdscript_text_document.cpp
+++ b/modules/gdscript/language_server/gdscript_text_document.cpp
@@ -400,6 +400,7 @@ GDScriptTextDocument::~GDScriptTextDocument() {
 void GDScriptTextDocument::sync_script_content(const String &p_path, const String &p_content) {
 	String path = GDScriptLanguageProtocol::get_singleton()->get_workspace()->get_file_path(p_path);
 	GDScriptLanguageProtocol::get_singleton()->get_workspace()->parse_script(path, p_content);
+	EditorFileSystem::get_singleton()->update_file(path);
 }
 
 void GDScriptTextDocument::show_native_symbol_in_editor(const String &p_symbol_id) {
diff --git a/modules/gdscript/register_types.cpp b/modules/gdscript/register_types.cpp
index 19fd3daf20..2d2f94f5e0 100644
--- a/modules/gdscript/register_types.cpp
+++ b/modules/gdscript/register_types.cpp
@@ -163,19 +163,19 @@ void unregister_gdscript_types() {
 
 #ifdef TESTS_ENABLED
 void test_tokenizer() {
-	TestGDScript::test(TestGDScript::TestType::TEST_TOKENIZER);
+	GDScriptTests::test(GDScriptTests::TestType::TEST_TOKENIZER);
 }
 
 void test_parser() {
-	TestGDScript::test(TestGDScript::TestType::TEST_PARSER);
+	GDScriptTests::test(GDScriptTests::TestType::TEST_PARSER);
 }
 
 void test_compiler() {
-	TestGDScript::test(TestGDScript::TestType::TEST_COMPILER);
+	GDScriptTests::test(GDScriptTests::TestType::TEST_COMPILER);
 }
 
 void test_bytecode() {
-	TestGDScript::test(TestGDScript::TestType::TEST_BYTECODE);
+	GDScriptTests::test(GDScriptTests::TestType::TEST_BYTECODE);
 }
 
 REGISTER_TEST_COMMAND("gdscript-tokenizer", &test_tokenizer);
diff --git a/modules/gdscript/tests/gdscript_test_runner.cpp b/modules/gdscript/tests/gdscript_test_runner.cpp
new file mode 100644
index 0000000000..76ae43e792
--- /dev/null
+++ b/modules/gdscript/tests/gdscript_test_runner.cpp
@@ -0,0 +1,584 @@
+/*************************************************************************/
+/*  gdscript_test_runner.cpp                                             */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "gdscript_test_runner.h"
+
+#include "../gdscript.h"
+#include "../gdscript_analyzer.h"
+#include "../gdscript_compiler.h"
+#include "../gdscript_parser.h"
+
+#include "core/config/project_settings.h"
+#include "core/core_string_names.h"
+#include "core/io/file_access_pack.h"
+#include "core/os/dir_access.h"
+#include "core/os/os.h"
+#include "core/string/string_builder.h"
+#include "scene/resources/packed_scene.h"
+
+#include "tests/test_macros.h"
+
+namespace GDScriptTests {
+
+void init_autoloads() {
+	Map<StringName, ProjectSettings::AutoloadInfo> autoloads = ProjectSettings::get_singleton()->get_autoload_list();
+
+	// First pass, add the constants so they exist before any script is loaded.
+	for (Map<StringName, ProjectSettings::AutoloadInfo>::Element *E = autoloads.front(); E; E = E->next()) {
+		const ProjectSettings::AutoloadInfo &info = E->get();
+
+		if (info.is_singleton) {
+			for (int i = 0; i < ScriptServer::get_language_count(); i++) {
+				ScriptServer::get_language(i)->add_global_constant(info.name, Variant());
+			}
+		}
+	}
+
+	// Second pass, load into global constants.
+	for (Map<StringName, ProjectSettings::AutoloadInfo>::Element *E = autoloads.front(); E; E = E->next()) {
+		const ProjectSettings::AutoloadInfo &info = E->get();
+
+		if (!info.is_singleton) {
+			// Skip non-singletons since we don't have a scene tree here anyway.
+			continue;
+		}
+
+		RES res = ResourceLoader::load(info.path);
+		ERR_CONTINUE_MSG(res.is_null(), "Can't autoload: " + info.path);
+		Node *n = nullptr;
+		if (res->is_class("PackedScene")) {
+			Ref<PackedScene> ps = res;
+			n = ps->instance();
+		} else if (res->is_class("Script")) {
+			Ref<Script> script_res = res;
+			StringName ibt = script_res->get_instance_base_type();
+			bool valid_type = ClassDB::is_parent_class(ibt, "Node");
+			ERR_CONTINUE_MSG(!valid_type, "Script does not inherit a Node: " + info.path);
+
+			Object *obj = ClassDB::instance(ibt);
+
+			ERR_CONTINUE_MSG(obj == nullptr,
+					"Cannot instance script for autoload, expected 'Node' inheritance, got: " +
+							String(ibt));
+
+			n = Object::cast_to<Node>(obj);
+			n->set_script(script_res);
+		}
+
+		ERR_CONTINUE_MSG(!n, "Path in autoload not a node or script: " + info.path);
+		n->set_name(info.name);
+
+		for (int i = 0; i < ScriptServer::get_language_count(); i++) {
+			ScriptServer::get_language(i)->add_global_constant(info.name, n);
+		}
+	}
+}
+
+void init_language(const String &p_base_path) {
+	// Setup project settings since it's needed by the languages to get the global scripts.
+	// This also sets up the base resource path.
+	Error err = ProjectSettings::get_singleton()->setup(p_base_path, String(), true);
+	if (err) {
+		print_line("Could not load project settings.");
+		// Keep going since some scripts still work without this.
+	}
+
+	// Initialize the language for the test routine.
+	GDScriptLanguage::get_singleton()->init();
+	init_autoloads();
+}
+
+void finish_language() {
+	GDScriptLanguage::get_singleton()->finish();
+	ScriptServer::global_classes_clear();
+}
+
+StringName GDScriptTestRunner::test_function_name;
+
+GDScriptTestRunner::GDScriptTestRunner(const String &p_source_dir, bool p_init_language) {
+	test_function_name = StaticCString::create("test");
+	do_init_languages = p_init_language;
+
+	source_dir = p_source_dir;
+	if (!source_dir.ends_with("/")) {
+		source_dir += "/";
+	}
+
+	if (do_init_languages) {
+		init_language(p_source_dir);
+
+		// Enable all warnings for GDScript, so we can test them.
+		ProjectSettings::get_singleton()->set_setting("debug/gdscript/warnings/enable", true);
+		for (int i = 0; i < (int)GDScriptWarning::WARNING_MAX; i++) {
+			String warning = GDScriptWarning::get_name_from_code((GDScriptWarning::Code)i).to_lower();
+			ProjectSettings::get_singleton()->set_setting("debug/gdscript/warnings/" + warning, true);
+		}
+	}
+
+	// Enable printing to show results
+	_print_line_enabled = true;
+	_print_error_enabled = true;
+}
+
+GDScriptTestRunner::~GDScriptTestRunner() {
+	test_function_name = StringName();
+	if (do_init_languages) {
+		finish_language();
+	}
+}
+
+int GDScriptTestRunner::run_tests() {
+	if (!make_tests()) {
+		FAIL("An error occurred while making the tests.");
+		return -1;
+	}
+
+	if (!generate_class_index()) {
+		FAIL("An error occurred while generating class index.");
+		return -1;
+	}
+
+	int failed = 0;
+	for (int i = 0; i < tests.size(); i++) {
+		GDScriptTest test = tests[i];
+		GDScriptTest::TestResult result = test.run_test();
+
+		String expected = FileAccess::get_file_as_string(test.get_output_file());
+		INFO(test.get_source_file());
+		if (!result.passed) {
+			INFO(expected);
+			failed++;
+		}
+
+		CHECK_MESSAGE(result.passed, (result.passed ? String() : result.output));
+	}
+
+	return failed;
+}
+
+bool GDScriptTestRunner::generate_outputs() {
+	is_generating = true;
+
+	if (!make_tests()) {
+		print_line("Failed to generate a test output.");
+		return false;
+	}
+
+	if (!generate_class_index()) {
+		return false;
+	}
+
+	for (int i = 0; i < tests.size(); i++) {
+		OS::get_singleton()->print(".");
+		GDScriptTest test = tests[i];
+		bool result = test.generate_output();
+
+		if (!result) {
+			print_line("\nCould not generate output for " + test.get_source_file());
+			return false;
+		}
+	}
+	print_line("\nGenerated output files for " + itos(tests.size()) + " tests successfully.");
+
+	return true;
+}
+
+bool GDScriptTestRunner::make_tests_for_dir(const String &p_dir) {
+	Error err = OK;
+	DirAccessRef dir(DirAccess::open(p_dir, &err));
+
+	if (err != OK) {
+		return false;
+	}
+
+	String current_dir = dir->get_current_dir();
+
+	dir->list_dir_begin();
+	String next = dir->get_next();
+
+	while (!next.is_empty()) {
+		if (dir->current_is_dir()) {
+			if (next == "." || next == "..") {
+				next = dir->get_next();
+				continue;
+			}
+			if (!make_tests_for_dir(current_dir.plus_file(next))) {
+				return false;
+			}
+		} else {
+			if (next.get_extension().to_lower() == "gd") {
+				String out_file = next.get_basename() + ".out";
+				if (!is_generating && !dir->file_exists(out_file)) {
+					ERR_FAIL_V_MSG(false, "Could not find output file for " + next);
+				}
+				GDScriptTest test(current_dir.plus_file(next), current_dir.plus_file(out_file), source_dir);
+				tests.push_back(test);
+			}
+		}
+
+		next = dir->get_next();
+	}
+
+	dir->list_dir_end();
+
+	return true;
+}
+
+bool GDScriptTestRunner::make_tests() {
+	Error err = OK;
+	DirAccessRef dir(DirAccess::open(source_dir, &err));
+
+	ERR_FAIL_COND_V_MSG(err != OK, false, "Could not open specified test directory.");
+
+	return make_tests_for_dir(dir->get_current_dir());
+}
+
+bool GDScriptTestRunner::generate_class_index() {
+	StringName gdscript_name = GDScriptLanguage::get_singleton()->get_name();
+	for (int i = 0; i < tests.size(); i++) {
+		GDScriptTest test = tests[i];
+		String base_type;
+
+		String class_name = GDScriptLanguage::get_singleton()->get_global_class_name(test.get_source_file(), &base_type);
+		if (class_name == String()) {
+			continue;
+		}
+		ERR_FAIL_COND_V_MSG(ScriptServer::is_global_class(class_name), false,
+				"Class name '" + class_name + "' from " + test.get_source_file() + " is already used in " + ScriptServer::get_global_class_path(class_name));
+
+		ScriptServer::add_global_class(class_name, base_type, gdscript_name, test.get_source_file());
+	}
+	return true;
+}
+
+GDScriptTest::GDScriptTest(const String &p_source_path, const String &p_output_path, const String &p_base_dir) {
+	source_file = p_source_path;
+	output_file = p_output_path;
+	base_dir = p_base_dir;
+	_print_handler.printfunc = print_handler;
+	_error_handler.errfunc = error_handler;
+}
+
+void GDScriptTestRunner::handle_cmdline() {
+	List<String> cmdline_args = OS::get_singleton()->get_cmdline_args();
+	// TODO: this could likely be ported to use test commands:
+	// https://github.com/godotengine/godot/pull/41355
+	// Currently requires to startup the whole engine, which is slow.
+	String test_cmd = "--gdscript-test";
+	String gen_cmd = "--gdscript-generate-tests";
+
+	for (List<String>::Element *E = cmdline_args.front(); E != nullptr; E = E->next()) {
+		String &cmd = E->get();
+		if (cmd == test_cmd || cmd == gen_cmd) {
+			if (E->next() == nullptr) {
+				ERR_PRINT("Needed a path for the test files.");
+				exit(-1);
+			}
+
+			const String &path = E->next()->get();
+
+			GDScriptTestRunner runner(path, false);
+			int failed = 0;
+			if (cmd == test_cmd) {
+				failed = runner.run_tests();
+			} else {
+				bool completed = runner.generate_outputs();
+				failed = completed ? 0 : -1;
+			}
+			exit(failed);
+		}
+	}
+}
+
+void GDScriptTest::enable_stdout() {
+	// TODO: this could likely be handled by doctest or `tests/test_macros.h`.
+	OS::get_singleton()->set_stdout_enabled(true);
+	OS::get_singleton()->set_stderr_enabled(true);
+}
+
+void GDScriptTest::disable_stdout() {
+	// TODO: this could likely be handled by doctest or `tests/test_macros.h`.
+	OS::get_singleton()->set_stdout_enabled(false);
+	OS::get_singleton()->set_stderr_enabled(false);
+}
+
+void GDScriptTest::print_handler(void *p_this, const String &p_message, bool p_error) {
+	TestResult *result = (TestResult *)p_this;
+	result->output += p_message + "\n";
+}
+
+void GDScriptTest::error_handler(void *p_this, const char *p_function, const char *p_file, int p_line, const char *p_error, const char *p_explanation, ErrorHandlerType p_type) {
+	ErrorHandlerData *data = (ErrorHandlerData *)p_this;
+	GDScriptTest *self = data->self;
+	TestResult *result = data->result;
+
+	result->status = GDTEST_RUNTIME_ERROR;
+
+	StringBuilder builder;
+	builder.append(">> ");
+	switch (p_type) {
+		case ERR_HANDLER_ERROR:
+			builder.append("ERROR");
+			break;
+		case ERR_HANDLER_WARNING:
+			builder.append("WARNING");
+			break;
+		case ERR_HANDLER_SCRIPT:
+			builder.append("SCRIPT ERROR");
+			break;
+		case ERR_HANDLER_SHADER:
+			builder.append("SHADER ERROR");
+			break;
+		default:
+			builder.append("Unknown error type");
+			break;
+	}
+
+	builder.append("\n>> ");
+	builder.append(p_function);
+	builder.append("\n>> ");
+	builder.append(p_function);
+	builder.append("\n>> ");
+	builder.append(String(p_file).trim_prefix(self->base_dir));
+	builder.append("\n>> ");
+	builder.append(itos(p_line));
+	builder.append("\n>> ");
+	builder.append(p_error);
+	if (strlen(p_explanation) > 0) {
+		builder.append("\n>> ");
+		builder.append(p_explanation);
+	}
+	builder.append("\n");
+
+	result->output = builder.as_string();
+}
+
+bool GDScriptTest::check_output(const String &p_output) const {
+	Error err = OK;
+	String expected = FileAccess::get_file_as_string(output_file, &err);
+
+	ERR_FAIL_COND_V_MSG(err != OK, false, "Error when opening the output file.");
+
+	String got = p_output.strip_edges(); // TODO: may be hacky.
+	got += "\n"; // Make sure to insert newline for CI static checks.
+
+	return got == expected;
+}
+
+String GDScriptTest::get_text_for_status(GDScriptTest::TestStatus p_status) const {
+	switch (p_status) {
+		case GDTEST_OK:
+			return "GDTEST_OK";
+		case GDTEST_LOAD_ERROR:
+			return "GDTEST_LOAD_ERROR";
+		case GDTEST_PARSER_ERROR:
+			return "GDTEST_PARSER_ERROR";
+		case GDTEST_ANALYZER_ERROR:
+			return "GDTEST_ANALYZER_ERROR";
+		case GDTEST_COMPILER_ERROR:
+			return "GDTEST_COMPILER_ERROR";
+		case GDTEST_RUNTIME_ERROR:
+			return "GDTEST_RUNTIME_ERROR";
+	}
+	return "";
+}
+
+GDScriptTest::TestResult GDScriptTest::execute_test_code(bool p_is_generating) {
+	disable_stdout();
+
+	TestResult result;
+	result.status = GDTEST_OK;
+	result.output = String();
+
+	Error err = OK;
+
+	// Create script.
+	Ref<GDScript> script;
+	script.instance();
+	script->set_path(source_file);
+	script->set_script_path(source_file);
+	err = script->load_source_code(source_file);
+	if (err != OK) {
+		enable_stdout();
+		result.status = GDTEST_LOAD_ERROR;
+		result.passed = false;
+		ERR_FAIL_V_MSG(result, "\nCould not load source code for: '" + source_file + "'");
+	}
+
+	// Test parsing.
+	GDScriptParser parser;
+	err = parser.parse(script->get_source_code(), source_file, false);
+	if (err != OK) {
+		enable_stdout();
+		result.status = GDTEST_PARSER_ERROR;
+		result.output = get_text_for_status(result.status) + "\n";
+
+		const List<GDScriptParser::ParserError> &errors = parser.get_errors();
+		for (const List<GDScriptParser::ParserError>::Element *E = errors.front(); E; E = E->next()) {
+			result.output += E->get().message + "\n"; // TODO: line, column?
+			break; // Only the first error since the following might be cascading.
+		}
+		if (!p_is_generating) {
+			result.passed = check_output(result.output);
+		}
+		return result;
+	}
+
+	// Test type-checking.
+	GDScriptAnalyzer analyzer(&parser);
+	err = analyzer.analyze();
+	if (err != OK) {
+		enable_stdout();
+		result.status = GDTEST_ANALYZER_ERROR;
+		result.output = get_text_for_status(result.status) + "\n";
+
+		const List<GDScriptParser::ParserError> &errors = parser.get_errors();
+		for (const List<GDScriptParser::ParserError>::Element *E = errors.front(); E; E = E->next()) {
+			result.output += E->get().message + "\n"; // TODO: line, column?
+			break; // Only the first error since the following might be cascading.
+		}
+		if (!p_is_generating) {
+			result.passed = check_output(result.output);
+		}
+		return result;
+	}
+
+	StringBuilder warning_string;
+	for (const List<GDScriptWarning>::Element *E = parser.get_warnings().front(); E != nullptr; E = E->next()) {
+		const GDScriptWarning warning = E->get();
+		warning_string.append(">> WARNING");
+		warning_string.append("\n>> Line: ");
+		warning_string.append(itos(warning.start_line));
+		warning_string.append("\n>> ");
+		warning_string.append(warning.get_name());
+		warning_string.append("\n>> ");
+		warning_string.append(warning.get_message());
+		warning_string.append("\n");
+	}
+	result.output += warning_string.as_string();
+
+	// Test compiling.
+	GDScriptCompiler compiler;
+	err = compiler.compile(&parser, script.ptr(), false);
+	if (err != OK) {
+		enable_stdout();
+		result.status = GDTEST_COMPILER_ERROR;
+		result.output = get_text_for_status(result.status) + "\n";
+		result.output = compiler.get_error();
+		if (!p_is_generating) {
+			result.passed = check_output(result.output);
+		}
+		return result;
+	}
+
+	// Test running.
+	const Map<StringName, GDScriptFunction *>::Element *test_function_element = script->get_member_functions().find(GDScriptTestRunner::test_function_name);
+	if (test_function_element == nullptr) {
+		enable_stdout();
+		result.status = GDTEST_LOAD_ERROR;
+		result.output = "";
+		result.passed = false;
+		ERR_FAIL_V_MSG(result, "\nCould not find test function on: '" + source_file + "'");
+	}
+
+	script->reload();
+
+	// Create object instance for test.
+	Object *obj = ClassDB::instance(script->get_native()->get_name());
+	Ref<Reference> obj_ref;
+	if (obj->is_reference()) {
+		obj_ref = Ref<Reference>(Object::cast_to<Reference>(obj));
+	}
+	obj->set_script(script);
+	GDScriptInstance *instance = static_cast<GDScriptInstance *>(obj->get_script_instance());
+
+	// Setup output handlers.
+	ErrorHandlerData error_data(&result, this);
+
+	_print_handler.userdata = &result;
+	_error_handler.userdata = &error_data;
+	add_print_handler(&_print_handler);
+	add_error_handler(&_error_handler);
+
+	// Call test function.
+	Callable::CallError call_err;
+	instance->call(GDScriptTestRunner::test_function_name, nullptr, 0, call_err);
+
+	// Tear down output handlers.
+	remove_print_handler(&_print_handler);
+	remove_error_handler(&_error_handler);
+
+	// Check results.
+	if (call_err.error != Callable::CallError::CALL_OK) {
+		enable_stdout();
+		result.status = GDTEST_LOAD_ERROR;
+		result.passed = false;
+		ERR_FAIL_V_MSG(result, "\nCould not call test function on: '" + source_file + "'");
+	}
+
+	result.output = get_text_for_status(result.status) + "\n" + result.output;
+	if (!p_is_generating) {
+		result.passed = check_output(result.output);
+	}
+
+	if (obj_ref.is_null()) {
+		memdelete(obj);
+	}
+
+	enable_stdout();
+	return result;
+}
+
+GDScriptTest::TestResult GDScriptTest::run_test() {
+	return execute_test_code(false);
+}
+
+bool GDScriptTest::generate_output() {
+	TestResult result = execute_test_code(true);
+	if (result.status == GDTEST_LOAD_ERROR) {
+		return false;
+	}
+
+	Error err = OK;
+	FileAccessRef out_file = FileAccess::open(output_file, FileAccess::WRITE, &err);
+	if (err != OK) {
+		return false;
+	}
+
+	String output = result.output.strip_edges(); // TODO: may be hacky.
+	output += "\n"; // Make sure to insert newline for CI static checks.
+
+	out_file->store_string(output);
+	out_file->close();
+
+	return true;
+}
+
+} // namespace GDScriptTests
diff --git a/modules/gdscript/tests/gdscript_test_runner.h b/modules/gdscript/tests/gdscript_test_runner.h
new file mode 100644
index 0000000000..9b2d14a371
--- /dev/null
+++ b/modules/gdscript/tests/gdscript_test_runner.h
@@ -0,0 +1,126 @@
+/*************************************************************************/
+/*  gdscript_test_runner.h                                               */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef GDSCRIPT_TEST_H
+#define GDSCRIPT_TEST_H
+
+#include "../gdscript.h"
+#include "core/error/error_macros.h"
+#include "core/string/print_string.h"
+#include "core/string/ustring.h"
+#include "core/templates/vector.h"
+
+namespace GDScriptTests {
+
+void init_autoloads();
+void init_language(const String &p_base_path);
+void finish_language();
+
+// Single test instance in a suite.
+class GDScriptTest {
+public:
+	enum TestStatus {
+		GDTEST_OK,
+		GDTEST_LOAD_ERROR,
+		GDTEST_PARSER_ERROR,
+		GDTEST_ANALYZER_ERROR,
+		GDTEST_COMPILER_ERROR,
+		GDTEST_RUNTIME_ERROR,
+	};
+
+	struct TestResult {
+		TestStatus status;
+		String output;
+		bool passed;
+	};
+
+private:
+	struct ErrorHandlerData {
+		TestResult *result;
+		GDScriptTest *self;
+		ErrorHandlerData(TestResult *p_result, GDScriptTest *p_this) {
+			result = p_result;
+			self = p_this;
+		}
+	};
+
+	String source_file;
+	String output_file;
+	String base_dir;
+
+	PrintHandlerList _print_handler;
+	ErrorHandlerList _error_handler;
+
+	void enable_stdout();
+	void disable_stdout();
+	bool check_output(const String &p_output) const;
+	String get_text_for_status(TestStatus p_status) const;
+
+	TestResult execute_test_code(bool p_is_generating);
+
+public:
+	static void print_handler(void *p_this, const String &p_message, bool p_error);
+	static void error_handler(void *p_this, const char *p_function, const char *p_file, int p_line, const char *p_error, const char *p_explanation, ErrorHandlerType p_type);
+	TestResult run_test();
+	bool generate_output();
+
+	const String &get_source_file() const { return source_file; }
+	const String &get_output_file() const { return output_file; }
+
+	GDScriptTest(const String &p_source_path, const String &p_output_path, const String &p_base_dir);
+	GDScriptTest() :
+			GDScriptTest(String(), String(), String()) {} // Needed to use in Vector.
+};
+
+class GDScriptTestRunner {
+	String source_dir;
+	Vector<GDScriptTest> tests;
+
+	bool is_generating = false;
+	bool do_init_languages = false;
+
+	bool make_tests();
+	bool make_tests_for_dir(const String &p_dir);
+	bool generate_class_index();
+
+public:
+	static StringName test_function_name;
+
+	static void handle_cmdline();
+	int run_tests();
+	bool generate_outputs();
+
+	GDScriptTestRunner(const String &p_source_dir, bool p_init_language);
+	~GDScriptTestRunner();
+};
+
+} // namespace GDScriptTests
+
+#endif // GDSCRIPT_TEST_H
diff --git a/modules/etc/texture_loader_pkm.h b/modules/gdscript/tests/gdscript_test_runner_suite.h
index 2ed5e75807..136907b316 100644
--- a/modules/etc/texture_loader_pkm.h
+++ b/modules/gdscript/tests/gdscript_test_runner_suite.h
@@ -1,5 +1,5 @@
 /*************************************************************************/
-/*  texture_loader_pkm.h                                                 */
+/*  gdscript_test_runner_suite.h                                         */
 /*************************************************************************/
 /*                       This file is part of:                           */
 /*                           GODOT ENGINE                                */
@@ -28,20 +28,26 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
 
-#ifndef TEXTURE_LOADER_PKM_H
-#define TEXTURE_LOADER_PKM_H
+#ifndef GDSCRIPT_TEST_RUNNER_SUITE_H
+#define GDSCRIPT_TEST_RUNNER_SUITE_H
 
-#include "core/io/resource_loader.h"
-#include "scene/resources/texture.h"
+#include "gdscript_test_runner.h"
+#include "tests/test_macros.h"
 
-class ResourceFormatPKM : public ResourceFormatLoader {
-public:
-	virtual RES load(const String &p_path, const String &p_original_path = "", Error *r_error = nullptr, bool p_use_sub_threads = false, float *r_progress = nullptr, CacheMode p_cache_mode = CACHE_MODE_REUSE);
-	virtual void get_recognized_extensions(List<String> *p_extensions) const;
-	virtual bool handles_type(const String &p_type) const;
-	virtual String get_resource_type(const String &p_path) const;
+namespace GDScriptTests {
 
-	virtual ~ResourceFormatPKM() {}
-};
+TEST_SUITE("[Modules][GDScript]") {
+	// GDScript 2.0 is still under heavy construction.
+	// Allow the tests to fail, but do not ignore errors during development.
+	// Update the scripts and expected output as needed.
+	TEST_CASE("Script compilation and runtime") {
+		GDScriptTestRunner runner("modules/gdscript/tests/scripts", true);
+		int fail_count = runner.run_tests();
+		INFO("Make sure `*.out` files have expected results.");
+		REQUIRE_MESSAGE(fail_count == 0, "All GDScript tests should pass.");
+	}
+}
 
-#endif // TEXTURE_LOADER_PKM_H
+} // namespace GDScriptTests
+
+#endif // GDSCRIPT_TEST_RUNNER_SUITE_H
diff --git a/modules/gdscript/tests/scripts/.gitignore b/modules/gdscript/tests/scripts/.gitignore
new file mode 100644
index 0000000000..94c5b1bf6b
--- /dev/null
+++ b/modules/gdscript/tests/scripts/.gitignore
@@ -0,0 +1,2 @@
+# Ignore metadata if someone open this on Godot.
+/.godot
diff --git a/modules/gdscript/tests/scripts/parser/errors/missing_argument.gd b/modules/gdscript/tests/scripts/parser/errors/missing_argument.gd
new file mode 100644
index 0000000000..c56ad94095
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/missing_argument.gd
@@ -0,0 +1,6 @@
+func args(a, b):
+    print(a)
+    print(b)
+
+func test():
+    args(1,)
diff --git a/modules/gdscript/tests/scripts/parser/errors/missing_argument.out b/modules/gdscript/tests/scripts/parser/errors/missing_argument.out
new file mode 100644
index 0000000000..fc2a891109
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/missing_argument.out
@@ -0,0 +1,2 @@
+GDTEST_ANALYZER_ERROR
+Too few arguments for "args()" call. Expected at least 2 but received 1.
diff --git a/modules/gdscript/tests/scripts/parser/errors/missing_closing_expr_paren.gd b/modules/gdscript/tests/scripts/parser/errors/missing_closing_expr_paren.gd
new file mode 100644
index 0000000000..a1077e1985
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/missing_closing_expr_paren.gd
@@ -0,0 +1,2 @@
+func test():
+    var a = ("missing paren ->"
diff --git a/modules/gdscript/tests/scripts/parser/errors/missing_closing_expr_paren.out b/modules/gdscript/tests/scripts/parser/errors/missing_closing_expr_paren.out
new file mode 100644
index 0000000000..7326afa33d
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/missing_closing_expr_paren.out
@@ -0,0 +1,2 @@
+GDTEST_PARSER_ERROR
+Expected closing ")" after grouping expression.
diff --git a/modules/gdscript/tests/scripts/parser/errors/missing_colon.gd b/modules/gdscript/tests/scripts/parser/errors/missing_colon.gd
new file mode 100644
index 0000000000..62cb633e9e
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/missing_colon.gd
@@ -0,0 +1,3 @@
+func test():
+    if true # Missing colon here.
+        print("true")
diff --git a/modules/gdscript/tests/scripts/parser/errors/missing_colon.out b/modules/gdscript/tests/scripts/parser/errors/missing_colon.out
new file mode 100644
index 0000000000..687b963bc8
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/missing_colon.out
@@ -0,0 +1,2 @@
+GDTEST_PARSER_ERROR
+Expected ":" after "if" condition.
diff --git a/modules/gdscript/tests/scripts/parser/errors/missing_paren_after_args.gd b/modules/gdscript/tests/scripts/parser/errors/missing_paren_after_args.gd
new file mode 100644
index 0000000000..116b0151da
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/missing_paren_after_args.gd
@@ -0,0 +1,6 @@
+func args(a, b):
+    print(a)
+    print(b)
+
+func test():
+    args(1,2
diff --git a/modules/gdscript/tests/scripts/parser/errors/missing_paren_after_args.out b/modules/gdscript/tests/scripts/parser/errors/missing_paren_after_args.out
new file mode 100644
index 0000000000..34ea7ac323
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/missing_paren_after_args.out
@@ -0,0 +1,2 @@
+GDTEST_PARSER_ERROR
+Expected closing ")" after call arguments.
diff --git a/modules/gdscript/tests/scripts/parser/errors/mixing_tabs_spaces.gd b/modules/gdscript/tests/scripts/parser/errors/mixing_tabs_spaces.gd
new file mode 100644
index 0000000000..9ad77f1432
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/mixing_tabs_spaces.gd
@@ -0,0 +1,3 @@
+func test():
+    print("Using spaces")
+	print("Using tabs")
diff --git a/modules/gdscript/tests/scripts/parser/errors/mixing_tabs_spaces.out b/modules/gdscript/tests/scripts/parser/errors/mixing_tabs_spaces.out
new file mode 100644
index 0000000000..6390de9788
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/mixing_tabs_spaces.out
@@ -0,0 +1,2 @@
+GDTEST_PARSER_ERROR
+Used "\t" for indentation instead " " as used before in the file.
diff --git a/modules/gdscript/tests/scripts/parser/errors/nothing_after_dollar.gd b/modules/gdscript/tests/scripts/parser/errors/nothing_after_dollar.gd
new file mode 100644
index 0000000000..3875ce3936
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/nothing_after_dollar.gd
@@ -0,0 +1,3 @@
+extends Node
+func test():
+    var a = $ # Expected some node path.
diff --git a/modules/gdscript/tests/scripts/parser/errors/nothing_after_dollar.out b/modules/gdscript/tests/scripts/parser/errors/nothing_after_dollar.out
new file mode 100644
index 0000000000..b3dc181a22
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/nothing_after_dollar.out
@@ -0,0 +1,2 @@
+GDTEST_PARSER_ERROR
+Expect node path as string or identifier after "$".
diff --git a/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar.gd b/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar.gd
new file mode 100644
index 0000000000..6fd2692d47
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar.gd
@@ -0,0 +1,3 @@
+extends Node
+func test():
+    $23 # Can't use number here.
diff --git a/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar.out b/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar.out
new file mode 100644
index 0000000000..b3dc181a22
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar.out
@@ -0,0 +1,2 @@
+GDTEST_PARSER_ERROR
+Expect node path as string or identifier after "$".
diff --git a/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar_slash.gd b/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar_slash.gd
new file mode 100644
index 0000000000..1836d42226
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar_slash.gd
@@ -0,0 +1,3 @@
+extends Node
+func test():
+    $MyNode/23 # Can't use number here.
diff --git a/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar_slash.out b/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar_slash.out
new file mode 100644
index 0000000000..dcb4ccecb0
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/wrong_value_after_dollar_slash.out
@@ -0,0 +1,2 @@
+GDTEST_PARSER_ERROR
+Expect node path after "/".
diff --git a/modules/gdscript/tests/scripts/parser/features/semicolon_as_end_statement.gd b/modules/gdscript/tests/scripts/parser/features/semicolon_as_end_statement.gd
new file mode 100644
index 0000000000..08f2eedb2d
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/features/semicolon_as_end_statement.gd
@@ -0,0 +1,2 @@
+func test():
+    print("A"); print("B")
diff --git a/modules/gdscript/tests/scripts/parser/features/semicolon_as_end_statement.out b/modules/gdscript/tests/scripts/parser/features/semicolon_as_end_statement.out
new file mode 100644
index 0000000000..fc03f3efe8
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/features/semicolon_as_end_statement.out
@@ -0,0 +1,3 @@
+GDTEST_OK
+A
+B
diff --git a/modules/gdscript/tests/scripts/parser/features/trailing_comma_in_function_args.gd b/modules/gdscript/tests/scripts/parser/features/trailing_comma_in_function_args.gd
new file mode 100644
index 0000000000..6097b11b10
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/features/trailing_comma_in_function_args.gd
@@ -0,0 +1,7 @@
+# See https://github.com/godotengine/godot/issues/41066.
+
+func f(p, ): ## <-- no errors
+	print(p)
+
+func test():
+	f(0, ) ## <-- no error
diff --git a/modules/gdscript/tests/scripts/parser/features/trailing_comma_in_function_args.out b/modules/gdscript/tests/scripts/parser/features/trailing_comma_in_function_args.out
new file mode 100644
index 0000000000..94e2ec2af8
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/features/trailing_comma_in_function_args.out
@@ -0,0 +1,2 @@
+GDTEST_OK
+0
diff --git a/modules/gdscript/tests/scripts/parser/features/variable_declaration.gd b/modules/gdscript/tests/scripts/parser/features/variable_declaration.gd
new file mode 100644
index 0000000000..3b48f10ca7
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/features/variable_declaration.gd
@@ -0,0 +1,12 @@
+var a # No init.
+var b = 42 # Init.
+
+func test():
+	var c # No init, local.
+	var d = 23 # Init, local.
+
+	a = 1
+	c = 2
+
+	prints(a, b, c, d)
+	print("OK")
diff --git a/modules/gdscript/tests/scripts/parser/features/variable_declaration.out b/modules/gdscript/tests/scripts/parser/features/variable_declaration.out
new file mode 100644
index 0000000000..2e0a63c024
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/features/variable_declaration.out
@@ -0,0 +1,7 @@
+GDTEST_OK
+>> WARNING
+>> Line: 5
+>> UNASSIGNED_VARIABLE
+>> The variable 'c' was used but never assigned a value.
+1 42 2 23
+OK
diff --git a/modules/gdscript/tests/scripts/parser/warnings/unused_variable.gd b/modules/gdscript/tests/scripts/parser/warnings/unused_variable.gd
new file mode 100644
index 0000000000..68e3bd424f
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/warnings/unused_variable.gd
@@ -0,0 +1,2 @@
+func test():
+    var unused = "not used"
diff --git a/modules/gdscript/tests/scripts/parser/warnings/unused_variable.out b/modules/gdscript/tests/scripts/parser/warnings/unused_variable.out
new file mode 100644
index 0000000000..270e0e69c0
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/warnings/unused_variable.out
@@ -0,0 +1,5 @@
+GDTEST_OK
+>> WARNING
+>> Line: 2
+>> UNUSED_VARIABLE
+>> The local variable 'unused' is declared but never used in the block. If this is intended, prefix it with an underscore: '_unused'
diff --git a/modules/gdscript/tests/scripts/project.godot b/modules/gdscript/tests/scripts/project.godot
new file mode 100644
index 0000000000..25b49c0abd
--- /dev/null
+++ b/modules/gdscript/tests/scripts/project.godot
@@ -0,0 +1,10 @@
+; This is not an actual project.
+; This config only exists to properly set up the test environment.
+; It also helps for opening Godot to edit the scripts, but please don't
+; let the editor changes be saved.
+
+config_version=4
+
+[application]
+
+config/name="GDScript Integration Test Suite"
diff --git a/modules/gdscript/tests/test_gdscript.cpp b/modules/gdscript/tests/test_gdscript.cpp
index 3cc0eee672..36da64bbaa 100644
--- a/modules/gdscript/tests/test_gdscript.cpp
+++ b/modules/gdscript/tests/test_gdscript.cpp
@@ -47,7 +47,7 @@
 #include "editor/editor_settings.h"
 #endif
 
-namespace TestGDScript {
+namespace GDScriptTests {
 
 static void test_tokenizer(const String &p_code, const Vector<String> &p_lines) {
 	GDScriptTokenizer tokenizer;
@@ -66,7 +66,7 @@ static void test_tokenizer(const String &p_code, const Vector<String> &p_lines)
 		StringBuilder token;
 		token += " --> "; // Padding for line number.
 
-		for (int l = current.start_line; l <= current.end_line; l++) {
+		for (int l = current.start_line; l <= current.end_line && l <= p_lines.size(); l++) {
 			print_line(vformat("%04d %s", l, p_lines[l - 1]).replace("\t", tab));
 		}
 
@@ -118,6 +118,18 @@ static void test_parser(const String &p_code, const String &p_script_path, const
 			print_line(vformat("%02d:%02d: %s", error.line, error.column, error.message));
 		}
 	}
+
+	GDScriptAnalyzer analyzer(&parser);
+	analyzer.analyze();
+
+	if (err != OK) {
+		const List<GDScriptParser::ParserError> &errors = parser.get_errors();
+		for (const List<GDScriptParser::ParserError>::Element *E = errors.front(); E != nullptr; E = E->next()) {
+			const GDScriptParser::ParserError &error = E->get();
+			print_line(vformat("%02d:%02d: %s", error.line, error.column, error.message));
+		}
+	}
+
 #ifdef TOOLS_ENABLED
 	GDScriptParser::TreePrinter printer;
 	printer.print_tree(parser);
@@ -183,60 +195,6 @@ static void test_compiler(const String &p_code, const String &p_script_path, con
 	}
 }
 
-void init_autoloads() {
-	Map<StringName, ProjectSettings::AutoloadInfo> autoloads = ProjectSettings::get_singleton()->get_autoload_list();
-
-	// First pass, add the constants so they exist before any script is loaded.
-	for (Map<StringName, ProjectSettings::AutoloadInfo>::Element *E = autoloads.front(); E; E = E->next()) {
-		const ProjectSettings::AutoloadInfo &info = E->get();
-
-		if (info.is_singleton) {
-			for (int i = 0; i < ScriptServer::get_language_count(); i++) {
-				ScriptServer::get_language(i)->add_global_constant(info.name, Variant());
-			}
-		}
-	}
-
-	// Second pass, load into global constants.
-	for (Map<StringName, ProjectSettings::AutoloadInfo>::Element *E = autoloads.front(); E; E = E->next()) {
-		const ProjectSettings::AutoloadInfo &info = E->get();
-
-		if (!info.is_singleton) {
-			// Skip non-singletons since we don't have a scene tree here anyway.
-			continue;
-		}
-
-		RES res = ResourceLoader::load(info.path);
-		ERR_CONTINUE_MSG(res.is_null(), "Can't autoload: " + info.path);
-		Node *n = nullptr;
-		if (res->is_class("PackedScene")) {
-			Ref<PackedScene> ps = res;
-			n = ps->instance();
-		} else if (res->is_class("Script")) {
-			Ref<Script> script_res = res;
-			StringName ibt = script_res->get_instance_base_type();
-			bool valid_type = ClassDB::is_parent_class(ibt, "Node");
-			ERR_CONTINUE_MSG(!valid_type, "Script does not inherit a Node: " + info.path);
-
-			Object *obj = ClassDB::instance(ibt);
-
-			ERR_CONTINUE_MSG(obj == nullptr,
-					"Cannot instance script for autoload, expected 'Node' inheritance, got: " +
-							String(ibt));
-
-			n = Object::cast_to<Node>(obj);
-			n->set_script(script_res);
-		}
-
-		ERR_CONTINUE_MSG(!n, "Path in autoload not a node or script: " + info.path);
-		n->set_name(info.name);
-
-		for (int i = 0; i < ScriptServer::get_language_count(); i++) {
-			ScriptServer::get_language(i)->add_global_constant(info.name, n);
-		}
-	}
-}
-
 void test(TestType p_type) {
 	List<String> cmdlargs = OS::get_singleton()->get_cmdline_args();
 
@@ -253,20 +211,8 @@ void test(TestType p_type) {
 	FileAccessRef fa = FileAccess::open(test, FileAccess::READ);
 	ERR_FAIL_COND_MSG(!fa, "Could not open file: " + test);
 
-	// Init PackedData since it's used by ProjectSettings.
-	PackedData *packed_data = memnew(PackedData);
-
-	// Setup project settings since it's needed by the languages to get the global scripts.
-	// This also sets up the base resource path.
-	Error err = ProjectSettings::get_singleton()->setup(fa->get_path_absolute().get_base_dir(), String(), true);
-	if (err) {
-		print_line("Could not load project settings.");
-		// Keep going since some scripts still work without this.
-	}
-
 	// Initialize the language for the test routine.
-	ScriptServer::init_languages();
-	init_autoloads();
+	init_language(fa->get_path_absolute().get_base_dir());
 
 	Vector<uint8_t> buf;
 	int flen = fa->get_len();
@@ -300,8 +246,6 @@ void test(TestType p_type) {
 			print_line("Not implemented.");
 	}
 
-	// Destroy stuff we set up earlier.
-	ScriptServer::finish_languages();
-	memdelete(packed_data);
+	finish_language();
 }
-} // namespace TestGDScript
+} // namespace GDScriptTests
diff --git a/modules/gdscript/tests/test_gdscript.h b/modules/gdscript/tests/test_gdscript.h
index bbda46cdad..c7ee5a2208 100644
--- a/modules/gdscript/tests/test_gdscript.h
+++ b/modules/gdscript/tests/test_gdscript.h
@@ -31,7 +31,10 @@
 #ifndef TEST_GDSCRIPT_H
 #define TEST_GDSCRIPT_H
 
-namespace TestGDScript {
+#include "gdscript_test_runner.h"
+#include "tests/test_macros.h"
+
+namespace GDScriptTests {
 
 enum TestType {
 	TEST_TOKENIZER,
@@ -41,6 +44,7 @@ enum TestType {
 };
 
 void test(TestType p_type);
-} // namespace TestGDScript
+
+} // namespace GDScriptTests
 
 #endif // TEST_GDSCRIPT_H
diff --git a/modules/glslang/register_types.cpp b/modules/glslang/register_types.cpp
index 14135265b9..4331daadfc 100644
--- a/modules/glslang/register_types.cpp
+++ b/modules/glslang/register_types.cpp
@@ -173,7 +173,7 @@ static Vector<uint8_t> _compile_shader_glsl(RenderingDevice::ShaderStage p_stage
 	ret.resize(SpirV.size() * sizeof(uint32_t));
 	{
 		uint8_t *w = ret.ptrw();
-		copymem(w, &SpirV[0], SpirV.size() * sizeof(uint32_t));
+		memcpy(w, &SpirV[0], SpirV.size() * sizeof(uint32_t));
 	}
 
 	return ret;
diff --git a/modules/gltf/gltf_document.cpp b/modules/gltf/gltf_document.cpp
index 0b70175a24..e67e29f7b4 100644
--- a/modules/gltf/gltf_document.cpp
+++ b/modules/gltf/gltf_document.cpp
@@ -1157,7 +1157,7 @@ Error GLTFDocument::_encode_buffer_view(Ref<GLTFState> state, const double *src,
 			}
 			int64_t old_size = gltf_buffer.size();
 			gltf_buffer.resize(old_size + (buffer.size() * sizeof(int8_t)));
-			copymem(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int8_t));
+			memcpy(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int8_t));
 			bv->byte_length = buffer.size() * sizeof(int8_t);
 		} break;
 		case COMPONENT_TYPE_UNSIGNED_BYTE: {
@@ -1203,7 +1203,7 @@ Error GLTFDocument::_encode_buffer_view(Ref<GLTFState> state, const double *src,
 			}
 			int64_t old_size = gltf_buffer.size();
 			gltf_buffer.resize(old_size + (buffer.size() * sizeof(int16_t)));
-			copymem(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int16_t));
+			memcpy(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int16_t));
 			bv->byte_length = buffer.size() * sizeof(int16_t);
 		} break;
 		case COMPONENT_TYPE_UNSIGNED_SHORT: {
@@ -1227,7 +1227,7 @@ Error GLTFDocument::_encode_buffer_view(Ref<GLTFState> state, const double *src,
 			}
 			int64_t old_size = gltf_buffer.size();
 			gltf_buffer.resize(old_size + (buffer.size() * sizeof(uint16_t)));
-			copymem(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(uint16_t));
+			memcpy(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(uint16_t));
 			bv->byte_length = buffer.size() * sizeof(uint16_t);
 		} break;
 		case COMPONENT_TYPE_INT: {
@@ -1247,7 +1247,7 @@ Error GLTFDocument::_encode_buffer_view(Ref<GLTFState> state, const double *src,
 			}
 			int64_t old_size = gltf_buffer.size();
 			gltf_buffer.resize(old_size + (buffer.size() * sizeof(int32_t)));
-			copymem(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int32_t));
+			memcpy(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(int32_t));
 			bv->byte_length = buffer.size() * sizeof(int32_t);
 		} break;
 		case COMPONENT_TYPE_FLOAT: {
@@ -1267,7 +1267,7 @@ Error GLTFDocument::_encode_buffer_view(Ref<GLTFState> state, const double *src,
 			}
 			int64_t old_size = gltf_buffer.size();
 			gltf_buffer.resize(old_size + (buffer.size() * sizeof(float)));
-			copymem(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(float));
+			memcpy(gltf_buffer.ptrw() + old_size, buffer.ptrw(), buffer.size() * sizeof(float));
 			bv->byte_length = buffer.size() * sizeof(float);
 		} break;
 	}
@@ -2389,9 +2389,9 @@ Error GLTFDocument::_serialize_meshes(Ref<GLTFState> state) {
 						for (int i = 0; i < ret_size; i++) {
 							Color tangent;
 							tangent.r = tarr[(i * 4) + 0];
-							tangent.r = tarr[(i * 4) + 1];
-							tangent.r = tarr[(i * 4) + 2];
-							tangent.r = tarr[(i * 4) + 3];
+							tangent.g = tarr[(i * 4) + 1];
+							tangent.b = tarr[(i * 4) + 2];
+							tangent.a = tarr[(i * 4) + 3];
 						}
 						t["TANGENT"] = _encode_accessor_as_color(state, attribs, true);
 					}
@@ -2864,7 +2864,7 @@ Error GLTFDocument::_serialize_images(Ref<GLTFState> state, const String &p_path
 
 			bv->byte_length = buffer.size();
 			state->buffers.write[bi].resize(state->buffers[bi].size() + bv->byte_length);
-			copymem(&state->buffers.write[bi].write[bv->byte_offset], buffer.ptr(), buffer.size());
+			memcpy(&state->buffers.write[bi].write[bv->byte_offset], buffer.ptr(), buffer.size());
 			ERR_FAIL_COND_V(bv->byte_offset + bv->byte_length > state->buffers[bi].size(), ERR_FILE_CORRUPT);
 
 			state->buffer_views.push_back(bv);
@@ -3293,6 +3293,7 @@ Error GLTFDocument::_serialize_materials(Ref<GLTFState> state) {
 				}
 				img->decompress();
 				img->convert(Image::FORMAT_RGBA8);
+				img->convert_ra_rgba8_to_rg();
 				for (int32_t y = 0; y < img->get_height(); y++) {
 					for (int32_t x = 0; x < img->get_width(); x++) {
 						Color c = img->get_pixel(x, y);
@@ -4958,8 +4959,8 @@ GLTFMeshIndex GLTFDocument::_convert_mesh_instance(Ref<GLTFState> state, MeshIns
 		if (godot_array_mesh.is_valid()) {
 			surface_name = godot_array_mesh->surface_get_name(surface_i);
 		}
-		if (p_mesh_instance->get_surface_material(surface_i).is_valid()) {
-			mat = p_mesh_instance->get_surface_material(surface_i);
+		if (p_mesh_instance->get_surface_override_material(surface_i).is_valid()) {
+			mat = p_mesh_instance->get_surface_override_material(surface_i);
 		}
 		if (p_mesh_instance->get_material_override().is_valid()) {
 			mat = p_mesh_instance->get_material_override();
diff --git a/modules/gridmap/grid_map.cpp b/modules/gridmap/grid_map.cpp
index 4e4f88ed6a..eaceaac33c 100644
--- a/modules/gridmap/grid_map.cpp
+++ b/modules/gridmap/grid_map.cpp
@@ -152,6 +152,7 @@ uint32_t GridMap::get_collision_mask() const {
 }
 
 void GridMap::set_collision_mask_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision mask bit must be between 0 and 31 inclusive.");
 	uint32_t mask = get_collision_mask();
 	if (p_value) {
 		mask |= 1 << p_bit;
@@ -162,20 +163,23 @@ void GridMap::set_collision_mask_bit(int p_bit, bool p_value) {
 }
 
 bool GridMap::get_collision_mask_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision mask bit must be between 0 and 31 inclusive.");
 	return get_collision_mask() & (1 << p_bit);
 }
 
 void GridMap::set_collision_layer_bit(int p_bit, bool p_value) {
-	uint32_t mask = get_collision_layer();
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision layer bit must be between 0 and 31 inclusive.");
+	uint32_t layer = get_collision_layer();
 	if (p_value) {
-		mask |= 1 << p_bit;
+		layer |= 1 << p_bit;
 	} else {
-		mask &= ~(1 << p_bit);
+		layer &= ~(1 << p_bit);
 	}
-	set_collision_layer(mask);
+	set_collision_layer(layer);
 }
 
 bool GridMap::get_collision_layer_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision layer bit must be between 0 and 31 inclusive.");
 	return get_collision_layer() & (1 << p_bit);
 }
 
diff --git a/modules/lightmapper_rd/lightmapper_rd.cpp b/modules/lightmapper_rd/lightmapper_rd.cpp
index 61ebabdfb6..3b0fbb1c47 100644
--- a/modules/lightmapper_rd/lightmapper_rd.cpp
+++ b/modules/lightmapper_rd/lightmapper_rd.cpp
@@ -162,8 +162,8 @@ Lightmapper::BakeError LightmapperRD::_blit_meshes_into_atlas(int p_max_texture_
 		MeshInstance &mi = mesh_instances.write[m_i];
 		Size2i s = Size2i(mi.data.albedo_on_uv2->get_width(), mi.data.albedo_on_uv2->get_height());
 		sizes.push_back(s);
-		atlas_size.width = MAX(atlas_size.width, s.width);
-		atlas_size.height = MAX(atlas_size.height, s.height);
+		atlas_size.width = MAX(atlas_size.width, s.width + 2);
+		atlas_size.height = MAX(atlas_size.height, s.height + 2);
 	}
 
 	int max = nearest_power_of_2_templated(atlas_size.width);
@@ -186,10 +186,12 @@ Lightmapper::BakeError LightmapperRD::_blit_meshes_into_atlas(int p_max_texture_
 
 	//determine best texture array atlas size by bruteforce fitting
 	while (atlas_size.x <= p_max_texture_size && atlas_size.y <= p_max_texture_size) {
-		Vector<Vector2i> source_sizes = sizes;
+		Vector<Vector2i> source_sizes;
 		Vector<int> source_indices;
-		source_indices.resize(source_sizes.size());
+		source_sizes.resize(sizes.size());
+		source_indices.resize(sizes.size());
 		for (int i = 0; i < source_indices.size(); i++) {
+			source_sizes.write[i] = sizes[i] + Vector2i(2, 2); // Add padding between lightmaps
 			source_indices.write[i] = i;
 		}
 		Vector<Vector3i> atlas_offsets;
@@ -207,7 +209,7 @@ Lightmapper::BakeError LightmapperRD::_blit_meshes_into_atlas(int p_max_texture_
 				if (ofs.z > 0) {
 					//valid
 					ofs.z = slices;
-					atlas_offsets.write[sidx] = ofs;
+					atlas_offsets.write[sidx] = ofs + Vector3i(1, 1, 0); // Center lightmap in the reserved oversized region
 				} else {
 					new_indices.push_back(sidx);
 					new_sources.push_back(source_sizes[i]);
@@ -272,7 +274,7 @@ Lightmapper::BakeError LightmapperRD::_blit_meshes_into_atlas(int p_max_texture_
 	return BAKE_OK;
 }
 
-void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, Vector<Probe> &probe_positions, GenerateProbes p_generate_probes, Vector<int> &slice_triangle_count, Vector<int> &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &box_buffer, RID &lights_buffer, RID &triangle_cell_indices_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &grid_texture_sdf, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata) {
+void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, Vector<Probe> &probe_positions, GenerateProbes p_generate_probes, Vector<int> &slice_triangle_count, Vector<int> &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &box_buffer, RID &lights_buffer, RID &triangle_cell_indices_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata) {
 	HashMap<Vertex, uint32_t, VertexHash> vertex_map;
 
 	//fill triangles array and vertex array
@@ -432,10 +434,10 @@ void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i
 	triangle_indices.resize(triangle_sort.size());
 	Vector<uint32_t> grid_indices;
 	grid_indices.resize(grid_size * grid_size * grid_size * 2);
-	zeromem(grid_indices.ptrw(), grid_indices.size() * sizeof(uint32_t));
+	memset(grid_indices.ptrw(), 0, grid_indices.size() * sizeof(uint32_t));
 	Vector<bool> solid;
 	solid.resize(grid_size * grid_size * grid_size);
-	zeromem(solid.ptrw(), solid.size() * sizeof(bool));
+	memset(solid.ptrw(), 0, solid.size() * sizeof(bool));
 
 	{
 		uint32_t *tiw = triangle_indices.ptrw();
@@ -482,14 +484,6 @@ void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i
 		img->save_png("res://grid_layer_" + itos(1000 + i).substr(1, 3) + ".png");
 	}
 #endif
-	if (p_step_function) {
-		p_step_function(0.45, TTR("Generating Signed Distance Field"), p_bake_userdata, true);
-	}
-
-	//generate SDF for raytracing
-	Vector<uint32_t> euclidean_pos = Geometry3D::generate_edf(solid, Vector3i(grid_size, grid_size, grid_size), false);
-	Vector<uint32_t> euclidean_neg = Geometry3D::generate_edf(solid, Vector3i(grid_size, grid_size, grid_size), true);
-	Vector<int8_t> sdf8 = Geometry3D::generate_sdf8(euclidean_pos, euclidean_neg);
 
 	/*****************************/
 	/*** CREATE GPU STRUCTURES ***/
@@ -551,10 +545,6 @@ void LightmapperRD::_create_acceleration_structures(RenderingDevice *rd, Size2i
 		tf.format = RD::DATA_FORMAT_R32G32_UINT;
 		texdata.write[0] = grid_indices.to_byte_array();
 		grid_texture = rd->texture_create(tf, RD::TextureView(), texdata);
-		//sdf
-		tf.format = RD::DATA_FORMAT_R8_SNORM;
-		texdata.write[0] = sdf8.to_byte_array();
-		grid_texture_sdf = rd->texture_create(tf, RD::TextureView(), texdata);
 	}
 }
 
@@ -755,8 +745,7 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 			light_environment_tex = rd->texture_create(tfp, RD::TextureView(), tdata);
 
 #ifdef DEBUG_TEXTURES
-			panorama_tex->convert(Image::FORMAT_RGB8);
-			panorama_tex->save_png("res://0_panorama.png");
+			panorama_tex->save_exr("res://0_panorama.exr", false);
 #endif
 		}
 	}
@@ -770,7 +759,6 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 	RID lights_buffer;
 	RID triangle_cell_indices_buffer;
 	RID grid_texture;
-	RID grid_texture_sdf;
 	RID seams_buffer;
 	RID probe_positions_buffer;
 
@@ -783,11 +771,10 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 	rd->free(lights_buffer);                \
 	rd->free(triangle_cell_indices_buffer); \
 	rd->free(grid_texture);                 \
-	rd->free(grid_texture_sdf);             \
 	rd->free(seams_buffer);                 \
 	rd->free(probe_positions_buffer);
 
-	_create_acceleration_structures(rd, atlas_size, atlas_slices, bounds, grid_size, probe_positions, p_generate_probes, slice_triangle_count, slice_seam_count, vertex_buffer, triangle_buffer, box_buffer, lights_buffer, triangle_cell_indices_buffer, probe_positions_buffer, grid_texture, grid_texture_sdf, seams_buffer, p_step_function, p_bake_userdata);
+	_create_acceleration_structures(rd, atlas_size, atlas_slices, bounds, grid_size, probe_positions, p_generate_probes, slice_triangle_count, slice_seam_count, vertex_buffer, triangle_buffer, box_buffer, lights_buffer, triangle_cell_indices_buffer, probe_positions_buffer, grid_texture, seams_buffer, p_step_function, p_bake_userdata);
 
 	if (p_step_function) {
 		p_step_function(0.47, TTR("Preparing shaders"), p_bake_userdata, true);
@@ -883,27 +870,20 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 			RD::Uniform u;
 			u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
 			u.binding = 9;
-			u.ids.push_back(grid_texture_sdf);
-			base_uniforms.push_back(u);
-		}
-		{
-			RD::Uniform u;
-			u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
-			u.binding = 10;
 			u.ids.push_back(albedo_array_tex);
 			base_uniforms.push_back(u);
 		}
 		{
 			RD::Uniform u;
 			u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
-			u.binding = 11;
+			u.binding = 10;
 			u.ids.push_back(emission_array_tex);
 			base_uniforms.push_back(u);
 		}
 		{
 			RD::Uniform u;
 			u.uniform_type = RD::UNIFORM_TYPE_SAMPLER;
-			u.binding = 12;
+			u.binding = 11;
 			u.ids.push_back(sampler);
 			base_uniforms.push_back(u);
 		}
@@ -937,13 +917,11 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 		Ref<Image> img;
 		img.instance();
 		img->create(atlas_size.width, atlas_size.height, false, Image::FORMAT_RGBAF, s);
-		img->convert(Image::FORMAT_RGBA8);
-		img->save_png("res://1_position_" + itos(i) + ".png");
+		img->save_exr("res://1_position_" + itos(i) + ".exr", false);
 
 		s = rd->texture_get_data(normal_tex, i);
 		img->create(atlas_size.width, atlas_size.height, false, Image::FORMAT_RGBAH, s);
-		img->convert(Image::FORMAT_RGBA8);
-		img->save_png("res://1_normal_" + itos(i) + ".png");
+		img->save_exr("res://1_normal_" + itos(i) + ".exr", false);
 	}
 #endif
 
@@ -966,27 +944,27 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 	}
 	ERR_FAIL_COND_V(err != OK, BAKE_ERROR_LIGHTMAP_CANT_PRE_BAKE_MESHES);
 
-	//unoccluder
+	// Unoccluder
 	RID compute_shader_unocclude = rd->shader_create_from_bytecode(compute_shader->get_bytecode("unocclude"));
 	ERR_FAIL_COND_V(compute_shader_unocclude.is_null(), BAKE_ERROR_LIGHTMAP_CANT_PRE_BAKE_MESHES); // internal check, should not happen
 	RID compute_shader_unocclude_pipeline = rd->compute_pipeline_create(compute_shader_unocclude);
 
-	//direct light
+	// Direct light
 	RID compute_shader_primary = rd->shader_create_from_bytecode(compute_shader->get_bytecode("primary"));
 	ERR_FAIL_COND_V(compute_shader_primary.is_null(), BAKE_ERROR_LIGHTMAP_CANT_PRE_BAKE_MESHES); // internal check, should not happen
 	RID compute_shader_primary_pipeline = rd->compute_pipeline_create(compute_shader_primary);
 
-	//indirect light
+	// Indirect light
 	RID compute_shader_secondary = rd->shader_create_from_bytecode(compute_shader->get_bytecode("secondary"));
 	ERR_FAIL_COND_V(compute_shader_secondary.is_null(), BAKE_ERROR_LIGHTMAP_CANT_PRE_BAKE_MESHES); //internal check, should not happen
 	RID compute_shader_secondary_pipeline = rd->compute_pipeline_create(compute_shader_secondary);
 
-	//dilate
+	// Dilate
 	RID compute_shader_dilate = rd->shader_create_from_bytecode(compute_shader->get_bytecode("dilate"));
 	ERR_FAIL_COND_V(compute_shader_dilate.is_null(), BAKE_ERROR_LIGHTMAP_CANT_PRE_BAKE_MESHES); //internal check, should not happen
 	RID compute_shader_dilate_pipeline = rd->compute_pipeline_create(compute_shader_dilate);
 
-	//dilate
+	// Light probes
 	RID compute_shader_light_probes = rd->shader_create_from_bytecode(compute_shader->get_bytecode("light_probes"));
 	ERR_FAIL_COND_V(compute_shader_light_probes.is_null(), BAKE_ERROR_LIGHTMAP_CANT_PRE_BAKE_MESHES); //internal check, should not happen
 	RID compute_shader_light_probes_pipeline = rd->compute_pipeline_create(compute_shader_light_probes);
@@ -1153,8 +1131,7 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 		Ref<Image> img;
 		img.instance();
 		img->create(atlas_size.width, atlas_size.height, false, Image::FORMAT_RGBAH, s);
-		img->convert(Image::FORMAT_RGBA8);
-		img->save_png("res://2_light_primary_" + itos(i) + ".png");
+		img->save_exr("res://2_light_primary_" + itos(i) + ".exr", false);
 	}
 #endif
 
@@ -1212,7 +1189,7 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 				RD::Uniform u;
 				u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
 				u.binding = 6;
-				u.ids.push_back(light_environment_tex); //reuse unocclude tex
+				u.ids.push_back(light_environment_tex);
 				uniforms.push_back(u);
 			}
 		}
@@ -1298,7 +1275,15 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 					}
 				}
 			}
+
+			if (b == 0) {
+				// This disables the environment for subsequent bounces
+				push_constant.environment_xform[3] = -99.0f;
+			}
 		}
+
+		// Restore the correct environment transform
+		push_constant.environment_xform[3] = 0.0f;
 	}
 
 	/* LIGHPROBES */
@@ -1449,8 +1434,7 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 		Ref<Image> img;
 		img.instance();
 		img->create(atlas_size.width, atlas_size.height, false, Image::FORMAT_RGBAH, s);
-		img->convert(Image::FORMAT_RGBA8);
-		img->save_png("res://4_light_secondary_" + itos(i) + ".png");
+		img->save_exr("res://4_light_secondary_" + itos(i) + ".exr", false);
 	}
 #endif
 
@@ -1582,6 +1566,11 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 		clear_colors.push_back(Color(0, 0, 0, 1));
 		for (int i = 0; i < atlas_slices; i++) {
 			int subslices = (p_bake_sh ? 4 : 1);
+
+			if (slice_seam_count[i] == 0) {
+				continue;
+			}
+
 			for (int k = 0; k < subslices; k++) {
 				RasterSeamsPushConstant seams_push_constant;
 				seams_push_constant.slice = uint32_t(i * subslices + k);
@@ -1654,8 +1643,7 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 		Ref<Image> img;
 		img.instance();
 		img->create(atlas_size.width, atlas_size.height, false, Image::FORMAT_RGBAH, s);
-		img->convert(Image::FORMAT_RGBA8);
-		img->save_png("res://5_blendseams" + itos(i) + ".png");
+		img->save_exr("res://5_blendseams" + itos(i) + ".exr", false);
 	}
 #endif
 	if (p_step_function) {
@@ -1674,7 +1662,7 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 	if (probe_positions.size() > 0) {
 		probe_values.resize(probe_positions.size() * 9);
 		Vector<uint8_t> probe_data = rd->buffer_get_data(light_probe_buffer);
-		copymem(probe_values.ptrw(), probe_data.ptr(), probe_data.size());
+		memcpy(probe_values.ptrw(), probe_data.ptr(), probe_data.size());
 		rd->free(light_probe_buffer);
 
 #ifdef DEBUG_TEXTURES
@@ -1682,7 +1670,7 @@ LightmapperRD::BakeError LightmapperRD::bake(BakeQuality p_quality, bool p_use_d
 			Ref<Image> img2;
 			img2.instance();
 			img2->create(probe_values.size(), 1, false, Image::FORMAT_RGBAF, probe_data);
-			img2->save_png("res://6_lightprobes.png");
+			img2->save_exr("res://6_lightprobes.exr", false);
 		}
 #endif
 	}
@@ -1743,7 +1731,7 @@ Vector<Color> LightmapperRD::get_bake_probe_sh(int p_probe) const {
 	ERR_FAIL_INDEX_V(p_probe, probe_positions.size(), Vector<Color>());
 	Vector<Color> ret;
 	ret.resize(9);
-	copymem(ret.ptrw(), &probe_values[p_probe * 9], sizeof(Color) * 9);
+	memcpy(ret.ptrw(), &probe_values[p_probe * 9], sizeof(Color) * 9);
 	return ret;
 }
 
diff --git a/modules/lightmapper_rd/lightmapper_rd.h b/modules/lightmapper_rd/lightmapper_rd.h
index f2a826a447..7ab7f34464 100644
--- a/modules/lightmapper_rd/lightmapper_rd.h
+++ b/modules/lightmapper_rd/lightmapper_rd.h
@@ -231,7 +231,7 @@ class LightmapperRD : public Lightmapper {
 	Vector<Color> probe_values;
 
 	BakeError _blit_meshes_into_atlas(int p_max_texture_size, Vector<Ref<Image>> &albedo_images, Vector<Ref<Image>> &emission_images, AABB &bounds, Size2i &atlas_size, int &atlas_slices, BakeStepFunc p_step_function, void *p_bake_userdata);
-	void _create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, Vector<Probe> &probe_positions, GenerateProbes p_generate_probes, Vector<int> &slice_triangle_count, Vector<int> &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &box_buffer, RID &lights_buffer, RID &triangle_cell_indices_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &grid_texture_sdf, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata);
+	void _create_acceleration_structures(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, AABB &bounds, int grid_size, Vector<Probe> &probe_positions, GenerateProbes p_generate_probes, Vector<int> &slice_triangle_count, Vector<int> &slice_seam_count, RID &vertex_buffer, RID &triangle_buffer, RID &box_buffer, RID &lights_buffer, RID &triangle_cell_indices_buffer, RID &probe_positions_buffer, RID &grid_texture, RID &seams_buffer, BakeStepFunc p_step_function, void *p_bake_userdata);
 	void _raster_geometry(RenderingDevice *rd, Size2i atlas_size, int atlas_slices, int grid_size, AABB bounds, float p_bias, Vector<int> slice_triangle_count, RID position_tex, RID unocclude_tex, RID normal_tex, RID raster_depth_buffer, RID rasterize_shader, RID raster_base_uniform);
 
 public:
diff --git a/modules/lightmapper_rd/lm_blendseams.glsl b/modules/lightmapper_rd/lm_blendseams.glsl
index e47e5fcc51..374c48082e 100644
--- a/modules/lightmapper_rd/lm_blendseams.glsl
+++ b/modules/lightmapper_rd/lm_blendseams.glsl
@@ -7,7 +7,7 @@ triangles = "#define MODE_TRIANGLES";
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #include "lm_common_inc.glsl"
 
@@ -74,7 +74,7 @@ void main() {
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #include "lm_common_inc.glsl"
 
diff --git a/modules/lightmapper_rd/lm_common_inc.glsl b/modules/lightmapper_rd/lm_common_inc.glsl
index f8a0cd16de..1581639036 100644
--- a/modules/lightmapper_rd/lm_common_inc.glsl
+++ b/modules/lightmapper_rd/lm_common_inc.glsl
@@ -84,9 +84,8 @@ layout(set = 0, binding = 7, std430) restrict readonly buffer Probes {
 probe_positions;
 
 layout(set = 0, binding = 8) uniform utexture3D grid;
-layout(set = 0, binding = 9) uniform texture3D grid_sdf;
 
-layout(set = 0, binding = 10) uniform texture2DArray albedo_tex;
-layout(set = 0, binding = 11) uniform texture2DArray emission_tex;
+layout(set = 0, binding = 9) uniform texture2DArray albedo_tex;
+layout(set = 0, binding = 10) uniform texture2DArray emission_tex;
 
-layout(set = 0, binding = 12) uniform sampler linear_sampler;
+layout(set = 0, binding = 11) uniform sampler linear_sampler;
diff --git a/modules/lightmapper_rd/lm_compute.glsl b/modules/lightmapper_rd/lm_compute.glsl
index eb9d817f99..9ca40535f9 100644
--- a/modules/lightmapper_rd/lm_compute.glsl
+++ b/modules/lightmapper_rd/lm_compute.glsl
@@ -10,7 +10,7 @@ light_probes = "#define MODE_LIGHT_PROBES";
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 // One 2D local group focusing in one layer at a time, though all
 // in parallel (no barriers) makes more sense than a 3D local group
@@ -96,15 +96,22 @@ params;
 bool ray_hits_triangle(vec3 from, vec3 dir, float max_dist, vec3 p0, vec3 p1, vec3 p2, out float r_distance, out vec3 r_barycentric) {
 	const vec3 e0 = p1 - p0;
 	const vec3 e1 = p0 - p2;
-	vec3 triangleNormal = cross(e1, e0);
+	vec3 triangle_normal = cross(e1, e0);
 
-	const vec3 e2 = (1.0 / dot(triangleNormal, dir)) * (p0 - from);
+	float n_dot_dir = dot(triangle_normal, dir);
+
+	if (abs(n_dot_dir) < 0.01) {
+		return false;
+	}
+
+	const vec3 e2 = (p0 - from) / n_dot_dir;
 	const vec3 i = cross(dir, e2);
 
 	r_barycentric.y = dot(i, e1);
 	r_barycentric.z = dot(i, e0);
 	r_barycentric.x = 1.0 - (r_barycentric.z + r_barycentric.y);
-	r_distance = dot(triangleNormal, e2);
+	r_distance = dot(triangle_normal, e2);
+
 	return (r_distance > params.bias) && (r_distance < max_dist) && all(greaterThanEqual(r_barycentric, vec3(0.0)));
 }
 
@@ -307,8 +314,6 @@ void main() {
 				continue;
 			}
 
-			d /= lights.data[i].range;
-
 			attenuation = get_omni_attenuation(d, 1.0 / lights.data[i].range, lights.data[i].attenuation);
 
 			if (lights.data[i].type == LIGHT_TYPE_SPOT) {
@@ -410,7 +415,7 @@ void main() {
 		uint tidx;
 		vec3 barycentric;
 
-		vec3 light;
+		vec3 light = vec3(0.0);
 		if (trace_ray(position + ray_dir * params.bias, position + ray_dir * length(params.world_size), tidx, barycentric)) {
 			//hit a triangle
 			vec2 uv0 = vertices.data[triangles.data[tidx].indices.x].uv;
@@ -419,8 +424,8 @@ void main() {
 			vec3 uvw = vec3(barycentric.x * uv0 + barycentric.y * uv1 + barycentric.z * uv2, float(triangles.data[tidx].slice));
 
 			light = textureLod(sampler2DArray(source_light, linear_sampler), uvw, 0.0).rgb;
-		} else {
-			//did not hit a triangle, reach out for the sky
+		} else if (params.env_transform[0][3] == 0.0) { // Use env_transform[0][3] to indicate when we are computing the first bounce
+			// Did not hit a triangle, reach out for the sky
 			vec3 sky_dir = normalize(mat3(params.env_transform) * ray_dir);
 
 			vec2 st = vec2(
diff --git a/modules/lightmapper_rd/lm_raster.glsl b/modules/lightmapper_rd/lm_raster.glsl
index 6c2904192b..55ca193cc1 100644
--- a/modules/lightmapper_rd/lm_raster.glsl
+++ b/modules/lightmapper_rd/lm_raster.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #include "lm_common_inc.glsl"
 
@@ -56,7 +56,7 @@ void main() {
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #include "lm_common_inc.glsl"
 
diff --git a/modules/mbedtls/crypto_mbedtls.cpp b/modules/mbedtls/crypto_mbedtls.cpp
index 73931b0365..987306af2a 100644
--- a/modules/mbedtls/crypto_mbedtls.cpp
+++ b/modules/mbedtls/crypto_mbedtls.cpp
@@ -409,7 +409,7 @@ Vector<uint8_t> CryptoMbedTLS::sign(HashingContext::HashType p_hash_type, Vector
 	int ret = mbedtls_pk_sign(&(key->pkey), type, p_hash.ptr(), size, buf, &sig_size, mbedtls_ctr_drbg_random, &ctr_drbg);
 	ERR_FAIL_COND_V_MSG(ret, out, "Error while signing: " + itos(ret));
 	out.resize(sig_size);
-	copymem(out.ptrw(), buf, sig_size);
+	memcpy(out.ptrw(), buf, sig_size);
 	return out;
 }
 
@@ -432,7 +432,7 @@ Vector<uint8_t> CryptoMbedTLS::encrypt(Ref<CryptoKey> p_key, Vector<uint8_t> p_p
 	int ret = mbedtls_pk_encrypt(&(key->pkey), p_plaintext.ptr(), p_plaintext.size(), buf, &size, sizeof(buf), mbedtls_ctr_drbg_random, &ctr_drbg);
 	ERR_FAIL_COND_V_MSG(ret, out, "Error while encrypting: " + itos(ret));
 	out.resize(size);
-	copymem(out.ptrw(), buf, size);
+	memcpy(out.ptrw(), buf, size);
 	return out;
 }
 
@@ -446,6 +446,6 @@ Vector<uint8_t> CryptoMbedTLS::decrypt(Ref<CryptoKey> p_key, Vector<uint8_t> p_c
 	int ret = mbedtls_pk_decrypt(&(key->pkey), p_ciphertext.ptr(), p_ciphertext.size(), buf, &size, sizeof(buf), mbedtls_ctr_drbg_random, &ctr_drbg);
 	ERR_FAIL_COND_V_MSG(ret, out, "Error while decrypting: " + itos(ret));
 	out.resize(size);
-	copymem(out.ptrw(), buf, size);
+	memcpy(out.ptrw(), buf, size);
 	return out;
 }
diff --git a/modules/mbedtls/packet_peer_mbed_dtls.cpp b/modules/mbedtls/packet_peer_mbed_dtls.cpp
index 8a6cdfb131..342ded6ea1 100644
--- a/modules/mbedtls/packet_peer_mbed_dtls.cpp
+++ b/modules/mbedtls/packet_peer_mbed_dtls.cpp
@@ -74,7 +74,7 @@ int PacketPeerMbedDTLS::bio_recv(void *ctx, unsigned char *buf, size_t len) {
 	if (err != OK) {
 		return MBEDTLS_ERR_SSL_INTERNAL_ERROR;
 	}
-	copymem(buf, buffer, buffer_size);
+	memcpy(buf, buffer, buffer_size);
 	return buffer_size;
 }
 
@@ -89,8 +89,8 @@ int PacketPeerMbedDTLS::_set_cookie() {
 	uint8_t client_id[18];
 	IP_Address addr = base->get_packet_address();
 	uint16_t port = base->get_packet_port();
-	copymem(client_id, addr.get_ipv6(), 16);
-	copymem(&client_id[16], (uint8_t *)&port, 2);
+	memcpy(client_id, addr.get_ipv6(), 16);
+	memcpy(&client_id[16], (uint8_t *)&port, 2);
 	return mbedtls_ssl_set_client_transport_id(ssl_ctx->get_context(), client_id, 18);
 }
 
diff --git a/modules/mbedtls/stream_peer_mbedtls.cpp b/modules/mbedtls/stream_peer_mbedtls.cpp
index b39a6ecc2f..8e40451806 100644
--- a/modules/mbedtls/stream_peer_mbedtls.cpp
+++ b/modules/mbedtls/stream_peer_mbedtls.cpp
@@ -242,7 +242,7 @@ void StreamPeerMbedTLS::poll() {
 		return;
 	}
 
-	// We could pass NULL as second parameter, but some behaviour sanitizers don't seem to like that.
+	// We could pass nullptr as second parameter, but some behaviour sanitizers don't seem to like that.
 	// Passing a 1 byte buffer to workaround it.
 	uint8_t byte;
 	int ret = mbedtls_ssl_read(ssl_ctx->get_context(), &byte, 0);
diff --git a/modules/minimp3/audio_stream_mp3.cpp b/modules/minimp3/audio_stream_mp3.cpp
index aaa05a910c..24ec206191 100644
--- a/modules/minimp3/audio_stream_mp3.cpp
+++ b/modules/minimp3/audio_stream_mp3.cpp
@@ -172,7 +172,7 @@ void AudioStreamMP3::set_data(const Vector<uint8_t> &p_data) {
 	clear_data();
 
 	data = memalloc(src_data_len);
-	copymem(data, src_datar, src_data_len);
+	memcpy(data, src_datar, src_data_len);
 	data_len = src_data_len;
 }
 
@@ -183,7 +183,7 @@ Vector<uint8_t> AudioStreamMP3::get_data() const {
 		vdata.resize(data_len);
 		{
 			uint8_t *w = vdata.ptrw();
-			copymem(w, data, data_len);
+			memcpy(w, data, data_len);
 		}
 	}
 
diff --git a/modules/mono/csharp_script.cpp b/modules/mono/csharp_script.cpp
index 43f57a7caa..09f3ea1f50 100644
--- a/modules/mono/csharp_script.cpp
+++ b/modules/mono/csharp_script.cpp
@@ -2016,7 +2016,7 @@ void CSharpInstance::connect_event_signals() {
 		StringName signal_name = event_signal.field->get_name();
 
 		// TODO: Use pooling for ManagedCallable instances.
-		auto event_signal_callable = memnew(EventSignalCallable(owner, &event_signal));
+		EventSignalCallable *event_signal_callable = memnew(EventSignalCallable(owner, &event_signal));
 
 		Callable callable(event_signal_callable);
 		connected_event_signals.push_back(callable);
@@ -2027,7 +2027,7 @@ void CSharpInstance::connect_event_signals() {
 void CSharpInstance::disconnect_event_signals() {
 	for (const List<Callable>::Element *E = connected_event_signals.front(); E; E = E->next()) {
 		const Callable &callable = E->get();
-		auto event_signal_callable = static_cast<const EventSignalCallable *>(callable.get_custom());
+		const EventSignalCallable *event_signal_callable = static_cast<const EventSignalCallable *>(callable.get_custom());
 		owner->disconnect(event_signal_callable->get_signal(), callable);
 	}
 
diff --git a/modules/mono/editor/godotsharp_export.cpp b/modules/mono/editor/godotsharp_export.cpp
index 4b858c0e82..54dbaebf38 100644
--- a/modules/mono/editor/godotsharp_export.cpp
+++ b/modules/mono/editor/godotsharp_export.cpp
@@ -91,7 +91,7 @@ Error get_assembly_dependencies(GDMonoAssembly *p_assembly, MonoAssemblyName *re
 
 		mono_assembly_get_assemblyref(image, i, reusable_aname);
 
-		GDMonoAssembly *ref_assembly = NULL;
+		GDMonoAssembly *ref_assembly = nullptr;
 		if (!GDMono::get_singleton()->load_assembly(ref_name, reusable_aname, &ref_assembly, /* refonly: */ true, p_search_dirs)) {
 			ERR_FAIL_V_MSG(ERR_CANT_RESOLVE, "Cannot load assembly (refonly): '" + ref_name + "'.");
 		}
diff --git a/modules/mono/mono_gd/gd_mono.cpp b/modules/mono/mono_gd/gd_mono.cpp
index 560788fb3a..c523d381f6 100644
--- a/modules/mono/mono_gd/gd_mono.cpp
+++ b/modules/mono/mono_gd/gd_mono.cpp
@@ -119,11 +119,11 @@ void gd_mono_profiler_init() {
 
 	const String env_var_name = "MONO_ENV_OPTIONS";
 	if (OS::get_singleton()->has_environment(env_var_name)) {
-		const auto mono_env_ops = OS::get_singleton()->get_environment(env_var_name);
+		const String mono_env_ops = OS::get_singleton()->get_environment(env_var_name);
 		// Usually MONO_ENV_OPTIONS looks like:   --profile=jb:prof=timeline,ctl=remote,host=127.0.0.1:55467
 		const String prefix = "--profile=";
 		if (mono_env_ops.begins_with(prefix)) {
-			const auto ops = mono_env_ops.substr(prefix.length(), mono_env_ops.length());
+			const String ops = mono_env_ops.substr(prefix.length(), mono_env_ops.length());
 			mono_profiler_load(ops.utf8());
 		}
 	}
diff --git a/modules/mono/mono_gd/gd_mono_wasm_m2n.h b/modules/mono/mono_gd/gd_mono_wasm_m2n.h
index 159a2ed7b6..366662ff81 100644
--- a/modules/mono/mono_gd/gd_mono_wasm_m2n.h
+++ b/modules/mono/mono_gd/gd_mono_wasm_m2n.h
@@ -176,7 +176,7 @@ T m2n_arg_cast(Mono_InterpMethodArguments *p_margs, size_t p_idx) {
 	} else if constexpr (cookie == 'F') {
 		return *reinterpret_cast<float *>(&p_margs->fargs[fidx(p_idx)]);
 	} else if constexpr (cookie == 'D') {
-		return (T)(size_t)p_margs->fargs[p_idx];
+		return (T)p_margs->fargs[p_idx];
 	}
 }
 
diff --git a/modules/mono/mono_gd/support/ios_support.mm b/modules/mono/mono_gd/support/ios_support.mm
index cdee04edcf..23424fbaf9 100644
--- a/modules/mono/mono_gd/support/ios_support.mm
+++ b/modules/mono/mono_gd/support/ios_support.mm
@@ -57,9 +57,9 @@ void ios_mono_log_callback(const char *log_domain, const char *log_level, const
 }
 
 void initialize() {
-	mono_dllmap_insert(NULL, "System.Native", NULL, "__Internal", NULL);
-	mono_dllmap_insert(NULL, "System.IO.Compression.Native", NULL, "__Internal", NULL);
-	mono_dllmap_insert(NULL, "System.Security.Cryptography.Native.Apple", NULL, "__Internal", NULL);
+	mono_dllmap_insert(nullptr, "System.Native", nullptr, "__Internal", nullptr);
+	mono_dllmap_insert(nullptr, "System.IO.Compression.Native", nullptr, "__Internal", nullptr);
+	mono_dllmap_insert(nullptr, "System.Security.Cryptography.Native.Apple", nullptr, "__Internal", nullptr);
 
 #ifdef IOS_DEVICE
 	// This function is defined in an auto-generated source file
@@ -85,7 +85,7 @@ void cleanup() {
 GD_PINVOKE_EXPORT const char *xamarin_get_locale_country_code() {
 	NSLocale *locale = [NSLocale currentLocale];
 	NSString *countryCode = [locale objectForKey:NSLocaleCountryCode];
-	if (countryCode == NULL) {
+	if (countryCode == nullptr) {
 		return strdup("US");
 	}
 	return strdup([countryCode UTF8String]);
diff --git a/modules/pvr/image_compress_pvrtc.cpp b/modules/pvr/image_compress_pvrtc.cpp
index d2d8976694..6cb9837f49 100644
--- a/modules/pvr/image_compress_pvrtc.cpp
+++ b/modules/pvr/image_compress_pvrtc.cpp
@@ -65,7 +65,7 @@ static void _compress_pvrtc1_4bpp(Image *p_img) {
 			img->get_mipmap_offset_size_and_dimensions(i, ofs, size, w, h);
 			Javelin::RgbaBitmap bm(w, h);
 			void *dst = (void *)bm.GetData();
-			copymem(dst, &r[ofs], size);
+			memcpy(dst, &r[ofs], size);
 			Javelin::ColorRgba<unsigned char> *dp = bm.GetData();
 			for (int j = 0; j < size / 4; j++) {
 				// Red and blue colors are swapped.
diff --git a/modules/raycast/SCsub b/modules/raycast/SCsub
new file mode 100644
index 0000000000..68e9df5263
--- /dev/null
+++ b/modules/raycast/SCsub
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+Import("env")
+Import("env_modules")
+
+embree_src = [
+    "common/sys/sysinfo.cpp",
+    "common/sys/alloc.cpp",
+    "common/sys/filename.cpp",
+    "common/sys/library.cpp",
+    "common/sys/thread.cpp",
+    "common/sys/string.cpp",
+    "common/sys/regression.cpp",
+    "common/sys/mutex.cpp",
+    "common/sys/condition.cpp",
+    "common/sys/barrier.cpp",
+    "common/math/constants.cpp",
+    "common/simd/sse.cpp",
+    "common/lexers/stringstream.cpp",
+    "common/lexers/tokenstream.cpp",
+    "common/tasking/taskschedulerinternal.cpp",
+    "common/algorithms/parallel_for.cpp",
+    "common/algorithms/parallel_reduce.cpp",
+    "common/algorithms/parallel_prefix_sum.cpp",
+    "common/algorithms/parallel_for_for.cpp",
+    "common/algorithms/parallel_for_for_prefix_sum.cpp",
+    "common/algorithms/parallel_partition.cpp",
+    "common/algorithms/parallel_sort.cpp",
+    "common/algorithms/parallel_set.cpp",
+    "common/algorithms/parallel_map.cpp",
+    "common/algorithms/parallel_filter.cpp",
+    "kernels/common/device.cpp",
+    "kernels/common/stat.cpp",
+    "kernels/common/acceln.cpp",
+    "kernels/common/accelset.cpp",
+    "kernels/common/state.cpp",
+    "kernels/common/rtcore.cpp",
+    "kernels/common/rtcore_builder.cpp",
+    "kernels/common/scene.cpp",
+    "kernels/common/alloc.cpp",
+    "kernels/common/geometry.cpp",
+    "kernels/common/scene_triangle_mesh.cpp",
+    "kernels/geometry/primitive4.cpp",
+    "kernels/builders/primrefgen.cpp",
+    "kernels/bvh/bvh.cpp",
+    "kernels/bvh/bvh_statistics.cpp",
+    "kernels/bvh/bvh4_factory.cpp",
+    "kernels/bvh/bvh8_factory.cpp",
+    "kernels/bvh/bvh_collider.cpp",
+    "kernels/bvh/bvh_rotate.cpp",
+    "kernels/bvh/bvh_refit.cpp",
+    "kernels/bvh/bvh_builder.cpp",
+    "kernels/bvh/bvh_builder_morton.cpp",
+    "kernels/bvh/bvh_builder_sah.cpp",
+    "kernels/bvh/bvh_builder_sah_spatial.cpp",
+    "kernels/bvh/bvh_builder_sah_mb.cpp",
+    "kernels/bvh/bvh_builder_twolevel.cpp",
+    "kernels/bvh/bvh_intersector1_bvh4.cpp",
+]
+
+embree_dir = "#thirdparty/embree-aarch64/"
+
+env_embree = env_modules.Clone()
+embree_sources = [embree_dir + file for file in embree_src]
+env_embree.Prepend(CPPPATH=[embree_dir, embree_dir + "include"])
+env_embree.Append(CPPFLAGS=["-DEMBREE_TARGET_SSE2", "-DEMBREE_LOWEST_ISA", "-DTASKING_INTERNAL", "-DNDEBUG"])
+
+if not env_embree.msvc:
+    env_embree.Append(CPPFLAGS=["-msse2", "-mxsave"])
+    if env["platform"] == "windows":
+        env_embree.Append(CPPFLAGS=["-mstackrealign"])
+
+if env["platform"] == "windows":
+    if env.msvc:
+        env.Append(LINKFLAGS=["psapi.lib"])
+        env_embree.Append(CPPFLAGS=["-D__SSE2__", "-D__SSE__"])
+    else:
+        env.Append(LIBS=["psapi"])
+
+env_embree.disable_warnings()
+env_embree.add_source_files(env.modules_sources, embree_sources)
+
+env_raycast = env_modules.Clone()
+env_raycast.Prepend(CPPPATH=[embree_dir, embree_dir + "include", embree_dir + "common"])
+
+env_raycast.add_source_files(env.modules_sources, "*.cpp")
diff --git a/modules/raycast/config.py b/modules/raycast/config.py
new file mode 100644
index 0000000000..26493da41b
--- /dev/null
+++ b/modules/raycast/config.py
@@ -0,0 +1,12 @@
+def can_build(env, platform):
+    if platform == "android":
+        return env["android_arch"] in ["arm64v8", "x86", "x86_64"]
+
+    if platform == "javascript":
+        return False  # No SIMD support yet
+
+    return True
+
+
+def configure(env):
+    pass
diff --git a/modules/raycast/godot_update_embree.py b/modules/raycast/godot_update_embree.py
new file mode 100644
index 0000000000..db4fa95c21
--- /dev/null
+++ b/modules/raycast/godot_update_embree.py
@@ -0,0 +1,260 @@
+import glob, os, shutil, subprocess, re
+
+include_dirs = [
+    "common/tasking",
+    "kernels/bvh",
+    "kernels/builders",
+    "common/sys",
+    "kernels",
+    "kernels/common",
+    "common/math",
+    "common/algorithms",
+    "common/lexers",
+    "common/simd",
+    "include/embree3",
+    "kernels/subdiv",
+    "kernels/geometry",
+]
+
+cpp_files = [
+    "common/sys/sysinfo.cpp",
+    "common/sys/alloc.cpp",
+    "common/sys/filename.cpp",
+    "common/sys/library.cpp",
+    "common/sys/thread.cpp",
+    "common/sys/string.cpp",
+    "common/sys/regression.cpp",
+    "common/sys/mutex.cpp",
+    "common/sys/condition.cpp",
+    "common/sys/barrier.cpp",
+    "common/math/constants.cpp",
+    "common/simd/sse.cpp",
+    "common/lexers/stringstream.cpp",
+    "common/lexers/tokenstream.cpp",
+    "common/tasking/taskschedulerinternal.cpp",
+    "common/algorithms/parallel_for.cpp",
+    "common/algorithms/parallel_reduce.cpp",
+    "common/algorithms/parallel_prefix_sum.cpp",
+    "common/algorithms/parallel_for_for.cpp",
+    "common/algorithms/parallel_for_for_prefix_sum.cpp",
+    "common/algorithms/parallel_partition.cpp",
+    "common/algorithms/parallel_sort.cpp",
+    "common/algorithms/parallel_set.cpp",
+    "common/algorithms/parallel_map.cpp",
+    "common/algorithms/parallel_filter.cpp",
+    "kernels/common/device.cpp",
+    "kernels/common/stat.cpp",
+    "kernels/common/acceln.cpp",
+    "kernels/common/accelset.cpp",
+    "kernels/common/state.cpp",
+    "kernels/common/rtcore.cpp",
+    "kernels/common/rtcore_builder.cpp",
+    "kernels/common/scene.cpp",
+    "kernels/common/alloc.cpp",
+    "kernels/common/geometry.cpp",
+    "kernels/common/scene_triangle_mesh.cpp",
+    "kernels/geometry/primitive4.cpp",
+    "kernels/builders/primrefgen.cpp",
+    "kernels/bvh/bvh.cpp",
+    "kernels/bvh/bvh_statistics.cpp",
+    "kernels/bvh/bvh4_factory.cpp",
+    "kernels/bvh/bvh8_factory.cpp",
+    "kernels/bvh/bvh_collider.cpp",
+    "kernels/bvh/bvh_rotate.cpp",
+    "kernels/bvh/bvh_refit.cpp",
+    "kernels/bvh/bvh_builder.cpp",
+    "kernels/bvh/bvh_builder_morton.cpp",
+    "kernels/bvh/bvh_builder_sah.cpp",
+    "kernels/bvh/bvh_builder_sah_spatial.cpp",
+    "kernels/bvh/bvh_builder_sah_mb.cpp",
+    "kernels/bvh/bvh_builder_twolevel.cpp",
+    "kernels/bvh/bvh_intersector1.cpp",
+    "kernels/bvh/bvh_intersector1_bvh4.cpp",
+]
+
+os.chdir("../../thirdparty")
+
+dir_name = "embree-aarch64"
+if os.path.exists(dir_name):
+    shutil.rmtree(dir_name)
+
+subprocess.run(["git", "clone", "https://github.com/lighttransport/embree-aarch64.git", "embree-tmp"])
+os.chdir("embree-tmp")
+
+commit_hash = str(subprocess.check_output(["git", "rev-parse", "HEAD"], universal_newlines=True)).strip()
+
+all_files = set(cpp_files)
+
+dest_dir = os.path.join("..", dir_name)
+for include_dir in include_dirs:
+    headers = glob.iglob(os.path.join(include_dir, "*.h"))
+    all_files.update(headers)
+
+for f in all_files:
+    d = os.path.join(dest_dir, os.path.dirname(f))
+    if not os.path.exists(d):
+        os.makedirs(d)
+    shutil.copy2(f, d)
+
+with open(os.path.join(dest_dir, "kernels/hash.h"), "w") as hash_file:
+    hash_file.write(
+        f"""
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_HASH "{commit_hash}"
+"""
+    )
+
+with open(os.path.join(dest_dir, "kernels/config.h"), "w") as config_file:
+    config_file.write(
+        """
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+/* #undef EMBREE_RAY_MASK */
+/* #undef EMBREE_STAT_COUNTERS */
+/* #undef EMBREE_BACKFACE_CULLING */
+/* #undef EMBREE_BACKFACE_CULLING_CURVES */
+#define EMBREE_FILTER_FUNCTION
+/* #undef EMBREE_IGNORE_INVALID_RAYS */
+#define EMBREE_GEOMETRY_TRIANGLE
+/* #undef EMBREE_GEOMETRY_QUAD */
+/* #undef EMBREE_GEOMETRY_CURVE */
+/* #undef EMBREE_GEOMETRY_SUBDIVISION */
+/* #undef EMBREE_GEOMETRY_USER */
+/* #undef EMBREE_GEOMETRY_INSTANCE */
+/* #undef EMBREE_GEOMETRY_GRID */
+/* #undef EMBREE_GEOMETRY_POINT */
+/* #undef EMBREE_RAY_PACKETS */
+/* #undef EMBREE_COMPACT_POLYS */
+
+#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+  #define IF_ENABLED_TRIS(x) x
+#else
+  #define IF_ENABLED_TRIS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+  #define IF_ENABLED_QUADS(x) x
+#else
+  #define IF_ENABLED_QUADS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_CURVES_OR_POINTS(x) x
+#else
+  #define IF_ENABLED_CURVES_OR_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+  #define IF_ENABLED_CURVES(x) x
+#else
+  #define IF_ENABLED_CURVES(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_POINTS(x) x
+#else
+  #define IF_ENABLED_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+  #define IF_ENABLED_SUBDIV(x) x
+#else
+  #define IF_ENABLED_SUBDIV(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+  #define IF_ENABLED_USER(x) x
+#else
+  #define IF_ENABLED_USER(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+  #define IF_ENABLED_INSTANCE(x) x
+#else
+  #define IF_ENABLED_INSTANCE(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+  #define IF_ENABLED_GRIDS(x) x
+#else
+  #define IF_ENABLED_GRIDS(x)
+#endif
+"""
+    )
+
+
+with open("CMakeLists.txt", "r") as cmake_file:
+    cmake_content = cmake_file.read()
+    major_version = int(re.compile(r"EMBREE_VERSION_MAJOR\s(\d+)").findall(cmake_content)[0])
+    minor_version = int(re.compile(r"EMBREE_VERSION_MINOR\s(\d+)").findall(cmake_content)[0])
+    patch_version = int(re.compile(r"EMBREE_VERSION_PATCH\s(\d+)").findall(cmake_content)[0])
+
+with open(os.path.join(dest_dir, "include/embree3/rtcore_config.h"), "w") as config_file:
+    config_file.write(
+        f"""
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define RTC_VERSION_MAJOR {major_version}
+#define RTC_VERSION_MINOR {minor_version}
+#define RTC_VERSION_PATCH {patch_version}
+#define RTC_VERSION {major_version}{minor_version:02d}{patch_version:02d}
+#define RTC_VERSION_STRING "{major_version}.{minor_version}.{patch_version}"
+
+#define RTC_MAX_INSTANCE_LEVEL_COUNT 1
+
+#define EMBREE_MIN_WIDTH 0
+#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
+
+#define EMBREE_STATIC_LIB
+/* #undef EMBREE_API_NAMESPACE */
+
+#if defined(EMBREE_API_NAMESPACE)
+#  define RTC_NAMESPACE
+#  define RTC_NAMESPACE_BEGIN namespace  {{
+#  define RTC_NAMESPACE_END }}
+#  define RTC_NAMESPACE_USE using namespace ;
+#  define RTC_API_EXTERN_C
+#  undef EMBREE_API_NAMESPACE
+#else
+#  define RTC_NAMESPACE_BEGIN
+#  define RTC_NAMESPACE_END
+#  define RTC_NAMESPACE_USE
+#  if defined(__cplusplus)
+#    define RTC_API_EXTERN_C extern "C"
+#  else
+#    define RTC_API_EXTERN_C
+#  endif
+#endif
+
+#if defined(ISPC)
+#  define RTC_API_IMPORT extern "C" unmasked
+#  define RTC_API_EXPORT extern "C" unmasked
+#elif defined(EMBREE_STATIC_LIB)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C
+#elif defined(_WIN32)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport)
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport)
+#else
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default")))
+#endif
+
+#if defined(RTC_EXPORT_API)
+#  define RTC_API RTC_API_EXPORT
+#else
+#  define RTC_API RTC_API_IMPORT
+#endif
+"""
+    )
+
+os.chdir("..")
+shutil.rmtree("embree-tmp")
diff --git a/modules/raycast/lightmap_raycaster.cpp b/modules/raycast/lightmap_raycaster.cpp
new file mode 100644
index 0000000000..9039622d3d
--- /dev/null
+++ b/modules/raycast/lightmap_raycaster.cpp
@@ -0,0 +1,202 @@
+/*************************************************************************/
+/*  lightmap_raycaster.cpp                                               */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifdef TOOLS_ENABLED
+
+#include "lightmap_raycaster.h"
+
+// From Embree.
+#include <math/vec2.h>
+#include <math/vec3.h>
+
+#include <pmmintrin.h>
+
+using namespace embree;
+
+LightmapRaycaster *LightmapRaycasterEmbree::create_embree_raycaster() {
+	return memnew(LightmapRaycasterEmbree);
+}
+
+void LightmapRaycasterEmbree::make_default_raycaster() {
+	create_function = create_embree_raycaster;
+}
+
+void LightmapRaycasterEmbree::filter_function(const struct RTCFilterFunctionNArguments *p_args) {
+	RTCHit *hit = (RTCHit *)p_args->hit;
+
+	unsigned int geomID = hit->geomID;
+	float u = hit->u;
+	float v = hit->v;
+
+	LightmapRaycasterEmbree *scene = (LightmapRaycasterEmbree *)p_args->geometryUserPtr;
+	RTCGeometry geom = rtcGetGeometry(scene->embree_scene, geomID);
+
+	rtcInterpolate0(geom, hit->primID, hit->u, hit->v, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 0, &hit->u, 2);
+
+	if (scene->alpha_textures.has(geomID)) {
+		const AlphaTextureData &alpha_texture = scene->alpha_textures[geomID];
+
+		if (alpha_texture.sample(hit->u, hit->v) < 128) {
+			p_args->valid[0] = 0;
+			return;
+		}
+	}
+
+	rtcInterpolate0(geom, hit->primID, u, v, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 1, &hit->Ng_x, 3);
+}
+
+bool LightmapRaycasterEmbree::intersect(Ray &r_ray) {
+	RTCIntersectContext context;
+
+	rtcInitIntersectContext(&context);
+
+	rtcIntersect1(embree_scene, &context, (RTCRayHit *)&r_ray);
+	return r_ray.geomID != RTC_INVALID_GEOMETRY_ID;
+}
+
+void LightmapRaycasterEmbree::intersect(Vector<Ray> &r_rays) {
+	Ray *rays = r_rays.ptrw();
+	for (int i = 0; i < r_rays.size(); ++i) {
+		intersect(rays[i]);
+	}
+}
+
+void LightmapRaycasterEmbree::set_mesh_alpha_texture(Ref<Image> p_alpha_texture, unsigned int p_id) {
+	if (p_alpha_texture.is_valid() && p_alpha_texture->get_size() != Vector2i()) {
+		AlphaTextureData tex;
+		tex.size = p_alpha_texture->get_size();
+		tex.data = p_alpha_texture->get_data();
+		alpha_textures.insert(p_id, tex);
+	}
+}
+
+float blerp(float c00, float c10, float c01, float c11, float tx, float ty) {
+	return Math::lerp(Math::lerp(c00, c10, tx), Math::lerp(c01, c11, tx), ty);
+}
+
+uint8_t LightmapRaycasterEmbree::AlphaTextureData::sample(float u, float v) const {
+	float x = u * size.x;
+	float y = v * size.y;
+	int xi = (int)x;
+	int yi = (int)y;
+
+	uint8_t texels[4];
+
+	for (int i = 0; i < 4; ++i) {
+		int sample_x = CLAMP(xi + i % 2, 0, size.x - 1);
+		int sample_y = CLAMP(yi + i / 2, 0, size.y - 1);
+		texels[i] = data[sample_y * size.x + sample_x];
+	}
+
+	return Math::round(blerp(texels[0], texels[1], texels[2], texels[3], x - xi, y - yi));
+}
+
+void LightmapRaycasterEmbree::add_mesh(const Vector<Vector3> &p_vertices, const Vector<Vector3> &p_normals, const Vector<Vector2> &p_uv2s, unsigned int p_id) {
+	RTCGeometry embree_mesh = rtcNewGeometry(embree_device, RTC_GEOMETRY_TYPE_TRIANGLE);
+
+	rtcSetGeometryVertexAttributeCount(embree_mesh, 2);
+
+	int vertex_count = p_vertices.size();
+
+	ERR_FAIL_COND(vertex_count % 3 != 0);
+	ERR_FAIL_COND(vertex_count != p_uv2s.size());
+
+	Vec3fa *embree_vertices = (Vec3fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, sizeof(Vec3fa), vertex_count);
+	Vec2fa *embree_light_uvs = (Vec2fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 0, RTC_FORMAT_FLOAT2, sizeof(Vec2fa), vertex_count);
+	uint32_t *embree_triangles = (uint32_t *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, sizeof(uint32_t) * 3, vertex_count / 3);
+
+	Vec3fa *embree_normals = nullptr;
+	if (!p_normals.is_empty()) {
+		embree_normals = (Vec3fa *)rtcSetNewGeometryBuffer(embree_mesh, RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE, 1, RTC_FORMAT_FLOAT3, sizeof(Vec3fa), vertex_count);
+	}
+
+	for (int i = 0; i < vertex_count; i++) {
+		embree_vertices[i] = Vec3fa(p_vertices[i].x, p_vertices[i].y, p_vertices[i].z);
+		embree_light_uvs[i] = Vec2fa(p_uv2s[i].x, p_uv2s[i].y);
+		if (embree_normals != nullptr) {
+			embree_normals[i] = Vec3fa(p_normals[i].x, p_normals[i].y, p_normals[i].z);
+		}
+		embree_triangles[i] = i;
+	}
+
+	rtcCommitGeometry(embree_mesh);
+	rtcSetGeometryIntersectFilterFunction(embree_mesh, filter_function);
+	rtcSetGeometryUserData(embree_mesh, this);
+	rtcAttachGeometryByID(embree_scene, embree_mesh, p_id);
+	rtcReleaseGeometry(embree_mesh);
+}
+
+void LightmapRaycasterEmbree::commit() {
+	rtcCommitScene(embree_scene);
+}
+
+void LightmapRaycasterEmbree::set_mesh_filter(const Set<int> &p_mesh_ids) {
+	for (Set<int>::Element *E = p_mesh_ids.front(); E; E = E->next()) {
+		rtcDisableGeometry(rtcGetGeometry(embree_scene, E->get()));
+	}
+	rtcCommitScene(embree_scene);
+	filter_meshes = p_mesh_ids;
+}
+
+void LightmapRaycasterEmbree::clear_mesh_filter() {
+	for (Set<int>::Element *E = filter_meshes.front(); E; E = E->next()) {
+		rtcEnableGeometry(rtcGetGeometry(embree_scene, E->get()));
+	}
+	rtcCommitScene(embree_scene);
+	filter_meshes.clear();
+}
+
+void embree_error_handler(void *p_user_data, RTCError p_code, const char *p_str) {
+	print_error("Embree error: " + String(p_str));
+}
+
+LightmapRaycasterEmbree::LightmapRaycasterEmbree() {
+	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+
+	embree_device = rtcNewDevice(nullptr);
+	rtcSetDeviceErrorFunction(embree_device, &embree_error_handler, nullptr);
+	embree_scene = rtcNewScene(embree_device);
+}
+
+LightmapRaycasterEmbree::~LightmapRaycasterEmbree() {
+	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
+	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
+
+	if (embree_scene != nullptr) {
+		rtcReleaseScene(embree_scene);
+	}
+
+	if (embree_device != nullptr) {
+		rtcReleaseDevice(embree_device);
+	}
+}
+
+#endif
diff --git a/modules/raycast/lightmap_raycaster.h b/modules/raycast/lightmap_raycaster.h
new file mode 100644
index 0000000000..4c3de27837
--- /dev/null
+++ b/modules/raycast/lightmap_raycaster.h
@@ -0,0 +1,77 @@
+/*************************************************************************/
+/*  lightmap_raycaster.h                                                 */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifdef TOOLS_ENABLED
+
+#include "core/object/object.h"
+#include "scene/3d/lightmapper.h"
+#include "scene/resources/mesh.h"
+
+#include <embree3/rtcore.h>
+
+class LightmapRaycasterEmbree : public LightmapRaycaster {
+	GDCLASS(LightmapRaycasterEmbree, LightmapRaycaster);
+
+private:
+	struct AlphaTextureData {
+		Vector<uint8_t> data;
+		Vector2i size;
+
+		uint8_t sample(float u, float v) const;
+	};
+
+	RTCDevice embree_device;
+	RTCScene embree_scene;
+
+	static void filter_function(const struct RTCFilterFunctionNArguments *p_args);
+
+	Map<unsigned int, AlphaTextureData> alpha_textures;
+	Set<int> filter_meshes;
+
+public:
+	virtual bool intersect(Ray &p_ray) override;
+
+	virtual void intersect(Vector<Ray> &r_rays) override;
+
+	virtual void add_mesh(const Vector<Vector3> &p_vertices, const Vector<Vector3> &p_normals, const Vector<Vector2> &p_uv2s, unsigned int p_id) override;
+	virtual void set_mesh_alpha_texture(Ref<Image> p_alpha_texture, unsigned int p_id) override;
+	virtual void commit() override;
+
+	virtual void set_mesh_filter(const Set<int> &p_mesh_ids) override;
+	virtual void clear_mesh_filter() override;
+
+	static LightmapRaycaster *create_embree_raycaster();
+	static void make_default_raycaster();
+
+	LightmapRaycasterEmbree();
+	~LightmapRaycasterEmbree();
+};
+
+#endif
diff --git a/modules/raycast/raycast_occlusion_cull.cpp b/modules/raycast/raycast_occlusion_cull.cpp
new file mode 100644
index 0000000000..66558efa8c
--- /dev/null
+++ b/modules/raycast/raycast_occlusion_cull.cpp
@@ -0,0 +1,583 @@
+/*************************************************************************/
+/*  raycast_occlusion_cull.cpp                                           */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "raycast_occlusion_cull.h"
+#include "core/config/project_settings.h"
+#include "core/templates/local_vector.h"
+
+#ifdef __SSE2__
+#include <pmmintrin.h>
+#endif
+
+RaycastOcclusionCull *RaycastOcclusionCull::raycast_singleton = nullptr;
+
+void RaycastOcclusionCull::RaycastHZBuffer::clear() {
+	HZBuffer::clear();
+
+	camera_rays.clear();
+	camera_ray_masks.clear();
+	packs_size = Size2i();
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::resize(const Size2i &p_size) {
+	if (p_size == Size2i()) {
+		clear();
+		return;
+	}
+
+	if (!sizes.is_empty() && p_size == sizes[0]) {
+		return; // Size didn't change
+	}
+
+	HZBuffer::resize(p_size);
+
+	packs_size = Size2i(Math::ceil(p_size.x / (float)TILE_SIZE), Math::ceil(p_size.y / (float)TILE_SIZE));
+	int ray_packets_count = packs_size.x * packs_size.y;
+	camera_rays.resize(ray_packets_count);
+	camera_ray_masks.resize(ray_packets_count * TILE_SIZE * TILE_SIZE);
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::update_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool) {
+	CameraRayThreadData td;
+	td.camera_matrix = p_cam_projection;
+	td.camera_transform = p_cam_transform;
+	td.camera_orthogonal = p_cam_orthogonal;
+	td.thread_count = p_thread_work_pool.get_thread_count();
+
+	p_thread_work_pool.do_work(td.thread_count, this, &RaycastHZBuffer::_camera_rays_threaded, &td);
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::_camera_rays_threaded(uint32_t p_thread, RaycastOcclusionCull::RaycastHZBuffer::CameraRayThreadData *p_data) {
+	uint32_t packs_total = camera_rays.size();
+	uint32_t total_threads = p_data->thread_count;
+	uint32_t from = p_thread * packs_total / total_threads;
+	uint32_t to = (p_thread + 1 == total_threads) ? packs_total : ((p_thread + 1) * packs_total / total_threads);
+	_generate_camera_rays(p_data->camera_transform, p_data->camera_matrix, p_data->camera_orthogonal, from, to);
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::_generate_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, int p_from, int p_to) {
+	Size2i buffer_size = sizes[0];
+
+	CameraMatrix inv_camera_matrix = p_cam_projection.inverse();
+	float z_far = p_cam_projection.get_z_far() * 1.05f;
+	debug_tex_range = z_far;
+
+	RayPacket *ray_packets = camera_rays.ptr();
+	uint32_t *ray_masks = camera_ray_masks.ptr();
+
+	for (int i = p_from; i < p_to; i++) {
+		RayPacket &packet = ray_packets[i];
+		int tile_x = (i % packs_size.x) * TILE_SIZE;
+		int tile_y = (i / packs_size.x) * TILE_SIZE;
+
+		for (int j = 0; j < TILE_RAYS; j++) {
+			float x = tile_x + j % TILE_SIZE;
+			float y = tile_y + j / TILE_SIZE;
+
+			ray_masks[i * TILE_RAYS + j] = ~0U;
+
+			if (x >= buffer_size.x || y >= buffer_size.y) {
+				ray_masks[i * TILE_RAYS + j] = 0U;
+			} else {
+				float u = x / (buffer_size.x - 1);
+				float v = y / (buffer_size.y - 1);
+				u = u * 2.0f - 1.0f;
+				v = v * 2.0f - 1.0f;
+
+				Plane pixel_proj = Plane(u, v, -1.0, 1.0);
+				Plane pixel_view = inv_camera_matrix.xform4(pixel_proj);
+				Vector3 pixel_world = p_cam_transform.xform(pixel_view.normal);
+
+				Vector3 dir;
+				if (p_cam_orthogonal) {
+					dir = -p_cam_transform.basis.get_axis(2);
+				} else {
+					dir = (pixel_world - p_cam_transform.origin).normalized();
+				}
+
+				packet.ray.org_x[j] = pixel_world.x;
+				packet.ray.org_y[j] = pixel_world.y;
+				packet.ray.org_z[j] = pixel_world.z;
+
+				packet.ray.dir_x[j] = dir.x;
+				packet.ray.dir_y[j] = dir.y;
+				packet.ray.dir_z[j] = dir.z;
+
+				packet.ray.tnear[j] = 0.0f;
+
+				packet.ray.time[j] = 0.0f;
+
+				packet.ray.flags[j] = 0;
+				packet.ray.mask[j] = -1;
+				packet.hit.geomID[j] = RTC_INVALID_GEOMETRY_ID;
+			}
+
+			packet.ray.tfar[j] = z_far;
+		}
+	}
+}
+
+void RaycastOcclusionCull::RaycastHZBuffer::sort_rays() {
+	if (is_empty()) {
+		return;
+	}
+
+	Size2i buffer_size = sizes[0];
+	for (int i = 0; i < packs_size.y; i++) {
+		for (int j = 0; j < packs_size.x; j++) {
+			for (int tile_i = 0; tile_i < TILE_SIZE; tile_i++) {
+				for (int tile_j = 0; tile_j < TILE_SIZE; tile_j++) {
+					int x = j * TILE_SIZE + tile_j;
+					int y = i * TILE_SIZE + tile_i;
+					if (x >= buffer_size.x || y >= buffer_size.y) {
+						continue;
+					}
+					int k = tile_i * TILE_SIZE + tile_j;
+					int packet_index = i * packs_size.x + j;
+					mips[0][y * buffer_size.x + x] = camera_rays[packet_index].ray.tfar[k];
+				}
+			}
+		}
+	}
+}
+
+////////////////////////////////////////////////////////
+
+bool RaycastOcclusionCull::is_occluder(RID p_rid) {
+	return occluder_owner.owns(p_rid);
+}
+
+RID RaycastOcclusionCull::occluder_allocate() {
+	return occluder_owner.allocate_rid();
+}
+
+void RaycastOcclusionCull::occluder_initialize(RID p_occluder) {
+	Occluder *occluder = memnew(Occluder);
+	occluder_owner.initialize_rid(p_occluder, occluder);
+}
+
+void RaycastOcclusionCull::occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) {
+	Occluder *occluder = occluder_owner.getornull(p_occluder);
+	ERR_FAIL_COND(!occluder);
+
+	occluder->vertices = p_vertices;
+	occluder->indices = p_indices;
+
+	for (Set<InstanceID>::Element *E = occluder->users.front(); E; E = E->next()) {
+		RID scenario_rid = E->get().scenario;
+		RID instance_rid = E->get().instance;
+		ERR_CONTINUE(!scenarios.has(scenario_rid));
+		Scenario &scenario = scenarios[scenario_rid];
+		ERR_CONTINUE(!scenario.instances.has(instance_rid));
+
+		if (!scenario.dirty_instances.has(instance_rid)) {
+			scenario.dirty_instances.insert(instance_rid);
+			scenario.dirty_instances_array.push_back(instance_rid);
+		}
+	}
+}
+
+void RaycastOcclusionCull::free_occluder(RID p_occluder) {
+	Occluder *occluder = occluder_owner.getornull(p_occluder);
+	ERR_FAIL_COND(!occluder);
+	memdelete(occluder);
+	occluder_owner.free(p_occluder);
+}
+
+////////////////////////////////////////////////////////
+
+void RaycastOcclusionCull::add_scenario(RID p_scenario) {
+	if (scenarios.has(p_scenario)) {
+		scenarios[p_scenario].removed = false;
+	} else {
+		scenarios[p_scenario] = Scenario();
+	}
+}
+
+void RaycastOcclusionCull::remove_scenario(RID p_scenario) {
+	ERR_FAIL_COND(!scenarios.has(p_scenario));
+	Scenario &scenario = scenarios[p_scenario];
+	scenario.removed = true;
+}
+
+void RaycastOcclusionCull::scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) {
+	ERR_FAIL_COND(!scenarios.has(p_scenario));
+	Scenario &scenario = scenarios[p_scenario];
+
+	if (!scenario.instances.has(p_instance)) {
+		scenario.instances[p_instance] = OccluderInstance();
+	}
+
+	OccluderInstance &instance = scenario.instances[p_instance];
+
+	if (instance.removed) {
+		instance.removed = false;
+		scenario.removed_instances.erase(p_instance);
+	}
+
+	bool changed = false;
+
+	if (instance.occluder != p_occluder) {
+		Occluder *old_occluder = occluder_owner.getornull(instance.occluder);
+		if (old_occluder) {
+			old_occluder->users.erase(InstanceID(p_scenario, p_instance));
+		}
+
+		instance.occluder = p_occluder;
+
+		if (p_occluder.is_valid()) {
+			Occluder *occluder = occluder_owner.getornull(p_occluder);
+			ERR_FAIL_COND(!occluder);
+			occluder->users.insert(InstanceID(p_scenario, p_instance));
+		}
+		changed = true;
+	}
+
+	if (instance.xform != p_xform) {
+		scenario.instances[p_instance].xform = p_xform;
+		changed = true;
+	}
+
+	if (instance.enabled != p_enabled) {
+		instance.enabled = p_enabled;
+		scenario.dirty = true; // The scenario needs a scene re-build, but the instance doesn't need update
+	}
+
+	if (changed && !scenario.dirty_instances.has(p_instance)) {
+		scenario.dirty_instances.insert(p_instance);
+		scenario.dirty_instances_array.push_back(p_instance);
+		scenario.dirty = true;
+	}
+}
+
+void RaycastOcclusionCull::scenario_remove_instance(RID p_scenario, RID p_instance) {
+	ERR_FAIL_COND(!scenarios.has(p_scenario));
+	Scenario &scenario = scenarios[p_scenario];
+
+	if (scenario.instances.has(p_instance)) {
+		OccluderInstance &instance = scenario.instances[p_instance];
+
+		if (!instance.removed) {
+			Occluder *occluder = occluder_owner.getornull(instance.occluder);
+			if (occluder) {
+				occluder->users.erase(InstanceID(p_scenario, p_instance));
+			}
+
+			scenario.removed_instances.push_back(p_instance);
+			instance.removed = true;
+		}
+	}
+}
+
+void RaycastOcclusionCull::Scenario::_update_dirty_instance_thread(int p_idx, RID *p_instances) {
+	_update_dirty_instance(p_idx, p_instances, nullptr);
+}
+
+void RaycastOcclusionCull::Scenario::_update_dirty_instance(int p_idx, RID *p_instances, ThreadWorkPool *p_thread_pool) {
+	OccluderInstance *occ_inst = instances.getptr(p_instances[p_idx]);
+
+	if (!occ_inst) {
+		return;
+	}
+
+	Occluder *occ = raycast_singleton->occluder_owner.getornull(occ_inst->occluder);
+
+	if (!occ) {
+		return;
+	}
+
+	int vertices_size = occ->vertices.size();
+
+	// Embree requires the last element to be readable by a 16-byte SSE load instruction, so we add padding to be safe.
+	occ_inst->xformed_vertices.resize(vertices_size + 1);
+
+	const Vector3 *read_ptr = occ->vertices.ptr();
+	Vector3 *write_ptr = occ_inst->xformed_vertices.ptr();
+
+	if (p_thread_pool && vertices_size > 1024) {
+		TransformThreadData td;
+		td.xform = occ_inst->xform;
+		td.read = read_ptr;
+		td.write = write_ptr;
+		td.vertex_count = vertices_size;
+		td.thread_count = p_thread_pool->get_thread_count();
+		p_thread_pool->do_work(td.thread_count, this, &Scenario::_transform_vertices_thread, &td);
+	} else {
+		_transform_vertices_range(read_ptr, write_ptr, occ_inst->xform, 0, vertices_size);
+	}
+
+	occ_inst->indices.resize(occ->indices.size());
+	memcpy(occ_inst->indices.ptr(), occ->indices.ptr(), occ->indices.size() * sizeof(int32_t));
+}
+
+void RaycastOcclusionCull::Scenario::_transform_vertices_thread(uint32_t p_thread, TransformThreadData *p_data) {
+	uint32_t vertex_total = p_data->vertex_count;
+	uint32_t total_threads = p_data->thread_count;
+	uint32_t from = p_thread * vertex_total / total_threads;
+	uint32_t to = (p_thread + 1 == total_threads) ? vertex_total : ((p_thread + 1) * vertex_total / total_threads);
+	_transform_vertices_range(p_data->read, p_data->write, p_data->xform, from, to);
+}
+
+void RaycastOcclusionCull::Scenario::_transform_vertices_range(const Vector3 *p_read, Vector3 *p_write, const Transform &p_xform, int p_from, int p_to) {
+	for (int i = p_from; i < p_to; i++) {
+		p_write[i] = p_xform.xform(p_read[i]);
+	}
+}
+
+void RaycastOcclusionCull::Scenario::_commit_scene(void *p_ud) {
+	Scenario *scenario = (Scenario *)p_ud;
+	int commit_idx = 1 - (scenario->current_scene_idx);
+	rtcCommitScene(scenario->ebr_scene[commit_idx]);
+	scenario->commit_done = true;
+}
+
+bool RaycastOcclusionCull::Scenario::update(ThreadWorkPool &p_thread_pool) {
+	ERR_FAIL_COND_V(singleton == nullptr, false);
+
+	if (commit_thread == nullptr) {
+		commit_thread = memnew(Thread);
+	}
+
+	if (commit_thread->is_started()) {
+		if (commit_done) {
+			commit_thread->wait_to_finish();
+			current_scene_idx = 1 - current_scene_idx;
+		} else {
+			return false;
+		}
+	}
+
+	if (removed) {
+		if (ebr_scene[0]) {
+			rtcReleaseScene(ebr_scene[0]);
+		}
+		if (ebr_scene[1]) {
+			rtcReleaseScene(ebr_scene[1]);
+		}
+		return true;
+	}
+
+	if (!dirty && removed_instances.is_empty() && dirty_instances_array.is_empty()) {
+		return false;
+	}
+
+	for (unsigned int i = 0; i < removed_instances.size(); i++) {
+		instances.erase(removed_instances[i]);
+	}
+
+	if (dirty_instances_array.size() / p_thread_pool.get_thread_count() > 128) {
+		// Lots of instances, use per-instance threading
+		p_thread_pool.do_work(dirty_instances_array.size(), this, &Scenario::_update_dirty_instance_thread, dirty_instances_array.ptr());
+	} else {
+		// Few instances, use threading on the vertex transforms
+		for (unsigned int i = 0; i < dirty_instances_array.size(); i++) {
+			_update_dirty_instance(i, dirty_instances_array.ptr(), &p_thread_pool);
+		}
+	}
+
+	dirty_instances.clear();
+	dirty_instances_array.clear();
+	removed_instances.clear();
+
+	if (raycast_singleton->ebr_device == nullptr) {
+		raycast_singleton->_init_embree();
+	}
+
+	int next_scene_idx = 1 - current_scene_idx;
+	RTCScene &next_scene = ebr_scene[next_scene_idx];
+
+	if (next_scene) {
+		rtcReleaseScene(next_scene);
+	}
+
+	next_scene = rtcNewScene(raycast_singleton->ebr_device);
+	rtcSetSceneBuildQuality(next_scene, RTCBuildQuality(raycast_singleton->build_quality));
+
+	const RID *inst_rid = nullptr;
+	while ((inst_rid = instances.next(inst_rid))) {
+		OccluderInstance *occ_inst = instances.getptr(*inst_rid);
+		Occluder *occ = raycast_singleton->occluder_owner.getornull(occ_inst->occluder);
+
+		if (!occ || !occ_inst->enabled) {
+			continue;
+		}
+
+		RTCGeometry geom = rtcNewGeometry(raycast_singleton->ebr_device, RTC_GEOMETRY_TYPE_TRIANGLE);
+		rtcSetSharedGeometryBuffer(geom, RTC_BUFFER_TYPE_VERTEX, 0, RTC_FORMAT_FLOAT3, occ_inst->xformed_vertices.ptr(), 0, sizeof(Vector3), occ_inst->xformed_vertices.size());
+		rtcSetSharedGeometryBuffer(geom, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, occ_inst->indices.ptr(), 0, sizeof(uint32_t) * 3, occ_inst->indices.size() / 3);
+		rtcCommitGeometry(geom);
+		rtcAttachGeometry(next_scene, geom);
+		rtcReleaseGeometry(geom);
+	}
+
+	dirty = false;
+	commit_done = false;
+	commit_thread->start(&Scenario::_commit_scene, this);
+	return false;
+}
+
+void RaycastOcclusionCull::Scenario::_raycast(uint32_t p_idx, const RaycastThreadData *p_raycast_data) const {
+	RTCIntersectContext ctx;
+	rtcInitIntersectContext(&ctx);
+	ctx.flags = RTC_INTERSECT_CONTEXT_FLAG_COHERENT;
+
+	rtcIntersect16((const int *)&p_raycast_data->masks[p_idx * TILE_RAYS], ebr_scene[current_scene_idx], &ctx, &p_raycast_data->rays[p_idx]);
+}
+
+void RaycastOcclusionCull::Scenario::raycast(LocalVector<RayPacket> &r_rays, const LocalVector<uint32_t> p_valid_masks, ThreadWorkPool &p_thread_pool) const {
+	ERR_FAIL_COND(singleton == nullptr);
+	if (raycast_singleton->ebr_device == nullptr) {
+		return; // Embree is initialized on demand when there is some scenario with occluders in it.
+	}
+
+	if (ebr_scene[current_scene_idx] == nullptr) {
+		return;
+	}
+
+	RaycastThreadData td;
+	td.rays = r_rays.ptr();
+	td.masks = p_valid_masks.ptr();
+
+	p_thread_pool.do_work(r_rays.size(), this, &Scenario::_raycast, &td);
+}
+
+////////////////////////////////////////////////////////
+
+void RaycastOcclusionCull::add_buffer(RID p_buffer) {
+	ERR_FAIL_COND(buffers.has(p_buffer));
+	buffers[p_buffer] = RaycastHZBuffer();
+}
+
+void RaycastOcclusionCull::remove_buffer(RID p_buffer) {
+	ERR_FAIL_COND(!buffers.has(p_buffer));
+	buffers.erase(p_buffer);
+}
+
+void RaycastOcclusionCull::buffer_set_scenario(RID p_buffer, RID p_scenario) {
+	ERR_FAIL_COND(!buffers.has(p_buffer));
+	ERR_FAIL_COND(p_scenario.is_valid() && !scenarios.has(p_scenario));
+	buffers[p_buffer].scenario_rid = p_scenario;
+}
+
+void RaycastOcclusionCull::buffer_set_size(RID p_buffer, const Vector2i &p_size) {
+	ERR_FAIL_COND(!buffers.has(p_buffer));
+	buffers[p_buffer].resize(p_size);
+}
+
+void RaycastOcclusionCull::buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) {
+	if (!buffers.has(p_buffer)) {
+		return;
+	}
+
+	RaycastHZBuffer &buffer = buffers[p_buffer];
+
+	if (buffer.is_empty() || !scenarios.has(buffer.scenario_rid)) {
+		return;
+	}
+
+	Scenario &scenario = scenarios[buffer.scenario_rid];
+
+	bool removed = scenario.update(p_thread_pool);
+
+	if (removed) {
+		scenarios.erase(buffer.scenario_rid);
+		return;
+	}
+
+	buffer.update_camera_rays(p_cam_transform, p_cam_projection, p_cam_orthogonal, p_thread_pool);
+
+	scenario.raycast(buffer.camera_rays, buffer.camera_ray_masks, p_thread_pool);
+	buffer.sort_rays();
+	buffer.update_mips();
+}
+
+RaycastOcclusionCull::HZBuffer *RaycastOcclusionCull::buffer_get_ptr(RID p_buffer) {
+	if (!buffers.has(p_buffer)) {
+		return nullptr;
+	}
+	return &buffers[p_buffer];
+}
+
+RID RaycastOcclusionCull::buffer_get_debug_texture(RID p_buffer) {
+	ERR_FAIL_COND_V(!buffers.has(p_buffer), RID());
+	return buffers[p_buffer].get_debug_texture();
+}
+
+////////////////////////////////////////////////////////
+
+void RaycastOcclusionCull::set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) {
+	if (build_quality == p_quality) {
+		return;
+	}
+
+	build_quality = p_quality;
+
+	const RID *scenario_rid = nullptr;
+	while ((scenario_rid = scenarios.next(scenario_rid))) {
+		scenarios[*scenario_rid].dirty = true;
+	}
+}
+
+void RaycastOcclusionCull::_init_embree() {
+#ifdef __SSE2__
+	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
+
+	String settings = vformat("threads=%d", MAX(1, OS::get_singleton()->get_processor_count() - 2));
+	ebr_device = rtcNewDevice(settings.utf8().ptr());
+}
+
+RaycastOcclusionCull::RaycastOcclusionCull() {
+	raycast_singleton = this;
+	int default_quality = GLOBAL_GET("rendering/occlusion_culling/bvh_build_quality");
+	build_quality = RS::ViewportOcclusionCullingBuildQuality(default_quality);
+}
+
+RaycastOcclusionCull::~RaycastOcclusionCull() {
+	const RID *scenario_rid = nullptr;
+	while ((scenario_rid = scenarios.next(scenario_rid))) {
+		Scenario &scenario = scenarios[*scenario_rid];
+		if (scenario.commit_thread) {
+			scenario.commit_thread->wait_to_finish();
+			memdelete(scenario.commit_thread);
+		}
+	}
+
+	if (ebr_device != nullptr) {
+#ifdef __SSE2__
+		_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
+		_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
+#endif
+		rtcReleaseDevice(ebr_device);
+	}
+
+	raycast_singleton = nullptr;
+}
diff --git a/modules/raycast/raycast_occlusion_cull.h b/modules/raycast/raycast_occlusion_cull.h
new file mode 100644
index 0000000000..acaceb9459
--- /dev/null
+++ b/modules/raycast/raycast_occlusion_cull.h
@@ -0,0 +1,184 @@
+/*************************************************************************/
+/*  raycast_occlusion_cull.h                                             */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef OCCLUSION_CULL_RAYCASTER_H
+#define OCCLUSION_CULL_RAYCASTER_H
+
+#include "core/io/image.h"
+#include "core/math/camera_matrix.h"
+#include "core/object/object.h"
+#include "core/object/reference.h"
+#include "core/templates/local_vector.h"
+#include "core/templates/rid_owner.h"
+#include "scene/resources/mesh.h"
+#include "servers/rendering/renderer_scene_occlusion_cull.h"
+
+#include <embree3/rtcore.h>
+
+class RaycastOcclusionCull : public RendererSceneOcclusionCull {
+	typedef RTCRayHit16 RayPacket;
+
+public:
+	class RaycastHZBuffer : public HZBuffer {
+	private:
+		Size2i packs_size;
+
+		struct CameraRayThreadData {
+			CameraMatrix camera_matrix;
+			Transform camera_transform;
+			bool camera_orthogonal;
+			int thread_count;
+			Size2i buffer_size;
+		};
+
+		void _camera_rays_threaded(uint32_t p_thread, CameraRayThreadData *p_data);
+		void _generate_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, int p_from, int p_to);
+
+	public:
+		LocalVector<RayPacket> camera_rays;
+		LocalVector<uint32_t> camera_ray_masks;
+		RID scenario_rid;
+
+		virtual void clear() override;
+		virtual void resize(const Size2i &p_size) override;
+		void sort_rays();
+		void update_camera_rays(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_work_pool);
+	};
+
+private:
+	struct InstanceID {
+		RID scenario;
+		RID instance;
+
+		bool operator<(const InstanceID &rhs) const {
+			if (instance == rhs.instance) {
+				return rhs.scenario < scenario;
+			}
+			return instance < rhs.instance;
+		}
+
+		InstanceID() {}
+		InstanceID(RID s, RID i) :
+				scenario(s), instance(i) {}
+	};
+
+	struct Occluder {
+		PackedVector3Array vertices;
+		PackedInt32Array indices;
+		Set<InstanceID> users;
+	};
+
+	struct OccluderInstance {
+		RID occluder;
+		LocalVector<uint32_t> indices;
+		LocalVector<Vector3> xformed_vertices;
+		Transform xform;
+		bool enabled = true;
+		bool removed = false;
+	};
+
+	struct Scenario {
+		struct RaycastThreadData {
+			RayPacket *rays;
+			const uint32_t *masks;
+		};
+
+		struct TransformThreadData {
+			uint32_t thread_count;
+			uint32_t vertex_count;
+			Transform xform;
+			const Vector3 *read;
+			Vector3 *write;
+		};
+
+		Thread *commit_thread = nullptr;
+		bool commit_done = true;
+		bool dirty = false;
+		bool removed = false;
+
+		RTCScene ebr_scene[2] = { nullptr, nullptr };
+		int current_scene_idx = 0;
+
+		HashMap<RID, OccluderInstance> instances;
+		Set<RID> dirty_instances; // To avoid duplicates
+		LocalVector<RID> dirty_instances_array; // To iterate and split into threads
+		LocalVector<RID> removed_instances;
+
+		void _update_dirty_instance_thread(int p_idx, RID *p_instances);
+		void _update_dirty_instance(int p_idx, RID *p_instances, ThreadWorkPool *p_thread_pool);
+		void _transform_vertices_thread(uint32_t p_thread, TransformThreadData *p_data);
+		void _transform_vertices_range(const Vector3 *p_read, Vector3 *p_write, const Transform &p_xform, int p_from, int p_to);
+		static void _commit_scene(void *p_ud);
+		bool update(ThreadWorkPool &p_thread_pool);
+
+		void _raycast(uint32_t p_thread, const RaycastThreadData *p_raycast_data) const;
+		void raycast(LocalVector<RayPacket> &r_rays, const LocalVector<uint32_t> p_valid_masks, ThreadWorkPool &p_thread_pool) const;
+	};
+
+	static RaycastOcclusionCull *raycast_singleton;
+
+	static const int TILE_SIZE = 4;
+	static const int TILE_RAYS = TILE_SIZE * TILE_SIZE;
+
+	RTCDevice ebr_device = nullptr;
+	RID_PtrOwner<Occluder> occluder_owner;
+	HashMap<RID, Scenario> scenarios;
+	HashMap<RID, RaycastHZBuffer> buffers;
+	RS::ViewportOcclusionCullingBuildQuality build_quality;
+
+	void _init_embree();
+
+public:
+	virtual bool is_occluder(RID p_rid) override;
+	virtual RID occluder_allocate() override;
+	virtual void occluder_initialize(RID p_occluder) override;
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) override;
+	virtual void free_occluder(RID p_occluder) override;
+
+	virtual void add_scenario(RID p_scenario) override;
+	virtual void remove_scenario(RID p_scenario) override;
+	virtual void scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) override;
+	virtual void scenario_remove_instance(RID p_scenario, RID p_instance) override;
+
+	virtual void add_buffer(RID p_buffer) override;
+	virtual void remove_buffer(RID p_buffer) override;
+	virtual HZBuffer *buffer_get_ptr(RID p_buffer) override;
+	virtual void buffer_set_scenario(RID p_buffer, RID p_scenario) override;
+	virtual void buffer_set_size(RID p_buffer, const Vector2i &p_size) override;
+	virtual void buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) override;
+	virtual RID buffer_get_debug_texture(RID p_buffer) override;
+
+	virtual void set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) override;
+
+	RaycastOcclusionCull();
+	~RaycastOcclusionCull();
+};
+
+#endif // OCCLUSION_CULL_RAYCASTER_H
diff --git a/modules/etc/register_types.cpp b/modules/raycast/register_types.cpp
index b165bccb3e..78ca91309f 100644
--- a/modules/etc/register_types.cpp
+++ b/modules/raycast/register_types.cpp
@@ -30,19 +30,20 @@
 
 #include "register_types.h"
 
-#include "image_compress_etc.h"
-#include "texture_loader_pkm.h"
+#include "lightmap_raycaster.h"
+#include "raycast_occlusion_cull.h"
 
-static Ref<ResourceFormatPKM> resource_loader_pkm;
+RaycastOcclusionCull *raycast_occlusion_cull = nullptr;
 
-void register_etc_types() {
-	resource_loader_pkm.instance();
-	ResourceLoader::add_resource_format_loader(resource_loader_pkm);
-
-	_register_etc_compress_func();
+void register_raycast_types() {
+#ifdef TOOLS_ENABLED
+	LightmapRaycasterEmbree::make_default_raycaster();
+#endif
+	raycast_occlusion_cull = memnew(RaycastOcclusionCull);
 }
 
-void unregister_etc_types() {
-	ResourceLoader::remove_resource_format_loader(resource_loader_pkm);
-	resource_loader_pkm.unref();
+void unregister_raycast_types() {
+	if (raycast_occlusion_cull) {
+		memdelete(raycast_occlusion_cull);
+	}
 }
diff --git a/modules/etc/register_types.h b/modules/raycast/register_types.h
index e8cbb635ae..789604a491 100644
--- a/modules/etc/register_types.h
+++ b/modules/raycast/register_types.h
@@ -28,10 +28,5 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
 
-#ifndef ETC_REGISTER_TYPES_H
-#define ETC_REGISTER_TYPES_H
-
-void register_etc_types();
-void unregister_etc_types();
-
-#endif // ETC_REGISTER_TYPES_H
+void register_raycast_types();
+void unregister_raycast_types();
diff --git a/modules/squish/image_compress_squish.cpp b/modules/squish/image_decompress_squish.cpp
index cce08034df..1450b0fe88 100644
--- a/modules/squish/image_compress_squish.cpp
+++ b/modules/squish/image_decompress_squish.cpp
@@ -1,5 +1,5 @@
 /*************************************************************************/
-/*  image_compress_squish.cpp                                            */
+/*  image_decompress_squish.cpp                                          */
 /*************************************************************************/
 /*                       This file is part of:                           */
 /*                           GODOT ENGINE                                */
@@ -28,7 +28,7 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
 
-#include "image_compress_squish.h"
+#include "image_decompress_squish.h"
 
 #include <squish.h>
 
@@ -76,83 +76,3 @@ void image_decompress_squish(Image *p_image) {
 		p_image->convert_ra_rgba8_to_rg();
 	}
 }
-
-void image_compress_squish(Image *p_image, float p_lossy_quality, Image::UsedChannels p_channels) {
-	if (p_image->get_format() >= Image::FORMAT_DXT1) {
-		return; //do not compress, already compressed
-	}
-
-	int w = p_image->get_width();
-	int h = p_image->get_height();
-
-	if (p_image->get_format() <= Image::FORMAT_RGBA8) {
-		int squish_comp = squish::kColourRangeFit;
-
-		if (p_lossy_quality > 0.85) {
-			squish_comp = squish::kColourIterativeClusterFit;
-		} else if (p_lossy_quality > 0.75) {
-			squish_comp = squish::kColourClusterFit;
-		}
-
-		Image::Format target_format = Image::FORMAT_RGBA8;
-
-		p_image->convert(Image::FORMAT_RGBA8); //still uses RGBA to convert
-
-		switch (p_channels) {
-			case Image::USED_CHANNELS_L: {
-				target_format = Image::FORMAT_DXT1;
-				squish_comp |= squish::kDxt1;
-			} break;
-			case Image::USED_CHANNELS_LA: {
-				target_format = Image::FORMAT_DXT5;
-				squish_comp |= squish::kDxt5;
-			} break;
-			case Image::USED_CHANNELS_R: {
-				target_format = Image::FORMAT_RGTC_R;
-				squish_comp |= squish::kBc4;
-			} break;
-			case Image::USED_CHANNELS_RG: {
-				target_format = Image::FORMAT_RGTC_RG;
-				squish_comp |= squish::kBc5;
-			} break;
-			case Image::USED_CHANNELS_RGB: {
-				target_format = Image::FORMAT_DXT1;
-				squish_comp |= squish::kDxt1;
-			} break;
-			case Image::USED_CHANNELS_RGBA: {
-				//TODO, should convert both, then measure which one does a better job
-				target_format = Image::FORMAT_DXT5;
-				squish_comp |= squish::kDxt5;
-
-			} break;
-			default: {
-				ERR_PRINT("Unknown image format, defaulting to RGBA8");
-				break;
-			}
-		}
-
-		Vector<uint8_t> data;
-		int target_size = Image::get_image_data_size(w, h, target_format, p_image->has_mipmaps());
-		int mm_count = p_image->has_mipmaps() ? Image::get_image_required_mipmaps(w, h, target_format) : 0;
-		data.resize(target_size);
-		int shift = Image::get_format_pixel_rshift(target_format);
-
-		const uint8_t *rb = p_image->get_data().ptr();
-		uint8_t *wb = data.ptrw();
-
-		int dst_ofs = 0;
-
-		for (int i = 0; i <= mm_count; i++) {
-			int bw = w % 4 != 0 ? w + (4 - w % 4) : w;
-			int bh = h % 4 != 0 ? h + (4 - h % 4) : h;
-
-			int src_ofs = p_image->get_mipmap_offset(i);
-			squish::CompressImage(&rb[src_ofs], w, h, &wb[dst_ofs], squish_comp);
-			dst_ofs += (MAX(4, bw) * MAX(4, bh)) >> shift;
-			w = MAX(w / 2, 1);
-			h = MAX(h / 2, 1);
-		}
-
-		p_image->create(p_image->get_width(), p_image->get_height(), p_image->has_mipmaps(), target_format, data);
-	}
-}
diff --git a/modules/squish/image_compress_squish.h b/modules/squish/image_decompress_squish.h
index 301d30fcf1..fff5839ac4 100644
--- a/modules/squish/image_compress_squish.h
+++ b/modules/squish/image_decompress_squish.h
@@ -1,5 +1,5 @@
 /*************************************************************************/
-/*  image_compress_squish.h                                              */
+/*  image_decompress_squish.h                                            */
 /*************************************************************************/
 /*                       This file is part of:                           */
 /*                           GODOT ENGINE                                */
@@ -28,12 +28,11 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
 
-#ifndef IMAGE_COMPRESS_SQUISH_H
-#define IMAGE_COMPRESS_SQUISH_H
+#ifndef IMAGE_DECOMPRESS_SQUISH_H
+#define IMAGE_DECOMPRESS_SQUISH_H
 
 #include "core/io/image.h"
 
-void image_compress_squish(Image *p_image, float p_lossy_quality, Image::UsedChannels p_channels);
 void image_decompress_squish(Image *p_image);
 
-#endif // IMAGE_COMPRESS_SQUISH_H
+#endif // IMAGE_DECOMPRESS_SQUISH_H
diff --git a/modules/squish/register_types.cpp b/modules/squish/register_types.cpp
index 451e9d8e93..51aab040e7 100644
--- a/modules/squish/register_types.cpp
+++ b/modules/squish/register_types.cpp
@@ -29,10 +29,10 @@
 /*************************************************************************/
 
 #include "register_types.h"
-#include "image_compress_squish.h"
+
+#include "image_decompress_squish.h"
 
 void register_squish_types() {
-	Image::set_compress_bc_func(image_compress_squish);
 	Image::_image_decompress_bc = image_decompress_squish;
 }
 
diff --git a/modules/stb_vorbis/audio_stream_ogg_vorbis.cpp b/modules/stb_vorbis/audio_stream_ogg_vorbis.cpp
index 6732078efc..e8e481de2d 100644
--- a/modules/stb_vorbis/audio_stream_ogg_vorbis.cpp
+++ b/modules/stb_vorbis/audio_stream_ogg_vorbis.cpp
@@ -204,7 +204,7 @@ void AudioStreamOGGVorbis::set_data(const Vector<uint8_t> &p_data) {
 			clear_data();
 
 			data = memalloc(src_data_len);
-			copymem(data, src_datar, src_data_len);
+			memcpy(data, src_datar, src_data_len);
 			data_len = src_data_len;
 
 			break;
@@ -221,7 +221,7 @@ Vector<uint8_t> AudioStreamOGGVorbis::get_data() const {
 		vdata.resize(data_len);
 		{
 			uint8_t *w = vdata.ptrw();
-			copymem(w, data, data_len);
+			memcpy(w, data, data_len);
 		}
 	}
 
diff --git a/modules/text_server_adv/SCsub b/modules/text_server_adv/SCsub
index e7863f88a3..a55b6dc283 100644
--- a/modules/text_server_adv/SCsub
+++ b/modules/text_server_adv/SCsub
@@ -448,7 +448,7 @@ if env["builtin_icu"]:
     ]
     thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
 
-    icu_data_name = "icudt68l.dat"
+    icu_data_name = "icudt69l.dat"
 
     if env_icu["tools"]:
         env_icu.Depends("#thirdparty/icu4c/icudata.gen.h", "#thirdparty/icu4c/" + icu_data_name)
diff --git a/modules/theora/video_stream_theora.cpp b/modules/theora/video_stream_theora.cpp
index c5f6dc0d99..54f5b3f424 100644
--- a/modules/theora/video_stream_theora.cpp
+++ b/modules/theora/video_stream_theora.cpp
@@ -225,7 +225,7 @@ void VideoStreamPlaybackTheora::set_file(const String &p_file) {
 			/* identify the codec: try theora */
 			if (!theora_p && th_decode_headerin(&ti, &tc, &ts, &op) >= 0) {
 				/* it is theora */
-				copymem(&to, &test, sizeof(test));
+				memcpy(&to, &test, sizeof(test));
 				theora_p = 1;
 			} else if (!vorbis_p && vorbis_synthesis_headerin(&vi, &vc, &op) >= 0) {
 				/* it is vorbis */
@@ -238,7 +238,7 @@ void VideoStreamPlaybackTheora::set_file(const String &p_file) {
 
 					audio_track_skip--;
 				} else {
-					copymem(&vo, &test, sizeof(test));
+					memcpy(&vo, &test, sizeof(test));
 					vorbis_p = 1;
 				}
 			} else {
@@ -603,6 +603,7 @@ float VideoStreamPlaybackTheora::get_playback_position() const {
 };
 
 void VideoStreamPlaybackTheora::seek(float p_time) {
+	WARN_PRINT_ONCE("Seeking in Theora and WebM videos is not implemented yet (it's only supported for GDNative-provided video streams).");
 }
 
 void VideoStreamPlaybackTheora::set_mix_callback(AudioMixCallback p_callback, void *p_userdata) {
diff --git a/modules/tinyexr/image_saver_tinyexr.cpp b/modules/tinyexr/image_saver_tinyexr.cpp
index f747763248..6a2fb0f666 100644
--- a/modules/tinyexr/image_saver_tinyexr.cpp
+++ b/modules/tinyexr/image_saver_tinyexr.cpp
@@ -169,7 +169,7 @@ Error save_exr(const String &p_path, const Ref<Image> &p_img, bool p_grayscale)
 		{ 0 }, // R
 		{ 1, 0 }, // GR
 		{ 2, 1, 0 }, // BGR
-		{ 2, 1, 0, 3 } // BGRA
+		{ 3, 2, 1, 0 } // ABGR
 	};
 
 	int channel_count = get_channel_count(format);
diff --git a/modules/visual_script/visual_script.cpp b/modules/visual_script/visual_script.cpp
index 6d5fff88d9..765a5fe023 100644
--- a/modules/visual_script/visual_script.cpp
+++ b/modules/visual_script/visual_script.cpp
@@ -1537,7 +1537,7 @@ Variant VisualScriptInstance::_call_internal(const StringName &p_method, void *p
 				state->flow_stack_pos = flow_stack_pos;
 				state->stack.resize(p_stack_size);
 				state->pass = p_pass;
-				copymem(state->stack.ptrw(), p_stack, p_stack_size);
+				memcpy(state->stack.ptrw(), p_stack, p_stack_size);
 				// Step 2, run away, return directly.
 				r_error.error = Callable::CallError::CALL_OK;
 
@@ -1607,7 +1607,7 @@ Variant VisualScriptInstance::_call_internal(const StringName &p_method, void *p
 			}
 
 			next = node->sequence_outputs[output];
-			VSDEBUG("GOT NEXT NODE - " + (next ? itos(next->get_id()) : "NULL"));
+			VSDEBUG("GOT NEXT NODE - " + (next ? itos(next->get_id()) : "Null"));
 		}
 
 		if (flow_stack) {
@@ -1802,7 +1802,7 @@ Variant VisualScriptInstance::call(const StringName &p_method, const Variant **p
 		sequence_bits[i] = false; // All starts as false.
 	}
 
-	zeromem(pass_stack, f->pass_stack_size * sizeof(int));
+	memset(pass_stack, 0, f->pass_stack_size * sizeof(int));
 
 	Map<int, VisualScriptNodeInstance *>::Element *E = instances.find(f->node);
 	if (!E) {
diff --git a/modules/visual_script/visual_script_editor.cpp b/modules/visual_script/visual_script_editor.cpp
index 3cdf60708b..02ec9ccd06 100644
--- a/modules/visual_script/visual_script_editor.cpp
+++ b/modules/visual_script/visual_script_editor.cpp
@@ -710,7 +710,7 @@ void VisualScriptEditor::_update_graph(int p_only_id) {
 			has_gnode_text = true;
 			LineEdit *line_edit = memnew(LineEdit);
 			line_edit->set_text(node->get_text());
-			line_edit->set_expand_to_text_length(true);
+			line_edit->set_expand_to_text_length_enabled(true);
 			line_edit->add_theme_font_override("font", get_theme_font("source", "EditorFonts"));
 			gnode->add_child(line_edit);
 			line_edit->connect("text_changed", callable_mp(this, &VisualScriptEditor::_expression_text_changed), varray(E->get()));
@@ -843,7 +843,7 @@ void VisualScriptEditor::_update_graph(int p_only_id) {
 						hbc->add_child(name_box);
 						name_box->set_custom_minimum_size(Size2(60 * EDSCALE, 0));
 						name_box->set_text(left_name);
-						name_box->set_expand_to_text_length(true);
+						name_box->set_expand_to_text_length_enabled(true);
 						name_box->connect("resized", callable_mp(this, &VisualScriptEditor::_update_node_size), varray(E->get()));
 						name_box->connect("focus_exited", callable_mp(this, &VisualScriptEditor::_port_name_focus_out), varray(name_box, E->get(), i, true));
 					} else {
@@ -938,7 +938,7 @@ void VisualScriptEditor::_update_graph(int p_only_id) {
 						hbc->add_child(name_box);
 						name_box->set_custom_minimum_size(Size2(60 * EDSCALE, 0));
 						name_box->set_text(right_name);
-						name_box->set_expand_to_text_length(true);
+						name_box->set_expand_to_text_length_enabled(true);
 						name_box->connect("resized", callable_mp(this, &VisualScriptEditor::_update_node_size), varray(E->get()));
 						name_box->connect("focus_exited", callable_mp(this, &VisualScriptEditor::_port_name_focus_out), varray(name_box, E->get(), i, false));
 					} else {
@@ -4322,7 +4322,7 @@ VisualScriptEditor::VisualScriptEditor() {
 	function_name_box = memnew(LineEdit);
 	function_name_edit->add_child(function_name_box);
 	function_name_box->connect("gui_input", callable_mp(this, &VisualScriptEditor::_fn_name_box_input));
-	function_name_box->set_expand_to_text_length(true);
+	function_name_box->set_expand_to_text_length_enabled(true);
 	add_child(function_name_edit);
 
 	///       Actual Graph          ///
diff --git a/modules/webm/video_stream_webm.cpp b/modules/webm/video_stream_webm.cpp
index 101001cba0..a6b64b342e 100644
--- a/modules/webm/video_stream_webm.cpp
+++ b/modules/webm/video_stream_webm.cpp
@@ -194,7 +194,7 @@ float VideoStreamPlaybackWebm::get_playback_position() const {
 }
 
 void VideoStreamPlaybackWebm::seek(float p_time) {
-	//Not implemented
+	WARN_PRINT_ONCE("Seeking in Theora and WebM videos is not implemented yet (it's only supported for GDNative-provided video streams).");
 }
 
 void VideoStreamPlaybackWebm::set_audio_track(int p_idx) {
diff --git a/modules/webp/image_loader_webp.cpp b/modules/webp/image_loader_webp.cpp
index b304c4824f..6e62840a3e 100644
--- a/modules/webp/image_loader_webp.cpp
+++ b/modules/webp/image_loader_webp.cpp
@@ -68,7 +68,7 @@ static Vector<uint8_t> _webp_lossy_pack(const Ref<Image> &p_image, float p_quali
 	w[1] = 'E';
 	w[2] = 'B';
 	w[3] = 'P';
-	copymem(&w[4], dst_buff, dst_size);
+	memcpy(&w[4], dst_buff, dst_size);
 	free(dst_buff);
 
 	return dst;
diff --git a/modules/websocket/packet_buffer.h b/modules/websocket/packet_buffer.h
index ed756363cf..e99a379767 100644
--- a/modules/websocket/packet_buffer.h
+++ b/modules/websocket/packet_buffer.h
@@ -31,7 +31,6 @@
 #ifndef PACKET_BUFFER_H
 #define PACKET_BUFFER_H
 
-#include "core/os/copymem.h"
 #include "core/templates/ring_buffer.h"
 
 template <class T>
@@ -66,7 +65,7 @@ public:
 		if (p_info) {
 			_Packet p;
 			p.size = p_size;
-			copymem(&p.info, p_info, sizeof(T));
+			memcpy(&p.info, p_info, sizeof(T));
 			_packets.write(p);
 		}
 
@@ -86,7 +85,7 @@ public:
 		ERR_FAIL_COND_V(p_bytes < (int)p.size, ERR_OUT_OF_MEMORY);
 
 		r_read = p.size;
-		copymem(r_info, &p.info, sizeof(T));
+		memcpy(r_info, &p.info, sizeof(T));
 		_payload.read(r_payload, p.size);
 		return OK;
 	}
diff --git a/modules/websocket/websocket_client.cpp b/modules/websocket/websocket_client.cpp
index 425013f811..1e9183ebfa 100644
--- a/modules/websocket/websocket_client.cpp
+++ b/modules/websocket/websocket_client.cpp
@@ -43,34 +43,18 @@ Error WebSocketClient::connect_to_url(String p_url, const Vector<String> p_proto
 
 	String host = p_url;
 	String path = "/";
-	int p_len = -1;
+	String scheme = "";
 	int port = 80;
-	bool ssl = false;
-	if (host.begins_with("wss://")) {
-		ssl = true; // we should implement this
-		host = host.substr(6, host.length() - 6);
-		port = 443;
-	} else {
-		ssl = false;
-		if (host.begins_with("ws://")) {
-			host = host.substr(5, host.length() - 5);
-		}
-	}
+	Error err = p_url.parse_url(scheme, host, port, path);
+	ERR_FAIL_COND_V_MSG(err != OK, err, "Invalid URL: " + p_url);
 
-	// Path
-	p_len = host.find("/");
-	if (p_len != -1) {
-		path = host.substr(p_len, host.length() - p_len);
-		host = host.substr(0, p_len);
+	bool ssl = false;
+	if (scheme == "wss://") {
+		ssl = true;
 	}
-
-	// Port
-	p_len = host.rfind(":");
-	if (p_len != -1 && p_len == host.find(":")) {
-		port = host.substr(p_len, host.length() - p_len).to_int();
-		host = host.substr(0, p_len);
+	if (port == 0) {
+		port = ssl ? 443 : 80;
 	}
-
 	return connect_to_host(host, path, port, ssl, p_protocols, p_custom_headers);
 }
 
diff --git a/modules/websocket/websocket_multiplayer_peer.cpp b/modules/websocket/websocket_multiplayer_peer.cpp
index 758ed66c80..fa0ef7060f 100644
--- a/modules/websocket/websocket_multiplayer_peer.cpp
+++ b/modules/websocket/websocket_multiplayer_peer.cpp
@@ -99,6 +99,8 @@ Error WebSocketMultiplayerPeer::get_packet(const uint8_t **r_buffer, int &r_buff
 		_current_packet.data = nullptr;
 	}
 
+	ERR_FAIL_COND_V(_incoming_packets.size() == 0, ERR_UNAVAILABLE);
+
 	_current_packet = _incoming_packets.front()->get();
 	_incoming_packets.pop_front();
 
@@ -168,10 +170,10 @@ Vector<uint8_t> WebSocketMultiplayerPeer::_make_pkt(uint8_t p_type, int32_t p_fr
 	out.resize(PROTO_SIZE + p_data_size);
 
 	uint8_t *w = out.ptrw();
-	copymem(&w[0], &p_type, 1);
-	copymem(&w[1], &p_from, 4);
-	copymem(&w[5], &p_to, 4);
-	copymem(&w[PROTO_SIZE], p_data, p_data_size);
+	memcpy(&w[0], &p_type, 1);
+	memcpy(&w[1], &p_from, 4);
+	memcpy(&w[5], &p_to, 4);
+	memcpy(&w[PROTO_SIZE], p_data, p_data_size);
 
 	return out;
 }
@@ -211,7 +213,7 @@ void WebSocketMultiplayerPeer::_store_pkt(int32_t p_source, int32_t p_dest, cons
 	packet.size = p_data_size;
 	packet.source = p_source;
 	packet.destination = p_dest;
-	copymem(packet.data, &p_data[PROTO_SIZE], p_data_size);
+	memcpy(packet.data, &p_data[PROTO_SIZE], p_data_size);
 	_incoming_packets.push_back(packet);
 	emit_signal("peer_packet", p_source);
 }
@@ -263,9 +265,9 @@ void WebSocketMultiplayerPeer::_process_multiplayer(Ref<WebSocketPeer> p_peer, u
 	uint8_t type = 0;
 	uint32_t from = 0;
 	int32_t to = 0;
-	copymem(&type, in_buffer, 1);
-	copymem(&from, &in_buffer[1], 4);
-	copymem(&to, &in_buffer[5], 4);
+	memcpy(&type, in_buffer, 1);
+	memcpy(&from, &in_buffer[1], 4);
+	memcpy(&to, &in_buffer[5], 4);
 
 	if (is_server()) { // Server can resend
 
@@ -299,7 +301,7 @@ void WebSocketMultiplayerPeer::_process_multiplayer(Ref<WebSocketPeer> p_peer, u
 		// System message
 		ERR_FAIL_COND(data_size < 4);
 		int id = 0;
-		copymem(&id, &in_buffer[PROTO_SIZE], 4);
+		memcpy(&id, &in_buffer[PROTO_SIZE], 4);
 
 		switch (type) {
 			case SYS_ADD: // Add peer
diff --git a/modules/webxr/native/library_godot_webxr.js b/modules/webxr/native/library_godot_webxr.js
index 8e9ef8a73c..6e19a8ac6e 100644
--- a/modules/webxr/native/library_godot_webxr.js
+++ b/modules/webxr/native/library_godot_webxr.js
@@ -71,10 +71,8 @@ const GodotWebXR = {
 			// enabled or disabled. When using the WebXR API Emulator, this
 			// gets picked up automatically, however, in the Oculus Browser
 			// on the Quest, we need to pause and resume the main loop.
-			Browser.pauseAsyncCallbacks();
 			Browser.mainLoop.pause();
 			window.setTimeout(function () {
-				Browser.resumeAsyncCallbacks();
 				Browser.mainLoop.resume();
 			}, 0);
 		},
diff --git a/modules/webxr/webxr_interface_js.cpp b/modules/webxr/webxr_interface_js.cpp
index 4dce2c2b23..06f3fe6284 100644
--- a/modules/webxr/webxr_interface_js.cpp
+++ b/modules/webxr/webxr_interface_js.cpp
@@ -253,7 +253,7 @@ bool WebXRInterfaceJS::initialize() {
 void WebXRInterfaceJS::uninitialize() {
 	if (initialized) {
 		XRServer *xr_server = XRServer::get_singleton();
-		if (xr_server != NULL) {
+		if (xr_server != nullptr) {
 			// no longer our primary interface
 			xr_server->clear_primary_interface_if(this);
 		}
diff --git a/modules/xatlas_unwrap/register_types.cpp b/modules/xatlas_unwrap/register_types.cpp
index e1f9521a48..8913ef1b65 100644
--- a/modules/xatlas_unwrap/register_types.cpp
+++ b/modules/xatlas_unwrap/register_types.cpp
@@ -29,26 +29,19 @@
 /*************************************************************************/
 
 #include "register_types.h"
-
-#include "core/error/error_macros.h"
-
 #include "core/crypto/crypto_core.h"
-
 #include "thirdparty/xatlas/xatlas.h"
 
-#include <stdio.h>
-#include <stdlib.h>
+extern bool (*array_mesh_lightmap_unwrap_callback)(float p_texel_size, const float *p_vertices, const float *p_normals, int p_vertex_count, const int *p_indices, int p_index_count, const uint8_t *p_cache_data, bool *r_use_cache, uint8_t **r_mesh_cache, int *r_mesh_cache_size, float **r_uv, int **r_vertex, int *r_vertex_count, int **r_index, int *r_index_count, int *r_size_hint_x, int *r_size_hint_y);
 
-extern bool (*array_mesh_lightmap_unwrap_callback)(float p_texel_size, const float *p_vertices, const float *p_normals, int p_vertex_count, const int *p_indices, int p_index_count, float **r_uv, int **r_vertex, int *r_vertex_count, int **r_index, int *r_index_count, int *r_size_hint_x, int *r_size_hint_y, int *&r_cache_data, unsigned int &r_cache_size, bool &r_used_cache);
-
-bool xatlas_mesh_lightmap_unwrap_callback(float p_texel_size, const float *p_vertices, const float *p_normals, int p_vertex_count, const int *p_indices, int p_index_count, float **r_uvs, int **r_vertices, int *r_vertex_count, int **r_indices, int *r_index_count, int *r_size_hint_x, int *r_size_hint_y, int *&r_cache_data, unsigned int &r_cache_size, bool &r_used_cache) {
+bool xatlas_mesh_lightmap_unwrap_callback(float p_texel_size, const float *p_vertices, const float *p_normals, int p_vertex_count, const int *p_indices, int p_index_count, const uint8_t *p_cache_data, bool *r_use_cache, uint8_t **r_mesh_cache, int *r_mesh_cache_size, float **r_uv, int **r_vertex, int *r_vertex_count, int **r_index, int *r_index_count, int *r_size_hint_x, int *r_size_hint_y) {
 	CryptoCore::MD5Context ctx;
 	ctx.start();
 
 	ctx.update((unsigned char *)&p_texel_size, sizeof(float));
 	ctx.update((unsigned char *)p_indices, sizeof(int) * p_index_count);
-	ctx.update((unsigned char *)p_vertices, sizeof(float) * p_vertex_count);
-	ctx.update((unsigned char *)p_normals, sizeof(float) * p_vertex_count);
+	ctx.update((unsigned char *)p_vertices, sizeof(float) * p_vertex_count * 3);
+	ctx.update((unsigned char *)p_normals, sizeof(float) * p_vertex_count * 3);
 
 	unsigned char hash[16];
 	ctx.finish(hash);
@@ -56,38 +49,37 @@ bool xatlas_mesh_lightmap_unwrap_callback(float p_texel_size, const float *p_ver
 	bool cached = false;
 	unsigned int cache_idx = 0;
 
-	if (r_used_cache && r_cache_size) {
-		//Check if hash is in cache data
+	*r_mesh_cache = nullptr;
+	*r_mesh_cache_size = 0;
 
-		int *cache_data = r_cache_data;
+	if (p_cache_data) {
+		//Check if hash is in cache data
+		int *cache_data = (int *)p_cache_data;
 		int n_entries = cache_data[0];
-		unsigned int r_idx = 1;
+		unsigned int read_idx = 1;
 		for (int i = 0; i < n_entries; ++i) {
-			if (memcmp(&cache_data[r_idx], hash, 16) == 0) {
+			if (memcmp(&cache_data[read_idx], hash, 16) == 0) {
 				cached = true;
-				cache_idx = r_idx;
+				cache_idx = read_idx;
 				break;
 			}
 
-			r_idx += 4; // hash
-			r_idx += 2; // size hint
+			read_idx += 4; // hash
+			read_idx += 2; // size hint
 
-			int vertex_count = cache_data[r_idx];
-			r_idx += 1; // vertex count
-			r_idx += vertex_count; // vertex
-			r_idx += vertex_count * 2; // uvs
+			int vertex_count = cache_data[read_idx];
+			read_idx += 1; // vertex count
+			read_idx += vertex_count; // vertex
+			read_idx += vertex_count * 2; // uvs
 
-			int index_count = cache_data[r_idx];
-			r_idx += 1; // index count
-			r_idx += index_count; // indices
+			int index_count = cache_data[read_idx];
+			read_idx += 1; // index count
+			read_idx += index_count; // indices
 		}
 	}
 
-	if (r_used_cache && cached) {
-		int *cache_data = r_cache_data;
-
-		// Return cache data pointer to the caller
-		r_cache_data = &cache_data[cache_idx];
+	if (cached) {
+		int *cache_data = (int *)p_cache_data;
 
 		cache_idx += 4;
 
@@ -99,96 +91,92 @@ bool xatlas_mesh_lightmap_unwrap_callback(float p_texel_size, const float *p_ver
 		// Load vertices
 		*r_vertex_count = cache_data[cache_idx];
 		cache_idx++;
-		*r_vertices = &cache_data[cache_idx];
+		*r_vertex = &cache_data[cache_idx];
 		cache_idx += *r_vertex_count;
 
 		// Load UVs
-		*r_uvs = (float *)&cache_data[cache_idx];
+		*r_uv = (float *)&cache_data[cache_idx];
 		cache_idx += *r_vertex_count * 2;
 
 		// Load indices
 		*r_index_count = cache_data[cache_idx];
 		cache_idx++;
-		*r_indices = &cache_data[cache_idx];
-
-		// Return cache data size to the caller
-		r_cache_size = sizeof(int) * (4 + 2 + 1 + *r_vertex_count + (*r_vertex_count * 2) + 1 + *r_index_count); // hash + size hint + vertex_count + vertices + uvs + index_count + indices
-		r_used_cache = true;
-		return true;
-	}
-
-	//set up input mesh
-	xatlas::MeshDecl input_mesh;
-	input_mesh.indexData = p_indices;
-	input_mesh.indexCount = p_index_count;
-	input_mesh.indexFormat = xatlas::IndexFormat::UInt32;
-
-	input_mesh.vertexCount = p_vertex_count;
-	input_mesh.vertexPositionData = p_vertices;
-	input_mesh.vertexPositionStride = sizeof(float) * 3;
-	input_mesh.vertexNormalData = p_normals;
-	input_mesh.vertexNormalStride = sizeof(uint32_t) * 3;
-	input_mesh.vertexUvData = nullptr;
-	input_mesh.vertexUvStride = 0;
-
-	xatlas::ChartOptions chart_options;
-	xatlas::PackOptions pack_options;
-
-	pack_options.maxChartSize = 4096;
-	pack_options.blockAlign = true;
-	pack_options.padding = 1;
-	pack_options.texelsPerUnit = 1.0 / p_texel_size;
+		*r_index = &cache_data[cache_idx];
+	} else {
+		// set up input mesh
+		xatlas::MeshDecl input_mesh;
+		input_mesh.indexData = p_indices;
+		input_mesh.indexCount = p_index_count;
+		input_mesh.indexFormat = xatlas::IndexFormat::UInt32;
+
+		input_mesh.vertexCount = p_vertex_count;
+		input_mesh.vertexPositionData = p_vertices;
+		input_mesh.vertexPositionStride = sizeof(float) * 3;
+		input_mesh.vertexNormalData = p_normals;
+		input_mesh.vertexNormalStride = sizeof(uint32_t) * 3;
+		input_mesh.vertexUvData = NULL;
+		input_mesh.vertexUvStride = 0;
+
+		xatlas::ChartOptions chart_options;
+		chart_options.fixWinding = true;
+
+		xatlas::PackOptions pack_options;
+		pack_options.padding = 1;
+		pack_options.maxChartSize = 4094; // Lightmap atlassing needs 2 for padding between meshes, so 4096-2
+		pack_options.blockAlign = true;
+		pack_options.texelsPerUnit = 1.0 / p_texel_size;
+
+		xatlas::Atlas *atlas = xatlas::Create();
+
+		xatlas::AddMeshError err = xatlas::AddMesh(atlas, input_mesh, 1);
+		ERR_FAIL_COND_V_MSG(err != xatlas::AddMeshError::Success, false, xatlas::StringForEnum(err));
+
+		xatlas::Generate(atlas, chart_options, pack_options);
+
+		*r_size_hint_x = atlas->width;
+		*r_size_hint_y = atlas->height;
+
+		float w = *r_size_hint_x;
+		float h = *r_size_hint_y;
+
+		if (w == 0 || h == 0) {
+			xatlas::Destroy(atlas);
+			return false; //could not bake because there is no area
+		}
 
-	xatlas::Atlas *atlas = xatlas::Create();
-	printf("Adding mesh..\n");
-	xatlas::AddMeshError err = xatlas::AddMesh(atlas, input_mesh, 1);
-	ERR_FAIL_COND_V_MSG(err != xatlas::AddMeshError::Success, false, xatlas::StringForEnum(err));
+		const xatlas::Mesh &output = atlas->meshes[0];
+
+		*r_vertex = (int *)memalloc(sizeof(int) * output.vertexCount);
+		ERR_FAIL_NULL_V_MSG(*r_vertex, false, "Out of memory.");
+		*r_uv = (float *)memalloc(sizeof(float) * output.vertexCount * 2);
+		ERR_FAIL_NULL_V_MSG(*r_uv, false, "Out of memory.");
+		*r_index = (int *)memalloc(sizeof(int) * output.indexCount);
+		ERR_FAIL_NULL_V_MSG(*r_index, false, "Out of memory.");
+
+		float max_x = 0;
+		float max_y = 0;
+		for (uint32_t i = 0; i < output.vertexCount; i++) {
+			(*r_vertex)[i] = output.vertexArray[i].xref;
+			(*r_uv)[i * 2 + 0] = output.vertexArray[i].uv[0] / w;
+			(*r_uv)[i * 2 + 1] = output.vertexArray[i].uv[1] / h;
+			max_x = MAX(max_x, output.vertexArray[i].uv[0]);
+			max_y = MAX(max_y, output.vertexArray[i].uv[1]);
+		}
 
-	printf("Generate..\n");
-	xatlas::Generate(atlas, chart_options, pack_options);
+		*r_vertex_count = output.vertexCount;
 
-	*r_size_hint_x = atlas->width;
-	*r_size_hint_y = atlas->height;
+		for (uint32_t i = 0; i < output.indexCount; i++) {
+			(*r_index)[i] = output.indexArray[i];
+		}
 
-	float w = *r_size_hint_x;
-	float h = *r_size_hint_y;
+		*r_index_count = output.indexCount;
 
-	if (w == 0 || h == 0) {
 		xatlas::Destroy(atlas);
-		return false; //could not bake because there is no area
-	}
-
-	const xatlas::Mesh &output = atlas->meshes[0];
-
-	*r_vertices = (int *)malloc(sizeof(int) * output.vertexCount);
-	ERR_FAIL_NULL_V_MSG(*r_vertices, false, "Out of memory.");
-	*r_uvs = (float *)malloc(sizeof(float) * output.vertexCount * 2);
-	ERR_FAIL_NULL_V_MSG(*r_uvs, false, "Out of memory.");
-	*r_indices = (int *)malloc(sizeof(int) * output.indexCount);
-	ERR_FAIL_NULL_V_MSG(*r_indices, false, "Out of memory.");
-
-	float max_x = 0.0;
-	float max_y = 0.0;
-	for (uint32_t i = 0; i < output.vertexCount; i++) {
-		(*r_vertices)[i] = output.vertexArray[i].xref;
-		(*r_uvs)[i * 2 + 0] = output.vertexArray[i].uv[0] / w;
-		(*r_uvs)[i * 2 + 1] = output.vertexArray[i].uv[1] / h;
-		max_x = MAX(max_x, output.vertexArray[i].uv[0]);
-		max_y = MAX(max_y, output.vertexArray[i].uv[1]);
 	}
 
-	printf("Final texture size: %f,%f - max %f,%f\n", w, h, max_x, max_y);
-	*r_vertex_count = output.vertexCount;
+	if (*r_use_cache) {
+		// Build cache data for current mesh
 
-	for (uint32_t i = 0; i < output.indexCount; i++) {
-		(*r_indices)[i] = output.indexArray[i];
-	}
-
-	*r_index_count = output.indexCount;
-
-	xatlas::Destroy(atlas);
-
-	if (r_used_cache) {
 		unsigned int new_cache_size = 4 + 2 + 1 + *r_vertex_count + (*r_vertex_count * 2) + 1 + *r_index_count; // hash + size hint + vertex_count + vertices + uvs + index_count + indices
 		new_cache_size *= sizeof(int);
 		int *new_cache_data = (int *)memalloc(new_cache_size);
@@ -208,11 +196,11 @@ bool xatlas_mesh_lightmap_unwrap_callback(float p_texel_size, const float *p_ver
 		new_cache_idx++;
 
 		// vertices
-		memcpy(&new_cache_data[new_cache_idx], *r_vertices, sizeof(int) * *r_vertex_count);
+		memcpy(&new_cache_data[new_cache_idx], *r_vertex, sizeof(int) * (*r_vertex_count));
 		new_cache_idx += *r_vertex_count;
 
 		// uvs
-		memcpy(&new_cache_data[new_cache_idx], *r_uvs, sizeof(float) * *r_vertex_count * 2);
+		memcpy(&new_cache_data[new_cache_idx], *r_uv, sizeof(float) * (*r_vertex_count) * 2);
 		new_cache_idx += *r_vertex_count * 2;
 
 		// index count
@@ -220,15 +208,15 @@ bool xatlas_mesh_lightmap_unwrap_callback(float p_texel_size, const float *p_ver
 		new_cache_idx++;
 
 		// indices
-		memcpy(&new_cache_data[new_cache_idx], *r_indices, sizeof(int) * *r_index_count);
-		new_cache_idx += *r_index_count;
+		memcpy(&new_cache_data[new_cache_idx], *r_index, sizeof(int) * (*r_index_count));
 
 		// Return cache data to the caller
-		r_cache_data = new_cache_data;
-		r_cache_size = new_cache_size;
-		r_used_cache = false;
+		*r_mesh_cache = (uint8_t *)new_cache_data;
+		*r_mesh_cache_size = new_cache_size;
 	}
 
+	*r_use_cache = cached; // Return whether cache was used.
+
 	return true;
 }
 
diff --git a/platform/android/detect.py b/platform/android/detect.py
index 996b6dcf41..2a80a3c45b 100644
--- a/platform/android/detect.py
+++ b/platform/android/detect.py
@@ -258,8 +258,10 @@ def configure(env):
     env.Append(CPPFLAGS=["-isystem", env["ANDROID_NDK_ROOT"] + "/sources/cxx-stl/llvm-libc++abi/include"])
 
     # Disable exceptions and rtti on non-tools (template) builds
-    if env["tools"] or env["builtin_icu"]:
+    if env["tools"]:
         env.Append(CXXFLAGS=["-frtti"])
+    elif env["builtin_icu"]:
+        env.Append(CXXFLAGS=["-frtti", "-fno-exceptions"])
     else:
         env.Append(CXXFLAGS=["-fno-rtti", "-fno-exceptions"])
         # Don't use dynamic_cast, necessary with no-rtti.
diff --git a/platform/android/export/export.cpp b/platform/android/export/export.cpp
index 5f9f420b59..cd3f00f935 100644
--- a/platform/android/export/export.cpp
+++ b/platform/android/export/export.cpp
@@ -201,8 +201,10 @@ static const char *android_perms[] = {
 	nullptr
 };
 
-static const char *SPLASH_IMAGE_EXPORT_PATH = "res/drawable/splash.png";
-static const char *SPLASH_BG_COLOR_PATH = "res/drawable/splash_bg_color.png";
+static const char *SPLASH_IMAGE_EXPORT_PATH = "res/drawable-nodpi/splash.png";
+static const char *LEGACY_BUILD_SPLASH_IMAGE_EXPORT_PATH = "res/drawable-nodpi-v4/splash.png";
+static const char *SPLASH_BG_COLOR_PATH = "res/drawable-nodpi/splash_bg_color.png";
+static const char *LEGACY_BUILD_SPLASH_BG_COLOR_PATH = "res/drawable-nodpi-v4/splash_bg_color.png";
 static const char *SPLASH_CONFIG_PATH = "res://android/build/res/drawable/splash_drawable.xml";
 
 const String SPLASH_CONFIG_XML_CONTENT = R"SPLASH(<?xml version="1.0" encoding="utf-8"?>
@@ -210,7 +212,7 @@ const String SPLASH_CONFIG_XML_CONTENT = R"SPLASH(<?xml version="1.0" encoding="
 	<item android:drawable="@drawable/splash_bg_color" />
 	<item>
 		<bitmap
-				android:gravity="%s"
+				android:gravity="center"
 				android:filter="%s"
 				android:src="@drawable/splash" />
 	</item>
@@ -853,7 +855,6 @@ class EditorExportPlatformAndroid : public EditorExportPlatform {
 		bool screen_support_xlarge = p_preset->get("screen/support_xlarge");
 
 		int xr_mode_index = p_preset->get("xr_features/xr_mode");
-		bool focus_awareness = p_preset->get("xr_features/focus_awareness");
 
 		Vector<String> perms;
 		// Write permissions into the perms variable.
@@ -919,7 +920,6 @@ class EditorExportPlatformAndroid : public EditorExportPlatform {
 					String tname = string_table[name];
 					uint32_t attrcount = decode_uint32(&p_manifest[iofs + 20]);
 					iofs += 28;
-					bool is_focus_aware_metadata = false;
 
 					for (uint32_t i = 0; i < attrcount; i++) {
 						uint32_t attr_nspace = decode_uint32(&p_manifest[iofs]);
@@ -971,28 +971,6 @@ class EditorExportPlatformAndroid : public EditorExportPlatform {
 							}
 						}
 
-						// FIXME: `attr_value != 0xFFFFFFFF` below added as a stopgap measure for GH-32553,
-						// but the issue should be debugged further and properly addressed.
-						if (tname == "meta-data" && attrname == "name" && value == "xr_mode_metadata_name") {
-							// Update the meta-data 'android:name' attribute based on the selected XR mode.
-							if (xr_mode_index == 1 /* XRMode.OVR */) {
-								string_table.write[attr_value] = "com.samsung.android.vr.application.mode";
-							}
-						}
-
-						if (tname == "meta-data" && attrname == "value" && value == "xr_mode_metadata_value") {
-							// Update the meta-data 'android:value' attribute based on the selected XR mode.
-							if (xr_mode_index == 1 /* XRMode.OVR */) {
-								string_table.write[attr_value] = "vr_only";
-							}
-						}
-
-						if (tname == "meta-data" && attrname == "value" && is_focus_aware_metadata) {
-							// Update the focus awareness meta-data value
-							encode_uint32(xr_mode_index == /* XRMode.OVR */ 1 && focus_awareness ? 0xFFFFFFFF : 0, &p_manifest.write[iofs + 16]);
-						}
-
-						is_focus_aware_metadata = tname == "meta-data" && attrname == "name" && value == "com.oculus.vr.focusaware";
 						iofs += 20;
 					}
 
@@ -1008,15 +986,6 @@ class EditorExportPlatformAndroid : public EditorExportPlatform {
 						Vector<int> feature_versions;
 
 						if (xr_mode_index == 1 /* XRMode.OVR */) {
-							// Check for degrees of freedom
-							int dof_index = p_preset->get("xr_features/degrees_of_freedom"); // 0: none, 1: 3dof and 6dof, 2: 6dof
-
-							if (dof_index > 0) {
-								feature_names.push_back("android.hardware.vr.headtracking");
-								feature_required_list.push_back(dof_index == 2);
-								feature_versions.push_back(1);
-							}
-
 							// Check for hand tracking
 							int hand_tracking_index = p_preset->get("xr_features/hand_tracking"); // 0: none, 1: optional, 2: required
 							if (hand_tracking_index > 0) {
@@ -1502,6 +1471,21 @@ class EditorExportPlatformAndroid : public EditorExportPlatform {
 			splash_image = Ref<Image>(memnew(Image(boot_splash_png)));
 		}
 
+		if (scale_splash) {
+			Size2 screen_size = Size2(ProjectSettings::get_singleton()->get("display/window/size/width"), ProjectSettings::get_singleton()->get("display/window/size/height"));
+			int width, height;
+			if (screen_size.width > screen_size.height) {
+				// scale horizontally
+				height = screen_size.height;
+				width = splash_image->get_width() * screen_size.height / splash_image->get_height();
+			} else {
+				// scale vertically
+				width = screen_size.width;
+				height = splash_image->get_height() * screen_size.width / splash_image->get_width();
+			}
+			splash_image->resize(width, height);
+		}
+
 		// Setup the splash bg color
 		bool bg_color_valid;
 		Color bg_color = ProjectSettings::get_singleton()->get("application/boot_splash/bg_color", &bg_color_valid);
@@ -1514,8 +1498,7 @@ class EditorExportPlatformAndroid : public EditorExportPlatform {
 		splash_bg_color_image->create(splash_image->get_width(), splash_image->get_height(), false, splash_image->get_format());
 		splash_bg_color_image->fill(bg_color);
 
-		String gravity = scale_splash ? "fill" : "center";
-		String processed_splash_config_xml = vformat(SPLASH_CONFIG_XML_CONTENT, gravity, bool_to_string(apply_filter));
+		String processed_splash_config_xml = vformat(SPLASH_CONFIG_XML_CONTENT, bool_to_string(apply_filter));
 		return processed_splash_config_xml;
 	}
 
@@ -1692,9 +1675,7 @@ public:
 		r_options->push_back(ExportOption(PropertyInfo(Variant::BOOL, "graphics/opengl_debug"), false));
 
 		r_options->push_back(ExportOption(PropertyInfo(Variant::INT, "xr_features/xr_mode", PROPERTY_HINT_ENUM, "Regular,Oculus Mobile VR"), 0));
-		r_options->push_back(ExportOption(PropertyInfo(Variant::INT, "xr_features/degrees_of_freedom", PROPERTY_HINT_ENUM, "None,3DOF and 6DOF,6DOF"), 0));
 		r_options->push_back(ExportOption(PropertyInfo(Variant::INT, "xr_features/hand_tracking", PROPERTY_HINT_ENUM, "None,Optional,Required"), 0));
-		r_options->push_back(ExportOption(PropertyInfo(Variant::BOOL, "xr_features/focus_awareness"), false));
 
 		r_options->push_back(ExportOption(PropertyInfo(Variant::BOOL, "screen/immersive_mode"), true));
 		r_options->push_back(ExportOption(PropertyInfo(Variant::BOOL, "screen/support_small"), true));
@@ -1803,7 +1784,7 @@ public:
 			p_debug_flags |= DEBUG_FLAG_REMOTE_DEBUG_LOCALHOST;
 		}
 
-		String tmp_export_path = EditorSettings::get_singleton()->get_cache_dir().plus_file("tmpexport.apk");
+		String tmp_export_path = EditorSettings::get_singleton()->get_cache_dir().plus_file("tmpexport." + uitos(OS::get_singleton()->get_unix_time()) + ".apk");
 
 #define CLEANUP_AND_RETURN(m_err)                         \
 	{                                                     \
@@ -1820,6 +1801,7 @@ public:
 
 		List<String> args;
 		int rv;
+		String output;
 
 		bool remove_prev = p_preset->get("one_click_deploy/clear_previous_install");
 		String version_name = p_preset->get("version/name");
@@ -1837,7 +1819,9 @@ public:
 			args.push_back("uninstall");
 			args.push_back(get_package_name(package_name));
 
-			err = OS::get_singleton()->execute(adb, args, nullptr, &rv);
+			output.clear();
+			err = OS::get_singleton()->execute(adb, args, &output, &rv, true);
+			print_verbose(output);
 		}
 
 		print_line("Installing to device (please wait...): " + devices[p_device].name);
@@ -1852,7 +1836,9 @@ public:
 		args.push_back("-r");
 		args.push_back(tmp_export_path);
 
-		err = OS::get_singleton()->execute(adb, args, nullptr, &rv);
+		output.clear();
+		err = OS::get_singleton()->execute(adb, args, &output, &rv, true);
+		print_verbose(output);
 		if (err || rv != 0) {
 			EditorNode::add_io_error("Could not install to device.");
 			CLEANUP_AND_RETURN(ERR_CANT_CREATE);
@@ -1869,7 +1855,9 @@ public:
 				args.push_back(devices[p_device].id);
 				args.push_back("reverse");
 				args.push_back("--remove-all");
-				OS::get_singleton()->execute(adb, args, nullptr, &rv);
+				output.clear();
+				OS::get_singleton()->execute(adb, args, &output, &rv, true);
+				print_verbose(output);
 
 				if (p_debug_flags & DEBUG_FLAG_REMOTE_DEBUG) {
 					int dbg_port = EditorSettings::get_singleton()->get("network/debug/remote_port");
@@ -1880,7 +1868,9 @@ public:
 					args.push_back("tcp:" + itos(dbg_port));
 					args.push_back("tcp:" + itos(dbg_port));
 
-					OS::get_singleton()->execute(adb, args, nullptr, &rv);
+					output.clear();
+					OS::get_singleton()->execute(adb, args, &output, &rv, true);
+					print_verbose(output);
 					print_line("Reverse result: " + itos(rv));
 				}
 
@@ -1894,7 +1884,9 @@ public:
 					args.push_back("tcp:" + itos(fs_port));
 					args.push_back("tcp:" + itos(fs_port));
 
-					err = OS::get_singleton()->execute(adb, args, nullptr, &rv);
+					output.clear();
+					err = OS::get_singleton()->execute(adb, args, &output, &rv, true);
+					print_verbose(output);
 					print_line("Reverse result2: " + itos(rv));
 				}
 			} else {
@@ -1922,7 +1914,9 @@ public:
 		args.push_back("-n");
 		args.push_back(get_package_name(package_name) + "/com.godot.game.GodotApp");
 
-		err = OS::get_singleton()->execute(adb, args, nullptr, &rv);
+		output.clear();
+		err = OS::get_singleton()->execute(adb, args, &output, &rv, true);
+		print_verbose(output);
 		if (err || rv != 0) {
 			EditorNode::add_io_error("Could not execute on device.");
 			CLEANUP_AND_RETURN(ERR_CANT_CREATE);
@@ -2130,27 +2124,13 @@ public:
 
 		// Validate the Xr features are properly populated
 		int xr_mode_index = p_preset->get("xr_features/xr_mode");
-		int degrees_of_freedom = p_preset->get("xr_features/degrees_of_freedom");
 		int hand_tracking = p_preset->get("xr_features/hand_tracking");
-		bool focus_awareness = p_preset->get("xr_features/focus_awareness");
 		if (xr_mode_index != /* XRMode.OVR*/ 1) {
-			if (degrees_of_freedom > 0) {
-				valid = false;
-				err += TTR("\"Degrees Of Freedom\" is only valid when \"Xr Mode\" is \"Oculus Mobile VR\".");
-				err += "\n";
-			}
-
 			if (hand_tracking > 0) {
 				valid = false;
 				err += TTR("\"Hand Tracking\" is only valid when \"Xr Mode\" is \"Oculus Mobile VR\".");
 				err += "\n";
 			}
-
-			if (focus_awareness) {
-				valid = false;
-				err += TTR("\"Focus Awareness\" is only valid when \"Xr Mode\" is \"Oculus Mobile VR\".");
-				err += "\n";
-			}
 		}
 
 		if (int(p_preset->get("custom_template/export_format")) == EXPORT_FORMAT_AAB &&
@@ -2267,7 +2247,7 @@ public:
 				}
 				r_command_line_flags.resize(base + 4 + length);
 				encode_uint32(length, &r_command_line_flags.write[base]);
-				copymem(&r_command_line_flags.write[base + 4], command_line_argument.ptr(), length);
+				memcpy(&r_command_line_flags.write[base + 4], command_line_argument.ptr(), length);
 			}
 		}
 	}
@@ -2319,6 +2299,7 @@ public:
 			return ERR_FILE_CANT_OPEN;
 		}
 
+		String output;
 		List<String> args;
 		args.push_back("sign");
 		args.push_back("--verbose");
@@ -2334,7 +2315,9 @@ public:
 			print_verbose("Signing debug binary using: " + String("\n") + apksigner + " " + join_list(args, String(" ")));
 		}
 		int retval;
-		OS::get_singleton()->execute(apksigner, args, nullptr, &retval);
+		output.clear();
+		OS::get_singleton()->execute(apksigner, args, &output, &retval, true);
+		print_verbose(output);
 		if (retval) {
 			EditorNode::add_io_error("'apksigner' returned with error #" + itos(retval));
 			return ERR_CANT_CREATE;
@@ -2352,7 +2335,9 @@ public:
 			print_verbose("Verifying signed build using: " + String("\n") + apksigner + " " + join_list(args, String(" ")));
 		}
 
-		OS::get_singleton()->execute(apksigner, args, nullptr, &retval);
+		output.clear();
+		OS::get_singleton()->execute(apksigner, args, &output, &retval, true);
+		print_verbose(output);
 		if (retval) {
 			EditorNode::add_io_error("'apksigner' verification of " + export_label + " failed.");
 			return ERR_CANT_CREATE;
@@ -2657,7 +2642,7 @@ public:
 		FileAccess *dst_f = nullptr;
 		io2.opaque = &dst_f;
 
-		String tmp_unaligned_path = EditorSettings::get_singleton()->get_cache_dir().plus_file("tmpexport-unaligned.apk");
+		String tmp_unaligned_path = EditorSettings::get_singleton()->get_cache_dir().plus_file("tmpexport-unaligned." + uitos(OS::get_singleton()->get_unix_time()) + ".apk");
 
 #define CLEANUP_AND_RETURN(m_err)                            \
 	{                                                        \
@@ -2702,12 +2687,12 @@ public:
 			}
 
 			// Process the splash image
-			if (file == SPLASH_IMAGE_EXPORT_PATH && splash_image.is_valid() && !splash_image->is_empty()) {
+			if ((file == SPLASH_IMAGE_EXPORT_PATH || file == LEGACY_BUILD_SPLASH_IMAGE_EXPORT_PATH) && splash_image.is_valid() && !splash_image->is_empty()) {
 				_load_image_data(splash_image, data);
 			}
 
 			// Process the splash bg color image
-			if (file == SPLASH_BG_COLOR_PATH && splash_bg_color_image.is_valid() && !splash_bg_color_image->is_empty()) {
+			if ((file == SPLASH_BG_COLOR_PATH || file == LEGACY_BUILD_SPLASH_BG_COLOR_PATH) && splash_bg_color_image.is_valid() && !splash_bg_color_image->is_empty()) {
 				_load_image_data(splash_bg_color_image, data);
 			}
 
diff --git a/platform/android/export/gradle_export_util.h b/platform/android/export/gradle_export_util.h
index 40b8e90c6f..bbbb526af9 100644
--- a/platform/android/export/gradle_export_util.h
+++ b/platform/android/export/gradle_export_util.h
@@ -241,12 +241,6 @@ String _get_xr_features_tag(const Ref<EditorExportPreset> &p_preset) {
 	String manifest_xr_features;
 	bool uses_xr = (int)(p_preset->get("xr_features/xr_mode")) == 1;
 	if (uses_xr) {
-		int dof_index = p_preset->get("xr_features/degrees_of_freedom"); // 0: none, 1: 3dof and 6dof, 2: 6dof
-		if (dof_index == 1) {
-			manifest_xr_features += "    <uses-feature tools:node=\"replace\" android:name=\"android.hardware.vr.headtracking\" android:required=\"false\" android:version=\"1\" />\n";
-		} else if (dof_index == 2) {
-			manifest_xr_features += "    <uses-feature tools:node=\"replace\" android:name=\"android.hardware.vr.headtracking\" android:required=\"true\" android:version=\"1\" />\n";
-		}
 		int hand_tracking_index = p_preset->get("xr_features/hand_tracking"); // 0: none, 1: optional, 2: required
 		if (hand_tracking_index == 1) {
 			manifest_xr_features += "    <uses-feature tools:node=\"replace\" android:name=\"oculus.software.handtracking\" android:required=\"false\" />\n";
@@ -278,10 +272,7 @@ String _get_activity_tag(const Ref<EditorExportPreset> &p_preset) {
 			"tools:replace=\"android:screenOrientation\" "
 			"android:screenOrientation=\"%s\">\n",
 			orientation);
-	if (uses_xr) {
-		String focus_awareness = bool_to_string(p_preset->get("xr_features/focus_awareness"));
-		manifest_activity_text += vformat("            <meta-data tools:node=\"replace\" android:name=\"com.oculus.vr.focusaware\" android:value=\"%s\" />\n", focus_awareness);
-	} else {
+	if (!uses_xr) {
 		manifest_activity_text += "            <meta-data tools:node=\"remove\" android:name=\"com.oculus.vr.focusaware\" />\n";
 	}
 	manifest_activity_text += "        </activity>\n";
@@ -289,16 +280,11 @@ String _get_activity_tag(const Ref<EditorExportPreset> &p_preset) {
 }
 
 String _get_application_tag(const Ref<EditorExportPreset> &p_preset) {
-	bool uses_xr = (int)(p_preset->get("xr_features/xr_mode")) == 1;
 	String manifest_application_text =
 			"    <application android:label=\"@string/godot_project_name_string\"\n"
 			"        android:allowBackup=\"false\" tools:ignore=\"GoogleAppIndexingWarning\"\n"
-			"        android:icon=\"@mipmap/icon\">\n\n"
-			"        <meta-data tools:node=\"remove\" android:name=\"xr_mode_metadata_name\" />\n";
+			"        android:icon=\"@mipmap/icon\">\n\n";
 
-	if (uses_xr) {
-		manifest_application_text += "        <meta-data tools:node=\"replace\" android:name=\"com.samsung.android.vr.application.mode\" android:value=\"vr_only\" />\n";
-	}
 	manifest_application_text += _get_activity_tag(p_preset);
 	manifest_application_text += "    </application>\n";
 	return manifest_application_text;
diff --git a/platform/android/file_access_android.cpp b/platform/android/file_access_android.cpp
index e288c16777..705891713f 100644
--- a/platform/android/file_access_android.cpp
+++ b/platform/android/file_access_android.cpp
@@ -114,7 +114,7 @@ uint8_t FileAccessAndroid::get_8() const {
 }
 
 int FileAccessAndroid::get_buffer(uint8_t *p_dst, int p_length) const {
-	ERR_FAIL_COND_V(!p_dst, -1);
+	ERR_FAIL_COND_V(!p_dst && p_length > 0, -1);
 	ERR_FAIL_COND_V(p_length < 0, -1);
 
 	off_t r = AAsset_read(a, p_dst, p_length);
diff --git a/platform/android/java/app/AndroidManifest.xml b/platform/android/java/app/AndroidManifest.xml
index 948fa8c00b..15feea15a4 100644
--- a/platform/android/java/app/AndroidManifest.xml
+++ b/platform/android/java/app/AndroidManifest.xml
@@ -30,11 +30,6 @@
         <!-- The following metadata values are replaced when Godot exports, modifying them here has no effect. -->
         <!-- Do these changes in the export preset. Adding new ones is fine. -->
 
-        <!-- XR mode metadata. This is modified by the exporter based on the selected xr mode. DO NOT CHANGE the values here. -->
-        <meta-data
-            android:name="xr_mode_metadata_name"
-            android:value="xr_mode_metadata_value" />
-
         <activity
             android:name=".GodotApp"
             android:label="@string/godot_project_name_string"
@@ -45,8 +40,8 @@
             android:resizeableActivity="false"
             tools:ignore="UnusedAttribute" >
 
-            <!-- Focus awareness metadata is updated at export time if the user enables it in the 'Xr Features' section. -->
-            <meta-data android:name="com.oculus.vr.focusaware" android:value="false" />
+            <!-- Focus awareness metadata is removed at export time if the xr mode is not VR. -->
+            <meta-data android:name="com.oculus.vr.focusaware" android:value="true" />
 
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
diff --git a/platform/android/java/app/build.gradle b/platform/android/java/app/build.gradle
index f103f22db2..1b1fb47bd8 100644
--- a/platform/android/java/app/build.gradle
+++ b/platform/android/java/app/build.gradle
@@ -157,7 +157,7 @@ android {
             aidl.srcDirs = ['aidl']
             assets.srcDirs = ['assets']
         }
-        debug.jniLibs.srcDirs = ['libs/debug']
+        debug.jniLibs.srcDirs = ['libs/debug', 'libs/debug/vulkan_validation_layers']
         release.jniLibs.srcDirs = ['libs/release']
     }
 
diff --git a/platform/android/java/app/config.gradle b/platform/android/java/app/config.gradle
index c0ae4007d2..b278d15bdf 100644
--- a/platform/android/java/app/config.gradle
+++ b/platform/android/java/app/config.gradle
@@ -3,7 +3,7 @@ ext.versions = [
     compileSdk         : 29,
     minSdk             : 18,
     targetSdk          : 29,
-    buildTools         : '30.0.1',
+    buildTools         : '30.0.3',
     supportCoreUtils   : '1.0.0',
     kotlinVersion      : '1.4.10',
     v4Support          : '1.0.0',
@@ -239,5 +239,5 @@ ext.shouldSign = { ->
 }
 
 ext.shouldNotStrip = { ->
-    return isAndroidStudio()
+    return isAndroidStudio() || project.hasProperty("doNotStrip")
 }
diff --git a/platform/android/java/app/res/drawable/splash.png b/platform/android/java/app/res/drawable-nodpi/splash.png
index 7bddd4325a..7bddd4325a 100644
--- a/platform/android/java/app/res/drawable/splash.png
+++ b/platform/android/java/app/res/drawable-nodpi/splash.png
diff --git a/platform/android/java/app/res/drawable/splash_bg_color.png b/platform/android/java/app/res/drawable-nodpi/splash_bg_color.png
index 004b6fd508..004b6fd508 100644
--- a/platform/android/java/app/res/drawable/splash_bg_color.png
+++ b/platform/android/java/app/res/drawable-nodpi/splash_bg_color.png
diff --git a/platform/android/java/build.gradle b/platform/android/java/build.gradle
index ec02b0fc7a..a7fe500be2 100644
--- a/platform/android/java/build.gradle
+++ b/platform/android/java/build.gradle
@@ -112,7 +112,7 @@ task copyReleaseAARToBin(type: Copy) {
  * The zip file also includes some gradle tools to allow building of the custom build.
  */
 task zipCustomBuild(type: Zip) {
-    dependsOn ':generateGodotTemplates'
+    onlyIf { generateGodotTemplates.state.executed || generateDevTemplate.state.executed }
     doFirst {
         logger.lifecycle("Generating Godot custom build template")
     }
@@ -122,16 +122,17 @@ task zipCustomBuild(type: Zip) {
     destinationDir(file(binDir))
 }
 
-/**
- * Master task used to coordinate the tasks defined above to generate the set of Godot templates.
- */
-task generateGodotTemplates(type: GradleBuild) {
+def templateExcludedBuildTask() {
     // We exclude these gradle tasks so we can run the scons command manually.
+    def excludedTasks = []
     for (String buildType : supportedTargets) {
-        startParameter.excludedTaskNames += ":lib:" + getSconsTaskName(buildType)
+        excludedTasks += ":lib:" + getSconsTaskName(buildType)
     }
+    return excludedTasks
+}
 
-    tasks = []
+def templateBuildTasks() {
+    def tasks = []
 
     // Only build the apks and aar files for which we have native shared libraries.
     for (String target : supportedTargets) {
@@ -152,6 +153,29 @@ task generateGodotTemplates(type: GradleBuild) {
         }
     }
 
+    return tasks
+}
+
+/**
+ * Master task used to coordinate the tasks defined above to generate the set of Godot templates.
+ */
+task generateGodotTemplates(type: GradleBuild) {
+    startParameter.excludedTaskNames = templateExcludedBuildTask()
+    tasks = templateBuildTasks()
+
+    finalizedBy 'zipCustomBuild'
+}
+
+/**
+ * Generates the same output as generateGodotTemplates but with dev symbols
+ */
+task generateDevTemplate (type: GradleBuild) {
+    // add parameter to set symbols to true
+    startParameter.projectProperties += [doNotStrip: true]
+
+    startParameter.excludedTaskNames = templateExcludedBuildTask()
+    tasks = templateBuildTasks()
+
     finalizedBy 'zipCustomBuild'
 }
 
diff --git a/platform/android/java/gradle.properties b/platform/android/java/gradle.properties
index 2dc069ad2f..6b3b62a9da 100644
--- a/platform/android/java/gradle.properties
+++ b/platform/android/java/gradle.properties
@@ -12,7 +12,7 @@ android.useAndroidX=true
 
 # Specifies the JVM arguments used for the daemon process.
 # The setting is particularly useful for tweaking memory settings.
-org.gradle.jvmargs=-Xmx1536m
+org.gradle.jvmargs=-Xmx4536m
 
 # When configured, Gradle will run in incubating parallel mode.
 # This option should only be used with decoupled projects. More details, visit
diff --git a/platform/android/java/lib/src/org/godotengine/godot/FullScreenGodotApp.java b/platform/android/java/lib/src/org/godotengine/godot/FullScreenGodotApp.java
index ec2ace4821..1ed16e04ca 100644
--- a/platform/android/java/lib/src/org/godotengine/godot/FullScreenGodotApp.java
+++ b/platform/android/java/lib/src/org/godotengine/godot/FullScreenGodotApp.java
@@ -63,30 +63,27 @@ public abstract class FullScreenGodotApp extends FragmentActivity implements God
 
 	@Override
 	public void onNewIntent(Intent intent) {
+		super.onNewIntent(intent);
 		if (godotFragment != null) {
 			godotFragment.onNewIntent(intent);
-		} else {
-			super.onNewIntent(intent);
 		}
 	}
 
 	@CallSuper
 	@Override
 	public void onActivityResult(int requestCode, int resultCode, Intent data) {
+		super.onActivityResult(requestCode, resultCode, data);
 		if (godotFragment != null) {
 			godotFragment.onActivityResult(requestCode, resultCode, data);
-		} else {
-			super.onActivityResult(requestCode, resultCode, data);
 		}
 	}
 
 	@CallSuper
 	@Override
 	public void onRequestPermissionsResult(int requestCode, String[] permissions, int[] grantResults) {
+		super.onRequestPermissionsResult(requestCode, permissions, grantResults);
 		if (godotFragment != null) {
 			godotFragment.onRequestPermissionsResult(requestCode, permissions, grantResults);
-		} else {
-			super.onRequestPermissionsResult(requestCode, permissions, grantResults);
 		}
 	}
 
diff --git a/platform/android/vulkan/vulkan_context_android.cpp b/platform/android/vulkan/vulkan_context_android.cpp
index 1bf85f07f1..63f2026fae 100644
--- a/platform/android/vulkan/vulkan_context_android.cpp
+++ b/platform/android/vulkan/vulkan_context_android.cpp
@@ -52,10 +52,10 @@ int VulkanContextAndroid::window_create(ANativeWindow *p_window, int p_width, in
 	return _window_create(DisplayServer::MAIN_WINDOW_ID, surface, p_width, p_height);
 }
 
-VulkanContextAndroid::VulkanContextAndroid() {
-	// TODO: fix validation layers
-	use_validation_layers = false;
-}
+bool VulkanContextAndroid::_use_validation_layers() {
+	uint32_t count = 0;
+	_get_preferred_validation_layers(&count, nullptr);
 
-VulkanContextAndroid::~VulkanContextAndroid() {
+	// On Android, we use validation layers automatically if they were explicitly linked with the app.
+	return count > 0;
 }
diff --git a/platform/android/vulkan/vulkan_context_android.h b/platform/android/vulkan/vulkan_context_android.h
index c608f2d665..5a84eaf8f3 100644
--- a/platform/android/vulkan/vulkan_context_android.h
+++ b/platform/android/vulkan/vulkan_context_android.h
@@ -36,13 +36,16 @@
 struct ANativeWindow;
 
 class VulkanContextAndroid : public VulkanContext {
-	virtual const char *_get_platform_surface_extension() const;
+	virtual const char *_get_platform_surface_extension() const override;
 
 public:
 	int window_create(ANativeWindow *p_window, int p_width, int p_height);
 
-	VulkanContextAndroid();
-	~VulkanContextAndroid();
+	VulkanContextAndroid() = default;
+	~VulkanContextAndroid() override = default;
+
+protected:
+	bool _use_validation_layers() override;
 };
 
 #endif // VULKAN_CONTEXT_ANDROID_H
diff --git a/platform/iphone/display_server_iphone.mm b/platform/iphone/display_server_iphone.mm
index 05dc78bb4d..b590ce065c 100644
--- a/platform/iphone/display_server_iphone.mm
+++ b/platform/iphone/display_server_iphone.mm
@@ -136,13 +136,13 @@ DisplayServerIPhone::~DisplayServerIPhone() {
 		if (rendering_device_vulkan) {
 			rendering_device_vulkan->finalize();
 			memdelete(rendering_device_vulkan);
-			rendering_device_vulkan = NULL;
+			rendering_device_vulkan = nullptr;
 		}
 
 		if (context_vulkan) {
 			context_vulkan->window_destroy(MAIN_WINDOW_ID);
 			memdelete(context_vulkan);
-			context_vulkan = NULL;
+			context_vulkan = nullptr;
 		}
 	}
 #endif
diff --git a/platform/iphone/godot_iphone.mm b/platform/iphone/godot_iphone.mm
index 62bc2e6e52..6c3e1eabde 100644
--- a/platform/iphone/godot_iphone.mm
+++ b/platform/iphone/godot_iphone.mm
@@ -50,7 +50,7 @@ int add_path(int p_argc, char **p_args) {
 
 	p_args[p_argc++] = (char *)"--path";
 	p_args[p_argc++] = (char *)[str cStringUsingEncoding:NSUTF8StringEncoding];
-	p_args[p_argc] = NULL;
+	p_args[p_argc] = nullptr;
 
 	return p_argc;
 };
@@ -69,7 +69,7 @@ int add_cmdline(int p_argc, char **p_args) {
 		p_args[p_argc++] = (char *)[str cStringUsingEncoding:NSUTF8StringEncoding];
 	};
 
-	p_args[p_argc] = NULL;
+	p_args[p_argc] = nullptr;
 
 	return p_argc;
 };
diff --git a/platform/iphone/godot_view.mm b/platform/iphone/godot_view.mm
index 468fa2928a..00a88d79c5 100644
--- a/platform/iphone/godot_view.mm
+++ b/platform/iphone/godot_view.mm
@@ -291,14 +291,14 @@ static const float earth_gravity = 9.80665;
 
 - (void)initTouches {
 	for (int i = 0; i < max_touches; i++) {
-		godot_touches[i] = NULL;
+		godot_touches[i] = nullptr;
 	}
 }
 
 - (int)getTouchIDForTouch:(UITouch *)p_touch {
 	int first = -1;
 	for (int i = 0; i < max_touches; i++) {
-		if (first == -1 && godot_touches[i] == NULL) {
+		if (first == -1 && godot_touches[i] == nullptr) {
 			first = i;
 			continue;
 		}
@@ -318,11 +318,11 @@ static const float earth_gravity = 9.80665;
 - (int)removeTouch:(UITouch *)p_touch {
 	int remaining = 0;
 	for (int i = 0; i < max_touches; i++) {
-		if (godot_touches[i] == NULL) {
+		if (godot_touches[i] == nullptr) {
 			continue;
 		}
 		if (godot_touches[i] == p_touch) {
-			godot_touches[i] = NULL;
+			godot_touches[i] = nullptr;
 		} else {
 			++remaining;
 		}
@@ -332,7 +332,7 @@ static const float earth_gravity = 9.80665;
 
 - (void)clearTouches {
 	for (int i = 0; i < max_touches; i++) {
-		godot_touches[i] = NULL;
+		godot_touches[i] = nullptr;
 	}
 }
 
diff --git a/platform/iphone/ios.mm b/platform/iphone/ios.mm
index cef03534c4..3430a9cba7 100644
--- a/platform/iphone/ios.mm
+++ b/platform/iphone/ios.mm
@@ -56,16 +56,16 @@ void iOS::alert(const char *p_alert, const char *p_title) {
 String iOS::get_model() const {
 	// [[UIDevice currentDevice] model] only returns "iPad" or "iPhone".
 	size_t size;
-	sysctlbyname("hw.machine", NULL, &size, NULL, 0);
+	sysctlbyname("hw.machine", nullptr, &size, nullptr, 0);
 	char *model = (char *)malloc(size);
-	if (model == NULL) {
+	if (model == nullptr) {
 		return "";
 	}
-	sysctlbyname("hw.machine", model, &size, NULL, 0);
+	sysctlbyname("hw.machine", model, &size, nullptr, 0);
 	NSString *platform = [NSString stringWithCString:model encoding:NSUTF8StringEncoding];
 	free(model);
 	const char *str = [platform UTF8String];
-	return String(str != NULL ? str : "");
+	return String(str != nullptr ? str : "");
 }
 
 String iOS::get_rate_url(int p_app_id) const {
diff --git a/platform/iphone/keyboard_input_view.mm b/platform/iphone/keyboard_input_view.mm
index 1408f78e90..0e5a98a3e6 100644
--- a/platform/iphone/keyboard_input_view.mm
+++ b/platform/iphone/keyboard_input_view.mm
@@ -88,13 +88,15 @@
 	self.text = existingString;
 	self.previousText = existingString;
 
+	NSInteger safeStartIndex = MAX(start, 0);
+
 	NSRange textRange;
 
 	// Either a simple cursor or a selection.
 	if (end > 0) {
-		textRange = NSMakeRange(start, end - start);
+		textRange = NSMakeRange(safeStartIndex, end - start);
 	} else {
-		textRange = NSMakeRange(start, 0);
+		textRange = NSMakeRange(safeStartIndex, 0);
 	}
 
 	self.selectedRange = textRange;
diff --git a/platform/iphone/plugin/godot_plugin_config.h b/platform/iphone/plugin/godot_plugin_config.h
index f4e30c8349..e2546e733c 100644
--- a/platform/iphone/plugin/godot_plugin_config.h
+++ b/platform/iphone/plugin/godot_plugin_config.h
@@ -218,8 +218,9 @@ static inline uint64_t get_plugin_modification_time(const PluginConfigIOS &plugi
 	} else {
 		String file_path = plugin_config.binary.get_base_dir();
 		String file_name = plugin_config.binary.get_basename().get_file();
-		String release_file_name = file_path.plus_file(file_name + ".release.a");
-		String debug_file_name = file_path.plus_file(file_name + ".debug.a");
+		String plugin_extension = plugin_config.binary.get_extension();
+		String release_file_name = file_path.plus_file(file_name + ".release." + plugin_extension);
+		String debug_file_name = file_path.plus_file(file_name + ".debug." + plugin_extension);
 
 		last_updated = MAX(last_updated, FileAccess::get_modified_time(release_file_name));
 		last_updated = MAX(last_updated, FileAccess::get_modified_time(debug_file_name));
diff --git a/platform/iphone/vulkan_context_iphone.mm b/platform/iphone/vulkan_context_iphone.mm
index b980ae99f0..08c9007fbb 100644
--- a/platform/iphone/vulkan_context_iphone.mm
+++ b/platform/iphone/vulkan_context_iphone.mm
@@ -38,13 +38,13 @@ const char *VulkanContextIPhone::_get_platform_surface_extension() const {
 Error VulkanContextIPhone::window_create(DisplayServer::WindowID p_window_id, CALayer *p_metal_layer, int p_width, int p_height) {
 	VkIOSSurfaceCreateInfoMVK createInfo;
 	createInfo.sType = VK_STRUCTURE_TYPE_IOS_SURFACE_CREATE_INFO_MVK;
-	createInfo.pNext = NULL;
+	createInfo.pNext = nullptr;
 	createInfo.flags = 0;
 	createInfo.pView = (__bridge const void *)p_metal_layer;
 
 	VkSurfaceKHR surface;
 	VkResult err =
-			vkCreateIOSSurfaceMVK(_get_instance(), &createInfo, NULL, &surface);
+			vkCreateIOSSurfaceMVK(_get_instance(), &createInfo, nullptr, &surface);
 	ERR_FAIL_COND_V(err, ERR_CANT_CREATE);
 
 	return _window_create(p_window_id, surface, p_width, p_height);
diff --git a/platform/javascript/api/javascript_tools_editor_plugin.cpp b/platform/javascript/api/javascript_tools_editor_plugin.cpp
index 8355faccc2..7a2c2b2335 100644
--- a/platform/javascript/api/javascript_tools_editor_plugin.cpp
+++ b/platform/javascript/api/javascript_tools_editor_plugin.cpp
@@ -65,10 +65,10 @@ void JavaScriptToolsEditorPlugin::_download_zip(Variant p_v) {
 
 	FileAccess *src_f;
 	zlib_filefunc_def io = zipio_create_io_from_file(&src_f);
-	zipFile zip = zipOpen2("/tmp/project.zip", APPEND_STATUS_CREATE, NULL, &io);
+	zipFile zip = zipOpen2("/tmp/project.zip", APPEND_STATUS_CREATE, nullptr, &io);
 	String base_path = resource_path.substr(0, resource_path.rfind("/")) + "/";
 	_zip_recursive(resource_path, base_path, zip);
-	zipClose(zip, NULL);
+	zipClose(zip, nullptr);
 	godot_js_editor_download_file("/tmp/project.zip", "project.zip", "application/zip");
 }
 
@@ -88,12 +88,12 @@ void JavaScriptToolsEditorPlugin::_zip_file(String p_path, String p_base_path, z
 	String path = p_path.replace_first(p_base_path, "");
 	zipOpenNewFileInZip(p_zip,
 			path.utf8().get_data(),
-			NULL,
-			NULL,
+			nullptr,
+			nullptr,
 			0,
-			NULL,
+			nullptr,
 			0,
-			NULL,
+			nullptr,
 			Z_DEFLATED,
 			Z_DEFAULT_COMPRESSION);
 	zipWriteInFileInZip(p_zip, data.ptr(), data.size());
@@ -116,12 +116,12 @@ void JavaScriptToolsEditorPlugin::_zip_recursive(String p_path, String p_base_pa
 			String path = cs.replace_first(p_base_path, "") + "/";
 			zipOpenNewFileInZip(p_zip,
 					path.utf8().get_data(),
-					NULL,
-					NULL,
+					nullptr,
+					nullptr,
 					0,
-					NULL,
+					nullptr,
 					0,
-					NULL,
+					nullptr,
 					Z_DEFLATED,
 					Z_DEFAULT_COMPRESSION);
 			zipCloseFileInZip(p_zip);
diff --git a/platform/javascript/detect.py b/platform/javascript/detect.py
index ac8d8de7e0..d01e8a8bd4 100644
--- a/platform/javascript/detect.py
+++ b/platform/javascript/detect.py
@@ -95,8 +95,9 @@ def configure(env):
         if env["initial_memory"] < 64:
             print("Editor build requires at least 64MiB of initial memory. Forcing it.")
             env["initial_memory"] = 64
-    elif env["builtin_icu"]:
         env.Append(CCFLAGS=["-frtti"])
+    elif env["builtin_icu"]:
+        env.Append(CCFLAGS=["-fno-exceptions", "-frtti"])
     else:
         # Disable exceptions and rtti on non-tools (template) builds
         # These flags help keep the file size down.
@@ -174,7 +175,7 @@ def configure(env):
     # Program() output consists of multiple files, so specify suffixes manually at builder.
     env["PROGSUFFIX"] = ""
     env["LIBPREFIX"] = "lib"
-    env["LIBSUFFIX"] = ".bc"
+    env["LIBSUFFIX"] = ".a"
     env["LIBPREFIXES"] = ["$LIBPREFIX"]
     env["LIBSUFFIXES"] = ["$LIBSUFFIX"]
 
@@ -228,8 +229,8 @@ def configure(env):
     # Allow use to take control of swapping WebGL buffers.
     env.Append(LINKFLAGS=["-s", "OFFSCREEN_FRAMEBUFFER=1"])
 
-    # callMain for manual start.
-    env.Append(LINKFLAGS=["-s", "EXTRA_EXPORTED_RUNTIME_METHODS=['callMain','cwrap']"])
+    # callMain for manual start, cwrap for the mono version.
+    env.Append(LINKFLAGS=["-s", "EXPORTED_RUNTIME_METHODS=['callMain','cwrap']"])
 
     # Add code that allow exiting runtime.
     env.Append(LINKFLAGS=["-s", "EXIT_RUNTIME=1"])
@@ -238,6 +239,6 @@ def configure(env):
     env.Append(
         LINKFLAGS=[
             "-s",
-            "EXPORTED_FUNCTIONS=['_main', '_emscripten_webgl_get_current_context', '_emscripten_webgl_commit_frame', '_emscripten_webgl_create_context']",
+            "EXPORTED_FUNCTIONS=['_main', '_emscripten_webgl_get_current_context']",
         ]
     )
diff --git a/platform/javascript/display_server_javascript.cpp b/platform/javascript/display_server_javascript.cpp
index fa6f5c1e9e..0031650360 100644
--- a/platform/javascript/display_server_javascript.cpp
+++ b/platform/javascript/display_server_javascript.cpp
@@ -399,7 +399,7 @@ void DisplayServerJavaScript::cursor_set_custom_image(const RES &p_cursor, Curso
 		godot_js_display_cursor_set_custom_shape(godot2dom_cursor(p_shape), png.ptr(), len, p_hotspot.x, p_hotspot.y);
 
 	} else {
-		godot_js_display_cursor_set_custom_shape(godot2dom_cursor(p_shape), NULL, 0, 0, 0);
+		godot_js_display_cursor_set_custom_shape(godot2dom_cursor(p_shape), nullptr, 0, 0, 0);
 	}
 
 	cursor_set_shape(cursor_shape);
@@ -771,8 +771,8 @@ DisplayServerJavaScript::DisplayServerJavaScript(const String &p_rendering_drive
 #define SET_EM_CALLBACK(target, ev, cb)                                  \
 	result = emscripten_set_##ev##_callback(target, nullptr, true, &cb); \
 	EM_CHECK(ev)
-#define SET_EM_WINDOW_CALLBACK(ev, cb)                                                         \
-	result = emscripten_set_##ev##_callback(EMSCRIPTEN_EVENT_TARGET_WINDOW, NULL, false, &cb); \
+#define SET_EM_WINDOW_CALLBACK(ev, cb)                                                            \
+	result = emscripten_set_##ev##_callback(EMSCRIPTEN_EVENT_TARGET_WINDOW, nullptr, false, &cb); \
 	EM_CHECK(ev)
 	// These callbacks from Emscripten's html5.h suffice to access most
 	// JavaScript APIs.
diff --git a/platform/javascript/emscripten_helpers.py b/platform/javascript/emscripten_helpers.py
index b3b15a1574..ab98838e20 100644
--- a/platform/javascript/emscripten_helpers.py
+++ b/platform/javascript/emscripten_helpers.py
@@ -1,4 +1,4 @@
-import os
+import os, json
 
 from SCons.Util import WhereIs
 
@@ -59,7 +59,23 @@ def create_template_zip(env, js, wasm, extra):
     if env["tools"]:
         # HTML
         html = "#misc/dist/html/editor.html"
-        subst_dict = {"@GODOT_VERSION@": get_build_version(), "@GODOT_NAME@": "GodotEngine"}
+        cache = [
+            "godot.tools.html",
+            "offline.html",
+            "godot.tools.js",
+            "godot.tools.worker.js",
+            "godot.tools.audio.worklet.js",
+            "logo.svg",
+            "favicon.png",
+        ]
+        opt_cache = ["godot.tools.wasm"]
+        subst_dict = {
+            "@GODOT_VERSION@": get_build_version(),
+            "@GODOT_NAME@": "GodotEngine",
+            "@GODOT_CACHE@": json.dumps(cache),
+            "@GODOT_OPT_CACHE@": json.dumps(opt_cache),
+            "@GODOT_OFFLINE_PAGE@": "offline.html",
+        }
         html = env.Substfile(target="#bin/godot${PROGSUFFIX}.html", source=html, SUBST_DICT=subst_dict)
         in_files.append(html)
         out_files.append(zip_dir.File(binary_name + ".html"))
@@ -82,6 +98,10 @@ def create_template_zip(env, js, wasm, extra):
         # HTML
         in_files.append("#misc/dist/html/full-size.html")
         out_files.append(zip_dir.File(binary_name + ".html"))
+        in_files.append(service_worker)
+        out_files.append(zip_dir.File(binary_name + ".service.worker.js"))
+        in_files.append("#misc/dist/html/offline-export.html")
+        out_files.append(zip_dir.File("godot.offline.html"))
 
     zip_files = env.InstallAs(out_files, in_files)
     env.Zip(
diff --git a/platform/javascript/export/export.cpp b/platform/javascript/export/export.cpp
index 3f04bedea2..14e279b45b 100644
--- a/platform/javascript/export/export.cpp
+++ b/platform/javascript/export/export.cpp
@@ -28,7 +28,9 @@
 /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
 /*************************************************************************/
 
+#include "core/io/image_loader.h"
 #include "core/io/json.h"
+#include "core/io/stream_peer_ssl.h"
 #include "core/io/tcp_server.h"
 #include "core/io/zip_io.h"
 #include "editor/editor_export.h"
@@ -40,20 +42,55 @@
 class EditorHTTPServer : public Reference {
 private:
 	Ref<TCP_Server> server;
-	Ref<StreamPeerTCP> connection;
+	Map<String, String> mimes;
+	Ref<StreamPeerTCP> tcp;
+	Ref<StreamPeerSSL> ssl;
+	Ref<StreamPeer> peer;
+	Ref<CryptoKey> key;
+	Ref<X509Certificate> cert;
+	bool use_ssl = false;
 	uint64_t time = 0;
 	uint8_t req_buf[4096];
 	int req_pos = 0;
 
 	void _clear_client() {
-		connection = Ref<StreamPeerTCP>();
+		peer = Ref<StreamPeer>();
+		ssl = Ref<StreamPeerSSL>();
+		tcp = Ref<StreamPeerTCP>();
 		memset(req_buf, 0, sizeof(req_buf));
 		time = 0;
 		req_pos = 0;
 	}
 
+	void _set_internal_certs(Ref<Crypto> p_crypto) {
+		const String cache_path = EditorSettings::get_singleton()->get_cache_dir();
+		const String key_path = cache_path.plus_file("html5_server.key");
+		const String crt_path = cache_path.plus_file("html5_server.crt");
+		bool regen = !FileAccess::exists(key_path) || !FileAccess::exists(crt_path);
+		if (!regen) {
+			key = Ref<CryptoKey>(CryptoKey::create());
+			cert = Ref<X509Certificate>(X509Certificate::create());
+			if (key->load(key_path) != OK || cert->load(crt_path) != OK) {
+				regen = true;
+			}
+		}
+		if (regen) {
+			key = p_crypto->generate_rsa(2048);
+			key->save(key_path);
+			cert = p_crypto->generate_self_signed_certificate(key, "CN=godot-debug.local,O=A Game Dev,C=XXA", "20140101000000", "20340101000000");
+			cert->save(crt_path);
+		}
+	}
+
 public:
 	EditorHTTPServer() {
+		mimes["html"] = "text/html";
+		mimes["js"] = "application/javascript";
+		mimes["json"] = "application/json";
+		mimes["pck"] = "application/octet-stream";
+		mimes["png"] = "image/png";
+		mimes["svg"] = "image/svg";
+		mimes["wasm"] = "application/wasm";
 		server.instance();
 		stop();
 	}
@@ -63,7 +100,24 @@ public:
 		_clear_client();
 	}
 
-	Error listen(int p_port, IP_Address p_address) {
+	Error listen(int p_port, IP_Address p_address, bool p_use_ssl, String p_ssl_key, String p_ssl_cert) {
+		use_ssl = p_use_ssl;
+		if (use_ssl) {
+			Ref<Crypto> crypto = Crypto::create();
+			if (crypto.is_null()) {
+				return ERR_UNAVAILABLE;
+			}
+			if (!p_ssl_key.is_empty() && !p_ssl_cert.is_empty()) {
+				key = Ref<CryptoKey>(CryptoKey::create());
+				Error err = key->load(p_ssl_key);
+				ERR_FAIL_COND_V(err != OK, err);
+				cert = Ref<X509Certificate>(X509Certificate::create());
+				err = cert->load(p_ssl_cert);
+				ERR_FAIL_COND_V(err != OK, err);
+			} else {
+				_set_internal_certs(crypto);
+			}
+		}
 		return server->listen(p_port, p_address);
 	}
 
@@ -82,51 +136,21 @@ public:
 		// Wrong protocol
 		ERR_FAIL_COND_MSG(req[0] != "GET" || req[2] != "HTTP/1.1", "Invalid method or HTTP version.");
 
-		const String cache_path = EditorSettings::get_singleton()->get_cache_dir();
-		const String basereq = "/tmp_js_export";
-		String filepath;
-		String ctype;
-		if (req[1] == basereq + ".html") {
-			filepath = cache_path.plus_file(req[1].get_file());
-			ctype = "text/html";
-		} else if (req[1] == basereq + ".js") {
-			filepath = cache_path.plus_file(req[1].get_file());
-			ctype = "application/javascript";
-		} else if (req[1] == basereq + ".audio.worklet.js") {
-			filepath = cache_path.plus_file(req[1].get_file());
-			ctype = "application/javascript";
-		} else if (req[1] == basereq + ".worker.js") {
-			filepath = cache_path.plus_file(req[1].get_file());
-			ctype = "application/javascript";
-		} else if (req[1] == basereq + ".pck") {
-			filepath = cache_path.plus_file(req[1].get_file());
-			ctype = "application/octet-stream";
-		} else if (req[1] == basereq + ".png" || req[1] == "/favicon.png") {
-			// Also allow serving the generated favicon for a smoother loading experience.
-			if (req[1] == "/favicon.png") {
-				filepath = EditorSettings::get_singleton()->get_cache_dir().plus_file("favicon.png");
-			} else {
-				filepath = basereq + ".png";
-			}
-			ctype = "image/png";
-		} else if (req[1] == basereq + ".side.wasm") {
-			filepath = cache_path.plus_file(req[1].get_file());
-			ctype = "application/wasm";
-		} else if (req[1] == basereq + ".wasm") {
-			filepath = cache_path.plus_file(req[1].get_file());
-			ctype = "application/wasm";
-		} else if (req[1].ends_with(".wasm")) {
-			filepath = cache_path.plus_file(req[1].get_file()); // TODO dangerous?
-			ctype = "application/wasm";
-		}
-		if (filepath.is_empty() || !FileAccess::exists(filepath)) {
+		const String req_file = req[1].get_file();
+		const String req_ext = req[1].get_extension();
+		const String cache_path = EditorSettings::get_singleton()->get_cache_dir().plus_file("web");
+		const String filepath = cache_path.plus_file(req_file);
+
+		if (!mimes.has(req_ext) || !FileAccess::exists(filepath)) {
 			String s = "HTTP/1.1 404 Not Found\r\n";
 			s += "Connection: Close\r\n";
 			s += "\r\n";
 			CharString cs = s.utf8();
-			connection->put_data((const uint8_t *)cs.get_data(), cs.size() - 1);
+			peer->put_data((const uint8_t *)cs.get_data(), cs.size() - 1);
 			return;
 		}
+		const String ctype = mimes[req_ext];
+
 		FileAccess *f = FileAccess::open(filepath, FileAccess::READ);
 		ERR_FAIL_COND(!f);
 		String s = "HTTP/1.1 200 OK\r\n";
@@ -138,7 +162,7 @@ public:
 		s += "Cache-Control: no-store, max-age=0\r\n";
 		s += "\r\n";
 		CharString cs = s.utf8();
-		Error err = connection->put_data((const uint8_t *)cs.get_data(), cs.size() - 1);
+		Error err = peer->put_data((const uint8_t *)cs.get_data(), cs.size() - 1);
 		if (err != OK) {
 			memdelete(f);
 			ERR_FAIL();
@@ -150,7 +174,7 @@ public:
 			if (read < 1) {
 				break;
 			}
-			err = connection->put_data(bytes, read);
+			err = peer->put_data(bytes, read);
 			if (err != OK) {
 				memdelete(f);
 				ERR_FAIL();
@@ -163,21 +187,43 @@ public:
 		if (!server->is_listening()) {
 			return;
 		}
-		if (connection.is_null()) {
+		if (tcp.is_null()) {
 			if (!server->is_connection_available()) {
 				return;
 			}
-			connection = server->take_connection();
+			tcp = server->take_connection();
+			peer = tcp;
 			time = OS::get_singleton()->get_ticks_usec();
 		}
 		if (OS::get_singleton()->get_ticks_usec() - time > 1000000) {
 			_clear_client();
 			return;
 		}
-		if (connection->get_status() != StreamPeerTCP::STATUS_CONNECTED) {
+		if (tcp->get_status() != StreamPeerTCP::STATUS_CONNECTED) {
 			return;
 		}
 
+		if (use_ssl) {
+			if (ssl.is_null()) {
+				ssl = Ref<StreamPeerSSL>(StreamPeerSSL::create());
+				peer = ssl;
+				ssl->set_blocking_handshake_enabled(false);
+				if (ssl->accept_stream(tcp, key, cert) != OK) {
+					_clear_client();
+					return;
+				}
+			}
+			ssl->poll();
+			if (ssl->get_status() == StreamPeerSSL::STATUS_HANDSHAKING) {
+				// Still handshaking, keep waiting.
+				return;
+			}
+			if (ssl->get_status() != StreamPeerSSL::STATUS_CONNECTED) {
+				_clear_client();
+				return;
+			}
+		}
+
 		while (true) {
 			char *r = (char *)req_buf;
 			int l = req_pos - 1;
@@ -189,7 +235,7 @@ public:
 
 			int read = 0;
 			ERR_FAIL_COND(req_pos >= 4096);
-			Error err = connection->get_partial_data(&req_buf[req_pos], 1, read);
+			Error err = peer->get_partial_data(&req_buf[req_pos], 1, read);
 			if (err != OK) {
 				// Got an error
 				_clear_client();
@@ -242,7 +288,32 @@ class EditorExportPlatformJavaScript : public EditorExportPlatform {
 		return name;
 	}
 
+	Ref<Image> _get_project_icon() const {
+		Ref<Image> icon;
+		icon.instance();
+		const String icon_path = String(GLOBAL_GET("application/config/icon")).strip_edges();
+		if (icon_path.is_empty() || ImageLoader::load_image(icon_path, icon) != OK) {
+			return EditorNode::get_singleton()->get_editor_theme()->get_icon("DefaultProjectIcon", "EditorIcons")->get_image();
+		}
+		return icon;
+	}
+
+	Ref<Image> _get_project_splash() const {
+		Ref<Image> splash;
+		splash.instance();
+		const String splash_path = String(GLOBAL_GET("application/boot_splash/image")).strip_edges();
+		if (splash_path.is_empty() || ImageLoader::load_image(splash_path, splash) != OK) {
+			return Ref<Image>(memnew(Image(boot_splash_png)));
+		}
+		return splash;
+	}
+
+	Error _extract_template(const String &p_template, const String &p_dir, const String &p_name, bool pwa);
+	void _replace_strings(Map<String, String> p_replaces, Vector<uint8_t> &r_template);
 	void _fix_html(Vector<uint8_t> &p_html, const Ref<EditorExportPreset> &p_preset, const String &p_name, bool p_debug, int p_flags, const Vector<SharedObject> p_shared_objects, const Dictionary &p_file_sizes);
+	Error _add_manifest_icon(const String &p_path, const String &p_icon, int p_size, Array &r_arr);
+	Error _build_pwa(const Ref<EditorExportPreset> &p_preset, const String p_path, const Vector<SharedObject> &p_shared_objects);
+	Error _write_or_error(const uint8_t *p_content, int p_len, String p_path);
 
 	static void _server_thread_poll(void *data);
 
@@ -281,10 +352,90 @@ public:
 	~EditorExportPlatformJavaScript();
 };
 
-void EditorExportPlatformJavaScript::_fix_html(Vector<uint8_t> &p_html, const Ref<EditorExportPreset> &p_preset, const String &p_name, bool p_debug, int p_flags, const Vector<SharedObject> p_shared_objects, const Dictionary &p_file_sizes) {
-	String str_template = String::utf8(reinterpret_cast<const char *>(p_html.ptr()), p_html.size());
-	String str_export;
+Error EditorExportPlatformJavaScript::_extract_template(const String &p_template, const String &p_dir, const String &p_name, bool pwa) {
+	FileAccess *src_f = nullptr;
+	zlib_filefunc_def io = zipio_create_io_from_file(&src_f);
+	unzFile pkg = unzOpen2(p_template.utf8().get_data(), &io);
+
+	if (!pkg) {
+		EditorNode::get_singleton()->show_warning(TTR("Could not open template for export:") + "\n" + p_template);
+		return ERR_FILE_NOT_FOUND;
+	}
+
+	if (unzGoToFirstFile(pkg) != UNZ_OK) {
+		EditorNode::get_singleton()->show_warning(TTR("Invalid export template:") + "\n" + p_template);
+		unzClose(pkg);
+		return ERR_FILE_CORRUPT;
+	}
+
+	do {
+		//get filename
+		unz_file_info info;
+		char fname[16384];
+		unzGetCurrentFileInfo(pkg, &info, fname, 16384, nullptr, 0, nullptr, 0);
+
+		String file = fname;
+
+		// Skip service worker and offline page if not exporting pwa.
+		if (!pwa && (file == "godot.service.worker.js" || file == "godot.offline.html")) {
+			continue;
+		}
+		Vector<uint8_t> data;
+		data.resize(info.uncompressed_size);
+
+		//read
+		unzOpenCurrentFile(pkg);
+		unzReadCurrentFile(pkg, data.ptrw(), data.size());
+		unzCloseCurrentFile(pkg);
+
+		//write
+		String dst = p_dir.plus_file(file.replace("godot", p_name));
+		FileAccess *f = FileAccess::open(dst, FileAccess::WRITE);
+		if (!f) {
+			EditorNode::get_singleton()->show_warning(TTR("Could not write file:") + "\n" + dst);
+			unzClose(pkg);
+			return ERR_FILE_CANT_WRITE;
+		}
+		f->store_buffer(data.ptr(), data.size());
+		memdelete(f);
+
+	} while (unzGoToNextFile(pkg) == UNZ_OK);
+	unzClose(pkg);
+	return OK;
+}
+
+Error EditorExportPlatformJavaScript::_write_or_error(const uint8_t *p_content, int p_size, String p_path) {
+	FileAccess *f = FileAccess::open(p_path, FileAccess::WRITE);
+	if (!f) {
+		EditorNode::get_singleton()->show_warning(TTR("Could not write file:") + "\n" + p_path);
+		return ERR_FILE_CANT_WRITE;
+	}
+	f->store_buffer(p_content, p_size);
+	memdelete(f);
+	return OK;
+}
+
+void EditorExportPlatformJavaScript::_replace_strings(Map<String, String> p_replaces, Vector<uint8_t> &r_template) {
+	String str_template = String::utf8(reinterpret_cast<const char *>(r_template.ptr()), r_template.size());
+	String out;
 	Vector<String> lines = str_template.split("\n");
+	for (int i = 0; i < lines.size(); i++) {
+		String current_line = lines[i];
+		for (Map<String, String>::Element *E = p_replaces.front(); E; E = E->next()) {
+			current_line = current_line.replace(E->key(), E->get());
+		}
+		out += current_line + "\n";
+	}
+	CharString cs = out.utf8();
+	r_template.resize(cs.length());
+	for (int i = 0; i < cs.length(); i++) {
+		r_template.write[i] = cs[i];
+	}
+}
+
+void EditorExportPlatformJavaScript::_fix_html(Vector<uint8_t> &p_html, const Ref<EditorExportPreset> &p_preset, const String &p_name, bool p_debug, int p_flags, const Vector<SharedObject> p_shared_objects, const Dictionary &p_file_sizes) {
+	// Engine.js config
+	Dictionary config;
 	Array libs;
 	for (int i = 0; i < p_shared_objects.size(); i++) {
 		libs.push_back(p_shared_objects[i].path.get_file());
@@ -295,29 +446,172 @@ void EditorExportPlatformJavaScript::_fix_html(Vector<uint8_t> &p_html, const Re
 	for (int i = 0; i < flags.size(); i++) {
 		args.push_back(flags[i]);
 	}
-	Dictionary config;
 	config["canvasResizePolicy"] = p_preset->get("html/canvas_resize_policy");
 	config["experimentalVK"] = p_preset->get("html/experimental_virtual_keyboard");
 	config["gdnativeLibs"] = libs;
 	config["executable"] = p_name;
 	config["args"] = args;
 	config["fileSizes"] = p_file_sizes;
+
+	String head_include;
+	if (p_preset->get("html/export_icon")) {
+		head_include += "<link id='-gd-engine-icon' rel='icon' type='image/png' href='" + p_name + ".icon.png' />\n";
+		head_include += "<link rel='apple-touch-icon' href='" + p_name + ".apple-touch-icon.png'/>\n";
+	}
+	if (p_preset->get("progressive_web_app/enabled")) {
+		head_include += "<link rel='manifest' href='" + p_name + ".manifest.json'>\n";
+		head_include += "<script type='application/javascript'>window.addEventListener('load', () => {if ('serviceWorker' in navigator) {navigator.serviceWorker.register('" +
+						p_name + ".service.worker.js');}});</script>\n";
+	}
+
+	// Replaces HTML string
 	const String str_config = JSON::print(config);
+	const String custom_head_include = p_preset->get("html/head_include");
+	Map<String, String> replaces;
+	replaces["$GODOT_URL"] = p_name + ".js";
+	replaces["$GODOT_PROJECT_NAME"] = ProjectSettings::get_singleton()->get_setting("application/config/name");
+	replaces["$GODOT_HEAD_INCLUDE"] = head_include + custom_head_include;
+	replaces["$GODOT_CONFIG"] = str_config;
+	_replace_strings(replaces, p_html);
+}
 
-	for (int i = 0; i < lines.size(); i++) {
-		String current_line = lines[i];
-		current_line = current_line.replace("$GODOT_URL", p_name + ".js");
-		current_line = current_line.replace("$GODOT_PROJECT_NAME", ProjectSettings::get_singleton()->get_setting("application/config/name"));
-		current_line = current_line.replace("$GODOT_HEAD_INCLUDE", p_preset->get("html/head_include"));
-		current_line = current_line.replace("$GODOT_CONFIG", str_config);
-		str_export += current_line + "\n";
+Error EditorExportPlatformJavaScript::_add_manifest_icon(const String &p_path, const String &p_icon, int p_size, Array &r_arr) {
+	const String name = p_path.get_file().get_basename();
+	const String icon_name = vformat("%s.%dx%d.png", name, p_size, p_size);
+	const String icon_dest = p_path.get_base_dir().plus_file(icon_name);
+
+	Ref<Image> icon;
+	if (!p_icon.is_empty()) {
+		icon.instance();
+		const Error err = ImageLoader::load_image(p_icon, icon);
+		if (err != OK) {
+			EditorNode::get_singleton()->show_warning(TTR("Could not read file:") + "\n" + p_icon);
+			return err;
+		}
+		if (icon->get_width() != p_size || icon->get_height() != p_size) {
+			icon->resize(p_size, p_size);
+		}
+	} else {
+		icon = _get_project_icon();
+		icon->resize(p_size, p_size);
 	}
+	const Error err = icon->save_png(icon_dest);
+	if (err != OK) {
+		EditorNode::get_singleton()->show_warning(TTR("Could not write file:") + "\n" + icon_dest);
+		return err;
+	}
+	Dictionary icon_dict;
+	icon_dict["sizes"] = vformat("%dx%d", p_size, p_size);
+	icon_dict["type"] = "image/png";
+	icon_dict["src"] = icon_name;
+	r_arr.push_back(icon_dict);
+	return err;
+}
 
-	CharString cs = str_export.utf8();
-	p_html.resize(cs.length());
-	for (int i = 0; i < cs.length(); i++) {
-		p_html.write[i] = cs[i];
+Error EditorExportPlatformJavaScript::_build_pwa(const Ref<EditorExportPreset> &p_preset, const String p_path, const Vector<SharedObject> &p_shared_objects) {
+	// Service worker
+	const String dir = p_path.get_base_dir();
+	const String name = p_path.get_file().get_basename();
+	const ExportMode mode = (ExportMode)(int)p_preset->get("variant/export_type");
+	Map<String, String> replaces;
+	replaces["@GODOT_VERSION@"] = "1";
+	replaces["@GODOT_NAME@"] = name;
+	replaces["@GODOT_OFFLINE_PAGE@"] = name + ".offline.html";
+	Array files;
+	replaces["@GODOT_OPT_CACHE@"] = JSON::print(files);
+	files.push_back(name + ".html");
+	files.push_back(name + ".js");
+	files.push_back(name + ".wasm");
+	files.push_back(name + ".pck");
+	files.push_back(name + ".offline.html");
+	if (p_preset->get("html/export_icon")) {
+		files.push_back(name + ".icon.png");
+		files.push_back(name + ".apple-touch-icon.png");
+	}
+	if (mode == EXPORT_MODE_THREADS) {
+		files.push_back(name + ".worker.js");
+		files.push_back(name + ".audio.worklet.js");
+	} else if (mode == EXPORT_MODE_GDNATIVE) {
+		files.push_back(name + ".side.wasm");
+		for (int i = 0; i < p_shared_objects.size(); i++) {
+			files.push_back(p_shared_objects[i].path.get_file());
+		}
+	}
+	replaces["@GODOT_CACHE@"] = JSON::print(files);
+
+	const String sw_path = dir.plus_file(name + ".service.worker.js");
+	Vector<uint8_t> sw;
+	{
+		FileAccess *f = FileAccess::open(sw_path, FileAccess::READ);
+		if (!f) {
+			EditorNode::get_singleton()->show_warning(TTR("Could not read file:") + "\n" + sw_path);
+			return ERR_FILE_CANT_READ;
+		}
+		sw.resize(f->get_len());
+		f->get_buffer(sw.ptrw(), sw.size());
+		memdelete(f);
+		f = nullptr;
 	}
+	_replace_strings(replaces, sw);
+	Error err = _write_or_error(sw.ptr(), sw.size(), dir.plus_file(name + ".service.worker.js"));
+	if (err != OK) {
+		return err;
+	}
+
+	// Custom offline page
+	const String offline_page = p_preset->get("progressive_web_app/offline_page");
+	if (!offline_page.is_empty()) {
+		DirAccess *da = DirAccess::create(DirAccess::ACCESS_FILESYSTEM);
+		const String offline_dest = dir.plus_file(name + ".offline.html");
+		err = da->copy(ProjectSettings::get_singleton()->globalize_path(offline_page), offline_dest);
+		if (err != OK) {
+			EditorNode::get_singleton()->show_warning(TTR("Could not read file:") + "\n" + offline_dest);
+			return err;
+		}
+	}
+
+	// Manifest
+	const char *modes[4] = { "fullscreen", "standalone", "minimal-ui", "browser" };
+	const char *orientations[3] = { "any", "landscape", "portrait" };
+	const int display = CLAMP(int(p_preset->get("progressive_web_app/display")), 0, 4);
+	const int orientation = CLAMP(int(p_preset->get("progressive_web_app/orientation")), 0, 3);
+
+	Dictionary manifest;
+	String proj_name = ProjectSettings::get_singleton()->get_setting("application/config/name");
+	if (proj_name.is_empty()) {
+		proj_name = "Godot Game";
+	}
+	manifest["name"] = proj_name;
+	manifest["start_url"] = "./" + name + ".html";
+	manifest["display"] = String::utf8(modes[display]);
+	manifest["orientation"] = String::utf8(orientations[orientation]);
+	manifest["background_color"] = "#" + p_preset->get("progressive_web_app/background_color").operator Color().to_html(false);
+
+	Array icons_arr;
+	const String icon144_path = p_preset->get("progressive_web_app/icon_144x144");
+	err = _add_manifest_icon(p_path, icon144_path, 144, icons_arr);
+	if (err != OK) {
+		return err;
+	}
+	const String icon180_path = p_preset->get("progressive_web_app/icon_180x180");
+	err = _add_manifest_icon(p_path, icon180_path, 180, icons_arr);
+	if (err != OK) {
+		return err;
+	}
+	const String icon512_path = p_preset->get("progressive_web_app/icon_512x512");
+	err = _add_manifest_icon(p_path, icon512_path, 512, icons_arr);
+	if (err != OK) {
+		return err;
+	}
+	manifest["icons"] = icons_arr;
+
+	CharString cs = JSON::print(manifest).utf8();
+	err = _write_or_error((const uint8_t *)cs.get_data(), cs.length(), dir.plus_file(name + ".manifest.json"));
+	if (err != OK) {
+		return err;
+	}
+
+	return OK;
 }
 
 void EditorExportPlatformJavaScript::get_preset_features(const Ref<EditorExportPreset> &p_preset, List<String> *r_features) {
@@ -350,10 +644,19 @@ void EditorExportPlatformJavaScript::get_export_options(List<ExportOption> *r_op
 	r_options->push_back(ExportOption(PropertyInfo(Variant::BOOL, "vram_texture_compression/for_desktop"), true)); // S3TC
 	r_options->push_back(ExportOption(PropertyInfo(Variant::BOOL, "vram_texture_compression/for_mobile"), false)); // ETC or ETC2, depending on renderer
 
+	r_options->push_back(ExportOption(PropertyInfo(Variant::BOOL, "html/export_icon"), true));
 	r_options->push_back(ExportOption(PropertyInfo(Variant::STRING, "html/custom_html_shell", PROPERTY_HINT_FILE, "*.html"), ""));
 	r_options->push_back(ExportOption(PropertyInfo(Variant::STRING, "html/head_include", PROPERTY_HINT_MULTILINE_TEXT), ""));
 	r_options->push_back(ExportOption(PropertyInfo(Variant::INT, "html/canvas_resize_policy", PROPERTY_HINT_ENUM, "None,Project,Adaptive"), 2));
 	r_options->push_back(ExportOption(PropertyInfo(Variant::BOOL, "html/experimental_virtual_keyboard"), false));
+	r_options->push_back(ExportOption(PropertyInfo(Variant::BOOL, "progressive_web_app/enabled"), false));
+	r_options->push_back(ExportOption(PropertyInfo(Variant::STRING, "progressive_web_app/offline_page", PROPERTY_HINT_FILE, "*.html"), ""));
+	r_options->push_back(ExportOption(PropertyInfo(Variant::INT, "progressive_web_app/display", PROPERTY_HINT_ENUM, "Fullscreen,Standalone,Minimal Ui,Browser"), 1));
+	r_options->push_back(ExportOption(PropertyInfo(Variant::INT, "progressive_web_app/orientation", PROPERTY_HINT_ENUM, "Any,Landscape,Portrait"), 0));
+	r_options->push_back(ExportOption(PropertyInfo(Variant::STRING, "progressive_web_app/icon_144x144", PROPERTY_HINT_FILE, "*.png,*.webp,*.svg,*.svgz"), ""));
+	r_options->push_back(ExportOption(PropertyInfo(Variant::STRING, "progressive_web_app/icon_180x180", PROPERTY_HINT_FILE, "*.png,*.webp,*.svg,*.svgz"), ""));
+	r_options->push_back(ExportOption(PropertyInfo(Variant::STRING, "progressive_web_app/icon_512x512", PROPERTY_HINT_FILE, "*.png,*.webp,*.svg,*.svgz"), ""));
+	r_options->push_back(ExportOption(PropertyInfo(Variant::COLOR, "progressive_web_app/background_color", PROPERTY_HINT_COLOR_NO_ALPHA), Color()));
 }
 
 String EditorExportPlatformJavaScript::get_name() const {
@@ -419,20 +722,25 @@ List<String> EditorExportPlatformJavaScript::get_binary_extensions(const Ref<Edi
 Error EditorExportPlatformJavaScript::export_project(const Ref<EditorExportPreset> &p_preset, bool p_debug, const String &p_path, int p_flags) {
 	ExportNotifier notifier(*this, p_preset, p_debug, p_path, p_flags);
 
-	String custom_debug = p_preset->get("custom_template/debug");
-	String custom_release = p_preset->get("custom_template/release");
-	String custom_html = p_preset->get("html/custom_html_shell");
+	const String custom_debug = p_preset->get("custom_template/debug");
+	const String custom_release = p_preset->get("custom_template/release");
+	const String custom_html = p_preset->get("html/custom_html_shell");
+	const bool export_icon = p_preset->get("html/export_icon");
+	const bool pwa = p_preset->get("progressive_web_app/enabled");
 
-	String template_path = p_debug ? custom_debug : custom_release;
+	const String base_dir = p_path.get_base_dir();
+	const String base_path = p_path.get_basename();
+	const String base_name = p_path.get_file().get_basename();
 
+	// Find the correct template
+	String template_path = p_debug ? custom_debug : custom_release;
 	template_path = template_path.strip_edges();
-
 	if (template_path == String()) {
 		ExportMode mode = (ExportMode)(int)p_preset->get("variant/export_type");
 		template_path = find_export_template(_get_template_name(mode, p_debug));
 	}
 
-	if (!DirAccess::exists(p_path.get_base_dir())) {
+	if (!DirAccess::exists(base_dir)) {
 		return ERR_FILE_BAD_PATH;
 	}
 
@@ -441,8 +749,9 @@ Error EditorExportPlatformJavaScript::export_project(const Ref<EditorExportPrese
 		return ERR_FILE_NOT_FOUND;
 	}
 
+	// Export pck and shared objects
 	Vector<SharedObject> shared_objects;
-	String pck_path = p_path.get_basename() + ".pck";
+	String pck_path = base_path + ".pck";
 	Error error = save_pack(p_preset, pck_path, &shared_objects);
 	if (error != OK) {
 		EditorNode::get_singleton()->show_warning(TTR("Could not write file:") + "\n" + pck_path);
@@ -450,7 +759,7 @@ Error EditorExportPlatformJavaScript::export_project(const Ref<EditorExportPrese
 	}
 	DirAccess *da = DirAccess::create(DirAccess::ACCESS_FILESYSTEM);
 	for (int i = 0; i < shared_objects.size(); i++) {
-		String dst = p_path.get_base_dir().plus_file(shared_objects[i].path.get_file());
+		String dst = base_dir.plus_file(shared_objects[i].path.get_file());
 		error = da->copy(shared_objects[i].path, dst);
 		if (error != OK) {
 			EditorNode::get_singleton()->show_warning(TTR("Could not write file:") + "\n" + shared_objects[i].path.get_file());
@@ -459,124 +768,54 @@ Error EditorExportPlatformJavaScript::export_project(const Ref<EditorExportPrese
 		}
 	}
 	memdelete(da);
+	da = nullptr;
 
-	FileAccess *src_f = nullptr;
-	zlib_filefunc_def io = zipio_create_io_from_file(&src_f);
-	unzFile pkg = unzOpen2(template_path.utf8().get_data(), &io);
-
-	if (!pkg) {
-		EditorNode::get_singleton()->show_warning(TTR("Could not open template for export:") + "\n" + template_path);
-		return ERR_FILE_NOT_FOUND;
-	}
-
-	if (unzGoToFirstFile(pkg) != UNZ_OK) {
-		EditorNode::get_singleton()->show_warning(TTR("Invalid export template:") + "\n" + template_path);
-		unzClose(pkg);
-		return ERR_FILE_CORRUPT;
+	// Extract templates.
+	error = _extract_template(template_path, base_dir, base_name, pwa);
+	if (error) {
+		return error;
 	}
 
-	Vector<uint8_t> html;
+	// Parse generated file sizes (pck and wasm, to help show a meaningful loading bar).
 	Dictionary file_sizes;
-	do {
-		//get filename
-		unz_file_info info;
-		char fname[16384];
-		unzGetCurrentFileInfo(pkg, &info, fname, 16384, nullptr, 0, nullptr, 0);
-
-		String file = fname;
-
-		// HTML is handled later
-		if (file == "godot.html") {
-			if (custom_html.is_empty()) {
-				html.resize(info.uncompressed_size);
-				unzOpenCurrentFile(pkg);
-				unzReadCurrentFile(pkg, html.ptrw(), html.size());
-				unzCloseCurrentFile(pkg);
-			}
-			continue;
-		}
-		Vector<uint8_t> data;
-		data.resize(info.uncompressed_size);
-
-		//read
-		unzOpenCurrentFile(pkg);
-		unzReadCurrentFile(pkg, data.ptrw(), data.size());
-		unzCloseCurrentFile(pkg);
-
-		//write
-
-		if (file == "godot.js") {
-			file = p_path.get_file().get_basename() + ".js";
-
-		} else if (file == "godot.worker.js") {
-			file = p_path.get_file().get_basename() + ".worker.js";
-
-		} else if (file == "godot.side.wasm") {
-			file = p_path.get_file().get_basename() + ".side.wasm";
-
-		} else if (file == "godot.audio.worklet.js") {
-			file = p_path.get_file().get_basename() + ".audio.worklet.js";
-
-		} else if (file == "godot.wasm") {
-			file = p_path.get_file().get_basename() + ".wasm";
-			file_sizes[file.get_file()] = (uint64_t)info.uncompressed_size;
-		}
-
-		String dst = p_path.get_base_dir().plus_file(file);
-		FileAccess *f = FileAccess::open(dst, FileAccess::WRITE);
-		if (!f) {
-			EditorNode::get_singleton()->show_warning(TTR("Could not write file:") + "\n" + dst);
-			unzClose(pkg);
-			return ERR_FILE_CANT_WRITE;
-		}
-		f->store_buffer(data.ptr(), data.size());
-		memdelete(f);
-
-	} while (unzGoToNextFile(pkg) == UNZ_OK);
-	unzClose(pkg);
-
-	if (!custom_html.is_empty()) {
-		FileAccess *f = FileAccess::open(custom_html, FileAccess::READ);
-		if (!f) {
-			EditorNode::get_singleton()->show_warning(TTR("Could not read custom HTML shell:") + "\n" + custom_html);
-			return ERR_FILE_CANT_READ;
-		}
-		html.resize(f->get_len());
-		f->get_buffer(html.ptrw(), html.size());
+	FileAccess *f = nullptr;
+	f = FileAccess::open(pck_path, FileAccess::READ);
+	if (f) {
+		file_sizes[pck_path.get_file()] = (uint64_t)f->get_len();
 		memdelete(f);
+		f = nullptr;
 	}
-	{
-		FileAccess *f = FileAccess::open(pck_path, FileAccess::READ);
-		if (f) {
-			file_sizes[pck_path.get_file()] = (uint64_t)f->get_len();
-			memdelete(f);
-			f = nullptr;
-		}
-		_fix_html(html, p_preset, p_path.get_file().get_basename(), p_debug, p_flags, shared_objects, file_sizes);
-		f = FileAccess::open(p_path, FileAccess::WRITE);
-		if (!f) {
-			EditorNode::get_singleton()->show_warning(TTR("Could not write file:") + "\n" + p_path);
-			return ERR_FILE_CANT_WRITE;
-		}
-		f->store_buffer(html.ptr(), html.size());
+	f = FileAccess::open(base_path + ".wasm", FileAccess::READ);
+	if (f) {
+		file_sizes[base_name + ".wasm"] = (uint64_t)f->get_len();
 		memdelete(f);
-		html.resize(0);
+		f = nullptr;
 	}
 
-	Ref<Image> splash;
-	const String splash_path = String(GLOBAL_GET("application/boot_splash/image")).strip_edges();
-	if (!splash_path.is_empty()) {
-		splash.instance();
-		const Error err = splash->load(splash_path);
-		if (err) {
-			EditorNode::get_singleton()->show_warning(TTR("Could not read boot splash image file:") + "\n" + splash_path + "\n" + TTR("Using default boot splash image."));
-			splash.unref();
-		}
-	}
-	if (splash.is_null()) {
-		splash = Ref<Image>(memnew(Image(boot_splash_png)));
+	// Read the HTML shell file (custom or from template).
+	const String html_path = custom_html.is_empty() ? base_path + ".html" : custom_html;
+	Vector<uint8_t> html;
+	f = FileAccess::open(html_path, FileAccess::READ);
+	if (!f) {
+		EditorNode::get_singleton()->show_warning(TTR("Could not read HTML shell:") + "\n" + html_path);
+		return ERR_FILE_CANT_READ;
+	}
+	html.resize(f->get_len());
+	f->get_buffer(html.ptrw(), html.size());
+	memdelete(f);
+	f = nullptr;
+
+	// Generate HTML file with replaced strings.
+	_fix_html(html, p_preset, base_name, p_debug, p_flags, shared_objects, file_sizes);
+	Error err = _write_or_error(html.ptr(), html.size(), p_path);
+	if (err != OK) {
+		return err;
 	}
-	const String splash_png_path = p_path.get_base_dir().plus_file(p_path.get_file().get_basename() + ".png");
+	html.resize(0);
+
+	// Export splash (why?)
+	Ref<Image> splash = _get_project_splash();
+	const String splash_png_path = base_path + ".png";
 	if (splash->save_png(splash_png_path) != OK) {
 		EditorNode::get_singleton()->show_warning(TTR("Could not write file:") + "\n" + splash_png_path);
 		return ERR_FILE_CANT_WRITE;
@@ -584,22 +823,27 @@ Error EditorExportPlatformJavaScript::export_project(const Ref<EditorExportPrese
 
 	// Save a favicon that can be accessed without waiting for the project to finish loading.
 	// This way, the favicon can be displayed immediately when loading the page.
-	Ref<Image> favicon;
-	const String favicon_path = String(GLOBAL_GET("application/config/icon")).strip_edges();
-	if (!favicon_path.is_empty()) {
-		favicon.instance();
-		const Error err = favicon->load(favicon_path);
-		if (err) {
-			favicon.unref();
-		}
-	}
-
-	if (favicon.is_valid()) {
-		const String favicon_png_path = p_path.get_base_dir().plus_file("favicon.png");
+	if (export_icon) {
+		Ref<Image> favicon = _get_project_icon();
+		const String favicon_png_path = base_path + ".icon.png";
 		if (favicon->save_png(favicon_png_path) != OK) {
 			EditorNode::get_singleton()->show_warning(TTR("Could not write file:") + "\n" + favicon_png_path);
 			return ERR_FILE_CANT_WRITE;
 		}
+		favicon->resize(180, 180);
+		const String apple_icon_png_path = base_path + ".apple-touch-icon.png";
+		if (favicon->save_png(apple_icon_png_path) != OK) {
+			EditorNode::get_singleton()->show_warning(TTR("Could not write file:") + "\n" + apple_icon_png_path);
+			return ERR_FILE_CANT_WRITE;
+		}
+	}
+
+	// Generate the PWA worker and manifest
+	if (pwa) {
+		err = _build_pwa(p_preset, p_path, shared_objects);
+		if (err != OK) {
+			return err;
+		}
 	}
 
 	return OK;
@@ -644,19 +888,31 @@ Error EditorExportPlatformJavaScript::run(const Ref<EditorExportPreset> &p_prese
 		return OK;
 	}
 
-	const String basepath = EditorSettings::get_singleton()->get_cache_dir().plus_file("tmp_js_export");
+	const String dest = EditorSettings::get_singleton()->get_cache_dir().plus_file("web");
+	DirAccessRef da = DirAccess::create(DirAccess::ACCESS_FILESYSTEM);
+	if (!da->dir_exists(dest)) {
+		Error err = da->make_dir_recursive(dest);
+		if (err != OK) {
+			EditorNode::get_singleton()->show_warning(TTR("Could not create HTTP server directory:") + "\n" + dest);
+			return err;
+		}
+	}
+	const String basepath = dest.plus_file("tmp_js_export");
 	Error err = export_project(p_preset, true, basepath + ".html", p_debug_flags);
 	if (err != OK) {
 		// Export generates several files, clean them up on failure.
 		DirAccess::remove_file_or_error(basepath + ".html");
+		DirAccess::remove_file_or_error(basepath + ".offline.html");
 		DirAccess::remove_file_or_error(basepath + ".js");
 		DirAccess::remove_file_or_error(basepath + ".worker.js");
 		DirAccess::remove_file_or_error(basepath + ".audio.worklet.js");
+		DirAccess::remove_file_or_error(basepath + ".service.worker.js");
 		DirAccess::remove_file_or_error(basepath + ".pck");
 		DirAccess::remove_file_or_error(basepath + ".png");
 		DirAccess::remove_file_or_error(basepath + ".side.wasm");
 		DirAccess::remove_file_or_error(basepath + ".wasm");
-		DirAccess::remove_file_or_error(EditorSettings::get_singleton()->get_cache_dir().plus_file("favicon.png"));
+		DirAccess::remove_file_or_error(basepath + ".icon.png");
+		DirAccess::remove_file_or_error(basepath + ".apple-touch-icon.png");
 		return err;
 	}
 
@@ -671,16 +927,23 @@ Error EditorExportPlatformJavaScript::run(const Ref<EditorExportPreset> &p_prese
 	}
 	ERR_FAIL_COND_V_MSG(!bind_ip.is_valid(), ERR_INVALID_PARAMETER, "Invalid editor setting 'export/web/http_host': '" + bind_host + "'. Try using '127.0.0.1'.");
 
+	const bool use_ssl = EDITOR_GET("export/web/use_ssl");
+	const String ssl_key = EDITOR_GET("export/web/ssl_key");
+	const String ssl_cert = EDITOR_GET("export/web/ssl_certificate");
+
 	// Restart server.
 	{
 		MutexLock lock(server_lock);
 
 		server->stop();
-		err = server->listen(bind_port, bind_ip);
+		err = server->listen(bind_port, bind_ip, use_ssl, ssl_key, ssl_cert);
+	}
+	if (err != OK) {
+		EditorNode::get_singleton()->show_warning(TTR("Error starting HTTP server:") + "\n" + itos(err));
+		return err;
 	}
-	ERR_FAIL_COND_V_MSG(err != OK, err, "Unable to start HTTP server.");
 
-	OS::get_singleton()->shell_open(String("http://" + bind_host + ":" + itos(bind_port) + "/tmp_js_export.html"));
+	OS::get_singleton()->shell_open(String((use_ssl ? "https://" : "http://") + bind_host + ":" + itos(bind_port) + "/tmp_js_export.html"));
 	// FIXME: Find out how to clean up export files after running the successfully
 	// exported game. Might not be trivial.
 	return OK;
@@ -730,7 +993,12 @@ EditorExportPlatformJavaScript::~EditorExportPlatformJavaScript() {
 void register_javascript_exporter() {
 	EDITOR_DEF("export/web/http_host", "localhost");
 	EDITOR_DEF("export/web/http_port", 8060);
+	EDITOR_DEF("export/web/use_ssl", false);
+	EDITOR_DEF("export/web/ssl_key", "");
+	EDITOR_DEF("export/web/ssl_certificate", "");
 	EditorSettings::get_singleton()->add_property_hint(PropertyInfo(Variant::INT, "export/web/http_port", PROPERTY_HINT_RANGE, "1,65535,1"));
+	EditorSettings::get_singleton()->add_property_hint(PropertyInfo(Variant::STRING, "export/web/ssl_key", PROPERTY_HINT_GLOBAL_FILE, "*.key"));
+	EditorSettings::get_singleton()->add_property_hint(PropertyInfo(Variant::STRING, "export/web/ssl_certificate", PROPERTY_HINT_GLOBAL_FILE, "*.crt,*.pem"));
 
 	Ref<EditorExportPlatformJavaScript> platform;
 	platform.instance();
diff --git a/platform/javascript/http_client_javascript.cpp b/platform/javascript/http_client_javascript.cpp
index b79c965854..a6cf4b0eb8 100644
--- a/platform/javascript/http_client_javascript.cpp
+++ b/platform/javascript/http_client_javascript.cpp
@@ -209,7 +209,7 @@ PackedByteArray HTTPClient::read_response_body_chunk() {
 		return chunk;
 	}
 	chunk.resize(read);
-	copymem(chunk.ptrw(), response_buffer.ptr(), read);
+	memcpy(chunk.ptrw(), response_buffer.ptr(), read);
 	return chunk;
 }
 
diff --git a/platform/linuxbsd/detect.py b/platform/linuxbsd/detect.py
index 6b527c6fb5..adbbcaac31 100644
--- a/platform/linuxbsd/detect.py
+++ b/platform/linuxbsd/detect.py
@@ -67,10 +67,10 @@ def get_opts():
         BoolVariable("use_static_cpp", "Link libgcc and libstdc++ statically for better portability", True),
         BoolVariable("use_coverage", "Test Godot coverage", False),
         BoolVariable("use_ubsan", "Use LLVM/GCC compiler undefined behavior sanitizer (UBSAN)", False),
-        BoolVariable("use_asan", "Use LLVM/GCC compiler address sanitizer (ASAN))", False),
-        BoolVariable("use_lsan", "Use LLVM/GCC compiler leak sanitizer (LSAN))", False),
-        BoolVariable("use_tsan", "Use LLVM/GCC compiler thread sanitizer (TSAN))", False),
-        BoolVariable("use_msan", "Use LLVM/GCC compiler memory sanitizer (MSAN))", False),
+        BoolVariable("use_asan", "Use LLVM/GCC compiler address sanitizer (ASAN)", False),
+        BoolVariable("use_lsan", "Use LLVM/GCC compiler leak sanitizer (LSAN)", False),
+        BoolVariable("use_tsan", "Use LLVM/GCC compiler thread sanitizer (TSAN)", False),
+        BoolVariable("use_msan", "Use LLVM compiler memory sanitizer (MSAN)", False),
         BoolVariable("pulseaudio", "Detect and use PulseAudio", True),
         BoolVariable("udev", "Use udev for gamepad connection callbacks", True),
         BoolVariable("debug_symbols", "Add debugging symbols to release/release_debug builds", True),
@@ -147,11 +147,23 @@ def configure(env):
         env.extra_suffix += "s"
 
         if env["use_ubsan"]:
-            env.Append(CCFLAGS=["-fsanitize=undefined"])
+            env.Append(
+                CCFLAGS=[
+                    "-fsanitize=undefined,shift,shift-exponent,integer-divide-by-zero,unreachable,vla-bound,null,return,signed-integer-overflow,bounds,float-divide-by-zero,float-cast-overflow,nonnull-attribute,returns-nonnull-attribute,bool,enum,vptr,pointer-overflow,builtin"
+                ]
+            )
             env.Append(LINKFLAGS=["-fsanitize=undefined"])
+            if env["use_llvm"]:
+                env.Append(
+                    CCFLAGS=[
+                        "-fsanitize=nullability-return,nullability-arg,function,nullability-assign,implicit-integer-sign-change"
+                    ]
+                )
+            else:
+                env.Append(CCFLAGS=["-fsanitize=bounds-strict"])
 
         if env["use_asan"]:
-            env.Append(CCFLAGS=["-fsanitize=address"])
+            env.Append(CCFLAGS=["-fsanitize=address,pointer-subtract,pointer-compare"])
             env.Append(LINKFLAGS=["-fsanitize=address"])
 
         if env["use_lsan"]:
@@ -162,8 +174,10 @@ def configure(env):
             env.Append(CCFLAGS=["-fsanitize=thread"])
             env.Append(LINKFLAGS=["-fsanitize=thread"])
 
-        if env["use_msan"]:
+        if env["use_msan"] and env["use_llvm"]:
             env.Append(CCFLAGS=["-fsanitize=memory"])
+            env.Append(CCFLAGS=["-fsanitize-memory-track-origins"])
+            env.Append(CCFLAGS=["-fsanitize-recover=memory"])
             env.Append(LINKFLAGS=["-fsanitize=memory"])
 
     if env["use_lto"]:
@@ -394,10 +408,7 @@ def configure(env):
 
     # Link those statically for portability
     if env["use_static_cpp"]:
-        # Workaround for GH-31743, Ubuntu 18.04 i386 crashes when it's used.
-        # That doesn't make any sense but it's likely a Ubuntu bug?
-        if is64 or env["bits"] == "64":
-            env.Append(LINKFLAGS=["-static-libgcc", "-static-libstdc++"])
+        env.Append(LINKFLAGS=["-static-libgcc", "-static-libstdc++"])
         if env["use_llvm"]:
             env["LINKCOM"] = env["LINKCOM"] + " -l:libatomic.a"
 
diff --git a/platform/osx/crash_handler_osx.mm b/platform/osx/crash_handler_osx.mm
index 147ce26779..0f128d504f 100644
--- a/platform/osx/crash_handler_osx.mm
+++ b/platform/osx/crash_handler_osx.mm
@@ -70,7 +70,7 @@ static uint64_t load_address() {
 }
 
 static void handle_crash(int sig) {
-	if (OS::get_singleton() == NULL) {
+	if (OS::get_singleton() == nullptr) {
 		abort();
 	}
 
@@ -105,7 +105,7 @@ static void handle_crash(int sig) {
 			if (dladdr(bt_buffer[i], &info) && info.dli_sname) {
 				if (info.dli_sname[0] == '_') {
 					int status;
-					char *demangled = abi::__cxa_demangle(info.dli_sname, NULL, 0, &status);
+					char *demangled = abi::__cxa_demangle(info.dli_sname, nullptr, 0, &status);
 
 					if (status == 0 && demangled) {
 						snprintf(fname, 1024, "%s", demangled);
@@ -167,9 +167,9 @@ void CrashHandler::disable() {
 		return;
 
 #ifdef CRASH_HANDLER_ENABLED
-	signal(SIGSEGV, NULL);
-	signal(SIGFPE, NULL);
-	signal(SIGILL, NULL);
+	signal(SIGSEGV, nullptr);
+	signal(SIGFPE, nullptr);
+	signal(SIGILL, nullptr);
 #endif
 
 	disabled = true;
diff --git a/platform/osx/detect.py b/platform/osx/detect.py
index 5b320da82f..317e79d0ea 100644
--- a/platform/osx/detect.py
+++ b/platform/osx/detect.py
@@ -34,9 +34,8 @@ def get_opts():
         BoolVariable("debug_symbols", "Add debugging symbols to release/release_debug builds", True),
         BoolVariable("separate_debug_symbols", "Create a separate file containing debugging symbols", False),
         BoolVariable("use_ubsan", "Use LLVM/GCC compiler undefined behavior sanitizer (UBSAN)", False),
-        BoolVariable("use_asan", "Use LLVM/GCC compiler address sanitizer (ASAN))", False),
-        BoolVariable("use_lsan", "Use LLVM/GCC compiler leak sanitizer (LSAN))", False),
-        BoolVariable("use_tsan", "Use LLVM/GCC compiler thread sanitizer (TSAN))", False),
+        BoolVariable("use_asan", "Use LLVM/GCC compiler address sanitizer (ASAN)", False),
+        BoolVariable("use_tsan", "Use LLVM/GCC compiler thread sanitizer (TSAN)", False),
     ]
 
 
@@ -132,21 +131,22 @@ def configure(env):
         env["AS"] = basecmd + "as"
         env.Append(CPPDEFINES=["__MACPORTS__"])  # hack to fix libvpx MM256_BROADCASTSI128_SI256 define
 
-    if env["use_ubsan"] or env["use_asan"] or env["use_lsan"] or env["use_tsan"]:
+    if env["use_ubsan"] or env["use_asan"] or env["use_tsan"]:
         env.extra_suffix += "s"
 
         if env["use_ubsan"]:
-            env.Append(CCFLAGS=["-fsanitize=undefined"])
+            env.Append(
+                CCFLAGS=[
+                    "-fsanitize=undefined,shift,shift-exponent,integer-divide-by-zero,unreachable,vla-bound,null,return,signed-integer-overflow,bounds,float-divide-by-zero,float-cast-overflow,nonnull-attribute,returns-nonnull-attribute,bool,enum,vptr,pointer-overflow,builtin"
+                ]
+            )
             env.Append(LINKFLAGS=["-fsanitize=undefined"])
+            env.Append(CCFLAGS=["-fsanitize=nullability-return,nullability-arg,function,nullability-assign"])
 
         if env["use_asan"]:
-            env.Append(CCFLAGS=["-fsanitize=address"])
+            env.Append(CCFLAGS=["-fsanitize=address,pointer-subtract,pointer-compare"])
             env.Append(LINKFLAGS=["-fsanitize=address"])
 
-        if env["use_lsan"]:
-            env.Append(CCFLAGS=["-fsanitize=leak"])
-            env.Append(LINKFLAGS=["-fsanitize=leak"])
-
         if env["use_tsan"]:
             env.Append(CCFLAGS=["-fsanitize=thread"])
             env.Append(LINKFLAGS=["-fsanitize=thread"])
diff --git a/platform/osx/display_server_osx.mm b/platform/osx/display_server_osx.mm
index 6b838b6d14..61bb26c2a8 100644
--- a/platform/osx/display_server_osx.mm
+++ b/platform/osx/display_server_osx.mm
@@ -1640,7 +1640,7 @@ String DisplayServerOSX::get_name() const {
 }
 
 const NSMenu *DisplayServerOSX::_get_menu_root(const String &p_menu_root) const {
-	const NSMenu *menu = NULL;
+	const NSMenu *menu = nullptr;
 	if (p_menu_root == "") {
 		// Main menu.x
 		menu = [NSApp mainMenu];
@@ -1655,13 +1655,13 @@ const NSMenu *DisplayServerOSX::_get_menu_root(const String &p_menu_root) const
 	}
 	if (menu == apple_menu) {
 		// Do not allow to change Apple menu.
-		return NULL;
+		return nullptr;
 	}
 	return menu;
 }
 
 NSMenu *DisplayServerOSX::_get_menu_root(const String &p_menu_root) {
-	NSMenu *menu = NULL;
+	NSMenu *menu = nullptr;
 	if (p_menu_root == "") {
 		// Main menu.
 		menu = [NSApp mainMenu];
@@ -1678,7 +1678,7 @@ NSMenu *DisplayServerOSX::_get_menu_root(const String &p_menu_root) {
 	}
 	if (menu == apple_menu) {
 		// Do not allow to change Apple menu.
-		return NULL;
+		return nullptr;
 	}
 	return menu;
 }
@@ -3029,7 +3029,7 @@ void DisplayServerOSX::cursor_set_shape(CursorShape p_shape) {
 		return;
 	}
 
-	if (cursors[p_shape] != NULL) {
+	if (cursors[p_shape] != nullptr) {
 		[cursors[p_shape] set];
 	} else {
 		switch (p_shape) {
@@ -3202,9 +3202,9 @@ void DisplayServerOSX::cursor_set_custom_image(const RES &p_cursor, CursorShape
 		[nsimage release];
 	} else {
 		// Reset to default system cursor
-		if (cursors[p_shape] != NULL) {
+		if (cursors[p_shape] != nullptr) {
 			[cursors[p_shape] release];
-			cursors[p_shape] = NULL;
+			cursors[p_shape] = nullptr;
 		}
 
 		CursorShape c = cursor_shape;
@@ -3759,12 +3759,12 @@ DisplayServerOSX::DisplayServerOSX(const String &p_rendering_driver, WindowMode
 
 	// Register to be notified on keyboard layout changes
 	CFNotificationCenterAddObserver(CFNotificationCenterGetDistributedCenter(),
-			NULL, keyboard_layout_changed,
-			kTISNotifySelectedKeyboardInputSourceChanged, NULL,
+			nullptr, keyboard_layout_changed,
+			kTISNotifySelectedKeyboardInputSourceChanged, nullptr,
 			CFNotificationSuspensionBehaviorDeliverImmediately);
 
 	// Register to be notified on displays arrangement changes
-	CGDisplayRegisterReconfigurationCallback(displays_arrangement_changed, NULL);
+	CGDisplayRegisterReconfigurationCallback(displays_arrangement_changed, nullptr);
 
 	// Menu bar setup must go between sharedApplication above and
 	// finishLaunching below, in order to properly emulate the behavior
@@ -3854,7 +3854,7 @@ DisplayServerOSX::DisplayServerOSX(const String &p_rendering_driver, WindowMode
 		context_vulkan = memnew(VulkanContextOSX);
 		if (context_vulkan->initialize() != OK) {
 			memdelete(context_vulkan);
-			context_vulkan = NULL;
+			context_vulkan = nullptr;
 			r_error = ERR_CANT_CREATE;
 			ERR_FAIL_MSG("Could not initialize Vulkan");
 		}
@@ -3926,8 +3926,8 @@ DisplayServerOSX::~DisplayServerOSX() {
 	}
 #endif
 
-	CFNotificationCenterRemoveObserver(CFNotificationCenterGetDistributedCenter(), NULL, kTISNotifySelectedKeyboardInputSourceChanged, NULL);
-	CGDisplayRemoveReconfigurationCallback(displays_arrangement_changed, NULL);
+	CFNotificationCenterRemoveObserver(CFNotificationCenterGetDistributedCenter(), nullptr, kTISNotifySelectedKeyboardInputSourceChanged, nullptr);
+	CGDisplayRemoveReconfigurationCallback(displays_arrangement_changed, nullptr);
 
 	cursors_cache.clear();
 }
diff --git a/platform/osx/export/export.cpp b/platform/osx/export/export.cpp
index aca9471849..51204bc8f6 100644
--- a/platform/osx/export/export.cpp
+++ b/platform/osx/export/export.cpp
@@ -215,7 +215,7 @@ void _rgba8_to_packbits_encode(int p_ch, int p_size, Vector<uint8_t> &p_source,
 			if ((p_source.ptr()[(i + 1) * 4 + p_ch] == cur) && (p_source.ptr()[(i + 2) * 4 + p_ch] == cur)) {
 				if (buf_size > 0) {
 					result.write[res_size++] = (uint8_t)(buf_size - 1);
-					copymem(&result.write[res_size], &buf, buf_size);
+					memcpy(&result.write[res_size], &buf, buf_size);
 					res_size += buf_size;
 					buf_size = 0;
 				}
@@ -241,7 +241,7 @@ void _rgba8_to_packbits_encode(int p_ch, int p_size, Vector<uint8_t> &p_source,
 				buf[buf_size++] = cur;
 				if (buf_size == 128) {
 					result.write[res_size++] = (uint8_t)(buf_size - 1);
-					copymem(&result.write[res_size], &buf, buf_size);
+					memcpy(&result.write[res_size], &buf, buf_size);
 					res_size += buf_size;
 					buf_size = 0;
 				}
@@ -249,7 +249,7 @@ void _rgba8_to_packbits_encode(int p_ch, int p_size, Vector<uint8_t> &p_source,
 		} else {
 			buf[buf_size++] = cur;
 			result.write[res_size++] = (uint8_t)(buf_size - 1);
-			copymem(&result.write[res_size], &buf, buf_size);
+			memcpy(&result.write[res_size], &buf, buf_size);
 			res_size += buf_size;
 			buf_size = 0;
 		}
@@ -259,7 +259,7 @@ void _rgba8_to_packbits_encode(int p_ch, int p_size, Vector<uint8_t> &p_source,
 
 	int ofs = p_dest.size();
 	p_dest.resize(p_dest.size() + res_size);
-	copymem(&p_dest.write[ofs], result.ptr(), res_size);
+	memcpy(&p_dest.write[ofs], result.ptr(), res_size);
 }
 
 void EditorExportPlatformOSX::_make_icon(const Ref<Image> &p_icon, Vector<uint8_t> &p_data) {
@@ -318,7 +318,7 @@ void EditorExportPlatformOSX::_make_icon(const Ref<Image> &p_icon, Vector<uint8_
 			memdelete(f);
 			len += 8;
 			len = BSWAP32(len);
-			copymem(&data.write[ofs], icon_infos[i].name, 4);
+			memcpy(&data.write[ofs], icon_infos[i].name, 4);
 			encode_uint32(len, &data.write[ofs + 4]);
 
 			// Clean up generated file.
@@ -338,7 +338,7 @@ void EditorExportPlatformOSX::_make_icon(const Ref<Image> &p_icon, Vector<uint8_
 
 				int len = data.size() - ofs;
 				len = BSWAP32(len);
-				copymem(&data.write[ofs], icon_infos[i].name, 4);
+				memcpy(&data.write[ofs], icon_infos[i].name, 4);
 				encode_uint32(len, &data.write[ofs + 4]);
 			}
 
@@ -353,7 +353,7 @@ void EditorExportPlatformOSX::_make_icon(const Ref<Image> &p_icon, Vector<uint8_
 				}
 				len += 8;
 				len = BSWAP32(len);
-				copymem(&data.write[ofs], icon_infos[i].mask_name, 4);
+				memcpy(&data.write[ofs], icon_infos[i].mask_name, 4);
 				encode_uint32(len, &data.write[ofs + 4]);
 			}
 		}
diff --git a/platform/osx/os_osx.mm b/platform/osx/os_osx.mm
index 7b5daf5bfb..e6feda5a9b 100644
--- a/platform/osx/os_osx.mm
+++ b/platform/osx/os_osx.mm
@@ -101,7 +101,7 @@ String OS_OSX::get_unique_id() const {
 
 	if (serial_number.is_empty()) {
 		io_service_t platformExpert = IOServiceGetMatchingService(kIOMasterPortDefault, IOServiceMatching("IOPlatformExpertDevice"));
-		CFStringRef serialNumberAsCFString = NULL;
+		CFStringRef serialNumberAsCFString = nullptr;
 		if (platformExpert) {
 			serialNumberAsCFString = (CFStringRef)IORegistryEntryCreateCFProperty(platformExpert, CFSTR(kIOPlatformSerialNumberKey), kCFAllocatorDefault, 0);
 			IOObjectRelease(platformExpert);
@@ -158,7 +158,7 @@ void OS_OSX::delete_main_loop() {
 	if (!main_loop)
 		return;
 	memdelete(main_loop);
-	main_loop = NULL;
+	main_loop = nullptr;
 }
 
 String OS_OSX::get_name() const {
@@ -346,7 +346,7 @@ Error OS_OSX::move_to_trash(const String &p_path) {
 }
 
 OS_OSX::OS_OSX() {
-	main_loop = NULL;
+	main_loop = nullptr;
 	force_quit = false;
 
 	Vector<Logger *> loggers;
diff --git a/platform/osx/vulkan_context_osx.mm b/platform/osx/vulkan_context_osx.mm
index 75a4fc990f..6b87fbd489 100644
--- a/platform/osx/vulkan_context_osx.mm
+++ b/platform/osx/vulkan_context_osx.mm
@@ -38,12 +38,12 @@ const char *VulkanContextOSX::_get_platform_surface_extension() const {
 Error VulkanContextOSX::window_create(DisplayServer::WindowID p_window_id, id p_window, int p_width, int p_height) {
 	VkMacOSSurfaceCreateInfoMVK createInfo;
 	createInfo.sType = VK_STRUCTURE_TYPE_MACOS_SURFACE_CREATE_INFO_MVK;
-	createInfo.pNext = NULL;
+	createInfo.pNext = nullptr;
 	createInfo.flags = 0;
 	createInfo.pView = p_window;
 
 	VkSurfaceKHR surface;
-	VkResult err = vkCreateMacOSSurfaceMVK(_get_instance(), &createInfo, NULL, &surface);
+	VkResult err = vkCreateMacOSSurfaceMVK(_get_instance(), &createInfo, nullptr, &surface);
 	ERR_FAIL_COND_V(err, ERR_CANT_CREATE);
 	return _window_create(p_window_id, surface, p_width, p_height);
 }
diff --git a/platform/server/detect.py b/platform/server/detect.py
index 16ddbe1768..478bcad212 100644
--- a/platform/server/detect.py
+++ b/platform/server/detect.py
@@ -35,11 +35,11 @@ def get_opts():
         BoolVariable("use_static_cpp", "Link libgcc and libstdc++ statically for better portability", True),
         BoolVariable("use_coverage", "Test Godot coverage", False),
         BoolVariable("use_ubsan", "Use LLVM/GCC compiler undefined behavior sanitizer (UBSAN)", False),
-        BoolVariable("use_asan", "Use LLVM/GCC compiler address sanitizer (ASAN))", False),
-        BoolVariable("use_lsan", "Use LLVM/GCC compiler leak sanitizer (LSAN))", False),
-        BoolVariable("use_tsan", "Use LLVM/GCC compiler thread sanitizer (TSAN))", False),
+        BoolVariable("use_asan", "Use LLVM/GCC compiler address sanitizer (ASAN)", False),
+        BoolVariable("use_lsan", "Use LLVM/GCC compiler leak sanitizer (LSAN)", False),
+        BoolVariable("use_tsan", "Use LLVM/GCC compiler thread sanitizer (TSAN)", False),
+        BoolVariable("use_msan", "Use LLVM compiler memory sanitizer (MSAN)", False),
         BoolVariable("debug_symbols", "Add debugging symbols to release/release_debug builds", True),
-        BoolVariable("use_msan", "Use LLVM/GCC compiler memory sanitizer (MSAN))", False),
         BoolVariable("separate_debug_symbols", "Create a separate file containing debugging symbols", False),
         BoolVariable("execinfo", "Use libexecinfo on systems where glibc is not available", False),
     ]
@@ -104,11 +104,23 @@ def configure(env):
         env.extra_suffix += "s"
 
         if env["use_ubsan"]:
-            env.Append(CCFLAGS=["-fsanitize=undefined"])
+            env.Append(
+                CCFLAGS=[
+                    "-fsanitize=undefined,shift,shift-exponent,integer-divide-by-zero,unreachable,vla-bound,null,return,signed-integer-overflow,bounds,float-divide-by-zero,float-cast-overflow,nonnull-attribute,returns-nonnull-attribute,bool,enum,vptr,pointer-overflow,builtin"
+                ]
+            )
             env.Append(LINKFLAGS=["-fsanitize=undefined"])
+            if env["use_llvm"]:
+                env.Append(
+                    CCFLAGS=[
+                        "-fsanitize=nullability-return,nullability-arg,function,nullability-assign,implicit-integer-sign-change"
+                    ]
+                )
+            else:
+                env.Append(CCFLAGS=["-fsanitize=bounds-strict"])
 
         if env["use_asan"]:
-            env.Append(CCFLAGS=["-fsanitize=address"])
+            env.Append(CCFLAGS=["-fsanitize=address,pointer-subtract,pointer-compare"])
             env.Append(LINKFLAGS=["-fsanitize=address"])
 
         if env["use_lsan"]:
@@ -119,8 +131,10 @@ def configure(env):
             env.Append(CCFLAGS=["-fsanitize=thread"])
             env.Append(LINKFLAGS=["-fsanitize=thread"])
 
-        if env["use_msan"]:
+        if env["use_msan"] and env["use_llvm"]:
             env.Append(CCFLAGS=["-fsanitize=memory"])
+            env.Append(CCFLAGS=["-fsanitize-memory-track-origins"])
+            env.Append(CCFLAGS=["-fsanitize-recover=memory"])
             env.Append(LINKFLAGS=["-fsanitize=memory"])
 
     if env["use_lto"]:
diff --git a/platform/uwp/export/export.cpp b/platform/uwp/export/export.cpp
index 2a0bc78440..217c119978 100644
--- a/platform/uwp/export/export.cpp
+++ b/platform/uwp/export/export.cpp
@@ -1177,6 +1177,8 @@ public:
 	}
 
 	virtual Error export_project(const Ref<EditorExportPreset> &p_preset, bool p_debug, const String &p_path, int p_flags = 0) override {
+		ExportNotifier notifier(*this, p_preset, p_debug, p_path, p_flags);
+
 		String src_appx;
 
 		EditorProgress ep("export", "Exporting for UWP", 7, true);
@@ -1334,7 +1336,7 @@ public:
 			int base = clf.size();
 			clf.resize(base + 4 + txt.length());
 			encode_uint32(txt.length(), &clf.write[base]);
-			copymem(&clf.write[base + 4], txt.ptr(), txt.length());
+			memcpy(&clf.write[base + 4], txt.ptr(), txt.length());
 			print_line(itos(i) + " param: " + cl[i]);
 		}
 
diff --git a/platform/windows/display_server_windows.cpp b/platform/windows/display_server_windows.cpp
index 86f20f1dd7..8789e6dfb4 100644
--- a/platform/windows/display_server_windows.cpp
+++ b/platform/windows/display_server_windows.cpp
@@ -332,7 +332,7 @@ static BOOL CALLBACK _MonitorEnumProcUsableSize(HMONITOR hMonitor, HDC hdcMonito
 	EnumRectData *data = (EnumRectData *)dwData;
 	if (data->count == data->screen) {
 		MONITORINFO minfo;
-		zeromem(&minfo, sizeof(MONITORINFO));
+		memset(&minfo, 0, sizeof(MONITORINFO));
 		minfo.cbSize = sizeof(MONITORINFO);
 		GetMonitorInfoA(hMonitor, &minfo);
 
@@ -1419,13 +1419,13 @@ void DisplayServerWindows::enable_for_stealing_focus(OS::ProcessID pid) {
 }
 
 int DisplayServerWindows::keyboard_get_layout_count() const {
-	return GetKeyboardLayoutList(0, NULL);
+	return GetKeyboardLayoutList(0, nullptr);
 }
 
 int DisplayServerWindows::keyboard_get_current_layout() const {
 	HKL cur_layout = GetKeyboardLayout(0);
 
-	int layout_count = GetKeyboardLayoutList(0, NULL);
+	int layout_count = GetKeyboardLayoutList(0, nullptr);
 	HKL *layouts = (HKL *)memalloc(layout_count * sizeof(HKL));
 	GetKeyboardLayoutList(layout_count, layouts);
 
@@ -1440,7 +1440,7 @@ int DisplayServerWindows::keyboard_get_current_layout() const {
 }
 
 void DisplayServerWindows::keyboard_set_current_layout(int p_index) {
-	int layout_count = GetKeyboardLayoutList(0, NULL);
+	int layout_count = GetKeyboardLayoutList(0, nullptr);
 
 	ERR_FAIL_INDEX(p_index, layout_count);
 
@@ -1451,7 +1451,7 @@ void DisplayServerWindows::keyboard_set_current_layout(int p_index) {
 }
 
 String DisplayServerWindows::keyboard_get_layout_language(int p_index) const {
-	int layout_count = GetKeyboardLayoutList(0, NULL);
+	int layout_count = GetKeyboardLayoutList(0, nullptr);
 
 	ERR_FAIL_INDEX_V(p_index, layout_count, "");
 
@@ -1481,7 +1481,7 @@ String _get_full_layout_name_from_registry(HKL p_layout) {
 
 	DWORD buffer = 1024;
 	DWORD vtype = REG_SZ;
-	if (RegQueryValueExW(hkey, L"Layout Text", NULL, &vtype, (LPBYTE)layout_text, &buffer) == ERROR_SUCCESS) {
+	if (RegQueryValueExW(hkey, L"Layout Text", nullptr, &vtype, (LPBYTE)layout_text, &buffer) == ERROR_SUCCESS) {
 		ret = String::utf16((const char16_t *)layout_text);
 	}
 	RegCloseKey(hkey);
@@ -1489,7 +1489,7 @@ String _get_full_layout_name_from_registry(HKL p_layout) {
 }
 
 String DisplayServerWindows::keyboard_get_layout_name(int p_index) const {
-	int layout_count = GetKeyboardLayoutList(0, NULL);
+	int layout_count = GetKeyboardLayoutList(0, nullptr);
 
 	ERR_FAIL_INDEX_V(p_index, layout_count, "");
 
diff --git a/scene/2d/animated_sprite_2d.cpp b/scene/2d/animated_sprite_2d.cpp
index 4aa079b013..9ee37670d1 100644
--- a/scene/2d/animated_sprite_2d.cpp
+++ b/scene/2d/animated_sprite_2d.cpp
@@ -272,7 +272,7 @@ void AnimatedSprite2D::set_sprite_frames(const Ref<SpriteFrames> &p_frames) {
 	notify_property_list_changed();
 	_reset_timeout();
 	update();
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Ref<SpriteFrames> AnimatedSprite2D::get_sprite_frames() const {
@@ -440,17 +440,14 @@ StringName AnimatedSprite2D::get_animation() const {
 	return animation;
 }
 
-String AnimatedSprite2D::get_configuration_warning() const {
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> AnimatedSprite2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (frames.is_null()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("A SpriteFrames resource must be created or set in the \"Frames\" property in order for AnimatedSprite to display frames.");
+		warnings.push_back(TTR("A SpriteFrames resource must be created or set in the \"Frames\" property in order for AnimatedSprite to display frames."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void AnimatedSprite2D::_bind_methods() {
diff --git a/scene/2d/animated_sprite_2d.h b/scene/2d/animated_sprite_2d.h
index 14ecb18866..ef0027edf1 100644
--- a/scene/2d/animated_sprite_2d.h
+++ b/scene/2d/animated_sprite_2d.h
@@ -109,7 +109,7 @@ public:
 	void set_flip_v(bool p_flip);
 	bool is_flipped_v() const;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 	AnimatedSprite2D();
 };
 
diff --git a/scene/2d/area_2d.cpp b/scene/2d/area_2d.cpp
index 49d1654e3f..9dfdd7bd0e 100644
--- a/scene/2d/area_2d.cpp
+++ b/scene/2d/area_2d.cpp
@@ -450,52 +450,6 @@ bool Area2D::overlaps_body(Node *p_body) const {
 	return E->get().in_tree;
 }
 
-void Area2D::set_collision_mask(uint32_t p_mask) {
-	collision_mask = p_mask;
-	PhysicsServer2D::get_singleton()->area_set_collision_mask(get_rid(), p_mask);
-}
-
-uint32_t Area2D::get_collision_mask() const {
-	return collision_mask;
-}
-
-void Area2D::set_collision_layer(uint32_t p_layer) {
-	collision_layer = p_layer;
-	PhysicsServer2D::get_singleton()->area_set_collision_layer(get_rid(), p_layer);
-}
-
-uint32_t Area2D::get_collision_layer() const {
-	return collision_layer;
-}
-
-void Area2D::set_collision_mask_bit(int p_bit, bool p_value) {
-	uint32_t mask = get_collision_mask();
-	if (p_value) {
-		mask |= 1 << p_bit;
-	} else {
-		mask &= ~(1 << p_bit);
-	}
-	set_collision_mask(mask);
-}
-
-bool Area2D::get_collision_mask_bit(int p_bit) const {
-	return get_collision_mask() & (1 << p_bit);
-}
-
-void Area2D::set_collision_layer_bit(int p_bit, bool p_value) {
-	uint32_t layer = get_collision_layer();
-	if (p_value) {
-		layer |= 1 << p_bit;
-	} else {
-		layer &= ~(1 << p_bit);
-	}
-	set_collision_layer(layer);
-}
-
-bool Area2D::get_collision_layer_bit(int p_bit) const {
-	return get_collision_layer() & (1 << p_bit);
-}
-
 void Area2D::set_audio_bus_override(bool p_override) {
 	audio_bus_override = p_override;
 }
@@ -557,18 +511,6 @@ void Area2D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_priority", "priority"), &Area2D::set_priority);
 	ClassDB::bind_method(D_METHOD("get_priority"), &Area2D::get_priority);
 
-	ClassDB::bind_method(D_METHOD("set_collision_mask", "collision_mask"), &Area2D::set_collision_mask);
-	ClassDB::bind_method(D_METHOD("get_collision_mask"), &Area2D::get_collision_mask);
-
-	ClassDB::bind_method(D_METHOD("set_collision_layer", "collision_layer"), &Area2D::set_collision_layer);
-	ClassDB::bind_method(D_METHOD("get_collision_layer"), &Area2D::get_collision_layer);
-
-	ClassDB::bind_method(D_METHOD("set_collision_mask_bit", "bit", "value"), &Area2D::set_collision_mask_bit);
-	ClassDB::bind_method(D_METHOD("get_collision_mask_bit", "bit"), &Area2D::get_collision_mask_bit);
-
-	ClassDB::bind_method(D_METHOD("set_collision_layer_bit", "bit", "value"), &Area2D::set_collision_layer_bit);
-	ClassDB::bind_method(D_METHOD("get_collision_layer_bit", "bit"), &Area2D::get_collision_layer_bit);
-
 	ClassDB::bind_method(D_METHOD("set_monitoring", "enable"), &Area2D::set_monitoring);
 	ClassDB::bind_method(D_METHOD("is_monitoring"), &Area2D::is_monitoring);
 
@@ -600,6 +542,11 @@ void Area2D::_bind_methods() {
 	ADD_SIGNAL(MethodInfo("area_entered", PropertyInfo(Variant::OBJECT, "area", PROPERTY_HINT_RESOURCE_TYPE, "Area2D")));
 	ADD_SIGNAL(MethodInfo("area_exited", PropertyInfo(Variant::OBJECT, "area", PROPERTY_HINT_RESOURCE_TYPE, "Area2D")));
 
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "monitoring"), "set_monitoring", "is_monitoring");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "monitorable"), "set_monitorable", "is_monitorable");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "priority", PROPERTY_HINT_RANGE, "0,128,1"), "set_priority", "get_priority");
+
+	ADD_GROUP("Physics Overrides", "");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "space_override", PROPERTY_HINT_ENUM, "Disabled,Combine,Combine-Replace,Replace,Replace-Combine"), "set_space_override_mode", "get_space_override_mode");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "gravity_point"), "set_gravity_is_point", "is_gravity_a_point");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "gravity_distance_scale", PROPERTY_HINT_EXP_RANGE, "0,1024,0.001,or_greater"), "set_gravity_distance_scale", "get_gravity_distance_scale");
@@ -607,12 +554,6 @@ void Area2D::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "gravity", PROPERTY_HINT_RANGE, "-1024,1024,0.001"), "set_gravity", "get_gravity");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "linear_damp", PROPERTY_HINT_RANGE, "0,100,0.001,or_greater"), "set_linear_damp", "get_linear_damp");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "angular_damp", PROPERTY_HINT_RANGE, "0,100,0.001,or_greater"), "set_angular_damp", "get_angular_damp");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "priority", PROPERTY_HINT_RANGE, "0,128,1"), "set_priority", "get_priority");
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "monitoring"), "set_monitoring", "is_monitoring");
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "monitorable"), "set_monitorable", "is_monitorable");
-	ADD_GROUP("Collision", "collision_");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_layer", PROPERTY_HINT_LAYERS_2D_PHYSICS), "set_collision_layer", "get_collision_layer");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_mask", PROPERTY_HINT_LAYERS_2D_PHYSICS), "set_collision_mask", "get_collision_mask");
 
 	ADD_GROUP("Audio Bus", "audio_bus_");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "audio_bus_override"), "set_audio_bus_override", "is_overriding_audio_bus");
diff --git a/scene/2d/area_2d.h b/scene/2d/area_2d.h
index 39b022fd2c..d6fcb2c2a5 100644
--- a/scene/2d/area_2d.h
+++ b/scene/2d/area_2d.h
@@ -54,8 +54,6 @@ private:
 	real_t gravity_distance_scale = 0.0;
 	real_t linear_damp = 0.1;
 	real_t angular_damp = 1.0;
-	uint32_t collision_mask = 1;
-	uint32_t collision_layer = 1;
 	int priority = 0;
 	bool monitoring = false;
 	bool monitorable = false;
@@ -163,18 +161,6 @@ public:
 	void set_monitorable(bool p_enable);
 	bool is_monitorable() const;
 
-	void set_collision_mask(uint32_t p_mask);
-	uint32_t get_collision_mask() const;
-
-	void set_collision_layer(uint32_t p_layer);
-	uint32_t get_collision_layer() const;
-
-	void set_collision_mask_bit(int p_bit, bool p_value);
-	bool get_collision_mask_bit(int p_bit) const;
-
-	void set_collision_layer_bit(int p_bit, bool p_value);
-	bool get_collision_layer_bit(int p_bit) const;
-
 	TypedArray<Node2D> get_overlapping_bodies() const; //function for script
 	TypedArray<Area2D> get_overlapping_areas() const; //function for script
 
diff --git a/scene/2d/canvas_modulate.cpp b/scene/2d/canvas_modulate.cpp
index 5d5aaae505..52eabefbcb 100644
--- a/scene/2d/canvas_modulate.cpp
+++ b/scene/2d/canvas_modulate.cpp
@@ -51,7 +51,7 @@ void CanvasModulate::_notification(int p_what) {
 			remove_from_group("_canvas_modulate_" + itos(get_canvas().get_id()));
 		}
 
-		update_configuration_warning();
+		update_configuration_warnings();
 	}
 }
 
@@ -73,24 +73,19 @@ Color CanvasModulate::get_color() const {
 	return color;
 }
 
-String CanvasModulate::get_configuration_warning() const {
-	if (!is_visible_in_tree() || !is_inside_tree()) {
-		return String();
-	}
-
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> CanvasModulate::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
-	List<Node *> nodes;
-	get_tree()->get_nodes_in_group("_canvas_modulate_" + itos(get_canvas().get_id()), &nodes);
+	if (is_visible_in_tree() && is_inside_tree()) {
+		List<Node *> nodes;
+		get_tree()->get_nodes_in_group("_canvas_modulate_" + itos(get_canvas().get_id()), &nodes);
 
-	if (nodes.size() > 1) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
+		if (nodes.size() > 1) {
+			warnings.push_back(TTR("Only one visible CanvasModulate is allowed per scene (or set of instanced scenes). The first created one will work, while the rest will be ignored."));
 		}
-		warning += TTR("Only one visible CanvasModulate is allowed per scene (or set of instanced scenes). The first created one will work, while the rest will be ignored.");
 	}
 
-	return warning;
+	return warnings;
 }
 
 CanvasModulate::CanvasModulate() {
diff --git a/scene/2d/canvas_modulate.h b/scene/2d/canvas_modulate.h
index 4d55a5d9cb..3d85a92a11 100644
--- a/scene/2d/canvas_modulate.h
+++ b/scene/2d/canvas_modulate.h
@@ -46,7 +46,7 @@ public:
 	void set_color(const Color &p_color);
 	Color get_color() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	CanvasModulate();
 	~CanvasModulate();
diff --git a/scene/2d/collision_object_2d.cpp b/scene/2d/collision_object_2d.cpp
index c83ed36917..de648d404c 100644
--- a/scene/2d/collision_object_2d.cpp
+++ b/scene/2d/collision_object_2d.cpp
@@ -100,6 +100,64 @@ void CollisionObject2D::_notification(int p_what) {
 	}
 }
 
+void CollisionObject2D::set_collision_layer(uint32_t p_layer) {
+	collision_layer = p_layer;
+	if (area) {
+		PhysicsServer2D::get_singleton()->area_set_collision_layer(get_rid(), p_layer);
+	} else {
+		PhysicsServer2D::get_singleton()->body_set_collision_layer(get_rid(), p_layer);
+	}
+}
+
+uint32_t CollisionObject2D::get_collision_layer() const {
+	return collision_layer;
+}
+
+void CollisionObject2D::set_collision_mask(uint32_t p_mask) {
+	collision_mask = p_mask;
+	if (area) {
+		PhysicsServer2D::get_singleton()->area_set_collision_mask(get_rid(), p_mask);
+	} else {
+		PhysicsServer2D::get_singleton()->body_set_collision_mask(get_rid(), p_mask);
+	}
+}
+
+uint32_t CollisionObject2D::get_collision_mask() const {
+	return collision_mask;
+}
+
+void CollisionObject2D::set_collision_layer_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision layer bit must be between 0 and 31 inclusive.");
+	uint32_t collision_layer = get_collision_layer();
+	if (p_value) {
+		collision_layer |= 1 << p_bit;
+	} else {
+		collision_layer &= ~(1 << p_bit);
+	}
+	set_collision_layer(collision_layer);
+}
+
+bool CollisionObject2D::get_collision_layer_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision layer bit must be between 0 and 31 inclusive.");
+	return get_collision_layer() & (1 << p_bit);
+}
+
+void CollisionObject2D::set_collision_mask_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision mask bit must be between 0 and 31 inclusive.");
+	uint32_t mask = get_collision_mask();
+	if (p_value) {
+		mask |= 1 << p_bit;
+	} else {
+		mask &= ~(1 << p_bit);
+	}
+	set_collision_mask(mask);
+}
+
+bool CollisionObject2D::get_collision_mask_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision mask bit must be between 0 and 31 inclusive.");
+	return get_collision_mask() & (1 << p_bit);
+}
+
 uint32_t CollisionObject2D::create_shape_owner(Object *p_owner) {
 	ShapeData sd;
 	uint32_t id;
@@ -363,22 +421,26 @@ void CollisionObject2D::_update_pickable() {
 	}
 }
 
-String CollisionObject2D::get_configuration_warning() const {
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> CollisionObject2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (shapes.is_empty()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("This node has no shape, so it can't collide or interact with other objects.\nConsider adding a CollisionShape2D or CollisionPolygon2D as a child to define its shape.");
+		warnings.push_back(TTR("This node has no shape, so it can't collide or interact with other objects.\nConsider adding a CollisionShape2D or CollisionPolygon2D as a child to define its shape."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void CollisionObject2D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_rid"), &CollisionObject2D::get_rid);
-
+	ClassDB::bind_method(D_METHOD("set_collision_layer", "layer"), &CollisionObject2D::set_collision_layer);
+	ClassDB::bind_method(D_METHOD("get_collision_layer"), &CollisionObject2D::get_collision_layer);
+	ClassDB::bind_method(D_METHOD("set_collision_mask", "mask"), &CollisionObject2D::set_collision_mask);
+	ClassDB::bind_method(D_METHOD("get_collision_mask"), &CollisionObject2D::get_collision_mask);
+	ClassDB::bind_method(D_METHOD("set_collision_layer_bit", "bit", "value"), &CollisionObject2D::set_collision_layer_bit);
+	ClassDB::bind_method(D_METHOD("get_collision_layer_bit", "bit"), &CollisionObject2D::get_collision_layer_bit);
+	ClassDB::bind_method(D_METHOD("set_collision_mask_bit", "bit", "value"), &CollisionObject2D::set_collision_mask_bit);
+	ClassDB::bind_method(D_METHOD("get_collision_mask_bit", "bit"), &CollisionObject2D::get_collision_mask_bit);
 	ClassDB::bind_method(D_METHOD("set_pickable", "enabled"), &CollisionObject2D::set_pickable);
 	ClassDB::bind_method(D_METHOD("is_pickable"), &CollisionObject2D::is_pickable);
 	ClassDB::bind_method(D_METHOD("create_shape_owner", "owner"), &CollisionObject2D::create_shape_owner);
@@ -407,9 +469,12 @@ void CollisionObject2D::_bind_methods() {
 	ADD_SIGNAL(MethodInfo("mouse_entered"));
 	ADD_SIGNAL(MethodInfo("mouse_exited"));
 
-	ADD_GROUP("Pickable", "input_");
+	ADD_GROUP("Collision", "collision_");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_layer", PROPERTY_HINT_LAYERS_2D_PHYSICS), "set_collision_layer", "get_collision_layer");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_mask", PROPERTY_HINT_LAYERS_2D_PHYSICS), "set_collision_mask", "get_collision_mask");
+
+	ADD_GROUP("Input", "input_");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "input_pickable"), "set_pickable", "is_pickable");
-	ADD_GROUP("", "");
 }
 
 CollisionObject2D::CollisionObject2D(RID p_rid, bool p_area) {
diff --git a/scene/2d/collision_object_2d.h b/scene/2d/collision_object_2d.h
index e82b61d441..bb1a9dfcf5 100644
--- a/scene/2d/collision_object_2d.h
+++ b/scene/2d/collision_object_2d.h
@@ -37,6 +37,9 @@
 class CollisionObject2D : public Node2D {
 	GDCLASS(CollisionObject2D, Node2D);
 
+	uint32_t collision_layer = 1;
+	uint32_t collision_mask = 1;
+
 	bool area = false;
 	RID rid;
 	bool pickable = false;
@@ -76,6 +79,18 @@ protected:
 	void set_only_update_transform_changes(bool p_enable);
 
 public:
+	void set_collision_layer(uint32_t p_layer);
+	uint32_t get_collision_layer() const;
+
+	void set_collision_mask(uint32_t p_mask);
+	uint32_t get_collision_mask() const;
+
+	void set_collision_layer_bit(int p_bit, bool p_value);
+	bool get_collision_layer_bit(int p_bit) const;
+
+	void set_collision_mask_bit(int p_bit, bool p_value);
+	bool get_collision_mask_bit(int p_bit) const;
+
 	uint32_t create_shape_owner(Object *p_owner);
 	void remove_shape_owner(uint32_t owner);
 	void get_shape_owners(List<uint32_t> *r_owners);
@@ -107,7 +122,7 @@ public:
 	void set_pickable(bool p_enabled);
 	bool is_pickable() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	_FORCE_INLINE_ RID get_rid() const { return rid; }
 
diff --git a/scene/2d/collision_polygon_2d.cpp b/scene/2d/collision_polygon_2d.cpp
index 38198c496e..a69ef73a54 100644
--- a/scene/2d/collision_polygon_2d.cpp
+++ b/scene/2d/collision_polygon_2d.cpp
@@ -204,7 +204,7 @@ void CollisionPolygon2D::set_polygon(const Vector<Point2> &p_polygon) {
 		_update_in_shape_owner();
 	}
 	update();
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Vector<Point2> CollisionPolygon2D::get_polygon() const {
@@ -219,7 +219,7 @@ void CollisionPolygon2D::set_build_mode(BuildMode p_mode) {
 		_update_in_shape_owner();
 	}
 	update();
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 CollisionPolygon2D::BuildMode CollisionPolygon2D::get_build_mode() const {
@@ -240,40 +240,28 @@ bool CollisionPolygon2D::_edit_is_selected_on_click(const Point2 &p_point, doubl
 }
 #endif
 
-String CollisionPolygon2D::get_configuration_warning() const {
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> CollisionPolygon2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!Object::cast_to<CollisionObject2D>(get_parent())) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("CollisionPolygon2D only serves to provide a collision shape to a CollisionObject2D derived node. Please only use it as a child of Area2D, StaticBody2D, RigidBody2D, KinematicBody2D, etc. to give them a shape.");
+		warnings.push_back(TTR("CollisionPolygon2D only serves to provide a collision shape to a CollisionObject2D derived node. Please only use it as a child of Area2D, StaticBody2D, RigidBody2D, KinematicBody2D, etc. to give them a shape."));
 	}
 
 	int polygon_count = polygon.size();
 	if (polygon_count == 0) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("An empty CollisionPolygon2D has no effect on collision.");
+		warnings.push_back(TTR("An empty CollisionPolygon2D has no effect on collision."));
 	} else {
 		bool solids = build_mode == BUILD_SOLIDS;
 		if (solids) {
 			if (polygon_count < 3) {
-				if (!warning.is_empty()) {
-					warning += "\n\n";
-				}
-				warning += TTR("Invalid polygon. At least 3 points are needed in 'Solids' build mode.");
+				warnings.push_back(TTR("Invalid polygon. At least 3 points are needed in 'Solids' build mode."));
 			}
 		} else if (polygon_count < 2) {
-			if (!warning.is_empty()) {
-				warning += "\n\n";
-			}
-			warning += TTR("Invalid polygon. At least 2 points are needed in 'Segments' build mode.");
+			warnings.push_back(TTR("Invalid polygon. At least 2 points are needed in 'Segments' build mode."));
 		}
 	}
 
-	return warning;
+	return warnings;
 }
 
 void CollisionPolygon2D::set_disabled(bool p_disabled) {
diff --git a/scene/2d/collision_polygon_2d.h b/scene/2d/collision_polygon_2d.h
index 9df9802629..95dd8c9e21 100644
--- a/scene/2d/collision_polygon_2d.h
+++ b/scene/2d/collision_polygon_2d.h
@@ -78,7 +78,7 @@ public:
 	void set_polygon(const Vector<Point2> &p_polygon);
 	Vector<Point2> get_polygon() const;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	void set_disabled(bool p_disabled);
 	bool is_disabled() const;
diff --git a/scene/2d/collision_shape_2d.cpp b/scene/2d/collision_shape_2d.cpp
index 93949f741b..d9009ef85c 100644
--- a/scene/2d/collision_shape_2d.cpp
+++ b/scene/2d/collision_shape_2d.cpp
@@ -162,7 +162,7 @@ void CollisionShape2D::set_shape(const Ref<Shape2D> &p_shape) {
 		shape->connect("changed", callable_mp(this, &CollisionShape2D::_shape_changed));
 	}
 
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Ref<Shape2D> CollisionShape2D::get_shape() const {
@@ -177,19 +177,23 @@ bool CollisionShape2D::_edit_is_selected_on_click(const Point2 &p_point, double
 	return shape->_edit_is_selected_on_click(p_point, p_tolerance);
 }
 
-String CollisionShape2D::get_configuration_warning() const {
+TypedArray<String> CollisionShape2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
+
 	if (!Object::cast_to<CollisionObject2D>(get_parent())) {
-		return TTR("CollisionShape2D only serves to provide a collision shape to a CollisionObject2D derived node. Please only use it as a child of Area2D, StaticBody2D, RigidBody2D, KinematicBody2D, etc. to give them a shape.");
+		warnings.push_back(TTR("CollisionShape2D only serves to provide a collision shape to a CollisionObject2D derived node. Please only use it as a child of Area2D, StaticBody2D, RigidBody2D, KinematicBody2D, etc. to give them a shape."));
 	}
 	if (!shape.is_valid()) {
-		return TTR("A shape must be provided for CollisionShape2D to function. Please create a shape resource for it!");
+		warnings.push_back(TTR("A shape must be provided for CollisionShape2D to function. Please create a shape resource for it!"));
 	}
+
 	Ref<ConvexPolygonShape2D> convex = shape;
 	Ref<ConcavePolygonShape2D> concave = shape;
 	if (convex.is_valid() || concave.is_valid()) {
-		return TTR("Polygon-based shapes are not meant be used nor edited directly through the CollisionShape2D node. Please use the CollisionPolygon2D node instead.");
+		warnings.push_back(TTR("Polygon-based shapes are not meant be used nor edited directly through the CollisionShape2D node. Please use the CollisionPolygon2D node instead."));
 	}
-	return String();
+
+	return warnings;
 }
 
 void CollisionShape2D::set_disabled(bool p_disabled) {
diff --git a/scene/2d/collision_shape_2d.h b/scene/2d/collision_shape_2d.h
index 695d0c6657..eaf72627c8 100644
--- a/scene/2d/collision_shape_2d.h
+++ b/scene/2d/collision_shape_2d.h
@@ -72,7 +72,7 @@ public:
 	void set_one_way_collision_margin(real_t p_margin);
 	real_t get_one_way_collision_margin() const;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	CollisionShape2D();
 };
diff --git a/scene/2d/cpu_particles_2d.cpp b/scene/2d/cpu_particles_2d.cpp
index 6a69a4c618..1578643d14 100644
--- a/scene/2d/cpu_particles_2d.cpp
+++ b/scene/2d/cpu_particles_2d.cpp
@@ -244,18 +244,15 @@ bool CPUParticles2D::get_fractional_delta() const {
 	return fractional_delta;
 }
 
-String CPUParticles2D::get_configuration_warning() const {
-	String warnings = Node2D::get_configuration_warning();
+TypedArray<String> CPUParticles2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	CanvasItemMaterial *mat = Object::cast_to<CanvasItemMaterial>(get_material().ptr());
 
 	if (get_material().is_null() || (mat && !mat->get_particles_animation())) {
 		if (get_param(PARAM_ANIM_SPEED) != 0.0 || get_param(PARAM_ANIM_OFFSET) != 0.0 ||
 				get_param_curve(PARAM_ANIM_SPEED).is_valid() || get_param_curve(PARAM_ANIM_OFFSET).is_valid()) {
-			if (warnings != String()) {
-				warnings += "\n";
-			}
-			warnings += "- " + TTR("CPUParticles2D animation requires the usage of a CanvasItemMaterial with \"Particles Animation\" enabled.");
+			warnings.push_back(TTR("CPUParticles2D animation requires the usage of a CanvasItemMaterial with \"Particles Animation\" enabled."));
 		}
 	}
 
@@ -979,7 +976,7 @@ void CPUParticles2D::_update_particle_data_buffer() {
 			ptr[7] = t.elements[2][1];
 
 		} else {
-			zeromem(ptr, sizeof(float) * 8);
+			memset(ptr, 0, sizeof(float) * 8);
 		}
 
 		Color c = r[idx].color;
@@ -1083,7 +1080,7 @@ void CPUParticles2D::_notification(int p_what) {
 						ptr[7] = t.elements[2][1];
 
 					} else {
-						zeromem(ptr, sizeof(float) * 8);
+						memset(ptr, 0, sizeof(float) * 8);
 					}
 
 					ptr += 16;
diff --git a/scene/2d/cpu_particles_2d.h b/scene/2d/cpu_particles_2d.h
index ab04ee4a57..ba34a0f45d 100644
--- a/scene/2d/cpu_particles_2d.h
+++ b/scene/2d/cpu_particles_2d.h
@@ -275,7 +275,7 @@ public:
 	void set_gravity(const Vector2 &p_gravity);
 	Vector2 get_gravity() const;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	void restart();
 
diff --git a/scene/2d/gpu_particles_2d.cpp b/scene/2d/gpu_particles_2d.cpp
index af70c47f7c..8a0631a614 100644
--- a/scene/2d/gpu_particles_2d.cpp
+++ b/scene/2d/gpu_particles_2d.cpp
@@ -137,7 +137,7 @@ void GPUParticles2D::set_process_material(const Ref<Material> &p_material) {
 	}
 	RS::get_singleton()->particles_set_process_material(particles, material_rid);
 
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 void GPUParticles2D::set_speed_scale(float p_scale) {
@@ -216,18 +216,15 @@ bool GPUParticles2D::get_fractional_delta() const {
 	return fractional_delta;
 }
 
-String GPUParticles2D::get_configuration_warning() const {
+TypedArray<String> GPUParticles2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
+
 	if (RenderingServer::get_singleton()->is_low_end()) {
-		return TTR("GPU-based particles are not supported by the GLES2 video driver.\nUse the CPUParticles2D node instead. You can use the \"Convert to CPUParticles2D\" option for this purpose.");
+		warnings.push_back(TTR("GPU-based particles are not supported by the GLES2 video driver.\nUse the CPUParticles2D node instead. You can use the \"Convert to CPUParticles2D\" option for this purpose."));
 	}
 
-	String warnings = Node2D::get_configuration_warning();
-
 	if (process_material.is_null()) {
-		if (warnings != String()) {
-			warnings += "\n";
-		}
-		warnings += "- " + TTR("A material to process the particles is not assigned, so no behavior is imprinted.");
+		warnings.push_back(TTR("A material to process the particles is not assigned, so no behavior is imprinted."));
 	} else {
 		CanvasItemMaterial *mat = Object::cast_to<CanvasItemMaterial>(get_material().ptr());
 
@@ -236,10 +233,7 @@ String GPUParticles2D::get_configuration_warning() const {
 			if (process &&
 					(process->get_param(ParticlesMaterial::PARAM_ANIM_SPEED) != 0.0 || process->get_param(ParticlesMaterial::PARAM_ANIM_OFFSET) != 0.0 ||
 							process->get_param_texture(ParticlesMaterial::PARAM_ANIM_SPEED).is_valid() || process->get_param_texture(ParticlesMaterial::PARAM_ANIM_OFFSET).is_valid())) {
-				if (warnings != String()) {
-					warnings += "\n";
-				}
-				warnings += "- " + TTR("Particles2D animation requires the usage of a CanvasItemMaterial with \"Particles Animation\" enabled.");
+				warnings.push_back(TTR("Particles2D animation requires the usage of a CanvasItemMaterial with \"Particles Animation\" enabled."));
 			}
 		}
 	}
diff --git a/scene/2d/gpu_particles_2d.h b/scene/2d/gpu_particles_2d.h
index 774cef9cc9..20f9f768ed 100644
--- a/scene/2d/gpu_particles_2d.h
+++ b/scene/2d/gpu_particles_2d.h
@@ -110,7 +110,7 @@ public:
 	void set_texture(const Ref<Texture2D> &p_texture);
 	Ref<Texture2D> get_texture() const;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	void restart();
 	Rect2 capture_rect() const;
diff --git a/scene/2d/joints_2d.cpp b/scene/2d/joints_2d.cpp
index 7d9cdd52ac..8a4ccc2f96 100644
--- a/scene/2d/joints_2d.cpp
+++ b/scene/2d/joints_2d.cpp
@@ -66,6 +66,7 @@ void Joint2D::_update_joint(bool p_only_free) {
 	if (p_only_free || !is_inside_tree()) {
 		PhysicsServer2D::get_singleton()->joint_clear(joint);
 		warning = String();
+		update_configuration_warnings();
 		return;
 	}
 
@@ -76,43 +77,26 @@ void Joint2D::_update_joint(bool p_only_free) {
 	PhysicsBody2D *body_b = Object::cast_to<PhysicsBody2D>(node_b);
 
 	if (node_a && !body_a && node_b && !body_b) {
-		PhysicsServer2D::get_singleton()->joint_clear(joint);
 		warning = TTR("Node A and Node B must be PhysicsBody2Ds");
-		update_configuration_warning();
-		return;
-	}
-
-	if (node_a && !body_a) {
-		PhysicsServer2D::get_singleton()->joint_clear(joint);
+	} else if (node_a && !body_a) {
 		warning = TTR("Node A must be a PhysicsBody2D");
-		update_configuration_warning();
-		return;
-	}
-
-	if (node_b && !body_b) {
-		PhysicsServer2D::get_singleton()->joint_clear(joint);
+	} else if (node_b && !body_b) {
 		warning = TTR("Node B must be a PhysicsBody2D");
-		update_configuration_warning();
-		return;
-	}
-
-	if (!body_a || !body_b) {
-		PhysicsServer2D::get_singleton()->joint_clear(joint);
+	} else if (!body_a || !body_b) {
 		warning = TTR("Joint is not connected to two PhysicsBody2Ds");
-		update_configuration_warning();
-		return;
+	} else if (body_a == body_b) {
+		warning = TTR("Node A and Node B must be different PhysicsBody2Ds");
+	} else {
+		warning = String();
 	}
 
-	if (body_a == body_b) {
+	update_configuration_warnings();
+
+	if (!warning.is_empty()) {
 		PhysicsServer2D::get_singleton()->joint_clear(joint);
-		warning = TTR("Node A and Node B must be different PhysicsBody2Ds");
-		update_configuration_warning();
 		return;
 	}
 
-	warning = String();
-	update_configuration_warning();
-
 	if (body_a) {
 		body_a->force_update_transform();
 	}
@@ -211,17 +195,14 @@ bool Joint2D::get_exclude_nodes_from_collision() const {
 	return exclude_from_collision;
 }
 
-String Joint2D::get_configuration_warning() const {
-	String node_warning = Node2D::get_configuration_warning();
+TypedArray<String> Joint2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node2D::get_configuration_warnings();
 
 	if (!warning.is_empty()) {
-		if (!node_warning.is_empty()) {
-			node_warning += "\n\n";
-		}
-		node_warning += warning;
+		warnings.push_back(warning);
 	}
 
-	return node_warning;
+	return warnings;
 }
 
 void Joint2D::_bind_methods() {
diff --git a/scene/2d/joints_2d.h b/scene/2d/joints_2d.h
index 08e02ee29d..dc5a08f815 100644
--- a/scene/2d/joints_2d.h
+++ b/scene/2d/joints_2d.h
@@ -62,7 +62,7 @@ protected:
 	_FORCE_INLINE_ bool is_configured() const { return configured; }
 
 public:
-	virtual String get_configuration_warning() const override;
+	virtual TypedArray<String> get_configuration_warnings() const override;
 
 	void set_node_a(const NodePath &p_node_a);
 	NodePath get_node_a() const;
diff --git a/scene/2d/light_2d.cpp b/scene/2d/light_2d.cpp
index 99e35cad1d..8fb765f16b 100644
--- a/scene/2d/light_2d.cpp
+++ b/scene/2d/light_2d.cpp
@@ -373,7 +373,7 @@ void PointLight2D::set_texture(const Ref<Texture2D> &p_texture) {
 		RS::get_singleton()->canvas_light_set_texture(_get_light(), RID());
 	}
 
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Ref<Texture2D> PointLight2D::get_texture() const {
@@ -390,17 +390,14 @@ Vector2 PointLight2D::get_texture_offset() const {
 	return texture_offset;
 }
 
-String PointLight2D::get_configuration_warning() const {
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> PointLight2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!texture.is_valid()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("A texture with the shape of the light must be supplied to the \"Texture\" property.");
+		warnings.push_back(TTR("A texture with the shape of the light must be supplied to the \"Texture\" property."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void PointLight2D::set_texture_scale(real_t p_scale) {
diff --git a/scene/2d/light_2d.h b/scene/2d/light_2d.h
index ae6cf6d0a0..d9ecd81f1c 100644
--- a/scene/2d/light_2d.h
+++ b/scene/2d/light_2d.h
@@ -169,7 +169,7 @@ public:
 	void set_texture_scale(real_t p_scale);
 	real_t get_texture_scale() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	PointLight2D();
 };
diff --git a/scene/2d/light_occluder_2d.cpp b/scene/2d/light_occluder_2d.cpp
index 9589702e2e..fdc28f81c2 100644
--- a/scene/2d/light_occluder_2d.cpp
+++ b/scene/2d/light_occluder_2d.cpp
@@ -242,24 +242,18 @@ int LightOccluder2D::get_occluder_light_mask() const {
 	return mask;
 }
 
-String LightOccluder2D::get_configuration_warning() const {
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> LightOccluder2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!occluder_polygon.is_valid()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("An occluder polygon must be set (or drawn) for this occluder to take effect.");
+		warnings.push_back(TTR("An occluder polygon must be set (or drawn) for this occluder to take effect."));
 	}
 
 	if (occluder_polygon.is_valid() && occluder_polygon->get_polygon().size() == 0) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("The occluder polygon for this occluder is empty. Please draw a polygon.");
+		warnings.push_back(TTR("The occluder polygon for this occluder is empty. Please draw a polygon."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void LightOccluder2D::set_as_sdf_collision(bool p_enable) {
diff --git a/scene/2d/light_occluder_2d.h b/scene/2d/light_occluder_2d.h
index f567c6d965..b4a48d1062 100644
--- a/scene/2d/light_occluder_2d.h
+++ b/scene/2d/light_occluder_2d.h
@@ -106,7 +106,7 @@ public:
 	void set_as_sdf_collision(bool p_enable);
 	bool is_set_as_sdf_collision() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	LightOccluder2D();
 	~LightOccluder2D();
diff --git a/scene/2d/navigation_agent_2d.cpp b/scene/2d/navigation_agent_2d.cpp
index 064fcc91a4..f9cbdbf377 100644
--- a/scene/2d/navigation_agent_2d.cpp
+++ b/scene/2d/navigation_agent_2d.cpp
@@ -35,6 +35,8 @@
 #include "servers/navigation_server_2d.h"
 
 void NavigationAgent2D::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("get_rid"), &NavigationAgent2D::get_rid);
+
 	ClassDB::bind_method(D_METHOD("set_target_desired_distance", "desired_distance"), &NavigationAgent2D::set_target_desired_distance);
 	ClassDB::bind_method(D_METHOD("get_target_desired_distance"), &NavigationAgent2D::get_target_desired_distance);
 
@@ -88,9 +90,11 @@ void NavigationAgent2D::_notification(int p_what) {
 	switch (p_what) {
 		case NOTIFICATION_READY: {
 			agent_parent = Object::cast_to<Node2D>(get_parent());
-
-			NavigationServer2D::get_singleton()->agent_set_callback(agent, this, "_avoidance_done");
-
+			if (agent_parent != nullptr) {
+				// place agent on navigation map first or else the RVO agent callback creation fails silently later
+				NavigationServer2D::get_singleton()->agent_set_map(get_rid(), agent_parent->get_world_2d()->get_navigation_map());
+				NavigationServer2D::get_singleton()->agent_set_callback(agent, this, "_avoidance_done");
+			}
 			set_physics_process_internal(true);
 		} break;
 		case NOTIFICATION_EXIT_TREE: {
@@ -100,12 +104,7 @@ void NavigationAgent2D::_notification(int p_what) {
 		case NOTIFICATION_INTERNAL_PHYSICS_PROCESS: {
 			if (agent_parent) {
 				NavigationServer2D::get_singleton()->agent_set_position(agent, agent_parent->get_global_transform().get_origin());
-				if (!target_reached) {
-					if (distance_to_target() < target_desired_distance) {
-						emit_signal("target_reached");
-						target_reached = true;
-					}
-				}
+				_check_distance_to_target();
 			}
 		} break;
 	}
@@ -239,17 +238,14 @@ void NavigationAgent2D::_avoidance_done(Vector3 p_new_velocity) {
 	emit_signal("velocity_computed", velocity);
 }
 
-String NavigationAgent2D::get_configuration_warning() const {
-	String warning = Node::get_configuration_warning();
+TypedArray<String> NavigationAgent2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!Object::cast_to<Node2D>(get_parent())) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("The NavigationAgent2D can be used only under a Node2D node");
+		warnings.push_back(TTR("The NavigationAgent2D can be used only under a Node2D node"));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void NavigationAgent2D::update_navigation() {
@@ -304,6 +300,7 @@ void NavigationAgent2D::update_navigation() {
 		while (o.distance_to(navigation_path[nav_path_index]) < target_desired_distance) {
 			nav_path_index += 1;
 			if (nav_path_index == navigation_path.size()) {
+				_check_distance_to_target();
 				nav_path_index -= 1;
 				navigation_finished = true;
 				emit_signal("navigation_finished");
@@ -312,3 +309,12 @@ void NavigationAgent2D::update_navigation() {
 		}
 	}
 }
+
+void NavigationAgent2D::_check_distance_to_target() {
+	if (!target_reached) {
+		if (distance_to_target() < target_desired_distance) {
+			emit_signal("target_reached");
+			target_reached = true;
+		}
+	}
+}
diff --git a/scene/2d/navigation_agent_2d.h b/scene/2d/navigation_agent_2d.h
index 153ede8cec..234cad333f 100644
--- a/scene/2d/navigation_agent_2d.h
+++ b/scene/2d/navigation_agent_2d.h
@@ -136,10 +136,11 @@ public:
 	void set_velocity(Vector2 p_velocity);
 	void _avoidance_done(Vector3 p_new_velocity);
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 private:
 	void update_navigation();
+	void _check_distance_to_target();
 };
 
 #endif
diff --git a/scene/2d/navigation_obstacle_2d.cpp b/scene/2d/navigation_obstacle_2d.cpp
index 965e2b6dc1..a06f7a9fd0 100644
--- a/scene/2d/navigation_obstacle_2d.cpp
+++ b/scene/2d/navigation_obstacle_2d.cpp
@@ -69,17 +69,14 @@ NavigationObstacle2D::~NavigationObstacle2D() {
 	agent = RID(); // Pointless
 }
 
-String NavigationObstacle2D::get_configuration_warning() const {
-	String warning = Node::get_configuration_warning();
+TypedArray<String> NavigationObstacle2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!Object::cast_to<Node2D>(get_parent())) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("The NavigationObstacle2D only serves to provide collision avoidance to a Node2D object.");
+		warnings.push_back(TTR("The NavigationObstacle2D only serves to provide collision avoidance to a Node2D object."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void NavigationObstacle2D::update_agent_shape() {
diff --git a/scene/2d/navigation_obstacle_2d.h b/scene/2d/navigation_obstacle_2d.h
index 135ca4651e..9cffc2c0c3 100644
--- a/scene/2d/navigation_obstacle_2d.h
+++ b/scene/2d/navigation_obstacle_2d.h
@@ -52,7 +52,7 @@ public:
 		return agent;
 	}
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 private:
 	void update_agent_shape();
diff --git a/scene/2d/navigation_region_2d.cpp b/scene/2d/navigation_region_2d.cpp
index 8be8c8db4a..d2caf5bea8 100644
--- a/scene/2d/navigation_region_2d.cpp
+++ b/scene/2d/navigation_region_2d.cpp
@@ -491,7 +491,7 @@ void NavigationRegion2D::set_navigation_polygon(const Ref<NavigationPolygon> &p_
 	}
 	_navpoly_changed();
 
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Ref<NavigationPolygon> NavigationRegion2D::get_navigation_polygon() const {
@@ -509,21 +509,16 @@ void NavigationRegion2D::_map_changed(RID p_map) {
 	}
 }
 
-String NavigationRegion2D::get_configuration_warning() const {
-	if (!is_visible_in_tree() || !is_inside_tree()) {
-		return String();
-	}
-
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> NavigationRegion2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node2D::get_configuration_warnings();
 
-	if (!navpoly.is_valid()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
+	if (is_visible_in_tree() && is_inside_tree()) {
+		if (!navpoly.is_valid()) {
+			warnings.push_back(TTR("A NavigationMesh resource must be set or created for this node to work. Please set a property or draw a polygon."));
 		}
-		warning += TTR("A NavigationPolygon resource must be set or created for this node to work. Please set a property or draw a polygon.");
 	}
 
-	return warning;
+	return warnings;
 }
 
 void NavigationRegion2D::_bind_methods() {
diff --git a/scene/2d/navigation_region_2d.h b/scene/2d/navigation_region_2d.h
index 58f04599be..2db8d70791 100644
--- a/scene/2d/navigation_region_2d.h
+++ b/scene/2d/navigation_region_2d.h
@@ -120,7 +120,7 @@ public:
 	void set_navigation_polygon(const Ref<NavigationPolygon> &p_navpoly);
 	Ref<NavigationPolygon> get_navigation_polygon() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	NavigationRegion2D();
 	~NavigationRegion2D();
diff --git a/scene/2d/parallax_layer.cpp b/scene/2d/parallax_layer.cpp
index 725e858a43..228020d383 100644
--- a/scene/2d/parallax_layer.cpp
+++ b/scene/2d/parallax_layer.cpp
@@ -135,17 +135,14 @@ void ParallaxLayer::set_base_offset_and_scale(const Point2 &p_offset, real_t p_s
 	_update_mirroring();
 }
 
-String ParallaxLayer::get_configuration_warning() const {
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> ParallaxLayer::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!Object::cast_to<ParallaxBackground>(get_parent())) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("ParallaxLayer node only works when set as child of a ParallaxBackground node.");
+		warnings.push_back(TTR("ParallaxLayer node only works when set as child of a ParallaxBackground node."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void ParallaxLayer::_bind_methods() {
diff --git a/scene/2d/parallax_layer.h b/scene/2d/parallax_layer.h
index e826e6da9c..cc2d2e096e 100644
--- a/scene/2d/parallax_layer.h
+++ b/scene/2d/parallax_layer.h
@@ -61,7 +61,7 @@ public:
 
 	void set_base_offset_and_scale(const Point2 &p_offset, real_t p_scale, const Point2 &p_screen_offset);
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 	ParallaxLayer();
 };
 
diff --git a/scene/2d/path_2d.cpp b/scene/2d/path_2d.cpp
index be160ee1dd..9912612c4f 100644
--- a/scene/2d/path_2d.cpp
+++ b/scene/2d/path_2d.cpp
@@ -249,21 +249,16 @@ void PathFollow2D::_validate_property(PropertyInfo &property) const {
 	}
 }
 
-String PathFollow2D::get_configuration_warning() const {
-	if (!is_visible_in_tree() || !is_inside_tree()) {
-		return String();
-	}
-
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> PathFollow2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
-	if (!Object::cast_to<Path2D>(get_parent())) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
+	if (is_visible_in_tree() && is_inside_tree()) {
+		if (!Object::cast_to<Path2D>(get_parent())) {
+			warnings.push_back(TTR("PathFollow2D only works when set as a child of a Path2D node."));
 		}
-		warning += TTR("PathFollow2D only works when set as a child of a Path2D node.");
 	}
 
-	return warning;
+	return warnings;
 }
 
 void PathFollow2D::_bind_methods() {
diff --git a/scene/2d/path_2d.h b/scene/2d/path_2d.h
index 671ab493c3..3b12f025fc 100644
--- a/scene/2d/path_2d.h
+++ b/scene/2d/path_2d.h
@@ -105,7 +105,7 @@ public:
 	void set_cubic_interpolation(bool p_enable);
 	bool get_cubic_interpolation() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	PathFollow2D() {}
 };
diff --git a/scene/2d/physics_body_2d.cpp b/scene/2d/physics_body_2d.cpp
index a615d96687..3f2f6d6b1c 100644
--- a/scene/2d/physics_body_2d.cpp
+++ b/scene/2d/physics_body_2d.cpp
@@ -42,70 +42,9 @@ void PhysicsBody2D::_notification(int p_what) {
 }
 
 void PhysicsBody2D::_bind_methods() {
-	ClassDB::bind_method(D_METHOD("set_collision_layer", "layer"), &PhysicsBody2D::set_collision_layer);
-	ClassDB::bind_method(D_METHOD("get_collision_layer"), &PhysicsBody2D::get_collision_layer);
-	ClassDB::bind_method(D_METHOD("set_collision_mask", "mask"), &PhysicsBody2D::set_collision_mask);
-	ClassDB::bind_method(D_METHOD("get_collision_mask"), &PhysicsBody2D::get_collision_mask);
-
-	ClassDB::bind_method(D_METHOD("set_collision_mask_bit", "bit", "value"), &PhysicsBody2D::set_collision_mask_bit);
-	ClassDB::bind_method(D_METHOD("get_collision_mask_bit", "bit"), &PhysicsBody2D::get_collision_mask_bit);
-
-	ClassDB::bind_method(D_METHOD("set_collision_layer_bit", "bit", "value"), &PhysicsBody2D::set_collision_layer_bit);
-	ClassDB::bind_method(D_METHOD("get_collision_layer_bit", "bit"), &PhysicsBody2D::get_collision_layer_bit);
-
 	ClassDB::bind_method(D_METHOD("get_collision_exceptions"), &PhysicsBody2D::get_collision_exceptions);
 	ClassDB::bind_method(D_METHOD("add_collision_exception_with", "body"), &PhysicsBody2D::add_collision_exception_with);
 	ClassDB::bind_method(D_METHOD("remove_collision_exception_with", "body"), &PhysicsBody2D::remove_collision_exception_with);
-
-	ADD_GROUP("Collision", "collision_");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_layer", PROPERTY_HINT_LAYERS_2D_PHYSICS), "set_collision_layer", "get_collision_layer");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_mask", PROPERTY_HINT_LAYERS_2D_PHYSICS), "set_collision_mask", "get_collision_mask");
-}
-
-void PhysicsBody2D::set_collision_layer(uint32_t p_layer) {
-	collision_layer = p_layer;
-	PhysicsServer2D::get_singleton()->body_set_collision_layer(get_rid(), p_layer);
-}
-
-uint32_t PhysicsBody2D::get_collision_layer() const {
-	return collision_layer;
-}
-
-void PhysicsBody2D::set_collision_mask(uint32_t p_mask) {
-	collision_mask = p_mask;
-	PhysicsServer2D::get_singleton()->body_set_collision_mask(get_rid(), p_mask);
-}
-
-uint32_t PhysicsBody2D::get_collision_mask() const {
-	return collision_mask;
-}
-
-void PhysicsBody2D::set_collision_mask_bit(int p_bit, bool p_value) {
-	uint32_t mask = get_collision_mask();
-	if (p_value) {
-		mask |= 1 << p_bit;
-	} else {
-		mask &= ~(1 << p_bit);
-	}
-	set_collision_mask(mask);
-}
-
-bool PhysicsBody2D::get_collision_mask_bit(int p_bit) const {
-	return get_collision_mask() & (1 << p_bit);
-}
-
-void PhysicsBody2D::set_collision_layer_bit(int p_bit, bool p_value) {
-	uint32_t collision_layer = get_collision_layer();
-	if (p_value) {
-		collision_layer |= 1 << p_bit;
-	} else {
-		collision_layer &= ~(1 << p_bit);
-	}
-	set_collision_layer(collision_layer);
-}
-
-bool PhysicsBody2D::get_collision_layer_bit(int p_bit) const {
-	return get_collision_layer() & (1 << p_bit);
 }
 
 PhysicsBody2D::PhysicsBody2D(PhysicsServer2D::BodyMode p_mode) :
@@ -332,6 +271,7 @@ bool RigidBody2D::_test_motion(const Vector2 &p_motion, bool p_infinite_inertia,
 void RigidBody2D::_direct_state_changed(Object *p_state) {
 #ifdef DEBUG_ENABLED
 	state = Object::cast_to<PhysicsDirectBodyState2D>(p_state);
+	ERR_FAIL_NULL_MSG(state, "Method '_direct_state_changed' must receive a valid PhysicsDirectBodyState2D object as argument");
 #else
 	state = (PhysicsDirectBodyState2D *)p_state; //trust it
 #endif
@@ -708,26 +648,23 @@ void RigidBody2D::_notification(int p_what) {
 
 	if (p_what == NOTIFICATION_LOCAL_TRANSFORM_CHANGED) {
 		if (Engine::get_singleton()->is_editor_hint()) {
-			update_configuration_warning();
+			update_configuration_warnings();
 		}
 	}
 
 #endif
 }
 
-String RigidBody2D::get_configuration_warning() const {
+TypedArray<String> RigidBody2D::get_configuration_warnings() const {
 	Transform2D t = get_transform();
 
-	String warning = CollisionObject2D::get_configuration_warning();
+	TypedArray<String> warnings = CollisionObject2D::get_configuration_warnings();
 
 	if ((get_mode() == MODE_RIGID || get_mode() == MODE_CHARACTER) && (ABS(t.elements[0].length() - 1.0) > 0.05 || ABS(t.elements[1].length() - 1.0) > 0.05)) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("Size changes to RigidBody2D (in character or rigid modes) will be overridden by the physics engine when running.\nChange the size in children collision shapes instead.");
+		warnings.push_back(TTR("Size changes to RigidBody2D (in character or rigid modes) will be overridden by the physics engine when running.\nChange the size in children collision shapes instead."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void RigidBody2D::_bind_methods() {
@@ -793,8 +730,6 @@ void RigidBody2D::_bind_methods() {
 
 	ClassDB::bind_method(D_METHOD("test_motion", "motion", "infinite_inertia", "margin", "result"), &RigidBody2D::_test_motion, DEFVAL(true), DEFVAL(0.08), DEFVAL(Variant()));
 
-	ClassDB::bind_method(D_METHOD("_direct_state_changed"), &RigidBody2D::_direct_state_changed);
-
 	ClassDB::bind_method(D_METHOD("get_colliding_bodies"), &RigidBody2D::get_colliding_bodies);
 
 	BIND_VMETHOD(MethodInfo("_integrate_forces", PropertyInfo(Variant::OBJECT, "state", PROPERTY_HINT_RESOURCE_TYPE, "PhysicsDirectBodyState2D")));
@@ -838,7 +773,7 @@ void RigidBody2D::_bind_methods() {
 
 RigidBody2D::RigidBody2D() :
 		PhysicsBody2D(PhysicsServer2D::BODY_MODE_RIGID) {
-	PhysicsServer2D::get_singleton()->body_set_force_integration_callback(get_rid(), this, "_direct_state_changed");
+	PhysicsServer2D::get_singleton()->body_set_force_integration_callback(get_rid(), callable_mp(this, &RigidBody2D::_direct_state_changed));
 }
 
 RigidBody2D::~RigidBody2D() {
@@ -1144,11 +1079,11 @@ void KinematicBody2D::set_sync_to_physics(bool p_enable) {
 	}
 
 	if (p_enable) {
-		PhysicsServer2D::get_singleton()->body_set_force_integration_callback(get_rid(), this, "_direct_state_changed");
+		PhysicsServer2D::get_singleton()->body_set_force_integration_callback(get_rid(), callable_mp(this, &KinematicBody2D::_direct_state_changed));
 		set_only_update_transform_changes(true);
 		set_notify_local_transform(true);
 	} else {
-		PhysicsServer2D::get_singleton()->body_set_force_integration_callback(get_rid(), nullptr, "");
+		PhysicsServer2D::get_singleton()->body_set_force_integration_callback(get_rid(), Callable());
 		set_only_update_transform_changes(false);
 		set_notify_local_transform(false);
 	}
@@ -1164,6 +1099,7 @@ void KinematicBody2D::_direct_state_changed(Object *p_state) {
 	}
 
 	PhysicsDirectBodyState2D *state = Object::cast_to<PhysicsDirectBodyState2D>(p_state);
+	ERR_FAIL_NULL_MSG(state, "Method '_direct_state_changed' must receive a valid PhysicsDirectBodyState2D object as argument");
 
 	last_valid_transform = state->get_transform();
 	set_notify_local_transform(false);
@@ -1217,8 +1153,6 @@ void KinematicBody2D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_sync_to_physics", "enable"), &KinematicBody2D::set_sync_to_physics);
 	ClassDB::bind_method(D_METHOD("is_sync_to_physics_enabled"), &KinematicBody2D::is_sync_to_physics_enabled);
 
-	ClassDB::bind_method(D_METHOD("_direct_state_changed"), &KinematicBody2D::_direct_state_changed);
-
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "collision/safe_margin", PROPERTY_HINT_RANGE, "0.001,256,0.001"), "set_safe_margin", "get_safe_margin");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "motion/sync_to_physics"), "set_sync_to_physics", "is_sync_to_physics_enabled");
 }
diff --git a/scene/2d/physics_body_2d.h b/scene/2d/physics_body_2d.h
index 2dc853b23b..e0fc0766bc 100644
--- a/scene/2d/physics_body_2d.h
+++ b/scene/2d/physics_body_2d.h
@@ -41,9 +41,6 @@ class KinematicCollision2D;
 class PhysicsBody2D : public CollisionObject2D {
 	GDCLASS(PhysicsBody2D, CollisionObject2D);
 
-	uint32_t collision_layer = 1;
-	uint32_t collision_mask = 1;
-
 protected:
 	void _notification(int p_what);
 	PhysicsBody2D(PhysicsServer2D::BodyMode p_mode);
@@ -51,18 +48,6 @@ protected:
 	static void _bind_methods();
 
 public:
-	void set_collision_layer(uint32_t p_layer);
-	uint32_t get_collision_layer() const;
-
-	void set_collision_mask(uint32_t p_mask);
-	uint32_t get_collision_mask() const;
-
-	void set_collision_mask_bit(int p_bit, bool p_value);
-	bool get_collision_mask_bit(int p_bit) const;
-
-	void set_collision_layer_bit(int p_bit, bool p_value);
-	bool get_collision_layer_bit(int p_bit) const;
-
 	TypedArray<PhysicsBody2D> get_collision_exceptions();
 	void add_collision_exception_with(Node *p_node); //must be physicsbody
 	void remove_collision_exception_with(Node *p_node);
@@ -246,7 +231,7 @@ public:
 
 	TypedArray<Node2D> get_colliding_bodies() const; //function for script
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	RigidBody2D();
 	~RigidBody2D();
diff --git a/scene/2d/ray_cast_2d.cpp b/scene/2d/ray_cast_2d.cpp
index 50625a0f39..f6740040c1 100644
--- a/scene/2d/ray_cast_2d.cpp
+++ b/scene/2d/ray_cast_2d.cpp
@@ -55,6 +55,7 @@ uint32_t RayCast2D::get_collision_mask() const {
 }
 
 void RayCast2D::set_collision_mask_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision mask bit must be between 0 and 31 inclusive.");
 	uint32_t mask = get_collision_mask();
 	if (p_value) {
 		mask |= 1 << p_bit;
@@ -65,6 +66,7 @@ void RayCast2D::set_collision_mask_bit(int p_bit, bool p_value) {
 }
 
 bool RayCast2D::get_collision_mask_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision mask bit must be between 0 and 31 inclusive.");
 	return get_collision_mask() & (1 << p_bit);
 }
 
diff --git a/scene/2d/remote_transform_2d.cpp b/scene/2d/remote_transform_2d.cpp
index f10714e28a..a7613dc009 100644
--- a/scene/2d/remote_transform_2d.cpp
+++ b/scene/2d/remote_transform_2d.cpp
@@ -138,7 +138,7 @@ void RemoteTransform2D::set_remote_node(const NodePath &p_remote_node) {
 		_update_remote();
 	}
 
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 NodePath RemoteTransform2D::get_remote_node() const {
@@ -185,17 +185,14 @@ void RemoteTransform2D::force_update_cache() {
 	_update_cache();
 }
 
-String RemoteTransform2D::get_configuration_warning() const {
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> RemoteTransform2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!has_node(remote_node) || !Object::cast_to<Node2D>(get_node(remote_node))) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("Path property must point to a valid Node2D node to work.");
+		warnings.push_back(TTR("Path property must point to a valid Node2D node to work."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void RemoteTransform2D::_bind_methods() {
diff --git a/scene/2d/remote_transform_2d.h b/scene/2d/remote_transform_2d.h
index 4a26d7b339..36fddb58c7 100644
--- a/scene/2d/remote_transform_2d.h
+++ b/scene/2d/remote_transform_2d.h
@@ -70,7 +70,7 @@ public:
 
 	void force_update_cache();
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	RemoteTransform2D();
 };
diff --git a/scene/2d/skeleton_2d.cpp b/scene/2d/skeleton_2d.cpp
index 2d19d254b1..22180797f0 100644
--- a/scene/2d/skeleton_2d.cpp
+++ b/scene/2d/skeleton_2d.cpp
@@ -100,7 +100,7 @@ void Bone2D::set_rest(const Transform2D &p_rest) {
 		skeleton->_make_bone_setup_dirty();
 	}
 
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Transform2D Bone2D::get_rest() const {
@@ -133,27 +133,21 @@ int Bone2D::get_index_in_skeleton() const {
 	return skeleton_index;
 }
 
-String Bone2D::get_configuration_warning() const {
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> Bone2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 	if (!skeleton) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
 		if (parent_bone) {
-			warning += TTR("This Bone2D chain should end at a Skeleton2D node.");
+			warnings.push_back(TTR("This Bone2D chain should end at a Skeleton2D node."));
 		} else {
-			warning += TTR("A Bone2D only works with a Skeleton2D or another Bone2D as parent node.");
+			warnings.push_back(TTR("A Bone2D only works with a Skeleton2D or another Bone2D as parent node."));
 		}
 	}
 
 	if (rest == Transform2D(0, 0, 0, 0, 0, 0)) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("This bone lacks a proper REST pose. Go to the Skeleton2D node and set one.");
+		warnings.push_back(TTR("This bone lacks a proper REST pose. Go to the Skeleton2D node and set one."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 Bone2D::Bone2D() {
diff --git a/scene/2d/skeleton_2d.h b/scene/2d/skeleton_2d.h
index 1f43ea742b..fd62b87bde 100644
--- a/scene/2d/skeleton_2d.h
+++ b/scene/2d/skeleton_2d.h
@@ -60,7 +60,7 @@ public:
 	void apply_rest();
 	Transform2D get_skeleton_rest() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	void set_default_length(real_t p_length);
 	real_t get_default_length() const;
diff --git a/scene/2d/tile_map.cpp b/scene/2d/tile_map.cpp
index 81a5b0b28c..4565543ec3 100644
--- a/scene/2d/tile_map.cpp
+++ b/scene/2d/tile_map.cpp
@@ -59,7 +59,7 @@ void TileMap::_notification(int p_what) {
 			RID space = get_world_2d()->get_space();
 			_update_quadrant_transform();
 			_update_quadrant_space(space);
-			update_configuration_warning();
+			update_configuration_warnings();
 
 		} break;
 
@@ -1250,6 +1250,7 @@ void TileMap::set_collision_mask(uint32_t p_mask) {
 }
 
 void TileMap::set_collision_layer_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision layer bit must be between 0 and 31 inclusive.");
 	uint32_t layer = get_collision_layer();
 	if (p_value) {
 		layer |= 1 << p_bit;
@@ -1260,6 +1261,7 @@ void TileMap::set_collision_layer_bit(int p_bit, bool p_value) {
 }
 
 void TileMap::set_collision_mask_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision mask bit must be between 0 and 31 inclusive.");
 	uint32_t mask = get_collision_mask();
 	if (p_value) {
 		mask |= 1 << p_bit;
@@ -1301,7 +1303,7 @@ void TileMap::set_collision_use_parent(bool p_use_parent) {
 
 	_recreate_quadrants();
 	notify_property_list_changed();
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 void TileMap::set_collision_friction(float p_friction) {
@@ -1352,10 +1354,12 @@ uint32_t TileMap::get_collision_mask() const {
 }
 
 bool TileMap::get_collision_layer_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision layer bit must be between 0 and 31 inclusive.");
 	return get_collision_layer() & (1 << p_bit);
 }
 
 bool TileMap::get_collision_mask_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision mask bit must be between 0 and 31 inclusive.");
 	return get_collision_mask() & (1 << p_bit);
 }
 
@@ -1693,17 +1697,14 @@ void TileMap::set_texture_repeat(CanvasItem::TextureRepeat p_texture_repeat) {
 	}
 }
 
-String TileMap::get_configuration_warning() const {
-	String warning = Node2D::get_configuration_warning();
+TypedArray<String> TileMap::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (use_parent && !collision_parent) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		return TTR("TileMap with Use Parent on needs a parent CollisionObject2D to give shapes to. Please use it as a child of Area2D, StaticBody2D, RigidBody2D, KinematicBody2D, etc. to give them a shape.");
+		warnings.push_back(TTR("TileMap with Use Parent on needs a parent CollisionObject2D to give shapes to. Please use it as a child of Area2D, StaticBody2D, RigidBody2D, KinematicBody2D, etc. to give them a shape."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void TileMap::_bind_methods() {
diff --git a/scene/2d/tile_map.h b/scene/2d/tile_map.h
index 26c84a0bb9..9d27053fee 100644
--- a/scene/2d/tile_map.h
+++ b/scene/2d/tile_map.h
@@ -340,7 +340,7 @@ public:
 	void set_clip_uv(bool p_enable);
 	bool get_clip_uv() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	virtual void set_texture_filter(CanvasItem::TextureFilter p_texture_filter) override;
 
diff --git a/scene/2d/visibility_notifier_2d.cpp b/scene/2d/visibility_notifier_2d.cpp
index 916038a1f3..8feb47f1cc 100644
--- a/scene/2d/visibility_notifier_2d.cpp
+++ b/scene/2d/visibility_notifier_2d.cpp
@@ -310,18 +310,15 @@ void VisibilityEnabler2D::_node_removed(Node *p_node) {
 	nodes.erase(p_node);
 }
 
-String VisibilityEnabler2D::get_configuration_warning() const {
-	String warning = VisibilityNotifier2D::get_configuration_warning();
+TypedArray<String> VisibilityEnabler2D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 #ifdef TOOLS_ENABLED
 	if (is_inside_tree() && get_parent() && (get_parent()->get_filename() == String() && get_parent() != get_tree()->get_edited_scene_root())) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("VisibilityEnabler2D works best when used with the edited scene root directly as parent.");
+		warnings.push_back(TTR("VisibilityEnabler2D works best when used with the edited scene root directly as parent."));
 	}
 #endif
-	return warning;
+	return warnings;
 }
 
 void VisibilityEnabler2D::_bind_methods() {
diff --git a/scene/2d/visibility_notifier_2d.h b/scene/2d/visibility_notifier_2d.h
index 3d1701a1e5..7f4a5bc193 100644
--- a/scene/2d/visibility_notifier_2d.h
+++ b/scene/2d/visibility_notifier_2d.h
@@ -102,7 +102,7 @@ public:
 	void set_enabler(Enabler p_enabler, bool p_enable);
 	bool is_enabler_enabled(Enabler p_enabler) const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	VisibilityEnabler2D();
 };
diff --git a/scene/3d/area_3d.cpp b/scene/3d/area_3d.cpp
index 749cf4ff9d..e187e06308 100644
--- a/scene/3d/area_3d.cpp
+++ b/scene/3d/area_3d.cpp
@@ -451,52 +451,6 @@ bool Area3D::overlaps_body(Node *p_body) const {
 	return E->get().in_tree;
 }
 
-void Area3D::set_collision_mask(uint32_t p_mask) {
-	collision_mask = p_mask;
-	PhysicsServer3D::get_singleton()->area_set_collision_mask(get_rid(), p_mask);
-}
-
-uint32_t Area3D::get_collision_mask() const {
-	return collision_mask;
-}
-
-void Area3D::set_collision_layer(uint32_t p_layer) {
-	collision_layer = p_layer;
-	PhysicsServer3D::get_singleton()->area_set_collision_layer(get_rid(), p_layer);
-}
-
-uint32_t Area3D::get_collision_layer() const {
-	return collision_layer;
-}
-
-void Area3D::set_collision_mask_bit(int p_bit, bool p_value) {
-	uint32_t mask = get_collision_mask();
-	if (p_value) {
-		mask |= 1 << p_bit;
-	} else {
-		mask &= ~(1 << p_bit);
-	}
-	set_collision_mask(mask);
-}
-
-bool Area3D::get_collision_mask_bit(int p_bit) const {
-	return get_collision_mask() & (1 << p_bit);
-}
-
-void Area3D::set_collision_layer_bit(int p_bit, bool p_value) {
-	uint32_t layer = get_collision_layer();
-	if (p_value) {
-		layer |= 1 << p_bit;
-	} else {
-		layer &= ~(1 << p_bit);
-	}
-	set_collision_layer(layer);
-}
-
-bool Area3D::get_collision_layer_bit(int p_bit) const {
-	return get_collision_layer() & (1 << p_bit);
-}
-
 void Area3D::set_audio_bus_override(bool p_override) {
 	audio_bus_override = p_override;
 }
@@ -595,18 +549,6 @@ void Area3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_priority", "priority"), &Area3D::set_priority);
 	ClassDB::bind_method(D_METHOD("get_priority"), &Area3D::get_priority);
 
-	ClassDB::bind_method(D_METHOD("set_collision_mask", "collision_mask"), &Area3D::set_collision_mask);
-	ClassDB::bind_method(D_METHOD("get_collision_mask"), &Area3D::get_collision_mask);
-
-	ClassDB::bind_method(D_METHOD("set_collision_layer", "collision_layer"), &Area3D::set_collision_layer);
-	ClassDB::bind_method(D_METHOD("get_collision_layer"), &Area3D::get_collision_layer);
-
-	ClassDB::bind_method(D_METHOD("set_collision_mask_bit", "bit", "value"), &Area3D::set_collision_mask_bit);
-	ClassDB::bind_method(D_METHOD("get_collision_mask_bit", "bit"), &Area3D::get_collision_mask_bit);
-
-	ClassDB::bind_method(D_METHOD("set_collision_layer_bit", "bit", "value"), &Area3D::set_collision_layer_bit);
-	ClassDB::bind_method(D_METHOD("get_collision_layer_bit", "bit"), &Area3D::get_collision_layer_bit);
-
 	ClassDB::bind_method(D_METHOD("set_monitorable", "enable"), &Area3D::set_monitorable);
 	ClassDB::bind_method(D_METHOD("is_monitorable"), &Area3D::is_monitorable);
 
@@ -650,6 +592,11 @@ void Area3D::_bind_methods() {
 	ADD_SIGNAL(MethodInfo("area_entered", PropertyInfo(Variant::OBJECT, "area", PROPERTY_HINT_RESOURCE_TYPE, "Area3D")));
 	ADD_SIGNAL(MethodInfo("area_exited", PropertyInfo(Variant::OBJECT, "area", PROPERTY_HINT_RESOURCE_TYPE, "Area3D")));
 
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "monitoring"), "set_monitoring", "is_monitoring");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "monitorable"), "set_monitorable", "is_monitorable");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "priority", PROPERTY_HINT_RANGE, "0,128,1"), "set_priority", "get_priority");
+
+	ADD_GROUP("Physics Overrides", "");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "space_override", PROPERTY_HINT_ENUM, "Disabled,Combine,Combine-Replace,Replace,Replace-Combine"), "set_space_override_mode", "get_space_override_mode");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "gravity_point"), "set_gravity_is_point", "is_gravity_a_point");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "gravity_distance_scale", PROPERTY_HINT_EXP_RANGE, "0,1024,0.001,or_greater"), "set_gravity_distance_scale", "get_gravity_distance_scale");
@@ -657,15 +604,11 @@ void Area3D::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "gravity", PROPERTY_HINT_RANGE, "-1024,1024,0.01"), "set_gravity", "get_gravity");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "linear_damp", PROPERTY_HINT_RANGE, "0,100,0.001,or_greater"), "set_linear_damp", "get_linear_damp");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "angular_damp", PROPERTY_HINT_RANGE, "0,100,0.001,or_greater"), "set_angular_damp", "get_angular_damp");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "priority", PROPERTY_HINT_RANGE, "0,128,1"), "set_priority", "get_priority");
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "monitoring"), "set_monitoring", "is_monitoring");
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "monitorable"), "set_monitorable", "is_monitorable");
-	ADD_GROUP("Collision", "collision_");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_layer", PROPERTY_HINT_LAYERS_3D_PHYSICS), "set_collision_layer", "get_collision_layer");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_mask", PROPERTY_HINT_LAYERS_3D_PHYSICS), "set_collision_mask", "get_collision_mask");
+
 	ADD_GROUP("Audio Bus", "audio_bus_");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "audio_bus_override"), "set_audio_bus_override", "is_overriding_audio_bus");
 	ADD_PROPERTY(PropertyInfo(Variant::STRING_NAME, "audio_bus_name", PROPERTY_HINT_ENUM, ""), "set_audio_bus_name", "get_audio_bus_name");
+
 	ADD_GROUP("Reverb Bus", "reverb_bus_");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "reverb_bus_enable"), "set_use_reverb_bus", "is_using_reverb_bus");
 	ADD_PROPERTY(PropertyInfo(Variant::STRING_NAME, "reverb_bus_name", PROPERTY_HINT_ENUM, ""), "set_reverb_bus", "get_reverb_bus");
diff --git a/scene/3d/area_3d.h b/scene/3d/area_3d.h
index 6d976115f7..9605a937af 100644
--- a/scene/3d/area_3d.h
+++ b/scene/3d/area_3d.h
@@ -54,8 +54,6 @@ private:
 	real_t gravity_distance_scale = 0.0;
 	real_t angular_damp = 0.1;
 	real_t linear_damp = 0.1;
-	uint32_t collision_mask = 1;
-	uint32_t collision_layer = 1;
 	int priority = 0;
 	bool monitoring = false;
 	bool monitorable = false;
@@ -169,18 +167,6 @@ public:
 	void set_monitorable(bool p_enable);
 	bool is_monitorable() const;
 
-	void set_collision_mask(uint32_t p_mask);
-	uint32_t get_collision_mask() const;
-
-	void set_collision_layer(uint32_t p_layer);
-	uint32_t get_collision_layer() const;
-
-	void set_collision_mask_bit(int p_bit, bool p_value);
-	bool get_collision_mask_bit(int p_bit) const;
-
-	void set_collision_layer_bit(int p_bit, bool p_value);
-	bool get_collision_layer_bit(int p_bit) const;
-
 	TypedArray<Node3D> get_overlapping_bodies() const;
 	TypedArray<Area3D> get_overlapping_areas() const; //function for script
 
diff --git a/scene/3d/baked_lightmap.cpp b/scene/3d/baked_lightmap.cpp
index 95ffbe48c1..ef648a126e 100644
--- a/scene/3d/baked_lightmap.cpp
+++ b/scene/3d/baked_lightmap.cpp
@@ -259,7 +259,7 @@ void BakedLightmap::_find_meshes_and_lights(Node *p_at_node, Vector<MeshesFound>
 					if (all_override.is_valid()) {
 						mf.overrides.push_back(all_override);
 					} else {
-						mf.overrides.push_back(mi->get_surface_material(i));
+						mf.overrides.push_back(mi->get_surface_override_material(i));
 					}
 				}
 
@@ -619,10 +619,6 @@ void BakedLightmap::_gen_new_positions_from_octree(const GenProbesOctree *p_cell
 }
 
 BakedLightmap::BakeError BakedLightmap::bake(Node *p_from_node, String p_image_data_path, Lightmapper::BakeStepFunc p_bake_step, void *p_bake_userdata) {
-	if (p_image_data_path == "" && (get_light_data().is_null() || !get_light_data()->get_path().is_resource_file())) {
-		return BAKE_ERROR_NO_SAVE_PATH;
-	}
-
 	if (p_image_data_path == "") {
 		if (get_light_data().is_null()) {
 			return BAKE_ERROR_NO_SAVE_PATH;
diff --git a/scene/3d/camera_3d.cpp b/scene/3d/camera_3d.cpp
index cd8d02233b..041da4f6ff 100644
--- a/scene/3d/camera_3d.cpp
+++ b/scene/3d/camera_3d.cpp
@@ -761,6 +761,7 @@ uint32_t ClippedCamera3D::get_collision_mask() const {
 }
 
 void ClippedCamera3D::set_collision_mask_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision layer bit must be between 0 and 31 inclusive.");
 	uint32_t mask = get_collision_mask();
 	if (p_value) {
 		mask |= 1 << p_bit;
@@ -771,6 +772,7 @@ void ClippedCamera3D::set_collision_mask_bit(int p_bit, bool p_value) {
 }
 
 bool ClippedCamera3D::get_collision_mask_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision mask bit must be between 0 and 31 inclusive.");
 	return get_collision_mask() & (1 << p_bit);
 }
 
diff --git a/scene/3d/collision_object_3d.cpp b/scene/3d/collision_object_3d.cpp
index 39880db29c..688509a979 100644
--- a/scene/3d/collision_object_3d.cpp
+++ b/scene/3d/collision_object_3d.cpp
@@ -75,9 +75,72 @@ void CollisionObject3D::_notification(int p_what) {
 			}
 
 		} break;
+		case NOTIFICATION_PREDELETE: {
+			if (debug_shape_count > 0) {
+				_clear_debug_shapes();
+			}
+		} break;
+	}
+}
+
+void CollisionObject3D::set_collision_layer(uint32_t p_layer) {
+	collision_layer = p_layer;
+	if (area) {
+		PhysicsServer3D::get_singleton()->area_set_collision_layer(get_rid(), p_layer);
+	} else {
+		PhysicsServer3D::get_singleton()->body_set_collision_layer(get_rid(), p_layer);
 	}
 }
 
+uint32_t CollisionObject3D::get_collision_layer() const {
+	return collision_layer;
+}
+
+void CollisionObject3D::set_collision_mask(uint32_t p_mask) {
+	collision_mask = p_mask;
+	if (area) {
+		PhysicsServer3D::get_singleton()->area_set_collision_mask(get_rid(), p_mask);
+	} else {
+		PhysicsServer3D::get_singleton()->body_set_collision_mask(get_rid(), p_mask);
+	}
+}
+
+uint32_t CollisionObject3D::get_collision_mask() const {
+	return collision_mask;
+}
+
+void CollisionObject3D::set_collision_layer_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision layer bit must be between 0 and 31 inclusive.");
+	uint32_t collision_layer = get_collision_layer();
+	if (p_value) {
+		collision_layer |= 1 << p_bit;
+	} else {
+		collision_layer &= ~(1 << p_bit);
+	}
+	set_collision_layer(collision_layer);
+}
+
+bool CollisionObject3D::get_collision_layer_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision layer bit must be between 0 and 31 inclusive.");
+	return get_collision_layer() & (1 << p_bit);
+}
+
+void CollisionObject3D::set_collision_mask_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision mask bit must be between 0 and 31 inclusive.");
+	uint32_t mask = get_collision_mask();
+	if (p_value) {
+		mask |= 1 << p_bit;
+	} else {
+		mask &= ~(1 << p_bit);
+	}
+	set_collision_mask(mask);
+}
+
+bool CollisionObject3D::get_collision_mask_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision mask bit must be between 0 and 31 inclusive.");
+	return get_collision_mask() & (1 << p_bit);
+}
+
 void CollisionObject3D::_input_event(Node *p_camera, const Ref<InputEvent> &p_input_event, const Vector3 &p_pos, const Vector3 &p_normal, int p_shape) {
 	if (get_script_instance()) {
 		get_script_instance()->call(SceneStringNames::get_singleton()->_input_event, p_camera, p_input_event, p_pos, p_normal, p_shape);
@@ -116,11 +179,13 @@ void CollisionObject3D::_update_debug_shapes() {
 	for (Set<uint32_t>::Element *shapedata_idx = debug_shapes_to_update.front(); shapedata_idx; shapedata_idx = shapedata_idx->next()) {
 		if (shapes.has(shapedata_idx->get())) {
 			ShapeData &shapedata = shapes[shapedata_idx->get()];
+			ShapeData::ShapeBase *shapes = shapedata.shapes.ptrw();
 			for (int i = 0; i < shapedata.shapes.size(); i++) {
-				ShapeData::ShapeBase &s = shapedata.shapes.write[i];
+				ShapeData::ShapeBase &s = shapes[i];
 				if (s.debug_shape) {
 					s.debug_shape->queue_delete();
 					s.debug_shape = nullptr;
+					--debug_shape_count;
 				}
 				if (s.shape.is_null() || shapedata.disabled) {
 					continue;
@@ -133,12 +198,30 @@ void CollisionObject3D::_update_debug_shapes() {
 				add_child(mi);
 				mi->force_update_transform();
 				s.debug_shape = mi;
+				++debug_shape_count;
 			}
 		}
 	}
 	debug_shapes_to_update.clear();
 }
 
+void CollisionObject3D::_clear_debug_shapes() {
+	for (Map<uint32_t, ShapeData>::Element *E = shapes.front(); E; E = E->next()) {
+		ShapeData &shapedata = E->get();
+		ShapeData::ShapeBase *shapes = shapedata.shapes.ptrw();
+		for (int i = 0; i < shapedata.shapes.size(); i++) {
+			ShapeData::ShapeBase &s = shapes[i];
+			if (s.debug_shape) {
+				s.debug_shape->queue_delete();
+				s.debug_shape = nullptr;
+				--debug_shape_count;
+			}
+		}
+	}
+
+	debug_shape_count = 0;
+}
+
 void CollisionObject3D::_update_shape_data(uint32_t p_owner) {
 	if (is_inside_tree() && get_tree()->is_debugging_collisions_hint() && !Engine::get_singleton()->is_editor_hint()) {
 		if (debug_shapes_to_update.is_empty()) {
@@ -158,6 +241,14 @@ bool CollisionObject3D::is_ray_pickable() const {
 }
 
 void CollisionObject3D::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("set_collision_layer", "layer"), &CollisionObject3D::set_collision_layer);
+	ClassDB::bind_method(D_METHOD("get_collision_layer"), &CollisionObject3D::get_collision_layer);
+	ClassDB::bind_method(D_METHOD("set_collision_mask", "mask"), &CollisionObject3D::set_collision_mask);
+	ClassDB::bind_method(D_METHOD("get_collision_mask"), &CollisionObject3D::get_collision_mask);
+	ClassDB::bind_method(D_METHOD("set_collision_layer_bit", "bit", "value"), &CollisionObject3D::set_collision_layer_bit);
+	ClassDB::bind_method(D_METHOD("get_collision_layer_bit", "bit"), &CollisionObject3D::get_collision_layer_bit);
+	ClassDB::bind_method(D_METHOD("set_collision_mask_bit", "bit", "value"), &CollisionObject3D::set_collision_mask_bit);
+	ClassDB::bind_method(D_METHOD("get_collision_mask_bit", "bit"), &CollisionObject3D::get_collision_mask_bit);
 	ClassDB::bind_method(D_METHOD("set_ray_pickable", "ray_pickable"), &CollisionObject3D::set_ray_pickable);
 	ClassDB::bind_method(D_METHOD("is_ray_pickable"), &CollisionObject3D::is_ray_pickable);
 	ClassDB::bind_method(D_METHOD("set_capture_input_on_drag", "enable"), &CollisionObject3D::set_capture_input_on_drag);
@@ -187,6 +278,11 @@ void CollisionObject3D::_bind_methods() {
 	ADD_SIGNAL(MethodInfo("mouse_entered"));
 	ADD_SIGNAL(MethodInfo("mouse_exited"));
 
+	ADD_GROUP("Collision", "collision_");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_layer", PROPERTY_HINT_LAYERS_3D_PHYSICS), "set_collision_layer", "get_collision_layer");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_mask", PROPERTY_HINT_LAYERS_3D_PHYSICS), "set_collision_mask", "get_collision_mask");
+
+	ADD_GROUP("Input", "input_");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "input_ray_pickable"), "set_ray_pickable", "is_ray_pickable");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "input_capture_on_drag"), "set_capture_input_on_drag", "get_capture_input_on_drag");
 }
@@ -395,17 +491,14 @@ bool CollisionObject3D::get_capture_input_on_drag() const {
 	return capture_input_on_drag;
 }
 
-String CollisionObject3D::get_configuration_warning() const {
-	String warning = Node3D::get_configuration_warning();
+TypedArray<String> CollisionObject3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (shapes.is_empty()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("This node has no shape, so it can't collide or interact with other objects.\nConsider adding a CollisionShape3D or CollisionPolygon3D as a child to define its shape.");
+		warnings.push_back(TTR("This node has no shape, so it can't collide or interact with other objects.\nConsider adding a CollisionShape3D or CollisionPolygon3D as a child to define its shape."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 CollisionObject3D::CollisionObject3D() {
diff --git a/scene/3d/collision_object_3d.h b/scene/3d/collision_object_3d.h
index fe20176984..e3901979d3 100644
--- a/scene/3d/collision_object_3d.h
+++ b/scene/3d/collision_object_3d.h
@@ -37,6 +37,9 @@
 class CollisionObject3D : public Node3D {
 	GDCLASS(CollisionObject3D, Node3D);
 
+	uint32_t collision_layer = 1;
+	uint32_t collision_mask = 1;
+
 	bool area = false;
 
 	RID rid;
@@ -62,6 +65,7 @@ class CollisionObject3D : public Node3D {
 	bool ray_pickable = true;
 
 	Set<uint32_t> debug_shapes_to_update;
+	int debug_shape_count = 0;
 
 	void _update_pickable();
 
@@ -78,8 +82,21 @@ protected:
 	virtual void _mouse_exit();
 
 	void _update_debug_shapes();
+	void _clear_debug_shapes();
 
 public:
+	void set_collision_layer(uint32_t p_layer);
+	uint32_t get_collision_layer() const;
+
+	void set_collision_mask(uint32_t p_mask);
+	uint32_t get_collision_mask() const;
+
+	void set_collision_layer_bit(int p_bit, bool p_value);
+	bool get_collision_layer_bit(int p_bit) const;
+
+	void set_collision_mask_bit(int p_bit, bool p_value);
+	bool get_collision_mask_bit(int p_bit) const;
+
 	uint32_t create_shape_owner(Object *p_owner);
 	void remove_shape_owner(uint32_t owner);
 	void get_shape_owners(List<uint32_t> *r_owners);
@@ -110,7 +127,7 @@ public:
 
 	_FORCE_INLINE_ RID get_rid() const { return rid; }
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	CollisionObject3D();
 	~CollisionObject3D();
diff --git a/scene/3d/collision_polygon_3d.cpp b/scene/3d/collision_polygon_3d.cpp
index e3e2eb4669..ac715b22b2 100644
--- a/scene/3d/collision_polygon_3d.cpp
+++ b/scene/3d/collision_polygon_3d.cpp
@@ -121,7 +121,7 @@ void CollisionPolygon3D::set_polygon(const Vector<Point2> &p_polygon) {
 	if (parent) {
 		_build_polygon();
 	}
-	update_configuration_warning();
+	update_configuration_warnings();
 	update_gizmo();
 }
 
@@ -167,24 +167,18 @@ void CollisionPolygon3D::set_margin(real_t p_margin) {
 	}
 }
 
-String CollisionPolygon3D::get_configuration_warning() const {
-	String warning = Node3D::get_configuration_warning();
+TypedArray<String> CollisionPolygon3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!Object::cast_to<CollisionObject3D>(get_parent())) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("CollisionPolygon3D only serves to provide a collision shape to a CollisionObject3D derived node. Please only use it as a child of Area3D, StaticBody3D, RigidBody3D, KinematicBody3D, etc. to give them a shape.");
+		warnings.push_back(TTR("CollisionPolygon3D only serves to provide a collision shape to a CollisionObject3D derived node. Please only use it as a child of Area3D, StaticBody3D, RigidBody3D, KinematicBody3D, etc. to give them a shape."));
 	}
 
 	if (polygon.is_empty()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("An empty CollisionPolygon3D has no effect on collision.");
+		warnings.push_back(TTR("An empty CollisionPolygon3D has no effect on collision."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 bool CollisionPolygon3D::_is_editable_3d_polygon() const {
diff --git a/scene/3d/collision_polygon_3d.h b/scene/3d/collision_polygon_3d.h
index 750751b509..73b8a8e0e3 100644
--- a/scene/3d/collision_polygon_3d.h
+++ b/scene/3d/collision_polygon_3d.h
@@ -74,7 +74,7 @@ public:
 	real_t get_margin() const;
 	void set_margin(real_t p_margin);
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	CollisionPolygon3D();
 };
diff --git a/scene/3d/collision_shape_3d.cpp b/scene/3d/collision_shape_3d.cpp
index 242d82ab4c..bec87914c0 100644
--- a/scene/3d/collision_shape_3d.cpp
+++ b/scene/3d/collision_shape_3d.cpp
@@ -120,34 +120,25 @@ void CollisionShape3D::resource_changed(RES res) {
 	update_gizmo();
 }
 
-String CollisionShape3D::get_configuration_warning() const {
-	String warning = Node3D::get_configuration_warning();
+TypedArray<String> CollisionShape3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!Object::cast_to<CollisionObject3D>(get_parent())) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("CollisionShape3D only serves to provide a collision shape to a CollisionObject3D derived node. Please only use it as a child of Area3D, StaticBody3D, RigidBody3D, KinematicBody3D, etc. to give them a shape.");
+		warnings.push_back(TTR("CollisionShape3D only serves to provide a collision shape to a CollisionObject3D derived node. Please only use it as a child of Area3D, StaticBody3D, RigidBody3D, KinematicBody3D, etc. to give them a shape."));
 	}
 
 	if (!shape.is_valid()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("A shape must be provided for CollisionShape3D to function. Please create a shape resource for it.");
+		warnings.push_back(TTR("A shape must be provided for CollisionShape3D to function. Please create a shape resource for it."));
 	}
 
 	if (shape.is_valid() &&
 			Object::cast_to<RigidBody3D>(get_parent()) &&
 			Object::cast_to<ConcavePolygonShape3D>(*shape) &&
 			Object::cast_to<RigidBody3D>(get_parent())->get_mode() != RigidBody3D::MODE_STATIC) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("ConcavePolygonShape3D doesn't support RigidBody3D in another mode than static.");
+		warnings.push_back(TTR("ConcavePolygonShape3D doesn't support RigidBody3D in another mode than static."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void CollisionShape3D::_bind_methods() {
@@ -188,7 +179,7 @@ void CollisionShape3D::set_shape(const Ref<Shape3D> &p_shape) {
 	if (is_inside_tree()) {
 		_shape_changed();
 	}
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Ref<Shape3D> CollisionShape3D::get_shape() const {
diff --git a/scene/3d/collision_shape_3d.h b/scene/3d/collision_shape_3d.h
index 5512417f75..56a4ae3039 100644
--- a/scene/3d/collision_shape_3d.h
+++ b/scene/3d/collision_shape_3d.h
@@ -64,7 +64,7 @@ public:
 	void set_disabled(bool p_disabled);
 	bool is_disabled() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	CollisionShape3D();
 	~CollisionShape3D();
diff --git a/scene/3d/cpu_particles_3d.cpp b/scene/3d/cpu_particles_3d.cpp
index d22d7ff3ab..aa29728c73 100644
--- a/scene/3d/cpu_particles_3d.cpp
+++ b/scene/3d/cpu_particles_3d.cpp
@@ -189,8 +189,8 @@ bool CPUParticles3D::get_fractional_delta() const {
 	return fractional_delta;
 }
 
-String CPUParticles3D::get_configuration_warning() const {
-	String warnings = GeometryInstance3D::get_configuration_warning();
+TypedArray<String> CPUParticles3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	bool mesh_found = false;
 	bool anim_material_found = false;
@@ -209,18 +209,12 @@ String CPUParticles3D::get_configuration_warning() const {
 	anim_material_found = anim_material_found || (spat && spat->get_billboard_mode() == StandardMaterial3D::BILLBOARD_PARTICLES);
 
 	if (!mesh_found) {
-		if (warnings != String()) {
-			warnings += "\n";
-		}
-		warnings += "- " + TTR("Nothing is visible because no mesh has been assigned.");
+		warnings.push_back(TTR("Nothing is visible because no mesh has been assigned."));
 	}
 
 	if (!anim_material_found && (get_param(PARAM_ANIM_SPEED) != 0.0 || get_param(PARAM_ANIM_OFFSET) != 0.0 ||
 										get_param_curve(PARAM_ANIM_SPEED).is_valid() || get_param_curve(PARAM_ANIM_OFFSET).is_valid())) {
-		if (warnings != String()) {
-			warnings += "\n";
-		}
-		warnings += "- " + TTR("CPUParticles3D animation requires the usage of a StandardMaterial3D whose Billboard Mode is set to \"Particle Billboard\".");
+		warnings.push_back(TTR("CPUParticles3D animation requires the usage of a StandardMaterial3D whose Billboard Mode is set to \"Particle Billboard\"."));
 	}
 
 	return warnings;
@@ -1056,7 +1050,7 @@ void CPUParticles3D::_update_particle_data_buffer() {
 			ptr[10] = t.basis.elements[2][2];
 			ptr[11] = t.origin.z;
 		} else {
-			zeromem(ptr, sizeof(float) * 12);
+			memset(ptr, 0, sizeof(float) * 12);
 		}
 
 		Color c = r[idx].color;
@@ -1161,7 +1155,7 @@ void CPUParticles3D::_notification(int p_what) {
 					ptr[10] = t.basis.elements[2][2];
 					ptr[11] = t.origin.z;
 				} else {
-					zeromem(ptr, sizeof(float) * 12);
+					memset(ptr, 0, sizeof(float) * 12);
 				}
 
 				ptr += 20;
diff --git a/scene/3d/cpu_particles_3d.h b/scene/3d/cpu_particles_3d.h
index 10ac32622d..c073c93c47 100644
--- a/scene/3d/cpu_particles_3d.h
+++ b/scene/3d/cpu_particles_3d.h
@@ -280,7 +280,7 @@ public:
 	void set_gravity(const Vector3 &p_gravity);
 	Vector3 get_gravity() const;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	void restart();
 
diff --git a/scene/3d/gi_probe.cpp b/scene/3d/gi_probe.cpp
index 43f820e5d4..4d7fc29f15 100644
--- a/scene/3d/gi_probe.cpp
+++ b/scene/3d/gi_probe.cpp
@@ -343,7 +343,7 @@ void GIProbe::_find_meshes(Node *p_at_node, List<PlotMesh> &plot_meshes) {
 				pm.local_xform = xf;
 				pm.mesh = mesh;
 				for (int i = 0; i < mesh->get_surface_count(); i++) {
-					pm.instance_materials.push_back(mi->get_surface_material(i));
+					pm.instance_materials.push_back(mi->get_surface_override_material(i));
 				}
 				pm.override_material = mi->get_material_override();
 				plot_meshes.push_back(pm);
@@ -503,19 +503,15 @@ Vector<Face3> GIProbe::get_faces(uint32_t p_usage_flags) const {
 	return Vector<Face3>();
 }
 
-String GIProbe::get_configuration_warning() const {
-	String warning = VisualInstance3D::get_configuration_warning();
+TypedArray<String> GIProbe::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (RenderingServer::get_singleton()->is_low_end()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("GIProbes are not supported by the GLES2 video driver.\nUse a BakedLightmap instead.");
+		warnings.push_back(TTR("GIProbes are not supported by the GLES2 video driver.\nUse a BakedLightmap instead."));
 	} else if (probe_data.is_null()) {
-		warning += TTR("No GIProbe data set, so this node is disabled. Bake static objects to enable GI.");
+		warnings.push_back(TTR("No GIProbe data set, so this node is disabled. Bake static objects to enable GI."));
 	}
-
-	return warning;
+	return warnings;
 }
 
 void GIProbe::_bind_methods() {
diff --git a/scene/3d/gi_probe.h b/scene/3d/gi_probe.h
index 534b425557..dac7dd3e17 100644
--- a/scene/3d/gi_probe.h
+++ b/scene/3d/gi_probe.h
@@ -165,7 +165,7 @@ public:
 	virtual AABB get_aabb() const override;
 	virtual Vector<Face3> get_faces(uint32_t p_usage_flags) const override;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	GIProbe();
 	~GIProbe();
diff --git a/scene/3d/gpu_particles_3d.cpp b/scene/3d/gpu_particles_3d.cpp
index e2cfc2ed87..5339b8a8da 100644
--- a/scene/3d/gpu_particles_3d.cpp
+++ b/scene/3d/gpu_particles_3d.cpp
@@ -115,7 +115,7 @@ void GPUParticles3D::set_process_material(const Ref<Material> &p_material) {
 	}
 	RS::get_singleton()->particles_set_process_material(particles, material_rid);
 
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 void GPUParticles3D::set_speed_scale(float p_scale) {
@@ -181,12 +181,33 @@ void GPUParticles3D::set_draw_order(DrawOrder p_order) {
 	RS::get_singleton()->particles_set_draw_order(particles, RS::ParticlesDrawOrder(p_order));
 }
 
+void GPUParticles3D::set_enable_trail(bool p_enabled) {
+	trail_enabled = p_enabled;
+	RS::get_singleton()->particles_set_trails(particles, trail_enabled, trail_length);
+	update_configuration_warnings();
+}
+void GPUParticles3D::set_trail_length(float p_seconds) {
+	ERR_FAIL_COND(p_seconds < 0.001);
+	trail_length = p_seconds;
+	RS::get_singleton()->particles_set_trails(particles, trail_enabled, trail_length);
+}
+
+bool GPUParticles3D::is_trail_enabled() const {
+	return trail_enabled;
+}
+float GPUParticles3D::get_trail_length() const {
+	return trail_length;
+}
+
 GPUParticles3D::DrawOrder GPUParticles3D::get_draw_order() const {
 	return draw_order;
 }
 
 void GPUParticles3D::set_draw_passes(int p_count) {
 	ERR_FAIL_COND(p_count < 1);
+	for (int i = p_count; i < draw_passes.size(); i++) {
+		set_draw_pass_mesh(i, Ref<Mesh>());
+	}
 	draw_passes.resize(p_count);
 	RS::get_singleton()->particles_set_draw_passes(particles, p_count);
 	notify_property_list_changed();
@@ -199,8 +220,16 @@ int GPUParticles3D::get_draw_passes() const {
 void GPUParticles3D::set_draw_pass_mesh(int p_pass, const Ref<Mesh> &p_mesh) {
 	ERR_FAIL_INDEX(p_pass, draw_passes.size());
 
+	if (Engine::get_singleton()->is_editor_hint() && draw_passes.write[p_pass].is_valid()) {
+		draw_passes.write[p_pass]->disconnect("changed", callable_mp((Node *)this, &Node::update_configuration_warnings));
+	}
+
 	draw_passes.write[p_pass] = p_mesh;
 
+	if (Engine::get_singleton()->is_editor_hint() && draw_passes.write[p_pass].is_valid()) {
+		draw_passes.write[p_pass]->connect("changed", callable_mp((Node *)this, &Node::update_configuration_warnings), varray(), CONNECT_DEFERRED);
+	}
+
 	RID mesh_rid;
 	if (p_mesh.is_valid()) {
 		mesh_rid = p_mesh->get_rid();
@@ -208,7 +237,8 @@ void GPUParticles3D::set_draw_pass_mesh(int p_pass, const Ref<Mesh> &p_mesh) {
 
 	RS::get_singleton()->particles_set_draw_pass_mesh(particles, p_pass, mesh_rid);
 
-	update_configuration_warning();
+	_skinning_changed();
+	update_configuration_warnings();
 }
 
 Ref<Mesh> GPUParticles3D::get_draw_pass_mesh(int p_pass) const {
@@ -235,13 +265,22 @@ bool GPUParticles3D::get_fractional_delta() const {
 	return fractional_delta;
 }
 
-String GPUParticles3D::get_configuration_warning() const {
+void GPUParticles3D::set_interpolate(bool p_enable) {
+	interpolate = p_enable;
+	RS::get_singleton()->particles_set_interpolate(particles, p_enable);
+}
+
+bool GPUParticles3D::get_interpolate() const {
+	return interpolate;
+}
+
+TypedArray<String> GPUParticles3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
+
 	if (RenderingServer::get_singleton()->is_low_end()) {
-		return TTR("GPU-based particles are not supported by the GLES2 video driver.\nUse the CPUParticles3D node instead. You can use the \"Convert to CPUParticles3D\" option for this purpose.");
+		warnings.push_back(TTR("GPU-based particles are not supported by the GLES2 video driver.\nUse the CPUParticles3D node instead. You can use the \"Convert to CPUParticles3D\" option for this purpose."));
 	}
 
-	String warnings = GeometryInstance3D::get_configuration_warning();
-
 	bool meshes_found = false;
 	bool anim_material_found = false;
 
@@ -250,7 +289,7 @@ String GPUParticles3D::get_configuration_warning() const {
 			meshes_found = true;
 			for (int j = 0; j < draw_passes[i]->get_surface_count(); j++) {
 				anim_material_found = Object::cast_to<ShaderMaterial>(draw_passes[i]->surface_get_material(j).ptr()) != nullptr;
-				StandardMaterial3D *spat = Object::cast_to<StandardMaterial3D>(draw_passes[i]->surface_get_material(j).ptr());
+				BaseMaterial3D *spat = Object::cast_to<BaseMaterial3D>(draw_passes[i]->surface_get_material(j).ptr());
 				anim_material_found = anim_material_found || (spat && spat->get_billboard_mode() == StandardMaterial3D::BILLBOARD_PARTICLES);
 			}
 			if (anim_material_found) {
@@ -260,30 +299,73 @@ String GPUParticles3D::get_configuration_warning() const {
 	}
 
 	anim_material_found = anim_material_found || Object::cast_to<ShaderMaterial>(get_material_override().ptr()) != nullptr;
-	StandardMaterial3D *spat = Object::cast_to<StandardMaterial3D>(get_material_override().ptr());
-	anim_material_found = anim_material_found || (spat && spat->get_billboard_mode() == StandardMaterial3D::BILLBOARD_PARTICLES);
+	{
+		BaseMaterial3D *spat = Object::cast_to<BaseMaterial3D>(get_material_override().ptr());
+		anim_material_found = anim_material_found || (spat && spat->get_billboard_mode() == BaseMaterial3D::BILLBOARD_PARTICLES);
+	}
 
 	if (!meshes_found) {
-		if (warnings != String()) {
-			warnings += "\n";
-		}
-		warnings += "- " + TTR("Nothing is visible because meshes have not been assigned to draw passes.");
+		warnings.push_back(TTR("Nothing is visible because meshes have not been assigned to draw passes."));
 	}
 
 	if (process_material.is_null()) {
-		if (warnings != String()) {
-			warnings += "\n";
-		}
-		warnings += "- " + TTR("A material to process the particles is not assigned, so no behavior is imprinted.");
+		warnings.push_back(TTR("A material to process the particles is not assigned, so no behavior is imprinted."));
 	} else {
 		const ParticlesMaterial *process = Object::cast_to<ParticlesMaterial>(process_material.ptr());
 		if (!anim_material_found && process &&
 				(process->get_param(ParticlesMaterial::PARAM_ANIM_SPEED) != 0.0 || process->get_param(ParticlesMaterial::PARAM_ANIM_OFFSET) != 0.0 ||
 						process->get_param_texture(ParticlesMaterial::PARAM_ANIM_SPEED).is_valid() || process->get_param_texture(ParticlesMaterial::PARAM_ANIM_OFFSET).is_valid())) {
-			if (warnings != String()) {
-				warnings += "\n";
+			warnings.push_back(TTR("Particles animation requires the usage of a BaseMaterial3D whose Billboard Mode is set to \"Particle Billboard\"."));
+		}
+	}
+
+	if (trail_enabled) {
+		int dp_count = 0;
+		bool missing_trails = false;
+		bool no_materials = false;
+
+		for (int i = 0; i < draw_passes.size(); i++) {
+			Ref<Mesh> draw_pass = draw_passes[i];
+			if (draw_pass.is_valid() && draw_pass->get_builtin_bind_pose_count() > 0) {
+				dp_count++;
+			}
+
+			if (draw_pass.is_valid()) {
+				int mats_found = 0;
+				for (int j = 0; j < draw_passes[i]->get_surface_count(); j++) {
+					BaseMaterial3D *spat = Object::cast_to<BaseMaterial3D>(draw_passes[i]->surface_get_material(j).ptr());
+					if (spat) {
+						mats_found++;
+					}
+					if (spat && !spat->get_flag(BaseMaterial3D::FLAG_PARTICLE_TRAILS_MODE)) {
+						missing_trails = true;
+					}
+				}
+
+				if (mats_found != draw_passes[i]->get_surface_count()) {
+					no_materials = true;
+				}
 			}
-			warnings += "- " + TTR("Particles animation requires the usage of a StandardMaterial3D whose Billboard Mode is set to \"Particle Billboard\".");
+		}
+
+		BaseMaterial3D *spat = Object::cast_to<BaseMaterial3D>(get_material_override().ptr());
+		if (spat) {
+			no_materials = false;
+		}
+		if (spat && !spat->get_flag(BaseMaterial3D::FLAG_PARTICLE_TRAILS_MODE)) {
+			missing_trails = true;
+		}
+
+		if (dp_count && skin.is_valid()) {
+			warnings.push_back(TTR("Using Trail meshes with a skin causes Skin to override Trail poses. Suggest removing the Skin."));
+		} else if (dp_count == 0 && skin.is_null()) {
+			warnings.push_back(TTR("Trails active, but neither Trail meshes or a Skin were found."));
+		} else if (dp_count > 1) {
+			warnings.push_back(TTR("Only one Trail mesh is supported. If you want to use more than a single mesh, a Skin is needed (see documentation)."));
+		}
+
+		if ((dp_count || !skin.is_null()) && (missing_trails || no_materials)) {
+			warnings.push_back(TTR("Trails enabled, but one or more mesh materials are either missing or not set for trails rendering."));
 		}
 	}
 
@@ -375,6 +457,47 @@ void GPUParticles3D::_notification(int p_what) {
 	}
 }
 
+void GPUParticles3D::_skinning_changed() {
+	Vector<Transform> xforms;
+	if (skin.is_valid()) {
+		xforms.resize(skin->get_bind_count());
+		for (int i = 0; i < skin->get_bind_count(); i++) {
+			xforms.write[i] = skin->get_bind_pose(i);
+		}
+	} else {
+		for (int i = 0; i < draw_passes.size(); i++) {
+			Ref<Mesh> draw_pass = draw_passes[i];
+			if (draw_pass.is_valid() && draw_pass->get_builtin_bind_pose_count() > 0) {
+				xforms.resize(draw_pass->get_builtin_bind_pose_count());
+				for (int j = 0; j < draw_pass->get_builtin_bind_pose_count(); j++) {
+					xforms.write[i] = draw_pass->get_builtin_bind_pose(j);
+				}
+				break;
+			}
+		}
+	}
+
+	RS::get_singleton()->particles_set_trail_bind_poses(particles, xforms);
+	update_configuration_warnings();
+}
+
+void GPUParticles3D::set_skin(const Ref<Skin> &p_skin) {
+	skin = p_skin;
+	_skinning_changed();
+}
+Ref<Skin> GPUParticles3D::get_skin() const {
+	return skin;
+}
+
+void GPUParticles3D::set_transform_align(TransformAlign p_align) {
+	ERR_FAIL_INDEX(uint32_t(p_align), 4);
+	transform_align = p_align;
+	RS::get_singleton()->particles_set_transform_align(particles, RS::ParticlesTransformAlign(transform_align));
+}
+GPUParticles3D::TransformAlign GPUParticles3D::get_transform_align() const {
+	return transform_align;
+}
+
 void GPUParticles3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_emitting", "emitting"), &GPUParticles3D::set_emitting);
 	ClassDB::bind_method(D_METHOD("set_amount", "amount"), &GPUParticles3D::set_amount);
@@ -387,6 +510,7 @@ void GPUParticles3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_use_local_coordinates", "enable"), &GPUParticles3D::set_use_local_coordinates);
 	ClassDB::bind_method(D_METHOD("set_fixed_fps", "fps"), &GPUParticles3D::set_fixed_fps);
 	ClassDB::bind_method(D_METHOD("set_fractional_delta", "enable"), &GPUParticles3D::set_fractional_delta);
+	ClassDB::bind_method(D_METHOD("set_interpolate", "enable"), &GPUParticles3D::set_interpolate);
 	ClassDB::bind_method(D_METHOD("set_process_material", "material"), &GPUParticles3D::set_process_material);
 	ClassDB::bind_method(D_METHOD("set_speed_scale", "scale"), &GPUParticles3D::set_speed_scale);
 	ClassDB::bind_method(D_METHOD("set_collision_base_size", "size"), &GPUParticles3D::set_collision_base_size);
@@ -402,6 +526,7 @@ void GPUParticles3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_use_local_coordinates"), &GPUParticles3D::get_use_local_coordinates);
 	ClassDB::bind_method(D_METHOD("get_fixed_fps"), &GPUParticles3D::get_fixed_fps);
 	ClassDB::bind_method(D_METHOD("get_fractional_delta"), &GPUParticles3D::get_fractional_delta);
+	ClassDB::bind_method(D_METHOD("get_interpolate"), &GPUParticles3D::get_interpolate);
 	ClassDB::bind_method(D_METHOD("get_process_material"), &GPUParticles3D::get_process_material);
 	ClassDB::bind_method(D_METHOD("get_speed_scale"), &GPUParticles3D::get_speed_scale);
 	ClassDB::bind_method(D_METHOD("get_collision_base_size"), &GPUParticles3D::get_collision_base_size);
@@ -416,6 +541,9 @@ void GPUParticles3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_draw_passes"), &GPUParticles3D::get_draw_passes);
 	ClassDB::bind_method(D_METHOD("get_draw_pass_mesh", "pass"), &GPUParticles3D::get_draw_pass_mesh);
 
+	ClassDB::bind_method(D_METHOD("set_skin", "skin"), &GPUParticles3D::set_skin);
+	ClassDB::bind_method(D_METHOD("get_skin"), &GPUParticles3D::get_skin);
+
 	ClassDB::bind_method(D_METHOD("restart"), &GPUParticles3D::restart);
 	ClassDB::bind_method(D_METHOD("capture_aabb"), &GPUParticles3D::capture_aabb);
 
@@ -424,6 +552,15 @@ void GPUParticles3D::_bind_methods() {
 
 	ClassDB::bind_method(D_METHOD("emit_particle", "xform", "velocity", "color", "custom", "flags"), &GPUParticles3D::emit_particle);
 
+	ClassDB::bind_method(D_METHOD("set_enable_trail", "enabled"), &GPUParticles3D::set_enable_trail);
+	ClassDB::bind_method(D_METHOD("set_trail_length", "secs"), &GPUParticles3D::set_trail_length);
+
+	ClassDB::bind_method(D_METHOD("is_trail_enabled"), &GPUParticles3D::is_trail_enabled);
+	ClassDB::bind_method(D_METHOD("get_trail_length"), &GPUParticles3D::get_trail_length);
+
+	ClassDB::bind_method(D_METHOD("set_transform_align", "align"), &GPUParticles3D::set_transform_align);
+	ClassDB::bind_method(D_METHOD("get_transform_align"), &GPUParticles3D::get_transform_align);
+
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "emitting"), "set_emitting", "is_emitting");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "amount", PROPERTY_HINT_EXP_RANGE, "1,1000000,1"), "set_amount", "get_amount");
 	ADD_PROPERTY(PropertyInfo(Variant::NODE_PATH, "sub_emitter", PROPERTY_HINT_NODE_PATH_VALID_TYPES, "GPUParticles3D"), "set_sub_emitter", "get_sub_emitter");
@@ -435,6 +572,7 @@ void GPUParticles3D::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "explosiveness", PROPERTY_HINT_RANGE, "0,1,0.01"), "set_explosiveness_ratio", "get_explosiveness_ratio");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "randomness", PROPERTY_HINT_RANGE, "0,1,0.01"), "set_randomness_ratio", "get_randomness_ratio");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "fixed_fps", PROPERTY_HINT_RANGE, "0,1000,1"), "set_fixed_fps", "get_fixed_fps");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "interpolate"), "set_interpolate", "get_interpolate");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "fract_delta"), "set_fractional_delta", "get_fractional_delta");
 	ADD_GROUP("Collision", "collision_");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "collision_base_size", PROPERTY_HINT_RANGE, "0,128,0.01,or_greater"), "set_collision_base_size", "get_collision_base_size");
@@ -442,6 +580,10 @@ void GPUParticles3D::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::AABB, "visibility_aabb"), "set_visibility_aabb", "get_visibility_aabb");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "local_coords"), "set_use_local_coordinates", "get_use_local_coordinates");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "draw_order", PROPERTY_HINT_ENUM, "Index,Lifetime,View Depth"), "set_draw_order", "get_draw_order");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "transform_align", PROPERTY_HINT_ENUM, "Disabled,ZBillboard,YToVelocity,ZBillboardYToVelocity"), "set_transform_align", "get_transform_align");
+	ADD_GROUP("Trails", "trail_");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "trail_enabled"), "set_enable_trail", "is_trail_enabled");
+	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "trail_length_secs", PROPERTY_HINT_RANGE, "0.01,4,0.01"), "set_trail_length", "get_trail_length");
 	ADD_GROUP("Process Material", "");
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "process_material", PROPERTY_HINT_RESOURCE_TYPE, "ShaderMaterial,ParticlesMaterial"), "set_process_material", "get_process_material");
 	ADD_GROUP("Draw Passes", "draw_");
@@ -449,6 +591,7 @@ void GPUParticles3D::_bind_methods() {
 	for (int i = 0; i < MAX_DRAW_PASSES; i++) {
 		ADD_PROPERTYI(PropertyInfo(Variant::OBJECT, "draw_pass_" + itos(i + 1), PROPERTY_HINT_RESOURCE_TYPE, "Mesh"), "set_draw_pass_mesh", "get_draw_pass_mesh", i);
 	}
+	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "draw_skin", PROPERTY_HINT_RESOURCE_TYPE, "Skin"), "set_skin", "get_skin");
 
 	BIND_ENUM_CONSTANT(DRAW_ORDER_INDEX);
 	BIND_ENUM_CONSTANT(DRAW_ORDER_LIFETIME);
@@ -461,6 +604,11 @@ void GPUParticles3D::_bind_methods() {
 	BIND_ENUM_CONSTANT(EMIT_FLAG_CUSTOM);
 
 	BIND_CONSTANT(MAX_DRAW_PASSES);
+
+	BIND_ENUM_CONSTANT(TRANSFORM_ALIGN_DISABLED);
+	BIND_ENUM_CONSTANT(TRANSFORM_ALIGN_Z_BILLBOARD);
+	BIND_ENUM_CONSTANT(TRANSFORM_ALIGN_Y_TO_VELOCITY);
+	BIND_ENUM_CONSTANT(TRANSFORM_ALIGN_Z_BILLBOARD_Y_TO_VELOCITY);
 }
 
 GPUParticles3D::GPUParticles3D() {
@@ -471,17 +619,20 @@ GPUParticles3D::GPUParticles3D() {
 	set_one_shot(false);
 	set_amount(8);
 	set_lifetime(1);
-	set_fixed_fps(0);
+	set_fixed_fps(30);
 	set_fractional_delta(true);
+	set_interpolate(true);
 	set_pre_process_time(0);
 	set_explosiveness_ratio(0);
 	set_randomness_ratio(0);
+	set_trail_length(0.3);
 	set_visibility_aabb(AABB(Vector3(-4, -4, -4), Vector3(8, 8, 8)));
 	set_use_local_coordinates(true);
 	set_draw_passes(1);
 	set_draw_order(DRAW_ORDER_INDEX);
 	set_speed_scale(1);
 	set_collision_base_size(0.01);
+	set_transform_align(TRANSFORM_ALIGN_DISABLED);
 }
 
 GPUParticles3D::~GPUParticles3D() {
diff --git a/scene/3d/gpu_particles_3d.h b/scene/3d/gpu_particles_3d.h
index 0c1a1a510c..1f9cea79b6 100644
--- a/scene/3d/gpu_particles_3d.h
+++ b/scene/3d/gpu_particles_3d.h
@@ -34,6 +34,7 @@
 #include "core/templates/rid.h"
 #include "scene/3d/visual_instance_3d.h"
 #include "scene/resources/material.h"
+#include "scene/resources/skin.h"
 
 class GPUParticles3D : public GeometryInstance3D {
 private:
@@ -46,6 +47,13 @@ public:
 		DRAW_ORDER_VIEW_DEPTH,
 	};
 
+	enum TransformAlign {
+		TRANSFORM_ALIGN_DISABLED,
+		TRANSFORM_ALIGN_Z_BILLBOARD,
+		TRANSFORM_ALIGN_Y_TO_VELOCITY,
+		TRANSFORM_ALIGN_Z_BILLBOARD_Y_TO_VELOCITY
+	};
+
 	enum {
 		MAX_DRAW_PASSES = 4
 	};
@@ -64,17 +72,26 @@ private:
 	bool local_coords;
 	int fixed_fps;
 	bool fractional_delta;
+	bool interpolate = true;
 	NodePath sub_emitter;
 	float collision_base_size;
 
+	bool trail_enabled = false;
+	float trail_length = 0.3;
+
+	TransformAlign transform_align = TRANSFORM_ALIGN_DISABLED;
+
 	Ref<Material> process_material;
 
 	DrawOrder draw_order;
 
 	Vector<Ref<Mesh>> draw_passes;
+	Ref<Skin> skin;
 
 	void _attach_sub_emitter();
 
+	void _skinning_changed();
+
 protected:
 	static void _bind_methods();
 	void _notification(int p_what);
@@ -96,6 +113,8 @@ public:
 	void set_process_material(const Ref<Material> &p_material);
 	void set_speed_scale(float p_scale);
 	void set_collision_base_size(float p_ratio);
+	void set_enable_trail(bool p_enabled);
+	void set_trail_length(float p_seconds);
 
 	bool is_emitting() const;
 	int get_amount() const;
@@ -109,6 +128,8 @@ public:
 	Ref<Material> get_process_material() const;
 	float get_speed_scale() const;
 	float get_collision_base_size() const;
+	bool is_trail_enabled() const;
+	float get_trail_length() const;
 
 	void set_fixed_fps(int p_count);
 	int get_fixed_fps() const;
@@ -116,6 +137,9 @@ public:
 	void set_fractional_delta(bool p_enable);
 	bool get_fractional_delta() const;
 
+	void set_interpolate(bool p_enable);
+	bool get_interpolate() const;
+
 	void set_draw_order(DrawOrder p_order);
 	DrawOrder get_draw_order() const;
 
@@ -125,11 +149,17 @@ public:
 	void set_draw_pass_mesh(int p_pass, const Ref<Mesh> &p_mesh);
 	Ref<Mesh> get_draw_pass_mesh(int p_pass) const;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	void set_sub_emitter(const NodePath &p_path);
 	NodePath get_sub_emitter() const;
 
+	void set_skin(const Ref<Skin> &p_skin);
+	Ref<Skin> get_skin() const;
+
+	void set_transform_align(TransformAlign p_align);
+	TransformAlign get_transform_align() const;
+
 	void restart();
 
 	enum EmitFlags {
@@ -148,6 +178,7 @@ public:
 };
 
 VARIANT_ENUM_CAST(GPUParticles3D::DrawOrder)
+VARIANT_ENUM_CAST(GPUParticles3D::TransformAlign)
 VARIANT_ENUM_CAST(GPUParticles3D::EmitFlags)
 
 #endif // PARTICLES_H
diff --git a/scene/3d/gpu_particles_collision_3d.cpp b/scene/3d/gpu_particles_collision_3d.cpp
index 97241be60f..628b823f89 100644
--- a/scene/3d/gpu_particles_collision_3d.cpp
+++ b/scene/3d/gpu_particles_collision_3d.cpp
@@ -346,7 +346,7 @@ void GPUParticlesCollisionSDF::_compute_sdf(ComputeSDFParams *params) {
 	ThreadWorkPool work_pool;
 	work_pool.init();
 	work_pool.begin_work(params->size.z, this, &GPUParticlesCollisionSDF::_compute_sdf_z, params);
-	while (work_pool.get_work_index() < (uint32_t)params->size.z) {
+	while (!work_pool.is_done_dispatching()) {
 		OS::get_singleton()->delay_usec(10000);
 		bake_step_function(work_pool.get_work_index() * 100 / params->size.z, "Baking SDF");
 	}
diff --git a/scene/3d/light_3d.cpp b/scene/3d/light_3d.cpp
index f109640aef..d45749d36b 100644
--- a/scene/3d/light_3d.cpp
+++ b/scene/3d/light_3d.cpp
@@ -48,7 +48,7 @@ void Light3D::set_param(Param p_param, float p_value) {
 		update_gizmo();
 
 		if (p_param == PARAM_SPOT_ANGLE) {
-			update_configuration_warning();
+			update_configuration_warnings();
 		}
 	}
 }
@@ -63,7 +63,7 @@ void Light3D::set_shadow(bool p_enable) {
 	RS::get_singleton()->light_set_shadow(light, p_enable);
 
 	if (type == RenderingServer::LIGHT_SPOT || type == RenderingServer::LIGHT_OMNI) {
-		update_configuration_warning();
+		update_configuration_warnings();
 	}
 
 	notify_property_list_changed();
@@ -153,7 +153,7 @@ void Light3D::set_projector(const Ref<Texture2D> &p_texture) {
 	projector = p_texture;
 	RID tex_id = projector.is_valid() ? projector->get_rid() : RID();
 	RS::get_singleton()->light_set_projector(light, tex_id);
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Ref<Texture2D> Light3D::get_projector() const {
@@ -457,17 +457,14 @@ OmniLight3D::ShadowMode OmniLight3D::get_shadow_mode() const {
 	return shadow_mode;
 }
 
-String OmniLight3D::get_configuration_warning() const {
-	String warning = Light3D::get_configuration_warning();
+TypedArray<String> OmniLight3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!has_shadow() && get_projector().is_valid()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("Projector texture only works with shadows active.");
+		warnings.push_back(TTR("Projector texture only works with shadows active."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void OmniLight3D::_bind_methods() {
@@ -491,24 +488,18 @@ OmniLight3D::OmniLight3D() :
 	set_param(PARAM_SHADOW_NORMAL_BIAS, 2.0);
 }
 
-String SpotLight3D::get_configuration_warning() const {
-	String warning = Light3D::get_configuration_warning();
+TypedArray<String> SpotLight3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (has_shadow() && get_param(PARAM_SPOT_ANGLE) >= 90.0) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("A SpotLight3D with an angle wider than 90 degrees cannot cast shadows.");
+		warnings.push_back(TTR("A SpotLight3D with an angle wider than 90 degrees cannot cast shadows."));
 	}
 
 	if (!has_shadow() && get_projector().is_valid()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("Projector texture only works with shadows active.");
+		warnings.push_back(TTR("Projector texture only works with shadows active."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void SpotLight3D::_bind_methods() {
diff --git a/scene/3d/light_3d.h b/scene/3d/light_3d.h
index 311db54bce..e145b08b74 100644
--- a/scene/3d/light_3d.h
+++ b/scene/3d/light_3d.h
@@ -202,7 +202,7 @@ public:
 	void set_shadow_mode(ShadowMode p_mode);
 	ShadowMode get_shadow_mode() const;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	OmniLight3D();
 };
@@ -216,7 +216,7 @@ protected:
 	static void _bind_methods();
 
 public:
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	SpotLight3D() :
 			Light3D(RenderingServer::LIGHT_SPOT) {}
diff --git a/scene/3d/lightmapper.cpp b/scene/3d/lightmapper.cpp
index c17ac52aa2..9e5078ba95 100644
--- a/scene/3d/lightmapper.cpp
+++ b/scene/3d/lightmapper.cpp
@@ -39,6 +39,15 @@ Ref<LightmapDenoiser> LightmapDenoiser::create() {
 	return Ref<LightmapDenoiser>();
 }
 
+LightmapRaycaster *(*LightmapRaycaster::create_function)() = nullptr;
+
+Ref<LightmapRaycaster> LightmapRaycaster::create() {
+	if (create_function) {
+		return Ref<LightmapRaycaster>(create_function());
+	}
+	return Ref<LightmapRaycaster>();
+}
+
 Lightmapper::CreateFunc Lightmapper::create_custom = nullptr;
 Lightmapper::CreateFunc Lightmapper::create_gpu = nullptr;
 Lightmapper::CreateFunc Lightmapper::create_cpu = nullptr;
diff --git a/scene/3d/lightmapper.h b/scene/3d/lightmapper.h
index a07a964c01..f63515f666 100644
--- a/scene/3d/lightmapper.h
+++ b/scene/3d/lightmapper.h
@@ -34,6 +34,16 @@
 #include "scene/resources/mesh.h"
 #include "servers/rendering/rendering_device.h"
 
+#if !defined(__aligned)
+
+#if defined(_WIN32) && defined(_MSC_VER)
+#define __aligned(...) __declspec(align(__VA_ARGS__))
+#else
+#define __aligned(...) __attribute__((aligned(__VA_ARGS__)))
+#endif
+
+#endif
+
 class LightmapDenoiser : public Reference {
 	GDCLASS(LightmapDenoiser, Reference)
 protected:
@@ -44,6 +54,73 @@ public:
 	static Ref<LightmapDenoiser> create();
 };
 
+class LightmapRaycaster : public Reference {
+	GDCLASS(LightmapRaycaster, Reference)
+protected:
+	static LightmapRaycaster *(*create_function)();
+
+public:
+	// compatible with embree3 rays
+	struct __aligned(16) Ray {
+		const static unsigned int INVALID_GEOMETRY_ID = ((unsigned int)-1); // from rtcore_common.h
+
+		/*! Default construction does nothing. */
+		_FORCE_INLINE_ Ray() :
+				geomID(INVALID_GEOMETRY_ID) {}
+
+		/*! Constructs a ray from origin, direction, and ray segment. Near
+		 *  has to be smaller than far. */
+		_FORCE_INLINE_ Ray(const Vector3 &org,
+				const Vector3 &dir,
+				float tnear = 0.0f,
+				float tfar = INFINITY) :
+				org(org),
+				tnear(tnear),
+				dir(dir),
+				time(0.0f),
+				tfar(tfar),
+				mask(-1),
+				u(0.0),
+				v(0.0),
+				primID(INVALID_GEOMETRY_ID),
+				geomID(INVALID_GEOMETRY_ID),
+				instID(INVALID_GEOMETRY_ID) {}
+
+		/*! Tests if we hit something. */
+		_FORCE_INLINE_ explicit operator bool() const { return geomID != INVALID_GEOMETRY_ID; }
+
+	public:
+		Vector3 org; //!< Ray origin + tnear
+		float tnear; //!< Start of ray segment
+		Vector3 dir; //!< Ray direction + tfar
+		float time; //!< Time of this ray for motion blur.
+		float tfar; //!< End of ray segment
+		unsigned int mask; //!< used to mask out objects during traversal
+		unsigned int id; //!< ray ID
+		unsigned int flags; //!< ray flags
+
+		Vector3 normal; //!< Not normalized geometry normal
+		float u; //!< Barycentric u coordinate of hit
+		float v; //!< Barycentric v coordinate of hit
+		unsigned int primID; //!< primitive ID
+		unsigned int geomID; //!< geometry ID
+		unsigned int instID; //!< instance ID
+	};
+
+	virtual bool intersect(Ray &p_ray) = 0;
+
+	virtual void intersect(Vector<Ray> &r_rays) = 0;
+
+	virtual void add_mesh(const Vector<Vector3> &p_vertices, const Vector<Vector3> &p_normals, const Vector<Vector2> &p_uv2s, unsigned int p_id) = 0;
+	virtual void set_mesh_alpha_texture(Ref<Image> p_alpha_texture, unsigned int p_id) = 0;
+	virtual void commit() = 0;
+
+	virtual void set_mesh_filter(const Set<int> &p_mesh_ids) = 0;
+	virtual void clear_mesh_filter() = 0;
+
+	static Ref<LightmapRaycaster> create();
+};
+
 class Lightmapper : public Reference {
 	GDCLASS(Lightmapper, Reference)
 public:
diff --git a/scene/3d/mesh_instance_3d.cpp b/scene/3d/mesh_instance_3d.cpp
index b997c64b29..27d5487a1a 100644
--- a/scene/3d/mesh_instance_3d.cpp
+++ b/scene/3d/mesh_instance_3d.cpp
@@ -51,13 +51,13 @@ bool MeshInstance3D::_set(const StringName &p_name, const Variant &p_value) {
 		return true;
 	}
 
-	if (p_name.operator String().begins_with("material/")) {
+	if (p_name.operator String().begins_with("surface_material_override/")) {
 		int idx = p_name.operator String().get_slicec('/', 1).to_int();
-		if (idx >= materials.size() || idx < 0) {
+		if (idx >= surface_override_materials.size() || idx < 0) {
 			return false;
 		}
 
-		set_surface_material(idx, p_value);
+		set_surface_override_material(idx, p_value);
 		return true;
 	}
 
@@ -75,12 +75,12 @@ bool MeshInstance3D::_get(const StringName &p_name, Variant &r_ret) const {
 		return true;
 	}
 
-	if (p_name.operator String().begins_with("material/")) {
+	if (p_name.operator String().begins_with("surface_material_override/")) {
 		int idx = p_name.operator String().get_slicec('/', 1).to_int();
-		if (idx >= materials.size() || idx < 0) {
+		if (idx >= surface_override_materials.size() || idx < 0) {
 			return false;
 		}
-		r_ret = materials[idx];
+		r_ret = surface_override_materials[idx];
 		return true;
 	}
 	return false;
@@ -100,7 +100,7 @@ void MeshInstance3D::_get_property_list(List<PropertyInfo> *p_list) const {
 
 	if (mesh.is_valid()) {
 		for (int i = 0; i < mesh->get_surface_count(); i++) {
-			p_list->push_back(PropertyInfo(Variant::OBJECT, "material/" + itos(i), PROPERTY_HINT_RESOURCE_TYPE, "ShaderMaterial,StandardMaterial3D", PROPERTY_USAGE_DEFAULT | PROPERTY_USAGE_DEFERRED_SET_RESOURCE));
+			p_list->push_back(PropertyInfo(Variant::OBJECT, "surface_material_override/" + itos(i), PROPERTY_HINT_RESOURCE_TYPE, "ShaderMaterial,StandardMaterial3D", PROPERTY_USAGE_DEFAULT | PROPERTY_USAGE_DEFERRED_SET_RESOURCE));
 		}
 	}
 }
@@ -126,7 +126,7 @@ void MeshInstance3D::set_mesh(const Ref<Mesh> &p_mesh) {
 		}
 
 		mesh->connect(CoreStringNames::get_singleton()->changed, callable_mp(this, &MeshInstance3D::_mesh_changed));
-		materials.resize(mesh->get_surface_count());
+		surface_override_materials.resize(mesh->get_surface_count());
 
 		set_base(mesh->get_rid());
 	} else {
@@ -271,32 +271,67 @@ void MeshInstance3D::create_convex_collision() {
 	}
 }
 
+Node *MeshInstance3D::create_multiple_convex_collisions_node() {
+	if (mesh.is_null()) {
+		return nullptr;
+	}
+
+	Vector<Ref<Shape3D>> shapes = mesh->convex_decompose();
+	if (!shapes.size()) {
+		return nullptr;
+	}
+
+	StaticBody3D *static_body = memnew(StaticBody3D);
+	for (int i = 0; i < shapes.size(); i++) {
+		CollisionShape3D *cshape = memnew(CollisionShape3D);
+		cshape->set_shape(shapes[i]);
+		static_body->add_child(cshape);
+	}
+	return static_body;
+}
+
+void MeshInstance3D::create_multiple_convex_collisions() {
+	StaticBody3D *static_body = Object::cast_to<StaticBody3D>(create_multiple_convex_collisions_node());
+	ERR_FAIL_COND(!static_body);
+	static_body->set_name(String(get_name()) + "_col");
+
+	add_child(static_body);
+	if (get_owner()) {
+		static_body->set_owner(get_owner());
+		int count = static_body->get_child_count();
+		for (int i = 0; i < count; i++) {
+			CollisionShape3D *cshape = Object::cast_to<CollisionShape3D>(static_body->get_child(i));
+			cshape->set_owner(get_owner());
+		}
+	}
+}
+
 void MeshInstance3D::_notification(int p_what) {
 	if (p_what == NOTIFICATION_ENTER_TREE) {
 		_resolve_skeleton_path();
 	}
 }
 
-int MeshInstance3D::get_surface_material_count() const {
-	return materials.size();
+int MeshInstance3D::get_surface_override_material_count() const {
+	return surface_override_materials.size();
 }
 
-void MeshInstance3D::set_surface_material(int p_surface, const Ref<Material> &p_material) {
-	ERR_FAIL_INDEX(p_surface, materials.size());
+void MeshInstance3D::set_surface_override_material(int p_surface, const Ref<Material> &p_material) {
+	ERR_FAIL_INDEX(p_surface, surface_override_materials.size());
 
-	materials.write[p_surface] = p_material;
+	surface_override_materials.write[p_surface] = p_material;
 
-	if (materials[p_surface].is_valid()) {
-		RS::get_singleton()->instance_set_surface_material(get_instance(), p_surface, materials[p_surface]->get_rid());
+	if (surface_override_materials[p_surface].is_valid()) {
+		RS::get_singleton()->instance_set_surface_override_material(get_instance(), p_surface, surface_override_materials[p_surface]->get_rid());
 	} else {
-		RS::get_singleton()->instance_set_surface_material(get_instance(), p_surface, RID());
+		RS::get_singleton()->instance_set_surface_override_material(get_instance(), p_surface, RID());
 	}
 }
 
-Ref<Material> MeshInstance3D::get_surface_material(int p_surface) const {
-	ERR_FAIL_INDEX_V(p_surface, materials.size(), Ref<Material>());
+Ref<Material> MeshInstance3D::get_surface_override_material(int p_surface) const {
+	ERR_FAIL_INDEX_V(p_surface, surface_override_materials.size(), Ref<Material>());
 
-	return materials[p_surface];
+	return surface_override_materials[p_surface];
 }
 
 Ref<Material> MeshInstance3D::get_active_material(int p_surface) const {
@@ -305,7 +340,7 @@ Ref<Material> MeshInstance3D::get_active_material(int p_surface) const {
 		return material_override;
 	}
 
-	Ref<Material> surface_material = get_surface_material(p_surface);
+	Ref<Material> surface_material = get_surface_override_material(p_surface);
 	if (surface_material.is_valid()) {
 		return surface_material;
 	}
@@ -320,7 +355,8 @@ Ref<Material> MeshInstance3D::get_active_material(int p_surface) const {
 
 void MeshInstance3D::_mesh_changed() {
 	ERR_FAIL_COND(mesh.is_null());
-	materials.resize(mesh->get_surface_count());
+	surface_override_materials.resize(mesh->get_surface_count());
+	update_gizmo();
 }
 
 void MeshInstance3D::create_debug_tangents() {
@@ -408,15 +444,17 @@ void MeshInstance3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_skin", "skin"), &MeshInstance3D::set_skin);
 	ClassDB::bind_method(D_METHOD("get_skin"), &MeshInstance3D::get_skin);
 
-	ClassDB::bind_method(D_METHOD("get_surface_material_count"), &MeshInstance3D::get_surface_material_count);
-	ClassDB::bind_method(D_METHOD("set_surface_material", "surface", "material"), &MeshInstance3D::set_surface_material);
-	ClassDB::bind_method(D_METHOD("get_surface_material", "surface"), &MeshInstance3D::get_surface_material);
+	ClassDB::bind_method(D_METHOD("get_surface_override_material_count"), &MeshInstance3D::get_surface_override_material_count);
+	ClassDB::bind_method(D_METHOD("set_surface_override_material", "surface", "material"), &MeshInstance3D::set_surface_override_material);
+	ClassDB::bind_method(D_METHOD("get_surface_override_material", "surface"), &MeshInstance3D::get_surface_override_material);
 	ClassDB::bind_method(D_METHOD("get_active_material", "surface"), &MeshInstance3D::get_active_material);
 
 	ClassDB::bind_method(D_METHOD("create_trimesh_collision"), &MeshInstance3D::create_trimesh_collision);
 	ClassDB::set_method_flags("MeshInstance3D", "create_trimesh_collision", METHOD_FLAGS_DEFAULT);
 	ClassDB::bind_method(D_METHOD("create_convex_collision"), &MeshInstance3D::create_convex_collision);
 	ClassDB::set_method_flags("MeshInstance3D", "create_convex_collision", METHOD_FLAGS_DEFAULT);
+	ClassDB::bind_method(D_METHOD("create_multiple_convex_collisions"), &MeshInstance3D::create_multiple_convex_collisions);
+	ClassDB::set_method_flags("MeshInstance3D", "create_multiple_convex_collisions", METHOD_FLAGS_DEFAULT);
 
 	ClassDB::bind_method(D_METHOD("create_debug_tangents"), &MeshInstance3D::create_debug_tangents);
 	ClassDB::set_method_flags("MeshInstance3D", "create_debug_tangents", METHOD_FLAGS_DEFAULT | METHOD_FLAG_EDITOR);
diff --git a/scene/3d/mesh_instance_3d.h b/scene/3d/mesh_instance_3d.h
index eb300784b1..9dea5804e0 100644
--- a/scene/3d/mesh_instance_3d.h
+++ b/scene/3d/mesh_instance_3d.h
@@ -52,7 +52,7 @@ protected:
 	};
 
 	Map<StringName, BlendShapeTrack> blend_shape_tracks;
-	Vector<Ref<Material>> materials;
+	Vector<Ref<Material>> surface_override_materials;
 
 	void _mesh_changed();
 	void _resolve_skeleton_path();
@@ -75,9 +75,9 @@ public:
 	void set_skeleton_path(const NodePath &p_skeleton);
 	NodePath get_skeleton_path();
 
-	int get_surface_material_count() const;
-	void set_surface_material(int p_surface, const Ref<Material> &p_material);
-	Ref<Material> get_surface_material(int p_surface) const;
+	int get_surface_override_material_count() const;
+	void set_surface_override_material(int p_surface, const Ref<Material> &p_material);
+	Ref<Material> get_surface_override_material(int p_surface) const;
 	Ref<Material> get_active_material(int p_surface) const;
 
 	Node *create_trimesh_collision_node();
@@ -86,6 +86,9 @@ public:
 	Node *create_convex_collision_node();
 	void create_convex_collision();
 
+	Node *create_multiple_convex_collisions_node();
+	void create_multiple_convex_collisions();
+
 	void create_debug_tangents();
 
 	virtual AABB get_aabb() const override;
diff --git a/scene/3d/navigation_agent_3d.cpp b/scene/3d/navigation_agent_3d.cpp
index 21ca3d70dd..64cfe4dca7 100644
--- a/scene/3d/navigation_agent_3d.cpp
+++ b/scene/3d/navigation_agent_3d.cpp
@@ -34,6 +34,8 @@
 #include "servers/navigation_server_3d.h"
 
 void NavigationAgent3D::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("get_rid"), &NavigationAgent3D::get_rid);
+
 	ClassDB::bind_method(D_METHOD("set_target_desired_distance", "desired_distance"), &NavigationAgent3D::set_target_desired_distance);
 	ClassDB::bind_method(D_METHOD("get_target_desired_distance"), &NavigationAgent3D::get_target_desired_distance);
 
@@ -95,8 +97,11 @@ void NavigationAgent3D::_notification(int p_what) {
 	switch (p_what) {
 		case NOTIFICATION_READY: {
 			agent_parent = Object::cast_to<Node3D>(get_parent());
-
-			NavigationServer3D::get_singleton()->agent_set_callback(agent, this, "_avoidance_done");
+			if (agent_parent != nullptr) {
+				// place agent on navigation map first or else the RVO agent callback creation fails silently later
+				NavigationServer3D::get_singleton()->agent_set_map(get_rid(), agent_parent->get_world_3d()->get_navigation_map());
+				NavigationServer3D::get_singleton()->agent_set_callback(agent, this, "_avoidance_done");
+			}
 			set_physics_process_internal(true);
 		} break;
 		case NOTIFICATION_EXIT_TREE: {
@@ -106,12 +111,7 @@ void NavigationAgent3D::_notification(int p_what) {
 		case NOTIFICATION_INTERNAL_PHYSICS_PROCESS: {
 			if (agent_parent) {
 				NavigationServer3D::get_singleton()->agent_set_position(agent, agent_parent->get_global_transform().origin);
-				if (!target_reached) {
-					if (distance_to_target() < target_desired_distance) {
-						emit_signal("target_reached");
-						target_reached = true;
-					}
-				}
+				_check_distance_to_target();
 			}
 		} break;
 	}
@@ -245,17 +245,14 @@ void NavigationAgent3D::_avoidance_done(Vector3 p_new_velocity) {
 	emit_signal("velocity_computed", p_new_velocity);
 }
 
-String NavigationAgent3D::get_configuration_warning() const {
-	String warning = Node::get_configuration_warning();
+TypedArray<String> NavigationAgent3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!Object::cast_to<Node3D>(get_parent())) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("The NavigationAgent3D can be used only under a spatial node.");
+		warnings.push_back(TTR("The NavigationAgent3D can be used only under a spatial node."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void NavigationAgent3D::update_navigation() {
@@ -312,6 +309,7 @@ void NavigationAgent3D::update_navigation() {
 		while (o.distance_to(navigation_path[nav_path_index] - Vector3(0, navigation_height_offset, 0)) < target_desired_distance) {
 			nav_path_index += 1;
 			if (nav_path_index == navigation_path.size()) {
+				_check_distance_to_target();
 				nav_path_index -= 1;
 				navigation_finished = true;
 				emit_signal("navigation_finished");
@@ -320,3 +318,12 @@ void NavigationAgent3D::update_navigation() {
 		}
 	}
 }
+
+void NavigationAgent3D::_check_distance_to_target() {
+	if (!target_reached) {
+		if (distance_to_target() < target_desired_distance) {
+			emit_signal("target_reached");
+			target_reached = true;
+		}
+	}
+}
diff --git a/scene/3d/navigation_agent_3d.h b/scene/3d/navigation_agent_3d.h
index 22db889618..56da2d1acf 100644
--- a/scene/3d/navigation_agent_3d.h
+++ b/scene/3d/navigation_agent_3d.h
@@ -143,10 +143,11 @@ public:
 	void set_velocity(Vector3 p_velocity);
 	void _avoidance_done(Vector3 p_new_velocity);
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 private:
 	void update_navigation();
+	void _check_distance_to_target();
 };
 
 #endif
diff --git a/scene/3d/navigation_obstacle_3d.cpp b/scene/3d/navigation_obstacle_3d.cpp
index df03bca4fd..20ffc3b00e 100644
--- a/scene/3d/navigation_obstacle_3d.cpp
+++ b/scene/3d/navigation_obstacle_3d.cpp
@@ -76,17 +76,14 @@ NavigationObstacle3D::~NavigationObstacle3D() {
 	agent = RID(); // Pointless
 }
 
-String NavigationObstacle3D::get_configuration_warning() const {
-	String warning = Node::get_configuration_warning();
+TypedArray<String> NavigationObstacle3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
-	if (!parent_node3d) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("The NavigationObstacle3D only serves to provide collision avoidance to a spatial object.");
+	if (!Object::cast_to<Node3D>(get_parent())) {
+		warnings.push_back(TTR("The NavigationObstacle3D only serves to provide collision avoidance to a spatial object."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void NavigationObstacle3D::update_agent_shape() {
diff --git a/scene/3d/navigation_obstacle_3d.h b/scene/3d/navigation_obstacle_3d.h
index b1bb53724a..2f78f624a4 100644
--- a/scene/3d/navigation_obstacle_3d.h
+++ b/scene/3d/navigation_obstacle_3d.h
@@ -52,7 +52,7 @@ public:
 		return agent;
 	}
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 private:
 	void update_agent_shape();
diff --git a/scene/3d/navigation_region_3d.cpp b/scene/3d/navigation_region_3d.cpp
index 3ca704e4b8..0afad62404 100644
--- a/scene/3d/navigation_region_3d.cpp
+++ b/scene/3d/navigation_region_3d.cpp
@@ -135,7 +135,7 @@ void NavigationRegion3D::set_navigation_mesh(const Ref<NavigationMesh> &p_navmes
 	emit_signal("navigation_mesh_changed");
 
 	update_gizmo();
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Ref<NavigationMesh> NavigationRegion3D::get_navigation_mesh() const {
@@ -177,21 +177,16 @@ void NavigationRegion3D::_bake_finished(Ref<NavigationMesh> p_nav_mesh) {
 	emit_signal("bake_finished");
 }
 
-String NavigationRegion3D::get_configuration_warning() const {
-	if (!is_visible_in_tree() || !is_inside_tree()) {
-		return String();
-	}
-
-	String warning = Node3D::get_configuration_warning();
+TypedArray<String> NavigationRegion3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
-	if (!navmesh.is_valid()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
+	if (is_visible_in_tree() && is_inside_tree()) {
+		if (!navmesh.is_valid()) {
+			warnings.push_back(TTR("A NavigationMesh resource must be set or created for this node to work."));
 		}
-		warning += TTR("A NavigationMesh resource must be set or created for this node to work.");
 	}
 
-	return warning;
+	return warnings;
 }
 
 void NavigationRegion3D::_bind_methods() {
@@ -217,7 +212,7 @@ void NavigationRegion3D::_bind_methods() {
 
 void NavigationRegion3D::_navigation_changed() {
 	update_gizmo();
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 NavigationRegion3D::NavigationRegion3D() {
diff --git a/scene/3d/navigation_region_3d.h b/scene/3d/navigation_region_3d.h
index 52fa2d6159..c2045215b1 100644
--- a/scene/3d/navigation_region_3d.h
+++ b/scene/3d/navigation_region_3d.h
@@ -66,7 +66,7 @@ public:
 	void bake_navigation_mesh();
 	void _bake_finished(Ref<NavigationMesh> p_nav_mesh);
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	NavigationRegion3D();
 	~NavigationRegion3D();
diff --git a/scene/3d/occluder_instance_3d.cpp b/scene/3d/occluder_instance_3d.cpp
new file mode 100644
index 0000000000..d3a256db34
--- /dev/null
+++ b/scene/3d/occluder_instance_3d.cpp
@@ -0,0 +1,335 @@
+/*************************************************************************/
+/*  occluder_instance_3d.cpp                                             */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "occluder_instance_3d.h"
+#include "core/core_string_names.h"
+#include "scene/3d/mesh_instance_3d.h"
+
+RID Occluder3D::get_rid() const {
+	if (!occluder.is_valid()) {
+		occluder = RS::get_singleton()->occluder_create();
+		RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices);
+	}
+	return occluder;
+}
+
+void Occluder3D::set_vertices(PackedVector3Array p_vertices) {
+	vertices = p_vertices;
+	if (occluder.is_valid()) {
+		RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices);
+	}
+	_update_changes();
+}
+
+PackedVector3Array Occluder3D::get_vertices() const {
+	return vertices;
+}
+
+void Occluder3D::set_indices(PackedInt32Array p_indices) {
+	indices = p_indices;
+	if (occluder.is_valid()) {
+		RS::get_singleton()->occluder_set_mesh(occluder, vertices, indices);
+	}
+	_update_changes();
+}
+
+PackedInt32Array Occluder3D::get_indices() const {
+	return indices;
+}
+
+void Occluder3D::_update_changes() {
+	aabb = AABB();
+
+	const Vector3 *ptr = vertices.ptr();
+	for (int i = 0; i < vertices.size(); i++) {
+		aabb.expand_to(ptr[i]);
+	}
+
+	debug_lines.clear();
+	debug_mesh.unref();
+
+	emit_changed();
+}
+
+Vector<Vector3> Occluder3D::get_debug_lines() const {
+	if (!debug_lines.is_empty()) {
+		return debug_lines;
+	}
+
+	if (indices.size() % 3 != 0) {
+		return Vector<Vector3>();
+	}
+
+	for (int i = 0; i < indices.size() / 3; i++) {
+		for (int j = 0; j < 3; j++) {
+			int a = indices[i * 3 + j];
+			int b = indices[i * 3 + (j + 1) % 3];
+			ERR_FAIL_INDEX_V_MSG(a, vertices.size(), Vector<Vector3>(), "Occluder indices are out of range.");
+			ERR_FAIL_INDEX_V_MSG(b, vertices.size(), Vector<Vector3>(), "Occluder indices are out of range.");
+			debug_lines.push_back(vertices[a]);
+			debug_lines.push_back(vertices[b]);
+		}
+	}
+	return debug_lines;
+}
+
+Ref<ArrayMesh> Occluder3D::get_debug_mesh() const {
+	if (debug_mesh.is_valid()) {
+		return debug_mesh;
+	}
+
+	if (indices.size() % 3 != 0) {
+		return debug_mesh;
+	}
+
+	Array arrays;
+	arrays.resize(Mesh::ARRAY_MAX);
+	arrays[Mesh::ARRAY_VERTEX] = vertices;
+	arrays[Mesh::ARRAY_INDEX] = indices;
+
+	debug_mesh.instance();
+	debug_mesh->add_surface_from_arrays(Mesh::PRIMITIVE_TRIANGLES, arrays);
+	return debug_mesh;
+}
+
+AABB Occluder3D::get_aabb() const {
+	return aabb;
+}
+
+void Occluder3D::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("set_vertices", "vertices"), &Occluder3D::set_vertices);
+	ClassDB::bind_method(D_METHOD("get_vertices"), &Occluder3D::get_vertices);
+
+	ClassDB::bind_method(D_METHOD("set_indices", "indices"), &Occluder3D::set_indices);
+	ClassDB::bind_method(D_METHOD("get_indices"), &Occluder3D::get_indices);
+
+	ADD_PROPERTY(PropertyInfo(Variant::PACKED_VECTOR3_ARRAY, "vertices", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_vertices", "get_vertices");
+	ADD_PROPERTY(PropertyInfo(Variant::PACKED_INT32_ARRAY, "indices", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR), "set_indices", "get_indices");
+}
+
+Occluder3D::Occluder3D() {
+}
+
+Occluder3D::~Occluder3D() {
+	if (occluder.is_valid()) {
+		RS::get_singleton()->free(occluder);
+	}
+}
+/////////////////////////////////////////////////
+
+AABB OccluderInstance3D::get_aabb() const {
+	if (occluder.is_valid()) {
+		return occluder->get_aabb();
+	}
+	return AABB();
+}
+
+Vector<Face3> OccluderInstance3D::get_faces(uint32_t p_usage_flags) const {
+	return Vector<Face3>();
+}
+
+void OccluderInstance3D::set_occluder(const Ref<Occluder3D> &p_occluder) {
+	if (occluder == p_occluder) {
+		return;
+	}
+
+	if (occluder.is_valid()) {
+		occluder->disconnect(CoreStringNames::get_singleton()->changed, callable_mp(this, &OccluderInstance3D::_occluder_changed));
+	}
+
+	occluder = p_occluder;
+
+	if (occluder.is_valid()) {
+		set_base(occluder->get_rid());
+		occluder->connect(CoreStringNames::get_singleton()->changed, callable_mp(this, &OccluderInstance3D::_occluder_changed));
+	} else {
+		set_base(RID());
+	}
+
+	update_gizmo();
+}
+
+void OccluderInstance3D::_occluder_changed() {
+	update_gizmo();
+}
+
+Ref<Occluder3D> OccluderInstance3D::get_occluder() const {
+	return occluder;
+}
+
+void OccluderInstance3D::set_bake_mask(uint32_t p_mask) {
+	bake_mask = p_mask;
+}
+
+uint32_t OccluderInstance3D::get_bake_mask() const {
+	return bake_mask;
+}
+
+void OccluderInstance3D::set_bake_mask_bit(int p_layer, bool p_enable) {
+	ERR_FAIL_INDEX(p_layer, 32);
+	if (p_enable) {
+		set_bake_mask(bake_mask | (1 << p_layer));
+	} else {
+		set_bake_mask(bake_mask & (~(1 << p_layer)));
+	}
+}
+
+bool OccluderInstance3D::get_bake_mask_bit(int p_layer) const {
+	ERR_FAIL_INDEX_V(p_layer, 32, false);
+	return (bake_mask & (1 << p_layer));
+}
+
+bool OccluderInstance3D::_bake_material_check(Ref<Material> p_material) {
+	StandardMaterial3D *standard_mat = Object::cast_to<StandardMaterial3D>(p_material.ptr());
+	if (standard_mat && standard_mat->get_transparency() != StandardMaterial3D::TRANSPARENCY_DISABLED) {
+		return false;
+	}
+	return true;
+}
+
+void OccluderInstance3D::_bake_node(Node *p_node, PackedVector3Array &r_vertices, PackedInt32Array &r_indices) {
+	MeshInstance3D *mi = Object::cast_to<MeshInstance3D>(p_node);
+	if (mi && mi->is_visible_in_tree()) {
+		Ref<Mesh> mesh = mi->get_mesh();
+		bool valid = true;
+
+		if (mesh.is_null()) {
+			valid = false;
+		}
+
+		if (valid && !_bake_material_check(mi->get_material_override())) {
+			valid = false;
+		}
+
+		if ((mi->get_layer_mask() & bake_mask) == 0) {
+			valid = false;
+		}
+
+		if (valid) {
+			Transform global_to_local = get_global_transform().affine_inverse() * mi->get_global_transform();
+
+			for (int i = 0; i < mesh->get_surface_count(); i++) {
+				if (mesh->surface_get_primitive_type(i) != Mesh::PRIMITIVE_TRIANGLES) {
+					continue;
+				}
+
+				if (mi->get_surface_override_material(i).is_valid()) {
+					if (!_bake_material_check(mi->get_surface_override_material(i))) {
+						continue;
+					}
+				} else {
+					if (!_bake_material_check(mesh->surface_get_material(i))) {
+						continue;
+					}
+				}
+
+				Array arrays = mesh->surface_get_arrays(i);
+
+				int vertex_offset = r_vertices.size();
+				PackedVector3Array vertices = arrays[Mesh::ARRAY_VERTEX];
+				r_vertices.resize(r_vertices.size() + vertices.size());
+
+				Vector3 *vtx_ptr = r_vertices.ptrw();
+				for (int j = 0; j < vertices.size(); j++) {
+					vtx_ptr[vertex_offset + j] = global_to_local.xform(vertices[j]);
+				}
+
+				int index_offset = r_indices.size();
+				PackedInt32Array indices = arrays[Mesh::ARRAY_INDEX];
+				r_indices.resize(r_indices.size() + indices.size());
+
+				int *idx_ptr = r_indices.ptrw();
+				for (int j = 0; j < indices.size(); j++) {
+					idx_ptr[index_offset + j] = vertex_offset + indices[j];
+				}
+			}
+		}
+	}
+
+	for (int i = 0; i < p_node->get_child_count(); i++) {
+		Node *child = p_node->get_child(i);
+		if (!child->get_owner()) {
+			continue; //maybe a helper
+		}
+
+		_bake_node(child, r_vertices, r_indices);
+	}
+}
+
+OccluderInstance3D::BakeError OccluderInstance3D::bake(Node *p_from_node, String p_occluder_path) {
+	if (p_occluder_path == "") {
+		if (get_occluder().is_null()) {
+			return BAKE_ERROR_NO_SAVE_PATH;
+		}
+	}
+
+	PackedVector3Array vertices;
+	PackedInt32Array indices;
+
+	_bake_node(p_from_node, vertices, indices);
+
+	if (vertices.is_empty() || indices.is_empty()) {
+		return BAKE_ERROR_NO_MESHES;
+	}
+
+	Ref<Occluder3D> occ;
+	if (get_occluder().is_valid()) {
+		occ = get_occluder();
+	} else {
+		occ.instance();
+		occ->set_path(p_occluder_path);
+	}
+
+	occ->set_vertices(vertices);
+	occ->set_indices(indices);
+	set_occluder(occ);
+
+	return BAKE_ERROR_OK;
+}
+
+void OccluderInstance3D::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("set_bake_mask", "mask"), &OccluderInstance3D::set_bake_mask);
+	ClassDB::bind_method(D_METHOD("get_bake_mask"), &OccluderInstance3D::get_bake_mask);
+	ClassDB::bind_method(D_METHOD("set_bake_mask_bit", "layer", "enabled"), &OccluderInstance3D::set_bake_mask_bit);
+	ClassDB::bind_method(D_METHOD("get_bake_mask_bit", "layer"), &OccluderInstance3D::get_bake_mask_bit);
+
+	ClassDB::bind_method(D_METHOD("set_occluder", "occluder"), &OccluderInstance3D::set_occluder);
+	ClassDB::bind_method(D_METHOD("get_occluder"), &OccluderInstance3D::get_occluder);
+
+	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "occluder", PROPERTY_HINT_RESOURCE_TYPE, "Occluder3D"), "set_occluder", "get_occluder");
+	ADD_GROUP("Bake", "bake_");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "bake_mask", PROPERTY_HINT_LAYERS_3D_RENDER), "set_bake_mask", "get_bake_mask");
+}
+
+OccluderInstance3D::OccluderInstance3D() {
+}
+
+OccluderInstance3D::~OccluderInstance3D() {
+}
diff --git a/scene/3d/occluder_instance_3d.h b/scene/3d/occluder_instance_3d.h
new file mode 100644
index 0000000000..4bb468274d
--- /dev/null
+++ b/scene/3d/occluder_instance_3d.h
@@ -0,0 +1,108 @@
+/*************************************************************************/
+/*  occluder_instance_3d.h                                               */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef OCCLUDER_INSTANCE_3D_H
+#define OCCLUDER_INSTANCE_3D_H
+
+#include "scene/3d/visual_instance_3d.h"
+
+class Occluder3D : public Resource {
+	GDCLASS(Occluder3D, Resource);
+	RES_BASE_EXTENSION("occ");
+
+	mutable RID occluder;
+	mutable Ref<ArrayMesh> debug_mesh;
+	mutable Vector<Vector3> debug_lines;
+	AABB aabb;
+
+	PackedVector3Array vertices;
+	PackedInt32Array indices;
+
+	void _update_changes();
+
+protected:
+	static void _bind_methods();
+
+public:
+	void set_vertices(PackedVector3Array p_vertices);
+	PackedVector3Array get_vertices() const;
+
+	void set_indices(PackedInt32Array p_indices);
+	PackedInt32Array get_indices() const;
+
+	Vector<Vector3> get_debug_lines() const;
+	Ref<ArrayMesh> get_debug_mesh() const;
+	AABB get_aabb() const;
+
+	virtual RID get_rid() const override;
+	Occluder3D();
+	~Occluder3D();
+};
+
+class OccluderInstance3D : public VisualInstance3D {
+	GDCLASS(OccluderInstance3D, Node3D);
+
+private:
+	Ref<Occluder3D> occluder;
+	uint32_t bake_mask = 0xFFFFFFFF;
+
+	void _occluder_changed();
+
+	bool _bake_material_check(Ref<Material> p_material);
+	void _bake_node(Node *p_node, PackedVector3Array &r_vertices, PackedInt32Array &r_indices);
+
+protected:
+	static void _bind_methods();
+
+public:
+	enum BakeError {
+		BAKE_ERROR_OK,
+		BAKE_ERROR_NO_SAVE_PATH,
+		BAKE_ERROR_NO_MESHES,
+	};
+
+	void set_occluder(const Ref<Occluder3D> &p_occluder);
+	Ref<Occluder3D> get_occluder() const;
+
+	virtual AABB get_aabb() const override;
+	virtual Vector<Face3> get_faces(uint32_t p_usage_flags) const override;
+
+	void set_bake_mask(uint32_t p_mask);
+	uint32_t get_bake_mask() const;
+
+	void set_bake_mask_bit(int p_layer, bool p_enable);
+	bool get_bake_mask_bit(int p_layer) const;
+	BakeError bake(Node *p_from_node, String p_occluder_path = "");
+
+	OccluderInstance3D();
+	~OccluderInstance3D();
+};
+
+#endif
diff --git a/scene/3d/path_3d.cpp b/scene/3d/path_3d.cpp
index 7e2601902b..4ec4ee6207 100644
--- a/scene/3d/path_3d.cpp
+++ b/scene/3d/path_3d.cpp
@@ -50,7 +50,7 @@ void Path3D::_curve_changed() {
 		for (int i = 0; i < get_child_count(); i++) {
 			PathFollow3D *child = Object::cast_to<PathFollow3D>(get_child(i));
 			if (child) {
-				child->update_configuration_warning();
+				child->update_configuration_warnings();
 			}
 		}
 	}
@@ -241,29 +241,21 @@ void PathFollow3D::_validate_property(PropertyInfo &property) const {
 	}
 }
 
-String PathFollow3D::get_configuration_warning() const {
-	if (!is_visible_in_tree() || !is_inside_tree()) {
-		return String();
-	}
-
-	String warning = Node3D::get_configuration_warning();
+TypedArray<String> PathFollow3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
-	if (!Object::cast_to<Path3D>(get_parent())) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("PathFollow3D only works when set as a child of a Path3D node.");
-	} else {
-		Path3D *path = Object::cast_to<Path3D>(get_parent());
-		if (path->get_curve().is_valid() && !path->get_curve()->is_up_vector_enabled() && rotation_mode == ROTATION_ORIENTED) {
-			if (!warning.is_empty()) {
-				warning += "\n\n";
+	if (is_visible_in_tree() && is_inside_tree()) {
+		if (!Object::cast_to<Path3D>(get_parent())) {
+			warnings.push_back(TTR("PathFollow3D only works when set as a child of a Path3D node."));
+		} else {
+			Path3D *path = Object::cast_to<Path3D>(get_parent());
+			if (path->get_curve().is_valid() && !path->get_curve()->is_up_vector_enabled() && rotation_mode == ROTATION_ORIENTED) {
+				warnings.push_back(TTR("PathFollow3D's ROTATION_ORIENTED requires \"Up Vector\" to be enabled in its parent Path3D's Curve resource."));
 			}
-			warning += TTR("PathFollow3D's ROTATION_ORIENTED requires \"Up Vector\" to be enabled in its parent Path3D's Curve resource.");
 		}
 	}
 
-	return warning;
+	return warnings;
 }
 
 void PathFollow3D::_bind_methods() {
@@ -368,7 +360,7 @@ float PathFollow3D::get_unit_offset() const {
 void PathFollow3D::set_rotation_mode(RotationMode p_rotation_mode) {
 	rotation_mode = p_rotation_mode;
 
-	update_configuration_warning();
+	update_configuration_warnings();
 	_update_transform();
 }
 
diff --git a/scene/3d/path_3d.h b/scene/3d/path_3d.h
index 17ee47593e..8545370a4a 100644
--- a/scene/3d/path_3d.h
+++ b/scene/3d/path_3d.h
@@ -104,7 +104,7 @@ public:
 	void set_cubic_interpolation(bool p_enable);
 	bool get_cubic_interpolation() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	PathFollow3D() {}
 };
diff --git a/scene/3d/physics_body_3d.cpp b/scene/3d/physics_body_3d.cpp
index e225c1f22d..93d3e946fd 100644
--- a/scene/3d/physics_body_3d.cpp
+++ b/scene/3d/physics_body_3d.cpp
@@ -55,52 +55,6 @@ real_t PhysicsBody3D::get_inverse_mass() const {
 	return 0;
 }
 
-void PhysicsBody3D::set_collision_layer(uint32_t p_layer) {
-	collision_layer = p_layer;
-	PhysicsServer3D::get_singleton()->body_set_collision_layer(get_rid(), p_layer);
-}
-
-uint32_t PhysicsBody3D::get_collision_layer() const {
-	return collision_layer;
-}
-
-void PhysicsBody3D::set_collision_mask(uint32_t p_mask) {
-	collision_mask = p_mask;
-	PhysicsServer3D::get_singleton()->body_set_collision_mask(get_rid(), p_mask);
-}
-
-uint32_t PhysicsBody3D::get_collision_mask() const {
-	return collision_mask;
-}
-
-void PhysicsBody3D::set_collision_mask_bit(int p_bit, bool p_value) {
-	uint32_t mask = get_collision_mask();
-	if (p_value) {
-		mask |= 1 << p_bit;
-	} else {
-		mask &= ~(1 << p_bit);
-	}
-	set_collision_mask(mask);
-}
-
-bool PhysicsBody3D::get_collision_mask_bit(int p_bit) const {
-	return get_collision_mask() & (1 << p_bit);
-}
-
-void PhysicsBody3D::set_collision_layer_bit(int p_bit, bool p_value) {
-	uint32_t mask = get_collision_layer();
-	if (p_value) {
-		mask |= 1 << p_bit;
-	} else {
-		mask &= ~(1 << p_bit);
-	}
-	set_collision_layer(mask);
-}
-
-bool PhysicsBody3D::get_collision_layer_bit(int p_bit) const {
-	return get_collision_layer() & (1 << p_bit);
-}
-
 TypedArray<PhysicsBody3D> PhysicsBody3D::get_collision_exceptions() {
 	List<RID> exceptions;
 	PhysicsServer3D::get_singleton()->body_get_collision_exceptions(get_rid(), &exceptions);
@@ -129,29 +83,11 @@ void PhysicsBody3D::remove_collision_exception_with(Node *p_node) {
 	PhysicsServer3D::get_singleton()->body_remove_collision_exception(get_rid(), collision_object->get_rid());
 }
 
-void PhysicsBody3D::_bind_methods() {
-	ClassDB::bind_method(D_METHOD("set_collision_layer", "layer"), &PhysicsBody3D::set_collision_layer);
-	ClassDB::bind_method(D_METHOD("get_collision_layer"), &PhysicsBody3D::get_collision_layer);
-
-	ClassDB::bind_method(D_METHOD("set_collision_mask", "mask"), &PhysicsBody3D::set_collision_mask);
-	ClassDB::bind_method(D_METHOD("get_collision_mask"), &PhysicsBody3D::get_collision_mask);
-
-	ClassDB::bind_method(D_METHOD("set_collision_mask_bit", "bit", "value"), &PhysicsBody3D::set_collision_mask_bit);
-	ClassDB::bind_method(D_METHOD("get_collision_mask_bit", "bit"), &PhysicsBody3D::get_collision_mask_bit);
-
-	ClassDB::bind_method(D_METHOD("set_collision_layer_bit", "bit", "value"), &PhysicsBody3D::set_collision_layer_bit);
-	ClassDB::bind_method(D_METHOD("get_collision_layer_bit", "bit"), &PhysicsBody3D::get_collision_layer_bit);
-
-	ADD_GROUP("Collision", "collision_");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_layer", PROPERTY_HINT_LAYERS_3D_PHYSICS), "set_collision_layer", "get_collision_layer");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "collision_mask", PROPERTY_HINT_LAYERS_3D_PHYSICS), "set_collision_mask", "get_collision_mask");
-}
+void PhysicsBody3D::_bind_methods() {}
 
 PhysicsBody3D::PhysicsBody3D(PhysicsServer3D::BodyMode p_mode) :
 		CollisionObject3D(PhysicsServer3D::get_singleton()->body_create(), false) {
 	PhysicsServer3D::get_singleton()->body_set_mode(get_rid(), p_mode);
-	collision_layer = 1;
-	collision_mask = 1;
 }
 
 void StaticBody3D::set_physics_material_override(const Ref<PhysicsMaterial> &p_physics_material_override) {
@@ -338,6 +274,7 @@ struct _RigidBodyInOut {
 void RigidBody3D::_direct_state_changed(Object *p_state) {
 #ifdef DEBUG_ENABLED
 	state = Object::cast_to<PhysicsDirectBodyState3D>(p_state);
+	ERR_FAIL_NULL_MSG(state, "Method '_direct_state_changed' must receive a valid PhysicsDirectBodyState3D object as argument");
 #else
 	state = (PhysicsDirectBodyState3D *)p_state; //trust it
 #endif
@@ -444,7 +381,7 @@ void RigidBody3D::_notification(int p_what) {
 
 	if (p_what == NOTIFICATION_LOCAL_TRANSFORM_CHANGED) {
 		if (Engine::get_singleton()->is_editor_hint()) {
-			update_configuration_warning();
+			update_configuration_warnings();
 		}
 	}
 
@@ -469,7 +406,7 @@ void RigidBody3D::set_mode(Mode p_mode) {
 			PhysicsServer3D::get_singleton()->body_set_mode(get_rid(), PhysicsServer3D::BODY_MODE_KINEMATIC);
 		} break;
 	}
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 RigidBody3D::Mode RigidBody3D::get_mode() const {
@@ -709,19 +646,16 @@ Array RigidBody3D::get_colliding_bodies() const {
 	return ret;
 }
 
-String RigidBody3D::get_configuration_warning() const {
+TypedArray<String> RigidBody3D::get_configuration_warnings() const {
 	Transform t = get_transform();
 
-	String warning = CollisionObject3D::get_configuration_warning();
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if ((get_mode() == MODE_RIGID || get_mode() == MODE_CHARACTER) && (ABS(t.basis.get_axis(0).length() - 1.0) > 0.05 || ABS(t.basis.get_axis(1).length() - 1.0) > 0.05 || ABS(t.basis.get_axis(2).length() - 1.0) > 0.05)) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("Size changes to RigidBody3D (in character or rigid modes) will be overridden by the physics engine when running.\nChange the size in children collision shapes instead.");
+		warnings.push_back(TTR("Size changes to RigidBody3D (in character or rigid modes) will be overridden by the physics engine when running.\nChange the size in children collision shapes instead."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void RigidBody3D::_bind_methods() {
@@ -779,8 +713,6 @@ void RigidBody3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_can_sleep", "able_to_sleep"), &RigidBody3D::set_can_sleep);
 	ClassDB::bind_method(D_METHOD("is_able_to_sleep"), &RigidBody3D::is_able_to_sleep);
 
-	ClassDB::bind_method(D_METHOD("_direct_state_changed"), &RigidBody3D::_direct_state_changed);
-
 	ClassDB::bind_method(D_METHOD("set_axis_lock", "axis", "lock"), &RigidBody3D::set_axis_lock);
 	ClassDB::bind_method(D_METHOD("get_axis_lock", "axis"), &RigidBody3D::get_axis_lock);
 
@@ -826,7 +758,7 @@ void RigidBody3D::_bind_methods() {
 
 RigidBody3D::RigidBody3D() :
 		PhysicsBody3D(PhysicsServer3D::BODY_MODE_RIGID) {
-	PhysicsServer3D::get_singleton()->body_set_force_integration_callback(get_rid(), this, "_direct_state_changed");
+	PhysicsServer3D::get_singleton()->body_set_force_integration_callback(get_rid(), callable_mp(this, &RigidBody3D::_direct_state_changed));
 }
 
 RigidBody3D::~RigidBody3D() {
@@ -1160,8 +1092,6 @@ void KinematicBody3D::_notification(int p_what) {
 }
 
 void KinematicBody3D::_bind_methods() {
-	ClassDB::bind_method(D_METHOD("_direct_state_changed"), &KinematicBody3D::_direct_state_changed);
-
 	ClassDB::bind_method(D_METHOD("move_and_collide", "rel_vec", "infinite_inertia", "exclude_raycast_shapes", "test_only"), &KinematicBody3D::_move, DEFVAL(true), DEFVAL(true), DEFVAL(false));
 	ClassDB::bind_method(D_METHOD("move_and_slide", "linear_velocity", "up_direction", "stop_on_slope", "max_slides", "floor_max_angle", "infinite_inertia"), &KinematicBody3D::move_and_slide, DEFVAL(Vector3(0, 0, 0)), DEFVAL(false), DEFVAL(4), DEFVAL(Math::deg2rad((real_t)45.0)), DEFVAL(true));
 	ClassDB::bind_method(D_METHOD("move_and_slide_with_snap", "linear_velocity", "snap", "up_direction", "stop_on_slope", "max_slides", "floor_max_angle", "infinite_inertia"), &KinematicBody3D::move_and_slide_with_snap, DEFVAL(Vector3(0, 0, 0)), DEFVAL(false), DEFVAL(4), DEFVAL(Math::deg2rad((real_t)45.0)), DEFVAL(true));
@@ -1194,6 +1124,7 @@ void KinematicBody3D::_bind_methods() {
 void KinematicBody3D::_direct_state_changed(Object *p_state) {
 #ifdef DEBUG_ENABLED
 	PhysicsDirectBodyState3D *state = Object::cast_to<PhysicsDirectBodyState3D>(p_state);
+	ERR_FAIL_NULL_MSG(state, "Method '_direct_state_changed' must receive a valid PhysicsDirectBodyState3D object as argument");
 #else
 	PhysicsDirectBodyState3D *state = (PhysicsDirectBodyState3D *)p_state; //trust it
 #endif
@@ -1205,7 +1136,7 @@ void KinematicBody3D::_direct_state_changed(Object *p_state) {
 KinematicBody3D::KinematicBody3D() :
 		PhysicsBody3D(PhysicsServer3D::BODY_MODE_KINEMATIC) {
 	set_safe_margin(0.001);
-	PhysicsServer3D::get_singleton()->body_set_force_integration_callback(get_rid(), this, "_direct_state_changed");
+	PhysicsServer3D::get_singleton()->body_set_force_integration_callback(get_rid(), callable_mp(this, &KinematicBody3D::_direct_state_changed));
 }
 
 KinematicBody3D::~KinematicBody3D() {
@@ -2044,6 +1975,7 @@ void PhysicalBone3D::_direct_state_changed(Object *p_state) {
 
 #ifdef DEBUG_ENABLED
 	state = Object::cast_to<PhysicsDirectBodyState3D>(p_state);
+	ERR_FAIL_NULL_MSG(state, "Method '_direct_state_changed' must receive a valid PhysicsDirectBodyState3D object as argument");
 #else
 	state = (PhysicsDirectBodyState3D *)p_state; //trust it
 #endif
@@ -2066,8 +1998,6 @@ void PhysicalBone3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("apply_central_impulse", "impulse"), &PhysicalBone3D::apply_central_impulse);
 	ClassDB::bind_method(D_METHOD("apply_impulse", "impulse", "position"), &PhysicalBone3D::apply_impulse, Vector3());
 
-	ClassDB::bind_method(D_METHOD("_direct_state_changed"), &PhysicalBone3D::_direct_state_changed);
-
 	ClassDB::bind_method(D_METHOD("set_joint_type", "joint_type"), &PhysicalBone3D::set_joint_type);
 	ClassDB::bind_method(D_METHOD("get_joint_type"), &PhysicalBone3D::get_joint_type);
 
@@ -2546,7 +2476,7 @@ void PhysicalBone3D::_start_physics_simulation() {
 	PhysicsServer3D::get_singleton()->body_set_mode(get_rid(), PhysicsServer3D::BODY_MODE_RIGID);
 	PhysicsServer3D::get_singleton()->body_set_collision_layer(get_rid(), get_collision_layer());
 	PhysicsServer3D::get_singleton()->body_set_collision_mask(get_rid(), get_collision_mask());
-	PhysicsServer3D::get_singleton()->body_set_force_integration_callback(get_rid(), this, "_direct_state_changed");
+	PhysicsServer3D::get_singleton()->body_set_force_integration_callback(get_rid(), callable_mp(this, &PhysicalBone3D::_direct_state_changed));
 	set_as_top_level(true);
 	_internal_simulate_physics = true;
 }
@@ -2565,7 +2495,7 @@ void PhysicalBone3D::_stop_physics_simulation() {
 		PhysicsServer3D::get_singleton()->body_set_collision_mask(get_rid(), 0);
 	}
 	if (_internal_simulate_physics) {
-		PhysicsServer3D::get_singleton()->body_set_force_integration_callback(get_rid(), nullptr, "");
+		PhysicsServer3D::get_singleton()->body_set_force_integration_callback(get_rid(), Callable());
 		parent_skeleton->set_bone_global_pose_override(bone_id, Transform(), 0.0, false);
 		set_as_top_level(false);
 		_internal_simulate_physics = false;
diff --git a/scene/3d/physics_body_3d.h b/scene/3d/physics_body_3d.h
index 1450fce6a6..21afe66861 100644
--- a/scene/3d/physics_body_3d.h
+++ b/scene/3d/physics_body_3d.h
@@ -40,9 +40,6 @@
 class PhysicsBody3D : public CollisionObject3D {
 	GDCLASS(PhysicsBody3D, CollisionObject3D);
 
-	uint32_t collision_layer;
-	uint32_t collision_mask;
-
 protected:
 	static void _bind_methods();
 	PhysicsBody3D(PhysicsServer3D::BodyMode p_mode);
@@ -52,18 +49,6 @@ public:
 	virtual Vector3 get_angular_velocity() const;
 	virtual real_t get_inverse_mass() const;
 
-	void set_collision_layer(uint32_t p_layer);
-	uint32_t get_collision_layer() const;
-
-	void set_collision_mask(uint32_t p_mask);
-	uint32_t get_collision_mask() const;
-
-	void set_collision_layer_bit(int p_bit, bool p_value);
-	bool get_collision_layer_bit(int p_bit) const;
-
-	void set_collision_mask_bit(int p_bit, bool p_value);
-	bool get_collision_mask_bit(int p_bit) const;
-
 	TypedArray<PhysicsBody3D> get_collision_exceptions();
 	void add_collision_exception_with(Node *p_node); //must be physicsbody
 	void remove_collision_exception_with(Node *p_node);
@@ -238,7 +223,7 @@ public:
 	void apply_impulse(const Vector3 &p_impulse, const Vector3 &p_position = Vector3());
 	void apply_torque_impulse(const Vector3 &p_impulse);
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	RigidBody3D();
 	~RigidBody3D();
diff --git a/scene/3d/physics_joint_3d.cpp b/scene/3d/physics_joint_3d.cpp
index de9c75621b..3d58d1c10e 100644
--- a/scene/3d/physics_joint_3d.cpp
+++ b/scene/3d/physics_joint_3d.cpp
@@ -65,6 +65,7 @@ void Joint3D::_update_joint(bool p_only_free) {
 	if (p_only_free || !is_inside_tree()) {
 		PhysicsServer3D::get_singleton()->joint_clear(joint);
 		warning = String();
+		update_configuration_warnings();
 		return;
 	}
 
@@ -75,43 +76,26 @@ void Joint3D::_update_joint(bool p_only_free) {
 	PhysicsBody3D *body_b = Object::cast_to<PhysicsBody3D>(node_b);
 
 	if (node_a && !body_a && node_b && !body_b) {
-		PhysicsServer3D::get_singleton()->joint_clear(joint);
 		warning = TTR("Node A and Node B must be PhysicsBody3Ds");
-		update_configuration_warning();
-		return;
-	}
-
-	if (node_a && !body_a) {
-		PhysicsServer3D::get_singleton()->joint_clear(joint);
+	} else if (node_a && !body_a) {
 		warning = TTR("Node A must be a PhysicsBody3D");
-		update_configuration_warning();
-		return;
-	}
-
-	if (node_b && !body_b) {
-		PhysicsServer3D::get_singleton()->joint_clear(joint);
+	} else if (node_b && !body_b) {
 		warning = TTR("Node B must be a PhysicsBody3D");
-		update_configuration_warning();
-		return;
-	}
-
-	if (!body_a && !body_b) {
-		PhysicsServer3D::get_singleton()->joint_clear(joint);
+	} else if (!body_a && !body_b) {
 		warning = TTR("Joint is not connected to any PhysicsBody3Ds");
-		update_configuration_warning();
-		return;
+	} else if (body_a == body_b) {
+		warning = TTR("Node A and Node B must be different PhysicsBody3Ds");
+	} else {
+		warning = String();
 	}
 
-	if (body_a == body_b) {
+	update_configuration_warnings();
+
+	if (!warning.is_empty()) {
 		PhysicsServer3D::get_singleton()->joint_clear(joint);
-		warning = TTR("Node A and Node B must be different PhysicsBody3Ds");
-		update_configuration_warning();
 		return;
 	}
 
-	warning = String();
-	update_configuration_warning();
-
 	configured = true;
 
 	if (body_a) {
@@ -206,17 +190,14 @@ bool Joint3D::get_exclude_nodes_from_collision() const {
 	return exclude_from_collision;
 }
 
-String Joint3D::get_configuration_warning() const {
-	String node_warning = Node3D::get_configuration_warning();
+TypedArray<String> Joint3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node3D::get_configuration_warnings();
 
 	if (!warning.is_empty()) {
-		if (!node_warning.is_empty()) {
-			node_warning += "\n\n";
-		}
-		node_warning += warning;
+		warnings.push_back(warning);
 	}
 
-	return node_warning;
+	return warnings;
 }
 
 void Joint3D::_bind_methods() {
diff --git a/scene/3d/physics_joint_3d.h b/scene/3d/physics_joint_3d.h
index f624ba602b..3e0ea38a5c 100644
--- a/scene/3d/physics_joint_3d.h
+++ b/scene/3d/physics_joint_3d.h
@@ -63,7 +63,7 @@ protected:
 	_FORCE_INLINE_ bool is_configured() const { return configured; }
 
 public:
-	virtual String get_configuration_warning() const override;
+	virtual TypedArray<String> get_configuration_warnings() const override;
 
 	void set_node_a(const NodePath &p_node_a);
 	NodePath get_node_a() const;
diff --git a/scene/3d/ray_cast_3d.cpp b/scene/3d/ray_cast_3d.cpp
index 66f3e539a2..95638ce514 100644
--- a/scene/3d/ray_cast_3d.cpp
+++ b/scene/3d/ray_cast_3d.cpp
@@ -61,6 +61,7 @@ uint32_t RayCast3D::get_collision_mask() const {
 }
 
 void RayCast3D::set_collision_mask_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision mask bit must be between 0 and 31 inclusive.");
 	uint32_t mask = get_collision_mask();
 	if (p_value) {
 		mask |= 1 << p_bit;
@@ -71,6 +72,7 @@ void RayCast3D::set_collision_mask_bit(int p_bit, bool p_value) {
 }
 
 bool RayCast3D::get_collision_mask_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision mask bit must be between 0 and 31 inclusive.");
 	return get_collision_mask() & (1 << p_bit);
 }
 
diff --git a/scene/3d/remote_transform_3d.cpp b/scene/3d/remote_transform_3d.cpp
index 83ac813c53..29a407905b 100644
--- a/scene/3d/remote_transform_3d.cpp
+++ b/scene/3d/remote_transform_3d.cpp
@@ -133,7 +133,7 @@ void RemoteTransform3D::set_remote_node(const NodePath &p_remote_node) {
 		_update_remote();
 	}
 
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 NodePath RemoteTransform3D::get_remote_node() const {
@@ -179,17 +179,14 @@ void RemoteTransform3D::force_update_cache() {
 	_update_cache();
 }
 
-String RemoteTransform3D::get_configuration_warning() const {
-	String warning = Node3D::get_configuration_warning();
+TypedArray<String> RemoteTransform3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!has_node(remote_node) || !Object::cast_to<Node3D>(get_node(remote_node))) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("The \"Remote Path\" property must point to a valid Node3D or Node3D-derived node to work.");
+		warnings.push_back(TTR("The \"Remote Path\" property must point to a valid Node3D or Node3D-derived node to work."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void RemoteTransform3D::_bind_methods() {
diff --git a/scene/3d/remote_transform_3d.h b/scene/3d/remote_transform_3d.h
index 21005d92d1..321bd3b51e 100644
--- a/scene/3d/remote_transform_3d.h
+++ b/scene/3d/remote_transform_3d.h
@@ -70,7 +70,7 @@ public:
 
 	void force_update_cache();
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	RemoteTransform3D();
 };
diff --git a/scene/3d/skeleton_3d.cpp b/scene/3d/skeleton_3d.cpp
index db5fc7593e..ebbb8985c9 100644
--- a/scene/3d/skeleton_3d.cpp
+++ b/scene/3d/skeleton_3d.cpp
@@ -387,6 +387,7 @@ void Skeleton3D::_notification(int p_what) {
 void Skeleton3D::clear_bones_global_pose_override() {
 	for (int i = 0; i < bones.size(); i += 1) {
 		bones.write[i].global_pose_override_amount = 0;
+		bones.write[i].global_pose_override_reset = true;
 	}
 	_make_dirty();
 }
diff --git a/scene/3d/skeleton_ik_3d.cpp b/scene/3d/skeleton_ik_3d.cpp
index 6cde6a9b17..898f94ccc1 100644
--- a/scene/3d/skeleton_ik_3d.cpp
+++ b/scene/3d/skeleton_ik_3d.cpp
@@ -270,7 +270,6 @@ void FabrikInverseKinematic::solve(Task *p_task, real_t blending_delta, bool ove
 		return; // Skip solving
 	}
 
-	// This line below is part of the problem - removing it fixes the issue with BoneAttachment nodes...
 	p_task->skeleton->set_bone_global_pose_override(p_task->chain.chain_root.bone, Transform(), 0.0, true);
 
 	if (p_task->chain.middle_chain_item) {
@@ -567,6 +566,9 @@ void SkeletonIK3D::start(bool p_one_time) {
 
 void SkeletonIK3D::stop() {
 	set_process_internal(false);
+	if (skeleton) {
+		skeleton->clear_bones_global_pose_override();
+	}
 }
 
 Transform SkeletonIK3D::_get_target_transform() {
diff --git a/scene/3d/soft_body_3d.cpp b/scene/3d/soft_body_3d.cpp
index 3fde4d6ef3..dc4deb0570 100644
--- a/scene/3d/soft_body_3d.cpp
+++ b/scene/3d/soft_body_3d.cpp
@@ -85,11 +85,11 @@ void SoftBodyRenderingServerHandler::commit_changes() {
 }
 
 void SoftBodyRenderingServerHandler::set_vertex(int p_vertex_id, const void *p_vector3) {
-	copymem(&write_buffer[p_vertex_id * stride + offset_vertices], p_vector3, sizeof(float) * 3);
+	memcpy(&write_buffer[p_vertex_id * stride + offset_vertices], p_vector3, sizeof(float) * 3);
 }
 
 void SoftBodyRenderingServerHandler::set_normal(int p_vertex_id, const void *p_vector3) {
-	copymem(&write_buffer[p_vertex_id * stride + offset_normal], p_vector3, sizeof(float) * 3);
+	memcpy(&write_buffer[p_vertex_id * stride + offset_normal], p_vector3, sizeof(float) * 3);
 }
 
 void SoftBodyRenderingServerHandler::set_aabb(const AABB &p_aabb) {
@@ -249,7 +249,7 @@ void SoftBody3D::_softbody_changed() {
 	prepare_physics_server();
 	_reset_points_offsets();
 #ifdef TOOLS_ENABLED
-	update_configuration_warning();
+	update_configuration_warnings();
 #endif
 }
 
@@ -301,7 +301,7 @@ void SoftBody3D::_notification(int p_what) {
 
 	if (p_what == NOTIFICATION_LOCAL_TRANSFORM_CHANGED) {
 		if (Engine::get_singleton()->is_editor_hint()) {
-			update_configuration_warning();
+			update_configuration_warnings();
 		}
 	}
 
@@ -366,27 +366,19 @@ void SoftBody3D::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "ray_pickable"), "set_ray_pickable", "is_ray_pickable");
 }
 
-String SoftBody3D::get_configuration_warning() const {
-	String warning = MeshInstance3D::get_configuration_warning();
+TypedArray<String> SoftBody3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (get_mesh().is_null()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-
-		warning += TTR("This body will be ignored until you set a mesh.");
+		warnings.push_back(TTR("This body will be ignored until you set a mesh."));
 	}
 
 	Transform t = get_transform();
 	if ((ABS(t.basis.get_axis(0).length() - 1.0) > 0.05 || ABS(t.basis.get_axis(1).length() - 1.0) > 0.05 || ABS(t.basis.get_axis(2).length() - 1.0) > 0.05)) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-
-		warning += TTR("Size changes to SoftBody3D will be overridden by the physics engine when running.\nChange the size in children collision shapes instead.");
+		warnings.push_back(TTR("Size changes to SoftBody3D will be overridden by the physics engine when running.\nChange the size in children collision shapes instead."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void SoftBody3D::_update_physics_server() {
@@ -460,7 +452,7 @@ void SoftBody3D::become_mesh_owner() {
 		mesh_owner = true;
 
 		Vector<Ref<Material>> copy_materials;
-		copy_materials.append_array(materials);
+		copy_materials.append_array(surface_override_materials);
 
 		ERR_FAIL_COND(!mesh->get_surface_count());
 
@@ -480,7 +472,7 @@ void SoftBody3D::become_mesh_owner() {
 		set_mesh(soft_mesh);
 
 		for (int i = copy_materials.size() - 1; 0 <= i; --i) {
-			set_surface_material(i, copy_materials[i]);
+			set_surface_override_material(i, copy_materials[i]);
 		}
 	}
 }
@@ -504,6 +496,7 @@ uint32_t SoftBody3D::get_collision_layer() const {
 }
 
 void SoftBody3D::set_collision_mask_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision mask bit must be between 0 and 31 inclusive.");
 	uint32_t mask = get_collision_mask();
 	if (p_value) {
 		mask |= 1 << p_bit;
@@ -514,10 +507,12 @@ void SoftBody3D::set_collision_mask_bit(int p_bit, bool p_value) {
 }
 
 bool SoftBody3D::get_collision_mask_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision mask bit must be between 0 and 31 inclusive.");
 	return get_collision_mask() & (1 << p_bit);
 }
 
 void SoftBody3D::set_collision_layer_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision layer bit must be between 0 and 31 inclusive.");
 	uint32_t layer = get_collision_layer();
 	if (p_value) {
 		layer |= 1 << p_bit;
@@ -528,6 +523,7 @@ void SoftBody3D::set_collision_layer_bit(int p_bit, bool p_value) {
 }
 
 bool SoftBody3D::get_collision_layer_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision layer bit must be between 0 and 31 inclusive.");
 	return get_collision_layer() & (1 << p_bit);
 }
 
diff --git a/scene/3d/soft_body_3d.h b/scene/3d/soft_body_3d.h
index f98df39209..0d0d39d48f 100644
--- a/scene/3d/soft_body_3d.h
+++ b/scene/3d/soft_body_3d.h
@@ -113,7 +113,7 @@ protected:
 	void _notification(int p_what);
 	static void _bind_methods();
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 protected:
 	void _update_physics_server();
diff --git a/scene/3d/sprite_3d.cpp b/scene/3d/sprite_3d.cpp
index 0be54e7243..33b8b488c6 100644
--- a/scene/3d/sprite_3d.cpp
+++ b/scene/3d/sprite_3d.cpp
@@ -928,7 +928,7 @@ void AnimatedSprite3D::set_sprite_frames(const Ref<SpriteFrames> &p_frames) {
 	notify_property_list_changed();
 	_reset_timeout();
 	_queue_update();
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Ref<SpriteFrames> AnimatedSprite3D::get_sprite_frames() const {
@@ -1058,17 +1058,14 @@ StringName AnimatedSprite3D::get_animation() const {
 	return animation;
 }
 
-String AnimatedSprite3D::get_configuration_warning() const {
-	String warning = SpriteBase3D::get_configuration_warning();
+TypedArray<String> AnimatedSprite3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (frames.is_null()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("A SpriteFrames resource must be created or set in the \"Frames\" property in order for AnimatedSprite3D to display frames.");
+		warnings.push_back(TTR("A SpriteFrames resource must be created or set in the \"Frames\" property in order for AnimatedSprite3D to display frames."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void AnimatedSprite3D::_bind_methods() {
diff --git a/scene/3d/sprite_3d.h b/scene/3d/sprite_3d.h
index d1bc8dc737..5e47e66bcb 100644
--- a/scene/3d/sprite_3d.h
+++ b/scene/3d/sprite_3d.h
@@ -236,7 +236,7 @@ public:
 
 	virtual Rect2 get_item_rect() const override;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 	AnimatedSprite3D();
 };
 
diff --git a/scene/3d/vehicle_body_3d.cpp b/scene/3d/vehicle_body_3d.cpp
index 5b0b3b89d3..9493f686c4 100644
--- a/scene/3d/vehicle_body_3d.cpp
+++ b/scene/3d/vehicle_body_3d.cpp
@@ -102,17 +102,14 @@ void VehicleWheel3D::_notification(int p_what) {
 	}
 }
 
-String VehicleWheel3D::get_configuration_warning() const {
-	String warning = Node3D::get_configuration_warning();
+TypedArray<String> VehicleWheel3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!Object::cast_to<VehicleBody3D>(get_parent())) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("VehicleWheel3D serves to provide a wheel system to a VehicleBody3D. Please use it as a child of a VehicleBody3D.");
+		warnings.push_back(TTR("VehicleWheel3D serves to provide a wheel system to a VehicleBody3D. Please use it as a child of a VehicleBody3D."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void VehicleWheel3D::_update(PhysicsDirectBodyState3D *s) {
@@ -806,6 +803,7 @@ void VehicleBody3D::_direct_state_changed(Object *p_state) {
 	RigidBody3D::_direct_state_changed(p_state);
 
 	state = Object::cast_to<PhysicsDirectBodyState3D>(p_state);
+	ERR_FAIL_NULL_MSG(state, "Method '_direct_state_changed' must receive a valid PhysicsDirectBodyState3D object as argument");
 
 	real_t step = state->get_step();
 
@@ -925,7 +923,7 @@ void VehicleBody3D::_bind_methods() {
 
 VehicleBody3D::VehicleBody3D() {
 	exclude.insert(get_rid());
-	//PhysicsServer3D::get_singleton()->body_set_force_integration_callback(get_rid(), this, "_direct_state_changed");
+	//PhysicsServer3D::get_singleton()->body_set_force_integration_callback(get_rid(), callable_mp(this, &VehicleBody3D::_direct_state_changed));
 
 	set_mass(40);
 }
diff --git a/scene/3d/vehicle_body_3d.h b/scene/3d/vehicle_body_3d.h
index 860fa7e3b7..646071a363 100644
--- a/scene/3d/vehicle_body_3d.h
+++ b/scene/3d/vehicle_body_3d.h
@@ -145,7 +145,7 @@ public:
 	void set_steering(real_t p_steering);
 	real_t get_steering() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	VehicleWheel3D();
 };
diff --git a/scene/3d/visual_instance_3d.cpp b/scene/3d/visual_instance_3d.cpp
index 394c67e873..d81b09b86c 100644
--- a/scene/3d/visual_instance_3d.cpp
+++ b/scene/3d/visual_instance_3d.cpp
@@ -338,6 +338,15 @@ GeometryInstance3D::GIMode GeometryInstance3D::get_gi_mode() const {
 	return gi_mode;
 }
 
+void GeometryInstance3D::set_ignore_occlusion_culling(bool p_enabled) {
+	ignore_occlusion_culling = p_enabled;
+	RS::get_singleton()->instance_geometry_set_flag(get_instance(), RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING, ignore_occlusion_culling);
+}
+
+bool GeometryInstance3D::is_ignoring_occlusion_culling() {
+	return ignore_occlusion_culling;
+}
+
 void GeometryInstance3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_material_override", "material"), &GeometryInstance3D::set_material_override);
 	ClassDB::bind_method(D_METHOD("get_material_override"), &GeometryInstance3D::get_material_override);
@@ -345,21 +354,24 @@ void GeometryInstance3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_cast_shadows_setting", "shadow_casting_setting"), &GeometryInstance3D::set_cast_shadows_setting);
 	ClassDB::bind_method(D_METHOD("get_cast_shadows_setting"), &GeometryInstance3D::get_cast_shadows_setting);
 
+	ClassDB::bind_method(D_METHOD("set_lod_bias", "bias"), &GeometryInstance3D::set_lod_bias);
+	ClassDB::bind_method(D_METHOD("get_lod_bias"), &GeometryInstance3D::get_lod_bias);
+
 	ClassDB::bind_method(D_METHOD("set_lod_max_hysteresis", "mode"), &GeometryInstance3D::set_lod_max_hysteresis);
 	ClassDB::bind_method(D_METHOD("get_lod_max_hysteresis"), &GeometryInstance3D::get_lod_max_hysteresis);
 
 	ClassDB::bind_method(D_METHOD("set_lod_max_distance", "mode"), &GeometryInstance3D::set_lod_max_distance);
 	ClassDB::bind_method(D_METHOD("get_lod_max_distance"), &GeometryInstance3D::get_lod_max_distance);
 
-	ClassDB::bind_method(D_METHOD("set_shader_instance_uniform", "uniform", "value"), &GeometryInstance3D::set_shader_instance_uniform);
-	ClassDB::bind_method(D_METHOD("get_shader_instance_uniform", "uniform"), &GeometryInstance3D::get_shader_instance_uniform);
-
 	ClassDB::bind_method(D_METHOD("set_lod_min_hysteresis", "mode"), &GeometryInstance3D::set_lod_min_hysteresis);
 	ClassDB::bind_method(D_METHOD("get_lod_min_hysteresis"), &GeometryInstance3D::get_lod_min_hysteresis);
 
 	ClassDB::bind_method(D_METHOD("set_lod_min_distance", "mode"), &GeometryInstance3D::set_lod_min_distance);
 	ClassDB::bind_method(D_METHOD("get_lod_min_distance"), &GeometryInstance3D::get_lod_min_distance);
 
+	ClassDB::bind_method(D_METHOD("set_shader_instance_uniform", "uniform", "value"), &GeometryInstance3D::set_shader_instance_uniform);
+	ClassDB::bind_method(D_METHOD("get_shader_instance_uniform", "uniform"), &GeometryInstance3D::get_shader_instance_uniform);
+
 	ClassDB::bind_method(D_METHOD("set_extra_cull_margin", "margin"), &GeometryInstance3D::set_extra_cull_margin);
 	ClassDB::bind_method(D_METHOD("get_extra_cull_margin"), &GeometryInstance3D::get_extra_cull_margin);
 
@@ -369,8 +381,8 @@ void GeometryInstance3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_gi_mode", "mode"), &GeometryInstance3D::set_gi_mode);
 	ClassDB::bind_method(D_METHOD("get_gi_mode"), &GeometryInstance3D::get_gi_mode);
 
-	ClassDB::bind_method(D_METHOD("set_lod_bias", "bias"), &GeometryInstance3D::set_lod_bias);
-	ClassDB::bind_method(D_METHOD("get_lod_bias"), &GeometryInstance3D::get_lod_bias);
+	ClassDB::bind_method(D_METHOD("set_ignore_occlusion_culling", "ignore_culling"), &GeometryInstance3D::set_ignore_occlusion_culling);
+	ClassDB::bind_method(D_METHOD("is_ignoring_occlusion_culling"), &GeometryInstance3D::is_ignoring_occlusion_culling);
 
 	ClassDB::bind_method(D_METHOD("set_custom_aabb", "aabb"), &GeometryInstance3D::set_custom_aabb);
 
@@ -381,6 +393,7 @@ void GeometryInstance3D::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "cast_shadow", PROPERTY_HINT_ENUM, "Off,On,Double-Sided,Shadows Only"), "set_cast_shadows_setting", "get_cast_shadows_setting");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "extra_cull_margin", PROPERTY_HINT_RANGE, "0,16384,0.01"), "set_extra_cull_margin", "get_extra_cull_margin");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "lod_bias", PROPERTY_HINT_RANGE, "0.001,128,0.001"), "set_lod_bias", "get_lod_bias");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "ignore_occlusion_culling"), "set_ignore_occlusion_culling", "is_ignoring_occlusion_culling");
 	ADD_GROUP("Global Illumination", "gi_");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "gi_mode", PROPERTY_HINT_ENUM, "Disabled,Baked,Dynamic"), "set_gi_mode", "get_gi_mode");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "gi_lightmap_scale", PROPERTY_HINT_ENUM, "1x,2x,4x,8x"), "set_lightmap_scale", "get_lightmap_scale");
diff --git a/scene/3d/visual_instance_3d.h b/scene/3d/visual_instance_3d.h
index 7fed8095ef..68d29ef81e 100644
--- a/scene/3d/visual_instance_3d.h
+++ b/scene/3d/visual_instance_3d.h
@@ -120,6 +120,7 @@ private:
 	float extra_cull_margin = 0.0;
 	LightmapScale lightmap_scale = LIGHTMAP_SCALE_1X;
 	GIMode gi_mode = GI_MODE_DISABLED;
+	bool ignore_occlusion_culling = false;
 
 	const StringName *_instance_uniform_get_remap(const StringName p_name) const;
 
@@ -167,6 +168,9 @@ public:
 
 	void set_custom_aabb(AABB aabb);
 
+	void set_ignore_occlusion_culling(bool p_enabled);
+	bool is_ignoring_occlusion_culling();
+
 	GeometryInstance3D();
 };
 
diff --git a/scene/3d/world_environment.cpp b/scene/3d/world_environment.cpp
index 214ffd6bd5..829ecc5ec2 100644
--- a/scene/3d/world_environment.cpp
+++ b/scene/3d/world_environment.cpp
@@ -65,7 +65,7 @@ void WorldEnvironment::_update_current_environment() {
 	} else {
 		get_viewport()->find_world_3d()->set_environment(Ref<Environment>());
 	}
-	get_tree()->call_group("_world_environment_" + itos(get_viewport()->find_world_3d()->get_scenario().get_id()), "update_configuration_warning");
+	get_tree()->call_group("_world_environment_" + itos(get_viewport()->find_world_3d()->get_scenario().get_id()), "update_configuration_warnings");
 }
 
 void WorldEnvironment::_update_current_camera_effects() {
@@ -76,7 +76,7 @@ void WorldEnvironment::_update_current_camera_effects() {
 		get_viewport()->find_world_3d()->set_camera_effects(Ref<CameraEffects>());
 	}
 
-	get_tree()->call_group("_world_camera_effects_" + itos(get_viewport()->find_world_3d()->get_scenario().get_id()), "update_configuration_warning");
+	get_tree()->call_group("_world_camera_effects_" + itos(get_viewport()->find_world_3d()->get_scenario().get_id()), "update_configuration_warnings");
 }
 
 void WorldEnvironment::set_environment(const Ref<Environment> &p_environment) {
@@ -96,7 +96,7 @@ void WorldEnvironment::set_environment(const Ref<Environment> &p_environment) {
 	if (is_inside_tree()) {
 		_update_current_environment();
 	} else {
-		update_configuration_warning();
+		update_configuration_warnings();
 	}
 }
 
@@ -121,7 +121,7 @@ void WorldEnvironment::set_camera_effects(const Ref<CameraEffects> &p_camera_eff
 	if (is_inside_tree()) {
 		_update_current_camera_effects();
 	} else {
-		update_configuration_warning();
+		update_configuration_warnings();
 	}
 }
 
@@ -129,35 +129,26 @@ Ref<CameraEffects> WorldEnvironment::get_camera_effects() const {
 	return camera_effects;
 }
 
-String WorldEnvironment::get_configuration_warning() const {
-	String warning = Node::get_configuration_warning();
+TypedArray<String> WorldEnvironment::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!environment.is_valid()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("WorldEnvironment requires its \"Environment\" property to contain an Environment to have a visible effect.");
+		warnings.push_back(TTR("WorldEnvironment requires its \"Environment\" property to contain an Environment to have a visible effect."));
 	}
 
 	if (!is_inside_tree()) {
-		return warning;
+		return warnings;
 	}
 
 	if (environment.is_valid() && get_viewport()->find_world_3d()->get_environment() != environment) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("Only the first Environment has an effect in a scene (or set of instantiated scenes).");
+		warnings.push_back(("Only the first Environment has an effect in a scene (or set of instantiated scenes)."));
 	}
 
 	if (camera_effects.is_valid() && get_viewport()->find_world_3d()->get_camera_effects() != camera_effects) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("Only the first CameraEffects has an effect in a scene (or set of instantiated scenes).");
+		warnings.push_back(TTR("Only one WorldEnvironment is allowed per scene (or set of instanced scenes)."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void WorldEnvironment::_bind_methods() {
diff --git a/scene/3d/world_environment.h b/scene/3d/world_environment.h
index e3f28d6d6b..9e85982381 100644
--- a/scene/3d/world_environment.h
+++ b/scene/3d/world_environment.h
@@ -55,7 +55,7 @@ public:
 	void set_camera_effects(const Ref<CameraEffects> &p_camera_effects);
 	Ref<CameraEffects> get_camera_effects() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	WorldEnvironment();
 };
diff --git a/scene/3d/xr_nodes.cpp b/scene/3d/xr_nodes.cpp
index 63be4352d5..b5037f9be7 100644
--- a/scene/3d/xr_nodes.cpp
+++ b/scene/3d/xr_nodes.cpp
@@ -55,23 +55,18 @@ void XRCamera3D::_notification(int p_what) {
 	};
 };
 
-String XRCamera3D::get_configuration_warning() const {
-	if (!is_visible() || !is_inside_tree()) {
-		return String();
+TypedArray<String> XRCamera3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
+
+	if (is_visible() && is_inside_tree()) {
+		// must be child node of XROrigin3D!
+		XROrigin3D *origin = Object::cast_to<XROrigin3D>(get_parent());
+		if (origin == nullptr) {
+			warnings.push_back(TTR("XRCamera3D must have an XROrigin3D node as its parent."));
+		};
 	}
 
-	String warning = Camera3D::get_configuration_warning();
-
-	// must be child node of XROrigin3D!
-	XROrigin3D *origin = Object::cast_to<XROrigin3D>(get_parent());
-	if (origin == nullptr) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("XRCamera3D must have an XROrigin3D node as its parent.");
-	};
-
-	return warning;
+	return warnings;
 };
 
 Vector3 XRCamera3D::project_local_ray_normal(const Point2 &p_pos) const {
@@ -265,7 +260,7 @@ void XRController3D::set_controller_id(int p_controller_id) {
 	// We don't check any bounds here, this controller may not yet be active and just be a place holder until it is.
 	// Note that setting this to 0 means this node is not bound to a controller yet.
 	controller_id = p_controller_id;
-	update_configuration_warning();
+	update_configuration_warnings();
 };
 
 int XRController3D::get_controller_id() const {
@@ -362,30 +357,22 @@ XRPositionalTracker::TrackerHand XRController3D::get_tracker_hand() const {
 	return tracker->get_tracker_hand();
 };
 
-String XRController3D::get_configuration_warning() const {
-	if (!is_visible() || !is_inside_tree()) {
-		return String();
-	}
-
-	String warning = Node3D::get_configuration_warning();
+TypedArray<String> XRController3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
-	// must be child node of XROrigin!
-	XROrigin3D *origin = Object::cast_to<XROrigin3D>(get_parent());
-	if (origin == nullptr) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
+	if (is_visible() && is_inside_tree()) {
+		// must be child node of XROrigin!
+		XROrigin3D *origin = Object::cast_to<XROrigin3D>(get_parent());
+		if (origin == nullptr) {
+			warnings.push_back(TTR("XRController3D must have an XROrigin3D node as its parent."));
 		}
-		warning += TTR("XRController3D must have an XROrigin3D node as its parent.");
-	};
 
-	if (controller_id == 0) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
+		if (controller_id == 0) {
+			warnings.push_back(TTR("The controller ID must not be 0 or this controller won't be bound to an actual controller."));
 		}
-		warning += TTR("The controller ID must not be 0 or this controller won't be bound to an actual controller.");
-	};
+	}
 
-	return warning;
+	return warnings;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -459,7 +446,7 @@ void XRAnchor3D::set_anchor_id(int p_anchor_id) {
 	// We don't check any bounds here, this anchor may not yet be active and just be a place holder until it is.
 	// Note that setting this to 0 means this node is not bound to an anchor yet.
 	anchor_id = p_anchor_id;
-	update_configuration_warning();
+	update_configuration_warnings();
 };
 
 int XRAnchor3D::get_anchor_id() const {
@@ -487,30 +474,22 @@ bool XRAnchor3D::get_is_active() const {
 	return is_active;
 };
 
-String XRAnchor3D::get_configuration_warning() const {
-	if (!is_visible() || !is_inside_tree()) {
-		return String();
-	}
-
-	String warning = Node3D::get_configuration_warning();
+TypedArray<String> XRAnchor3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
-	// must be child node of XROrigin3D!
-	XROrigin3D *origin = Object::cast_to<XROrigin3D>(get_parent());
-	if (origin == nullptr) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
+	if (is_visible() && is_inside_tree()) {
+		// must be child node of XROrigin3D!
+		XROrigin3D *origin = Object::cast_to<XROrigin3D>(get_parent());
+		if (origin == nullptr) {
+			warnings.push_back(TTR("XRAnchor3D must have an XROrigin3D node as its parent."));
 		}
-		warning += TTR("XRAnchor3D must have an XROrigin3D node as its parent.");
-	};
 
-	if (anchor_id == 0) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
+		if (anchor_id == 0) {
+			warnings.push_back(TTR("The anchor ID must not be 0 or this anchor won't be bound to an actual anchor."));
 		}
-		warning += TTR("The anchor ID must not be 0 or this anchor won't be bound to an actual anchor.");
-	};
+	}
 
-	return warning;
+	return warnings;
 };
 
 Plane XRAnchor3D::get_plane() const {
@@ -528,21 +507,16 @@ Ref<Mesh> XRAnchor3D::get_mesh() const {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-String XROrigin3D::get_configuration_warning() const {
-	if (!is_visible() || !is_inside_tree()) {
-		return String();
-	}
-
-	String warning = Node3D::get_configuration_warning();
+TypedArray<String> XROrigin3D::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
-	if (tracked_camera == nullptr) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
+	if (is_visible() && is_inside_tree()) {
+		if (tracked_camera == nullptr) {
+			warnings.push_back(TTR("XROrigin3D requires an XRCamera3D child node."));
 		}
-		warning += TTR("XROrigin3D requires an XRCamera3D child node.");
 	}
 
-	return warning;
+	return warnings;
 };
 
 void XROrigin3D::_bind_methods() {
diff --git a/scene/3d/xr_nodes.h b/scene/3d/xr_nodes.h
index 7cd6e2ac57..90079f5fe9 100644
--- a/scene/3d/xr_nodes.h
+++ b/scene/3d/xr_nodes.h
@@ -50,7 +50,7 @@ protected:
 	void _notification(int p_what);
 
 public:
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	virtual Vector3 project_local_ray_normal(const Point2 &p_pos) const override;
 	virtual Point2 unproject_position(const Vector3 &p_pos) const override;
@@ -97,7 +97,7 @@ public:
 
 	Ref<Mesh> get_mesh() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	XRController3D() {}
 	~XRController3D() {}
@@ -133,7 +133,7 @@ public:
 
 	Ref<Mesh> get_mesh() const;
 
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	XRAnchor3D() {}
 	~XRAnchor3D() {}
@@ -158,7 +158,7 @@ protected:
 	static void _bind_methods();
 
 public:
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	void set_tracked_camera(XRCamera3D *p_tracked_camera);
 	void clear_tracked_camera_if(XRCamera3D *p_tracked_camera);
diff --git a/scene/animation/animation_tree.cpp b/scene/animation/animation_tree.cpp
index 4b4d3943c9..44f2d38a84 100644
--- a/scene/animation/animation_tree.cpp
+++ b/scene/animation/animation_tree.cpp
@@ -458,7 +458,7 @@ void AnimationTree::set_tree_root(const Ref<AnimationNode> &p_root) {
 
 	properties_dirty = true;
 
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Ref<AnimationNode> AnimationTree::get_tree_root() const {
@@ -1262,7 +1262,7 @@ void AnimationTree::_notification(int p_what) {
 
 void AnimationTree::set_animation_player(const NodePath &p_player) {
 	animation_player = p_player;
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 NodePath AnimationTree::get_animation_player() const {
@@ -1281,38 +1281,26 @@ uint64_t AnimationTree::get_last_process_pass() const {
 	return process_pass;
 }
 
-String AnimationTree::get_configuration_warning() const {
-	String warning = Node::get_configuration_warning();
+TypedArray<String> AnimationTree::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!root.is_valid()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("No root AnimationNode for the graph is set.");
+		warnings.push_back(TTR("No root AnimationNode for the graph is set."));
 	}
 
 	if (!has_node(animation_player)) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("Path to an AnimationPlayer node containing animations is not set.");
+		warnings.push_back(TTR("Path to an AnimationPlayer node containing animations is not set."));
 	} else {
 		AnimationPlayer *player = Object::cast_to<AnimationPlayer>(get_node(animation_player));
 
 		if (!player) {
-			if (!warning.is_empty()) {
-				warning += "\n\n";
-			}
-			warning += TTR("Path set for AnimationPlayer does not lead to an AnimationPlayer node.");
+			warnings.push_back(TTR("Path set for AnimationPlayer does not lead to an AnimationPlayer node."));
 		} else if (!player->has_node(player->get_root())) {
-			if (!warning.is_empty()) {
-				warning += "\n\n";
-			}
-			warning += TTR("The AnimationPlayer root node is not a valid node.");
+			warnings.push_back(TTR("The AnimationPlayer root node is not a valid node."));
 		}
 	}
 
-	return warning;
+	return warnings;
 }
 
 void AnimationTree::set_root_motion_track(const NodePath &p_track) {
diff --git a/scene/animation/animation_tree.h b/scene/animation/animation_tree.h
index 1c5aec26ab..700ff1cb5b 100644
--- a/scene/animation/animation_tree.h
+++ b/scene/animation/animation_tree.h
@@ -300,7 +300,7 @@ public:
 	void set_animation_player(const NodePath &p_player);
 	NodePath get_animation_player() const;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	bool is_state_invalid() const;
 	String get_invalid_state_reason() const;
diff --git a/scene/gui/color_picker.cpp b/scene/gui/color_picker.cpp
index 78524a856a..b78f9cad24 100644
--- a/scene/gui/color_picker.cpp
+++ b/scene/gui/color_picker.cpp
@@ -143,6 +143,30 @@ void ColorPicker::_update_controls() {
 		scroll[3]->hide();
 		labels[3]->hide();
 	}
+
+	switch (picker_type) {
+		case SHAPE_HSV_RECTANGLE:
+			wheel_edit->hide();
+			w_edit->show();
+			uv_edit->show();
+			break;
+		case SHAPE_HSV_WHEEL:
+			wheel_edit->show();
+			w_edit->hide();
+			uv_edit->hide();
+
+			wheel->set_material(wheel_mat);
+			break;
+		case SHAPE_VHS_CIRCLE:
+			wheel_edit->show();
+			w_edit->show();
+			uv_edit->hide();
+
+			wheel->set_material(circle_mat);
+			break;
+		default: {
+		}
+	}
 }
 
 void ColorPicker::_set_pick_color(const Color &p_color, bool p_update_sliders) {
@@ -267,6 +291,8 @@ void ColorPicker::_update_color(bool p_update_sliders) {
 	for (int i = 0; i < 4; i++) {
 		scroll[i]->update();
 	}
+	wheel->update();
+	wheel_uv->update();
 	updating = false;
 }
 
@@ -309,6 +335,18 @@ Color ColorPicker::get_pick_color() const {
 	return color;
 }
 
+void ColorPicker::set_picker_shape(PickerShapeType p_picker_type) {
+	ERR_FAIL_INDEX(p_picker_type, SHAPE_MAX);
+	picker_type = p_picker_type;
+
+	_update_controls();
+	_update_color();
+}
+
+ColorPicker::PickerShapeType ColorPicker::get_picker_shape() const {
+	return picker_type;
+}
+
 void ColorPicker::add_preset(const Color &p_color) {
 	if (presets.find(p_color)) {
 		presets.move_to_back(presets.find(p_color));
@@ -421,7 +459,7 @@ void ColorPicker::_update_text_value() {
 }
 
 void ColorPicker::_sample_draw() {
-	const Rect2 r = Rect2(Point2(), Size2(uv_edit->get_size().width, sample->get_size().height * 0.95));
+	const Rect2 r = Rect2(Point2(), Size2(sample->get_size().width, sample->get_size().height * 0.95));
 
 	if (color.a < 1.0) {
 		sample->draw_texture_rect(get_theme_icon("preset_bg", "ColorPicker"), r, true);
@@ -441,42 +479,131 @@ void ColorPicker::_hsv_draw(int p_which, Control *c) {
 	}
 	if (p_which == 0) {
 		Vector<Point2> points;
-		points.push_back(Vector2());
-		points.push_back(Vector2(c->get_size().x, 0));
-		points.push_back(c->get_size());
-		points.push_back(Vector2(0, c->get_size().y));
 		Vector<Color> colors;
-		colors.push_back(Color(1, 1, 1, 1));
-		colors.push_back(Color(1, 1, 1, 1));
-		colors.push_back(Color(0, 0, 0, 1));
-		colors.push_back(Color(0, 0, 0, 1));
-		c->draw_polygon(points, colors);
 		Vector<Color> colors2;
 		Color col = color;
+		Vector2 center = c->get_size() / 2.0;
+
+		switch (picker_type) {
+			case SHAPE_HSV_WHEEL: {
+				points.resize(4);
+				colors.resize(4);
+				colors2.resize(4);
+				real_t ring_radius_x = Math_SQRT12 * c->get_size().width * 0.42;
+				real_t ring_radius_y = Math_SQRT12 * c->get_size().height * 0.42;
+
+				points.set(0, center - Vector2(ring_radius_x, ring_radius_y));
+				points.set(1, center + Vector2(ring_radius_x, -ring_radius_y));
+				points.set(2, center + Vector2(ring_radius_x, ring_radius_y));
+				points.set(3, center + Vector2(-ring_radius_x, ring_radius_y));
+				colors.set(0, Color(1, 1, 1, 1));
+				colors.set(1, Color(1, 1, 1, 1));
+				colors.set(2, Color(0, 0, 0, 1));
+				colors.set(3, Color(0, 0, 0, 1));
+				c->draw_polygon(points, colors);
+
+				col.set_hsv(h, 1, 1);
+				col.a = 0;
+				colors2.set(0, col);
+				col.a = 1;
+				colors2.set(1, col);
+				col.set_hsv(h, 1, 0);
+				colors2.set(2, col);
+				col.a = 0;
+				colors2.set(3, col);
+				c->draw_polygon(points, colors2);
+				break;
+			}
+			case SHAPE_HSV_RECTANGLE: {
+				points.resize(4);
+				colors.resize(4);
+				colors2.resize(4);
+				points.set(0, Vector2());
+				points.set(1, Vector2(c->get_size().x, 0));
+				points.set(2, c->get_size());
+				points.set(3, Vector2(0, c->get_size().y));
+				colors.set(0, Color(1, 1, 1, 1));
+				colors.set(1, Color(1, 1, 1, 1));
+				colors.set(2, Color(0, 0, 0, 1));
+				colors.set(3, Color(0, 0, 0, 1));
+				c->draw_polygon(points, colors);
+				col = color;
+				col.set_hsv(h, 1, 1);
+				col.a = 0;
+				colors2.set(0, col);
+				col.a = 1;
+				colors2.set(1, col);
+				col.set_hsv(h, 1, 0);
+				colors2.set(2, col);
+				col.a = 0;
+				colors2.set(3, col);
+				c->draw_polygon(points, colors2);
+				break;
+			}
+			default: {
+			}
+		}
+		Ref<Texture2D> cursor = get_theme_icon("picker_cursor", "ColorPicker");
+		int x;
+		int y;
+		if (picker_type == SHAPE_VHS_CIRCLE) {
+			x = center.x + (center.x * Math::cos(h * Math_TAU) * s) - (cursor->get_width() / 2);
+			y = center.y + (center.y * Math::sin(h * Math_TAU) * s) - (cursor->get_height() / 2);
+		} else {
+			real_t corner_x = (c == wheel_uv) ? center.x - Math_SQRT12 * c->get_size().width * 0.42 : 0;
+			real_t corner_y = (c == wheel_uv) ? center.y - Math_SQRT12 * c->get_size().height * 0.42 : 0;
+
+			Size2 real_size(c->get_size().x - corner_x * 2, c->get_size().y - corner_y * 2);
+			x = CLAMP(real_size.x * s, 0, real_size.x) + corner_x - (cursor->get_width() / 2);
+			y = CLAMP(real_size.y - real_size.y * v, 0, real_size.y) + corner_y - (cursor->get_height() / 2);
+		}
+		c->draw_texture(cursor, Point2(x, y));
+
 		col.set_hsv(h, 1, 1);
-		col.a = 0;
-		colors2.push_back(col);
-		col.a = 1;
-		colors2.push_back(col);
-		col.set_hsv(h, 1, 0);
-		colors2.push_back(col);
-		col.a = 0;
-		colors2.push_back(col);
-		c->draw_polygon(points, colors2);
-		int x = CLAMP(c->get_size().x * s, 0, c->get_size().x);
-		int y = CLAMP(c->get_size().y - c->get_size().y * v, 0, c->get_size().y);
-		col = color;
-		col.a = 1;
-		c->draw_line(Point2(x, 0), Point2(x, c->get_size().y), col.inverted());
-		c->draw_line(Point2(0, y), Point2(c->get_size().x, y), col.inverted());
-		c->draw_line(Point2(x, y), Point2(x, y), Color(1, 1, 1), 2);
+		if (picker_type == SHAPE_HSV_WHEEL) {
+			points.resize(4);
+			double h1 = h - (0.5 / 360);
+			double h2 = h + (0.5 / 360);
+			points.set(0, Point2(center.x + (center.x * Math::cos(h1 * Math_TAU)), center.y + (center.y * Math::sin(h1 * Math_TAU))));
+			points.set(1, Point2(center.x + (center.x * Math::cos(h1 * Math_TAU) * 0.84), center.y + (center.y * Math::sin(h1 * Math_TAU) * 0.84)));
+			points.set(2, Point2(center.x + (center.x * Math::cos(h2 * Math_TAU)), center.y + (center.y * Math::sin(h2 * Math_TAU))));
+			points.set(3, Point2(center.x + (center.x * Math::cos(h2 * Math_TAU) * 0.84), center.y + (center.y * Math::sin(h2 * Math_TAU) * 0.84)));
+			c->draw_multiline(points, col.inverted());
+		}
+
 	} else if (p_which == 1) {
-		Ref<Texture2D> hue = get_theme_icon("color_hue", "ColorPicker");
-		c->draw_texture_rect(hue, Rect2(Point2(), c->get_size()));
-		int y = c->get_size().y - c->get_size().y * (1.0 - h);
-		Color col = Color();
-		col.set_hsv(h, 1, 1);
-		c->draw_line(Point2(0, y), Point2(c->get_size().x, y), col.inverted());
+		if (picker_type == SHAPE_HSV_RECTANGLE) {
+			Ref<Texture2D> hue = get_theme_icon("color_hue", "ColorPicker");
+			c->draw_texture_rect(hue, Rect2(Point2(), c->get_size()));
+			int y = c->get_size().y - c->get_size().y * (1.0 - h);
+			Color col;
+			col.set_hsv(h, 1, 1);
+			c->draw_line(Point2(0, y), Point2(c->get_size().x, y), col.inverted());
+		} else if (picker_type == SHAPE_VHS_CIRCLE) {
+			Vector<Point2> points;
+			Vector<Color> colors;
+			Color col;
+			col.set_hsv(h, s, 1);
+			points.resize(4);
+			colors.resize(4);
+			points.set(0, Vector2());
+			points.set(1, Vector2(c->get_size().x, 0));
+			points.set(2, c->get_size());
+			points.set(3, Vector2(0, c->get_size().y));
+			colors.set(0, col);
+			colors.set(1, col);
+			colors.set(2, Color(0, 0, 0));
+			colors.set(3, Color(0, 0, 0));
+			c->draw_polygon(points, colors);
+			int y = c->get_size().y - c->get_size().y * CLAMP(v, 0, 1);
+			col.set_hsv(h, 1, v);
+			c->draw_line(Point2(0, y), Point2(c->get_size().x, y), col.inverted());
+		}
+	} else if (p_which == 2) {
+		c->draw_rect(Rect2(Point2(), c->get_size()), Color(1, 1, 1));
+		if (picker_type == SHAPE_VHS_CIRCLE) {
+			circle_mat->set_shader_param("v", v);
+		}
 	}
 }
 
@@ -543,16 +670,51 @@ void ColorPicker::_slider_draw(int p_which) {
 	scroll[p_which]->draw_polygon(pos, col);
 }
 
-void ColorPicker::_uv_input(const Ref<InputEvent> &p_event) {
+void ColorPicker::_uv_input(const Ref<InputEvent> &p_event, Control *c) {
 	Ref<InputEventMouseButton> bev = p_event;
 
 	if (bev.is_valid()) {
 		if (bev->is_pressed() && bev->get_button_index() == MOUSE_BUTTON_LEFT) {
+			Vector2 center = c->get_size() / 2.0;
+			if (picker_type == SHAPE_VHS_CIRCLE) {
+				real_t dist = center.distance_to(bev->get_position());
+
+				if (dist <= center.x) {
+					real_t rad = Math::atan2(bev->get_position().y - center.y, bev->get_position().x - center.x);
+					h = ((rad >= 0) ? rad : (Math_TAU + rad)) / Math_TAU;
+					s = CLAMP(dist / center.x, 0, 1);
+				} else {
+					return;
+				}
+			} else {
+				real_t corner_x = (c == wheel_uv) ? center.x - Math_SQRT12 * c->get_size().width * 0.42 : 0;
+				real_t corner_y = (c == wheel_uv) ? center.y - Math_SQRT12 * c->get_size().height * 0.42 : 0;
+				Size2 real_size(c->get_size().x - corner_x * 2, c->get_size().y - corner_y * 2);
+
+				if (bev->get_position().x < corner_x || bev->get_position().x > c->get_size().x - corner_x ||
+						bev->get_position().y < corner_y || bev->get_position().y > c->get_size().y - corner_y) {
+					{
+						real_t dist = center.distance_to(bev->get_position());
+
+						if (dist >= center.x * 0.84 && dist <= center.x) {
+							real_t rad = Math::atan2(bev->get_position().y - center.y, bev->get_position().x - center.x);
+							h = ((rad >= 0) ? rad : (Math_TAU + rad)) / Math_TAU;
+							spinning = true;
+						} else {
+							return;
+						}
+					}
+				}
+
+				if (!spinning) {
+					real_t x = CLAMP(bev->get_position().x, corner_x, c->get_size().x - corner_x);
+					real_t y = CLAMP(bev->get_position().y, corner_x, c->get_size().y - corner_y);
+
+					s = (x - c->get_position().x - corner_x) / real_size.x;
+					v = 1.0 - (y - c->get_position().y - corner_y) / real_size.y;
+				}
+			}
 			changing_color = true;
-			float x = CLAMP((float)bev->get_position().x, 0, uv_edit->get_size().width);
-			float y = CLAMP((float)bev->get_position().y, 0, uv_edit->get_size().height);
-			s = x / uv_edit->get_size().width;
-			v = 1.0 - y / uv_edit->get_size().height;
 			color.set_hsv(h, s, v, color.a);
 			last_hsv = color;
 			set_pick_color(color);
@@ -563,8 +725,10 @@ void ColorPicker::_uv_input(const Ref<InputEvent> &p_event) {
 		} else if (deferred_mode_enabled && !bev->is_pressed() && bev->get_button_index() == MOUSE_BUTTON_LEFT) {
 			emit_signal("color_changed", color);
 			changing_color = false;
+			spinning = false;
 		} else {
 			changing_color = false;
+			spinning = false;
 		}
 	}
 
@@ -574,10 +738,30 @@ void ColorPicker::_uv_input(const Ref<InputEvent> &p_event) {
 		if (!changing_color) {
 			return;
 		}
-		float x = CLAMP((float)mev->get_position().x, 0, uv_edit->get_size().width);
-		float y = CLAMP((float)mev->get_position().y, 0, uv_edit->get_size().height);
-		s = x / uv_edit->get_size().width;
-		v = 1.0 - y / uv_edit->get_size().height;
+
+		Vector2 center = c->get_size() / 2.0;
+		if (picker_type == SHAPE_VHS_CIRCLE) {
+			real_t dist = center.distance_to(mev->get_position());
+			real_t rad = Math::atan2(mev->get_position().y - center.y, mev->get_position().x - center.x);
+			h = ((rad >= 0) ? rad : (Math_TAU + rad)) / Math_TAU;
+			s = CLAMP(dist / center.x, 0, 1);
+		} else {
+			if (spinning) {
+				real_t rad = Math::atan2(mev->get_position().y - center.y, mev->get_position().x - center.x);
+				h = ((rad >= 0) ? rad : (Math_TAU + rad)) / Math_TAU;
+			} else {
+				real_t corner_x = (c == wheel_uv) ? center.x - Math_SQRT12 * c->get_size().width * 0.42 : 0;
+				real_t corner_y = (c == wheel_uv) ? center.y - Math_SQRT12 * c->get_size().height * 0.42 : 0;
+				Size2 real_size(c->get_size().x - corner_x * 2, c->get_size().y - corner_y * 2);
+
+				real_t x = CLAMP(mev->get_position().x, corner_x, c->get_size().x - corner_x);
+				real_t y = CLAMP(mev->get_position().y, corner_x, c->get_size().y - corner_y);
+
+				s = (x - corner_x) / real_size.x;
+				v = 1.0 - (y - corner_y) / real_size.y;
+			}
+		}
+
 		color.set_hsv(h, s, v, color.a);
 		last_hsv = color;
 		set_pick_color(color);
@@ -595,7 +779,11 @@ void ColorPicker::_w_input(const Ref<InputEvent> &p_event) {
 		if (bev->is_pressed() && bev->get_button_index() == MOUSE_BUTTON_LEFT) {
 			changing_color = true;
 			float y = CLAMP((float)bev->get_position().y, 0, w_edit->get_size().height);
-			h = y / w_edit->get_size().height;
+			if (picker_type == SHAPE_VHS_CIRCLE) {
+				v = 1.0 - (y / w_edit->get_size().height);
+			} else {
+				h = y / w_edit->get_size().height;
+			}
 		} else {
 			changing_color = false;
 		}
@@ -617,7 +805,11 @@ void ColorPicker::_w_input(const Ref<InputEvent> &p_event) {
 			return;
 		}
 		float y = CLAMP((float)mev->get_position().y, 0, w_edit->get_size().height);
-		h = y / w_edit->get_size().height;
+		if (picker_type == SHAPE_VHS_CIRCLE) {
+			v = 1.0 - (y / w_edit->get_size().height);
+		} else {
+			h = y / w_edit->get_size().height;
+		}
 		color.set_hsv(h, s, v, color.a);
 		last_hsv = color;
 		set_pick_color(color);
@@ -801,18 +993,25 @@ void ColorPicker::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("add_preset", "color"), &ColorPicker::add_preset);
 	ClassDB::bind_method(D_METHOD("erase_preset", "color"), &ColorPicker::erase_preset);
 	ClassDB::bind_method(D_METHOD("get_presets"), &ColorPicker::get_presets);
+	ClassDB::bind_method(D_METHOD("set_picker_shape", "picker"), &ColorPicker::set_picker_shape);
+	ClassDB::bind_method(D_METHOD("get_picker_shape"), &ColorPicker::get_picker_shape);
 
 	ADD_PROPERTY(PropertyInfo(Variant::COLOR, "color"), "set_pick_color", "get_pick_color");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "edit_alpha"), "set_edit_alpha", "is_editing_alpha");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "hsv_mode"), "set_hsv_mode", "is_hsv_mode");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "raw_mode"), "set_raw_mode", "is_raw_mode");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "deferred_mode"), "set_deferred_mode", "is_deferred_mode");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "picker_shape", PROPERTY_HINT_ENUM, "HSV Rectangle,HSV Rectangle Wheel,VHS Circle"), "set_picker_shape", "get_picker_shape");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "presets_enabled"), "set_presets_enabled", "are_presets_enabled");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "presets_visible"), "set_presets_visible", "are_presets_visible");
 
 	ADD_SIGNAL(MethodInfo("color_changed", PropertyInfo(Variant::COLOR, "color")));
 	ADD_SIGNAL(MethodInfo("preset_added", PropertyInfo(Variant::COLOR, "color")));
 	ADD_SIGNAL(MethodInfo("preset_removed", PropertyInfo(Variant::COLOR, "color")));
+
+	BIND_ENUM_CONSTANT(SHAPE_HSV_RECTANGLE);
+	BIND_ENUM_CONSTANT(SHAPE_HSV_WHEEL);
+	BIND_ENUM_CONSTANT(SHAPE_VHS_CIRCLE);
 }
 
 ColorPicker::ColorPicker() :
@@ -821,32 +1020,21 @@ ColorPicker::ColorPicker() :
 	add_child(hb_edit);
 	hb_edit->set_v_size_flags(SIZE_EXPAND_FILL);
 
-	uv_edit = memnew(Control);
 	hb_edit->add_child(uv_edit);
-	uv_edit->connect("gui_input", callable_mp(this, &ColorPicker::_uv_input));
+	uv_edit->connect("gui_input", callable_mp(this, &ColorPicker::_uv_input), make_binds(uv_edit));
 	uv_edit->set_mouse_filter(MOUSE_FILTER_PASS);
 	uv_edit->set_h_size_flags(SIZE_EXPAND_FILL);
 	uv_edit->set_v_size_flags(SIZE_EXPAND_FILL);
 	uv_edit->set_custom_minimum_size(Size2(get_theme_constant("sv_width"), get_theme_constant("sv_height")));
 	uv_edit->connect("draw", callable_mp(this, &ColorPicker::_hsv_draw), make_binds(0, uv_edit));
 
-	w_edit = memnew(Control);
-	hb_edit->add_child(w_edit);
-	w_edit->set_custom_minimum_size(Size2(get_theme_constant("h_width"), 0));
-	w_edit->set_h_size_flags(SIZE_FILL);
-	w_edit->set_v_size_flags(SIZE_EXPAND_FILL);
-	w_edit->connect("gui_input", callable_mp(this, &ColorPicker::_w_input));
-	w_edit->connect("draw", callable_mp(this, &ColorPicker::_hsv_draw), make_binds(1, w_edit));
-
 	HBoxContainer *hb_smpl = memnew(HBoxContainer);
 	add_child(hb_smpl);
 
-	sample = memnew(TextureRect);
 	hb_smpl->add_child(sample);
 	sample->set_h_size_flags(SIZE_EXPAND_FILL);
 	sample->connect("draw", callable_mp(this, &ColorPicker::_sample_draw));
 
-	btn_pick = memnew(Button);
 	btn_pick->set_flat(true);
 	hb_smpl->add_child(btn_pick);
 	btn_pick->set_toggle_mode(true);
@@ -896,17 +1084,14 @@ ColorPicker::ColorPicker() :
 	HBoxContainer *hhb = memnew(HBoxContainer);
 	vbr->add_child(hhb);
 
-	btn_hsv = memnew(CheckButton);
 	hhb->add_child(btn_hsv);
 	btn_hsv->set_text(RTR("HSV"));
 	btn_hsv->connect("toggled", callable_mp(this, &ColorPicker::set_hsv_mode));
 
-	btn_raw = memnew(CheckButton);
 	hhb->add_child(btn_raw);
 	btn_raw->set_text(RTR("Raw"));
 	btn_raw->connect("toggled", callable_mp(this, &ColorPicker::set_raw_mode));
 
-	text_type = memnew(Button);
 	hhb->add_child(text_type);
 	text_type->set_text("#");
 	text_type->set_tooltip(TTR("Switch between hexadecimal and code values."));
@@ -920,34 +1105,68 @@ ColorPicker::ColorPicker() :
 		text_type->set_mouse_filter(MOUSE_FILTER_IGNORE);
 	}
 
-	c_text = memnew(LineEdit);
 	hhb->add_child(c_text);
 	c_text->set_h_size_flags(SIZE_EXPAND_FILL);
 	c_text->connect("text_entered", callable_mp(this, &ColorPicker::_html_entered));
 	c_text->connect("focus_entered", callable_mp(this, &ColorPicker::_focus_enter));
 	c_text->connect("focus_exited", callable_mp(this, &ColorPicker::_html_focus_exit));
 
+	wheel_edit->set_h_size_flags(SIZE_EXPAND_FILL);
+	wheel_edit->set_v_size_flags(SIZE_EXPAND_FILL);
+	wheel_edit->set_custom_minimum_size(Size2(get_theme_constant("sv_width"), get_theme_constant("sv_height")));
+	hb_edit->add_child(wheel_edit);
+
+	wheel_mat.instance();
+	circle_mat.instance();
+
+	Ref<Shader> wheel_shader(memnew(Shader));
+	wheel_shader->set_code("shader_type canvas_item;const float TAU=6.28318530718;void fragment(){float x=UV.x-0.5;float y=UV.y-0.5;float a=atan(y,x);x+=0.001;y+=0.001;float b=float(sqrt(x*x+y*y)<0.5)*float(sqrt(x*x+y*y)>0.42);x-=0.002;float b2=float(sqrt(x*x+y*y)<0.5)*float(sqrt(x*x+y*y)>0.42);y-=0.002;float b3=float(sqrt(x*x+y*y)<0.5)*float(sqrt(x*x+y*y)>0.42);x+=0.002;float b4=float(sqrt(x*x+y*y)<0.5)*float(sqrt(x*x+y*y)>0.42);COLOR=vec4(clamp((abs(fract(((a-TAU)/TAU)+vec3(3.0,2.0,1.0)/3.0)*6.0-3.0)-1.0),0.0,1.0),(b+b2+b3+b4)/4.00);}");
+	wheel_mat->set_shader(wheel_shader);
+
+	Ref<Shader> circle_shader(memnew(Shader));
+	circle_shader->set_code("shader_type canvas_item;const float TAU=6.28318530718;uniform float v=1.0;void fragment(){float x=UV.x-0.5;float y=UV.y-0.5;float a=atan(y,x);x+=0.001;y+=0.001;float b=float(sqrt(x*x+y*y)<0.5);x-=0.002;float b2=float(sqrt(x*x+y*y)<0.5);y-=0.002;float b3=float(sqrt(x*x+y*y)<0.5);x+=0.002;float b4=float(sqrt(x*x+y*y)<0.5);COLOR=vec4(mix(vec3(1.0),clamp(abs(fract(vec3((a-TAU)/TAU)+vec3(1.0,2.0/3.0,1.0/3.0))*6.0-vec3(3.0))-vec3(1.0),0.0,1.0),((float(sqrt(x*x+y*y))*2.0))/1.0)*vec3(v),(b+b2+b3+b4)/4.00);}");
+	circle_mat->set_shader(circle_shader);
+
+	MarginContainer *wheel_margin(memnew(MarginContainer));
+#ifdef TOOLS_ENABLED
+	wheel_margin->add_theme_constant_override("margin_bottom", 8 * EDSCALE);
+#else
+	wheel_margin->add_theme_constant_override("margin_bottom", 8);
+#endif
+	wheel_edit->add_child(wheel_margin);
+
+	wheel_margin->add_child(wheel);
+	wheel->set_mouse_filter(MOUSE_FILTER_PASS);
+	wheel->connect("draw", callable_mp(this, &ColorPicker::_hsv_draw), make_binds(2, wheel));
+
+	wheel_margin->add_child(wheel_uv);
+	wheel_uv->connect("gui_input", callable_mp(this, &ColorPicker::_uv_input), make_binds(wheel_uv));
+	wheel_uv->connect("draw", callable_mp(this, &ColorPicker::_hsv_draw), make_binds(0, wheel_uv));
+
+	hb_edit->add_child(w_edit);
+	w_edit->set_custom_minimum_size(Size2(get_theme_constant("h_width"), 0));
+	w_edit->set_h_size_flags(SIZE_FILL);
+	w_edit->set_v_size_flags(SIZE_EXPAND_FILL);
+	w_edit->connect("gui_input", callable_mp(this, &ColorPicker::_w_input));
+	w_edit->connect("draw", callable_mp(this, &ColorPicker::_hsv_draw), make_binds(1, w_edit));
+
+	picker_type = SHAPE_HSV_RECTANGLE;
 	_update_controls();
 	updating = false;
 
 	set_pick_color(Color(1, 1, 1));
 
-	preset_separator = memnew(HSeparator);
 	add_child(preset_separator);
 
-	preset_container = memnew(HBoxContainer);
 	preset_container->set_h_size_flags(SIZE_EXPAND_FILL);
 	add_child(preset_container);
 
-	preset = memnew(TextureRect);
 	preset_container->add_child(preset);
 	preset->connect("gui_input", callable_mp(this, &ColorPicker::_preset_input));
 	preset->connect("draw", callable_mp(this, &ColorPicker::_update_presets));
 
-	preset_container2 = memnew(HBoxContainer);
 	preset_container2->set_h_size_flags(SIZE_EXPAND_FILL);
 	add_child(preset_container2);
-	bt_add_preset = memnew(Button);
 	preset_container2->add_child(bt_add_preset);
 	bt_add_preset->set_tooltip(RTR("Add current color as a preset."));
 	bt_add_preset->connect("pressed", callable_mp(this, &ColorPicker::_add_preset_pressed));
diff --git a/scene/gui/color_picker.h b/scene/gui/color_picker.h
index 24e1746c41..a0d2aa95ca 100644
--- a/scene/gui/color_picker.h
+++ b/scene/gui/color_picker.h
@@ -31,6 +31,7 @@
 #ifndef COLOR_PICKER_H
 #define COLOR_PICKER_H
 
+#include "scene/gui/aspect_ratio_container.h"
 #include "scene/gui/box_container.h"
 #include "scene/gui/button.h"
 #include "scene/gui/check_button.h"
@@ -45,29 +46,44 @@
 class ColorPicker : public BoxContainer {
 	GDCLASS(ColorPicker, BoxContainer);
 
+public:
+	enum PickerShapeType {
+		SHAPE_HSV_RECTANGLE,
+		SHAPE_HSV_WHEEL,
+		SHAPE_VHS_CIRCLE,
+
+		SHAPE_MAX
+	};
+
 private:
 	Control *screen = nullptr;
-	Control *uv_edit;
-	Control *w_edit;
-	TextureRect *sample;
-	TextureRect *preset;
-	HBoxContainer *preset_container;
-	HBoxContainer *preset_container2;
-	HSeparator *preset_separator;
-	Button *bt_add_preset;
+	Control *uv_edit = memnew(Control);
+	Control *w_edit = memnew(Control);
+	AspectRatioContainer *wheel_edit = memnew(AspectRatioContainer);
+	Ref<ShaderMaterial> wheel_mat;
+	Ref<ShaderMaterial> circle_mat;
+	Control *wheel = memnew(Control);
+	Control *wheel_uv = memnew(Control);
+	TextureRect *sample = memnew(TextureRect);
+	TextureRect *preset = memnew(TextureRect);
+	HBoxContainer *preset_container = memnew(HBoxContainer);
+	HBoxContainer *preset_container2 = memnew(HBoxContainer);
+	HSeparator *preset_separator = memnew(HSeparator);
+	Button *bt_add_preset = memnew(Button);
 	List<Color> presets;
-	Button *btn_pick;
-	CheckButton *btn_hsv;
-	CheckButton *btn_raw;
+	Button *btn_pick = memnew(Button);
+	CheckButton *btn_hsv = memnew(CheckButton);
+	CheckButton *btn_raw = memnew(CheckButton);
 	HSlider *scroll[4];
 	SpinBox *values[4];
 	Label *labels[4];
-	Button *text_type;
-	LineEdit *c_text;
+	Button *text_type = memnew(Button);
+	LineEdit *c_text = memnew(LineEdit);
 	bool edit_alpha = true;
 	Size2i ms;
 	bool text_is_constructor = false;
 	int presets_per_row = 0;
+	PickerShapeType picker_type = SHAPE_HSV_WHEEL;
 
 	Color color;
 	bool raw_mode_enabled = false;
@@ -75,6 +91,7 @@ private:
 	bool deferred_mode_enabled = false;
 	bool updating = true;
 	bool changing_color = false;
+	bool spinning = false;
 	bool presets_enabled = true;
 	bool presets_visible = true;
 	float h = 0.0;
@@ -93,7 +110,7 @@ private:
 	void _hsv_draw(int p_which, Control *c);
 	void _slider_draw(int p_which);
 
-	void _uv_input(const Ref<InputEvent> &p_event);
+	void _uv_input(const Ref<InputEvent> &p_event, Control *c);
 	void _w_input(const Ref<InputEvent> &p_event);
 	void _preset_input(const Ref<InputEvent> &p_event);
 	void _screen_input(const Ref<InputEvent> &p_event);
@@ -115,6 +132,9 @@ public:
 	void set_pick_color(const Color &p_color);
 	Color get_pick_color() const;
 
+	void set_picker_shape(PickerShapeType p_picker_type);
+	PickerShapeType get_picker_shape() const;
+
 	void add_preset(const Color &p_color);
 	void erase_preset(const Color &p_color);
 	PackedColorArray get_presets() const;
@@ -175,4 +195,5 @@ public:
 	ColorPickerButton();
 };
 
+VARIANT_ENUM_CAST(ColorPicker::PickerShapeType);
 #endif // COLOR_PICKER_H
diff --git a/scene/gui/container.cpp b/scene/gui/container.cpp
index 2e6b798eea..dea69aae6b 100644
--- a/scene/gui/container.cpp
+++ b/scene/gui/container.cpp
@@ -159,16 +159,14 @@ void Container::_notification(int p_what) {
 	}
 }
 
-String Container::get_configuration_warning() const {
-	String warning = Control::get_configuration_warning();
+TypedArray<String> Container::get_configuration_warnings() const {
+	TypedArray<String> warnings = Control::get_configuration_warnings();
 
 	if (get_class() == "Container" && get_script().is_null()) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("Container by itself serves no purpose unless a script configures its children placement behavior.\nIf you don't intend to add a script, use a plain Control node instead.");
+		warnings.push_back(TTR("Container by itself serves no purpose unless a script configures its children placement behavior.\nIf you don't intend to add a script, use a plain Control node instead."));
 	}
-	return warning;
+
+	return warnings;
 }
 
 void Container::_bind_methods() {
diff --git a/scene/gui/container.h b/scene/gui/container.h
index a4f392a3ae..bce3085f0c 100644
--- a/scene/gui/container.h
+++ b/scene/gui/container.h
@@ -56,7 +56,7 @@ public:
 
 	void fit_child_in_rect(Control *p_child, const Rect2 &p_rect);
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	Container();
 };
diff --git a/scene/gui/control.cpp b/scene/gui/control.cpp
index 300201c0db..191f94b2b8 100644
--- a/scene/gui/control.cpp
+++ b/scene/gui/control.cpp
@@ -2183,7 +2183,7 @@ Ref<Theme> Control::get_theme() const {
 
 void Control::set_tooltip(const String &p_tooltip) {
 	data.tooltip = p_tooltip;
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 String Control::get_tooltip(const Point2 &p_pos) const {
@@ -2468,7 +2468,7 @@ int Control::get_v_size_flags() const {
 void Control::set_mouse_filter(MouseFilter p_filter) {
 	ERR_FAIL_INDEX(p_filter, 3);
 	data.mouse_filter = p_filter;
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 Control::MouseFilter Control::get_mouse_filter() const {
@@ -2688,15 +2688,15 @@ void Control::get_argument_options(const StringName &p_function, int p_idx, List
 	if (p_idx == 0) {
 		List<StringName> sn;
 		String pf = p_function;
-		if (pf == "add_color_override" || pf == "has_color" || pf == "has_color_override" || pf == "get_color") {
+		if (pf == "add_theme_color_override" || pf == "has_theme_color" || pf == "has_theme_color_override" || pf == "get_theme_color") {
 			Theme::get_default()->get_color_list(get_class(), &sn);
-		} else if (pf == "add_style_override" || pf == "has_style" || pf == "has_style_override" || pf == "get_style") {
+		} else if (pf == "add_theme_style_override" || pf == "has_theme_style" || pf == "has_theme_style_override" || pf == "get_theme_style") {
 			Theme::get_default()->get_stylebox_list(get_class(), &sn);
-		} else if (pf == "add_font_override" || pf == "has_font" || pf == "has_font_override" || pf == "get_font") {
+		} else if (pf == "add_theme_font_override" || pf == "has_theme_font" || pf == "has_theme_font_override" || pf == "get_theme_font") {
 			Theme::get_default()->get_font_list(get_class(), &sn);
-		} else if (pf == "add_font_size_override" || pf == "has_font_size" || pf == "has_font_size_override" || pf == "get_font_size") {
+		} else if (pf == "add_theme_font_size_override" || pf == "has_theme_font_size" || pf == "has_theme_font_size_override" || pf == "get_theme_font_size") {
 			Theme::get_default()->get_font_size_list(get_class(), &sn);
-		} else if (pf == "add_constant_override" || pf == "has_constant" || pf == "has_constant_override" || pf == "get_constant") {
+		} else if (pf == "add_theme_constant_override" || pf == "has_theme_constant" || pf == "has_theme_constant_override" || pf == "get_theme_constant") {
 			Theme::get_default()->get_constant_list(get_class(), &sn);
 		}
 
@@ -2707,17 +2707,14 @@ void Control::get_argument_options(const StringName &p_function, int p_idx, List
 	}
 }
 
-String Control::get_configuration_warning() const {
-	String warning = CanvasItem::get_configuration_warning();
+TypedArray<String> Control::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (data.mouse_filter == MOUSE_FILTER_IGNORE && data.tooltip != "") {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("The Hint Tooltip won't be displayed as the control's Mouse Filter is set to \"Ignore\". To solve this, set the Mouse Filter to \"Stop\" or \"Pass\".");
+		warnings.push_back(TTR("The Hint Tooltip won't be displayed as the control's Mouse Filter is set to \"Ignore\". To solve this, set the Mouse Filter to \"Stop\" or \"Pass\"."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void Control::set_clip_contents(bool p_clip) {
diff --git a/scene/gui/control.h b/scene/gui/control.h
index 184b2df6d3..1f397df589 100644
--- a/scene/gui/control.h
+++ b/scene/gui/control.h
@@ -524,7 +524,7 @@ public:
 	bool is_visibility_clip_disabled() const;
 
 	virtual void get_argument_options(const StringName &p_function, int p_idx, List<String> *r_options) const override;
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	Control() {}
 };
diff --git a/scene/gui/line_edit.cpp b/scene/gui/line_edit.cpp
index 2d8eb3191c..1aff5d5390 100644
--- a/scene/gui/line_edit.cpp
+++ b/scene/gui/line_edit.cpp
@@ -51,13 +51,13 @@ void LineEdit::_swap_current_input_direction() {
 	} else {
 		input_direction = TEXT_DIRECTION_LTR;
 	}
-	set_cursor_position(get_cursor_position());
+	set_caret_column(get_caret_column());
 	update();
 }
 
-void LineEdit::_move_cursor_left(bool p_select, bool p_move_by_word) {
+void LineEdit::_move_caret_left(bool p_select, bool p_move_by_word) {
 	if (selection.enabled && !p_select) {
-		set_cursor_position(selection.begin);
+		set_caret_column(selection.begin);
 		deselect();
 		return;
 	}
@@ -65,7 +65,7 @@ void LineEdit::_move_cursor_left(bool p_select, bool p_move_by_word) {
 	shift_selection_check_pre(p_select);
 
 	if (p_move_by_word) {
-		int cc = cursor_pos;
+		int cc = caret_column;
 
 		Vector<Vector2i> words = TS->shaped_text_get_word_breaks(text_rid);
 		for (int i = words.size() - 1; i >= 0; i--) {
@@ -75,21 +75,21 @@ void LineEdit::_move_cursor_left(bool p_select, bool p_move_by_word) {
 			}
 		}
 
-		set_cursor_position(cc);
+		set_caret_column(cc);
 	} else {
-		if (mid_grapheme_caret_enabled) {
-			set_cursor_position(get_cursor_position() - 1);
+		if (caret_mid_grapheme_enabled) {
+			set_caret_column(get_caret_column() - 1);
 		} else {
-			set_cursor_position(TS->shaped_text_prev_grapheme_pos(text_rid, get_cursor_position()));
+			set_caret_column(TS->shaped_text_prev_grapheme_pos(text_rid, get_caret_column()));
 		}
 	}
 
 	shift_selection_check_post(p_select);
 }
 
-void LineEdit::_move_cursor_right(bool p_select, bool p_move_by_word) {
+void LineEdit::_move_caret_right(bool p_select, bool p_move_by_word) {
 	if (selection.enabled && !p_select) {
-		set_cursor_position(selection.end);
+		set_caret_column(selection.end);
 		deselect();
 		return;
 	}
@@ -97,7 +97,7 @@ void LineEdit::_move_cursor_right(bool p_select, bool p_move_by_word) {
 	shift_selection_check_pre(p_select);
 
 	if (p_move_by_word) {
-		int cc = cursor_pos;
+		int cc = caret_column;
 
 		Vector<Vector2i> words = TS->shaped_text_get_word_breaks(text_rid);
 		for (int i = 0; i < words.size(); i++) {
@@ -107,27 +107,27 @@ void LineEdit::_move_cursor_right(bool p_select, bool p_move_by_word) {
 			}
 		}
 
-		set_cursor_position(cc);
+		set_caret_column(cc);
 	} else {
-		if (mid_grapheme_caret_enabled) {
-			set_cursor_position(get_cursor_position() + 1);
+		if (caret_mid_grapheme_enabled) {
+			set_caret_column(get_caret_column() + 1);
 		} else {
-			set_cursor_position(TS->shaped_text_next_grapheme_pos(text_rid, get_cursor_position()));
+			set_caret_column(TS->shaped_text_next_grapheme_pos(text_rid, get_caret_column()));
 		}
 	}
 
 	shift_selection_check_post(p_select);
 }
 
-void LineEdit::_move_cursor_start(bool p_select) {
+void LineEdit::_move_caret_start(bool p_select) {
 	shift_selection_check_pre(p_select);
-	set_cursor_position(0);
+	set_caret_column(0);
 	shift_selection_check_post(p_select);
 }
 
-void LineEdit::_move_cursor_end(bool p_select) {
+void LineEdit::_move_caret_end(bool p_select) {
 	shift_selection_check_pre(p_select);
-	set_cursor_position(text.length());
+	set_caret_column(text.length());
 	shift_selection_check_post(p_select);
 }
 
@@ -138,7 +138,7 @@ void LineEdit::_backspace(bool p_word, bool p_all_to_left) {
 
 	if (p_all_to_left) {
 		deselect();
-		text = text.substr(0, cursor_pos);
+		text = text.substr(0, caret_column);
 		_text_changed();
 		return;
 	}
@@ -149,18 +149,19 @@ void LineEdit::_backspace(bool p_word, bool p_all_to_left) {
 	}
 
 	if (p_word) {
-		int cc = cursor_pos;
+		int cc = caret_column;
 
 		Vector<Vector2i> words = TS->shaped_text_get_word_breaks(text_rid);
 		for (int i = words.size() - 1; i >= 0; i--) {
 			if (words[i].x < cc) {
 				cc = words[i].x;
+				break;
 			}
 		}
 
-		delete_text(cc, cursor_pos);
+		delete_text(cc, caret_column);
 
-		set_cursor_position(cc);
+		set_caret_column(cc);
 	} else {
 		delete_char();
 	}
@@ -173,9 +174,9 @@ void LineEdit::_delete(bool p_word, bool p_all_to_right) {
 
 	if (p_all_to_right) {
 		deselect();
-		text = text.substr(cursor_pos, text.length() - cursor_pos);
+		text = text.substr(caret_column, text.length() - caret_column);
 		_shape();
-		set_cursor_position(0);
+		set_caret_column(0);
 		_text_changed();
 		return;
 	}
@@ -187,12 +188,12 @@ void LineEdit::_delete(bool p_word, bool p_all_to_right) {
 
 	int text_len = text.length();
 
-	if (cursor_pos == text_len) {
+	if (caret_column == text_len) {
 		return; // Nothing to do.
 	}
 
 	if (p_word) {
-		int cc = cursor_pos;
+		int cc = caret_column;
 		Vector<Vector2i> words = TS->shaped_text_get_word_breaks(text_rid);
 		for (int i = 0; i < words.size(); i++) {
 			if (words[i].y > cc) {
@@ -201,15 +202,16 @@ void LineEdit::_delete(bool p_word, bool p_all_to_right) {
 			}
 		}
 
-		delete_text(cursor_pos, cc);
+		delete_text(caret_column, cc);
+		set_caret_column(caret_column);
 	} else {
-		if (mid_grapheme_caret_enabled) {
-			set_cursor_position(cursor_pos + 1);
+		if (caret_mid_grapheme_enabled) {
+			set_caret_column(caret_column + 1);
 			delete_char();
 		} else {
-			int cc = cursor_pos;
-			set_cursor_position(TS->shaped_text_next_grapheme_pos(text_rid, cursor_pos));
-			delete_text(cc, cursor_pos);
+			int cc = caret_column;
+			set_caret_column(TS->shaped_text_next_grapheme_pos(text_rid, caret_column));
+			delete_text(cc, caret_column);
 		}
 	}
 }
@@ -250,10 +252,10 @@ void LineEdit::_gui_input(Ref<InputEvent> p_event) {
 
 			shift_selection_check_pre(b->get_shift());
 
-			set_cursor_at_pixel_pos(b->get_position().x);
+			set_caret_at_pixel_pos(b->get_position().x);
 
 			if (b->get_shift()) {
-				selection_fill_at_cursor();
+				selection_fill_at_caret();
 				selection.creating = true;
 
 			} else {
@@ -265,18 +267,18 @@ void LineEdit::_gui_input(Ref<InputEvent> p_event) {
 						selection.end = text.length();
 						selection.doubleclick = true;
 						selection.last_dblclk = 0;
-						cursor_pos = selection.begin;
+						caret_column = selection.begin;
 					} else if (b->is_doubleclick()) {
 						// Double-click select word.
 						Vector<Vector2i> words = TS->shaped_text_get_word_breaks(text_rid);
 						for (int i = 0; i < words.size(); i++) {
-							if (words[i].x < cursor_pos && words[i].y > cursor_pos) {
+							if (words[i].x < caret_column && words[i].y > caret_column) {
 								selection.enabled = true;
 								selection.begin = words[i].x;
 								selection.end = words[i].y;
 								selection.doubleclick = true;
 								selection.last_dblclk = OS::get_singleton()->get_ticks_msec();
-								cursor_pos = selection.end;
+								caret_column = selection.end;
 								break;
 							}
 						}
@@ -285,9 +287,9 @@ void LineEdit::_gui_input(Ref<InputEvent> p_event) {
 
 				selection.drag_attempt = false;
 
-				if ((cursor_pos < selection.begin) || (cursor_pos > selection.end) || !selection.enabled) {
+				if ((caret_column < selection.begin) || (caret_column > selection.end) || !selection.enabled) {
 					deselect();
-					selection.cursor_start = cursor_pos;
+					selection.start_column = caret_column;
 					selection.creating = true;
 				} else if (selection.enabled) {
 					selection.drag_attempt = true;
@@ -331,8 +333,8 @@ void LineEdit::_gui_input(Ref<InputEvent> p_event) {
 
 		if (m->get_button_mask() & MOUSE_BUTTON_LEFT) {
 			if (selection.creating) {
-				set_cursor_at_pixel_pos(m->get_position().x);
-				selection_fill_at_cursor();
+				set_caret_at_pixel_pos(m->get_position().x);
+				selection_fill_at_caret();
 			}
 		}
 	}
@@ -346,7 +348,7 @@ void LineEdit::_gui_input(Ref<InputEvent> p_event) {
 
 		if (context_menu_enabled) {
 			if (k->is_action("ui_menu", true)) {
-				Point2 pos = Point2(get_cursor_pixel_pos().x, (get_size().y + get_theme_font("font")->get_height(get_theme_font_size("font_size"))) / 2);
+				Point2 pos = Point2(get_caret_pixel_pos().x, (get_size().y + get_theme_font("font")->get_height(get_theme_font_size("font_size"))) / 2);
 				menu->set_position(get_global_transform().xform(pos));
 				menu->set_size(Vector2(1, 1));
 				_generate_context_menu();
@@ -445,34 +447,34 @@ void LineEdit::_gui_input(Ref<InputEvent> p_event) {
 		k->set_shift(false);
 
 		if (k->is_action("ui_text_caret_word_left", true)) {
-			_move_cursor_left(shift_pressed, true);
+			_move_caret_left(shift_pressed, true);
 			accept_event();
 			return;
 		}
 		if (k->is_action("ui_text_caret_left", true)) {
-			_move_cursor_left(shift_pressed);
+			_move_caret_left(shift_pressed);
 			accept_event();
 			return;
 		}
 		if (k->is_action("ui_text_caret_word_right", true)) {
-			_move_cursor_right(shift_pressed, true);
+			_move_caret_right(shift_pressed, true);
 			accept_event();
 			return;
 		}
 		if (k->is_action("ui_text_caret_right", true)) {
-			_move_cursor_right(shift_pressed, false);
+			_move_caret_right(shift_pressed, false);
 			accept_event();
 			return;
 		}
 
 		// Up = Home, Down = End
 		if (k->is_action("ui_text_caret_up", true) || k->is_action("ui_text_caret_line_start", true) || k->is_action("ui_text_caret_page_up", true)) {
-			_move_cursor_start(shift_pressed);
+			_move_caret_start(shift_pressed);
 			accept_event();
 			return;
 		}
 		if (k->is_action("ui_text_caret_down", true) || k->is_action("ui_text_caret_line_end", true) || k->is_action("ui_text_caret_page_down", true)) {
-			_move_cursor_end(shift_pressed);
+			_move_caret_end(shift_pressed);
 			accept_event();
 			return;
 		}
@@ -495,7 +497,7 @@ void LineEdit::_gui_input(Ref<InputEvent> p_event) {
 			selection_delete();
 			char32_t ucodestr[2] = { (char32_t)k->get_unicode(), 0 };
 			int prev_len = text.length();
-			append_at_cursor(ucodestr);
+			insert_text_at_caret(ucodestr);
 			if (text.length() != prev_len) {
 				_text_changed();
 			}
@@ -542,15 +544,15 @@ void LineEdit::drop_data(const Point2 &p_point, const Variant &p_data) {
 	Control::drop_data(p_point, p_data);
 
 	if (p_data.get_type() == Variant::STRING) {
-		set_cursor_at_pixel_pos(p_point.x);
+		set_caret_at_pixel_pos(p_point.x);
 		int selected = selection.end - selection.begin;
 
 		text.erase(selection.begin, selected);
 		_shape();
 
-		append_at_cursor(p_data);
-		selection.begin = cursor_pos - selected;
-		selection.end = cursor_pos;
+		insert_text_at_caret(p_data);
+		selection.begin = caret_column - selected;
+		selection.end = caret_column;
 	}
 }
 
@@ -575,8 +577,8 @@ void LineEdit::_notification(int p_what) {
 #ifdef TOOLS_ENABLED
 		case NOTIFICATION_ENTER_TREE: {
 			if (Engine::get_singleton()->is_editor_hint() && !get_tree()->is_node_being_edited(this)) {
-				cursor_set_blink_enabled(EDITOR_DEF("text_editor/cursor/caret_blink", false));
-				cursor_set_blink_speed(EDITOR_DEF("text_editor/cursor/caret_blink_speed", 0.65));
+				set_caret_blink_enabled(EDITOR_DEF("text_editor/cursor/caret_blink", false));
+				set_caret_blink_speed(EDITOR_DEF("text_editor/cursor/caret_blink_speed", 0.65));
 
 				if (!EditorSettings::get_singleton()->is_connected("settings_changed", callable_mp(this, &LineEdit::_editor_settings_changed))) {
 					EditorSettings::get_singleton()->connect("settings_changed", callable_mp(this, &LineEdit::_editor_settings_changed));
@@ -587,7 +589,7 @@ void LineEdit::_notification(int p_what) {
 		case NOTIFICATION_RESIZED: {
 			_fit_to_width();
 			scroll_offset = 0;
-			set_cursor_position(get_cursor_position());
+			set_caret_column(get_caret_column());
 		} break;
 		case NOTIFICATION_LAYOUT_DIRECTION_CHANGED:
 		case NOTIFICATION_THEME_CHANGED: {
@@ -674,7 +676,7 @@ void LineEdit::_notification(int p_what) {
 			Color selection_color = get_theme_color("selection_color");
 			Color font_color = is_editable() ? get_theme_color("font_color") : get_theme_color("font_uneditable_color");
 			Color font_selected_color = get_theme_color("font_selected_color");
-			Color cursor_color = get_theme_color("cursor_color");
+			Color caret_color = get_theme_color("caret_color");
 
 			// Draw placeholder color.
 			if (using_placeholder) {
@@ -778,7 +780,7 @@ void LineEdit::_notification(int p_what) {
 					// Normal caret.
 					Rect2 l_caret, t_caret;
 					TextServer::Direction l_dir, t_dir;
-					TS->shaped_text_get_carets(text_rid, cursor_pos, l_caret, l_dir, t_caret, t_dir);
+					TS->shaped_text_get_carets(text_rid, caret_column, l_caret, l_dir, t_caret, t_dir);
 
 					if (l_caret == Rect2() && t_caret == Rect2()) {
 						// No carets, add one at the start.
@@ -791,28 +793,28 @@ void LineEdit::_notification(int p_what) {
 							l_dir = TextServer::DIRECTION_LTR;
 							l_caret = Rect2(Vector2(x_ofs, y), Size2(caret_width, h));
 						}
-						RenderingServer::get_singleton()->canvas_item_add_rect(ci, l_caret, cursor_color);
+						RenderingServer::get_singleton()->canvas_item_add_rect(ci, l_caret, caret_color);
 					} else {
 						if (l_caret != Rect2() && l_dir == TextServer::DIRECTION_AUTO) {
 							// Draw extra marker on top of mid caret.
 							Rect2 trect = Rect2(l_caret.position.x - 3 * caret_width, l_caret.position.y, 6 * caret_width, caret_width);
 							trect.position += ofs;
-							RenderingServer::get_singleton()->canvas_item_add_rect(ci, trect, cursor_color);
+							RenderingServer::get_singleton()->canvas_item_add_rect(ci, trect, caret_color);
 						}
 
 						l_caret.position += ofs;
 						l_caret.size.x = caret_width;
-						RenderingServer::get_singleton()->canvas_item_add_rect(ci, l_caret, cursor_color);
+						RenderingServer::get_singleton()->canvas_item_add_rect(ci, l_caret, caret_color);
 
 						t_caret.position += ofs;
 						t_caret.size.x = caret_width;
 
-						RenderingServer::get_singleton()->canvas_item_add_rect(ci, t_caret, cursor_color);
+						RenderingServer::get_singleton()->canvas_item_add_rect(ci, t_caret, caret_color);
 					}
 				} else {
 					{
 						// IME intermediate text range.
-						Vector<Vector2> sel = TS->shaped_text_get_selection(text_rid, cursor_pos, cursor_pos + ime_text.length());
+						Vector<Vector2> sel = TS->shaped_text_get_selection(text_rid, caret_column, caret_column + ime_text.length());
 						for (int i = 0; i < sel.size(); i++) {
 							Rect2 rect = Rect2(sel[i].x + ofs.x, ofs.y, sel[i].y - sel[i].x, text_height);
 							if (rect.position.x + rect.size.x <= x_ofs || rect.position.x > ofs_max) {
@@ -825,12 +827,12 @@ void LineEdit::_notification(int p_what) {
 								rect.size.x = ofs_max - rect.position.x;
 							}
 							rect.size.y = caret_width;
-							RenderingServer::get_singleton()->canvas_item_add_rect(ci, rect, cursor_color);
+							RenderingServer::get_singleton()->canvas_item_add_rect(ci, rect, caret_color);
 						}
 					}
 					{
 						// IME caret.
-						Vector<Vector2> sel = TS->shaped_text_get_selection(text_rid, cursor_pos + ime_selection.x, cursor_pos + ime_selection.x + ime_selection.y);
+						Vector<Vector2> sel = TS->shaped_text_get_selection(text_rid, caret_column + ime_selection.x, caret_column + ime_selection.x + ime_selection.y);
 						for (int i = 0; i < sel.size(); i++) {
 							Rect2 rect = Rect2(sel[i].x + ofs.x, ofs.y, sel[i].y - sel[i].x, text_height);
 							if (rect.position.x + rect.size.x <= x_ofs || rect.position.x > ofs_max) {
@@ -843,7 +845,7 @@ void LineEdit::_notification(int p_what) {
 								rect.size.x = ofs_max - rect.position.x;
 							}
 							rect.size.y = caret_width * 3;
-							RenderingServer::get_singleton()->canvas_item_add_rect(ci, rect, cursor_color);
+							RenderingServer::get_singleton()->canvas_item_add_rect(ci, rect, caret_color);
 						}
 					}
 				}
@@ -869,8 +871,8 @@ void LineEdit::_notification(int p_what) {
 
 			if (get_viewport()->get_window_id() != DisplayServer::INVALID_WINDOW_ID && DisplayServer::get_singleton()->has_feature(DisplayServer::FEATURE_IME)) {
 				DisplayServer::get_singleton()->window_set_ime_active(true, get_viewport()->get_window_id());
-				Point2 cursor_pos = Point2(get_cursor_position(), 1) * get_minimum_size().height;
-				DisplayServer::get_singleton()->window_set_ime_position(get_global_position() + cursor_pos, get_viewport()->get_window_id());
+				Point2 caret_column = Point2(get_caret_column(), 1) * get_minimum_size().height;
+				DisplayServer::get_singleton()->window_set_ime_position(get_global_position() + caret_column, get_viewport()->get_window_id());
 			}
 
 			show_virtual_keyboard();
@@ -887,7 +889,7 @@ void LineEdit::_notification(int p_what) {
 			ime_text = "";
 			ime_selection = Point2();
 			_shape();
-			set_cursor_position(cursor_pos); // Update scroll_offset
+			set_caret_column(caret_column); // Update scroll_offset
 
 			if (DisplayServer::get_singleton()->has_feature(DisplayServer::FEATURE_VIRTUAL_KEYBOARD) && virtual_keyboard_enabled) {
 				DisplayServer::get_singleton()->virtual_keyboard_hide();
@@ -899,7 +901,7 @@ void LineEdit::_notification(int p_what) {
 				ime_text = DisplayServer::get_singleton()->ime_get_text();
 				ime_selection = DisplayServer::get_singleton()->ime_get_selection();
 				_shape();
-				set_cursor_position(cursor_pos); // Update scroll_offset
+				set_caret_column(caret_column); // Update scroll_offset
 
 				update();
 			}
@@ -933,7 +935,7 @@ void LineEdit::paste_text() {
 		if (selection.enabled) {
 			selection_delete();
 		}
-		append_at_cursor(paste_buffer);
+		insert_text_at_caret(paste_buffer);
 
 		if (!text_changed_dirty) {
 			if (is_inside_tree() && text.length() != prev_len) {
@@ -961,7 +963,7 @@ void LineEdit::undo() {
 	TextOperation op = undo_stack_pos->get();
 	text = op.text;
 	scroll_offset = op.scroll_offset;
-	set_cursor_position(op.cursor_pos);
+	set_caret_column(op.caret_column);
 
 	_shape();
 	_emit_text_change();
@@ -982,7 +984,7 @@ void LineEdit::redo() {
 	TextOperation op = undo_stack_pos->get();
 	text = op.text;
 	scroll_offset = op.scroll_offset;
-	set_cursor_position(op.cursor_pos);
+	set_caret_column(op.caret_column);
 
 	_shape();
 	_emit_text_change();
@@ -990,7 +992,7 @@ void LineEdit::redo() {
 
 void LineEdit::shift_selection_check_pre(bool p_shift) {
 	if (!selection.enabled && p_shift) {
-		selection.cursor_start = cursor_pos;
+		selection.start_column = caret_column;
 	}
 	if (!p_shift) {
 		deselect();
@@ -999,11 +1001,11 @@ void LineEdit::shift_selection_check_pre(bool p_shift) {
 
 void LineEdit::shift_selection_check_post(bool p_shift) {
 	if (p_shift) {
-		selection_fill_at_cursor();
+		selection_fill_at_caret();
 	}
 }
 
-void LineEdit::set_cursor_at_pixel_pos(int p_x) {
+void LineEdit::set_caret_at_pixel_pos(int p_x) {
 	Ref<StyleBox> style = get_theme_stylebox("normal");
 	bool rtl = is_layout_rtl();
 
@@ -1048,10 +1050,10 @@ void LineEdit::set_cursor_at_pixel_pos(int p_x) {
 	}
 
 	int ofs = TS->shaped_text_hit_test_position(text_rid, p_x - x_ofs - scroll_offset);
-	set_cursor_position(ofs);
+	set_caret_column(ofs);
 }
 
-Vector2i LineEdit::get_cursor_pixel_pos() {
+Vector2i LineEdit::get_caret_pixel_pos() {
 	Ref<StyleBox> style = get_theme_stylebox("normal");
 	bool rtl = is_layout_rtl();
 
@@ -1100,9 +1102,9 @@ Vector2i LineEdit::get_cursor_pixel_pos() {
 	TextServer::Direction l_dir, t_dir;
 	// Get position of the start of caret.
 	if (ime_text.length() != 0 && ime_selection.x != 0) {
-		TS->shaped_text_get_carets(text_rid, cursor_pos + ime_selection.x, l_caret, l_dir, t_caret, t_dir);
+		TS->shaped_text_get_carets(text_rid, caret_column + ime_selection.x, l_caret, l_dir, t_caret, t_dir);
 	} else {
-		TS->shaped_text_get_carets(text_rid, cursor_pos, l_caret, l_dir, t_caret, t_dir);
+		TS->shaped_text_get_carets(text_rid, caret_column, l_caret, l_dir, t_caret, t_dir);
 	}
 
 	if ((l_caret != Rect2() && (l_dir == TextServer::DIRECTION_AUTO || l_dir == (TextServer::Direction)input_direction)) || (t_caret == Rect2())) {
@@ -1114,9 +1116,9 @@ Vector2i LineEdit::get_cursor_pixel_pos() {
 	// Get position of the end of caret.
 	if (ime_text.length() != 0) {
 		if (ime_selection.y != 0) {
-			TS->shaped_text_get_carets(text_rid, cursor_pos + ime_selection.x + ime_selection.y, l_caret, l_dir, t_caret, t_dir);
+			TS->shaped_text_get_carets(text_rid, caret_column + ime_selection.x + ime_selection.y, l_caret, l_dir, t_caret, t_dir);
 		} else {
-			TS->shaped_text_get_carets(text_rid, cursor_pos + ime_text.size(), l_caret, l_dir, t_caret, t_dir);
+			TS->shaped_text_get_carets(text_rid, caret_column + ime_text.size(), l_caret, l_dir, t_caret, t_dir);
 		}
 		if ((l_caret != Rect2() && (l_dir == TextServer::DIRECTION_AUTO || l_dir == (TextServer::Direction)input_direction)) || (t_caret == Rect2())) {
 			ret.y = x_ofs + l_caret.position.x + scroll_offset;
@@ -1130,19 +1132,19 @@ Vector2i LineEdit::get_cursor_pixel_pos() {
 	return ret;
 }
 
-void LineEdit::set_mid_grapheme_caret_enabled(const bool p_enabled) {
-	mid_grapheme_caret_enabled = p_enabled;
+void LineEdit::set_caret_mid_grapheme_enabled(const bool p_enabled) {
+	caret_mid_grapheme_enabled = p_enabled;
 }
 
-bool LineEdit::get_mid_grapheme_caret_enabled() const {
-	return mid_grapheme_caret_enabled;
+bool LineEdit::is_caret_mid_grapheme_enabled() const {
+	return caret_mid_grapheme_enabled;
 }
 
-bool LineEdit::cursor_get_blink_enabled() const {
+bool LineEdit::is_caret_blink_enabled() const {
 	return caret_blink_enabled;
 }
 
-void LineEdit::cursor_set_blink_enabled(const bool p_enabled) {
+void LineEdit::set_caret_blink_enabled(const bool p_enabled) {
 	caret_blink_enabled = p_enabled;
 
 	if (has_focus() || caret_force_displayed) {
@@ -1160,21 +1162,21 @@ void LineEdit::cursor_set_blink_enabled(const bool p_enabled) {
 	notify_property_list_changed();
 }
 
-bool LineEdit::cursor_get_force_displayed() const {
+bool LineEdit::is_caret_force_displayed() const {
 	return caret_force_displayed;
 }
 
-void LineEdit::cursor_set_force_displayed(const bool p_enabled) {
+void LineEdit::set_caret_force_displayed(const bool p_enabled) {
 	caret_force_displayed = p_enabled;
-	cursor_set_blink_enabled(caret_blink_enabled);
+	set_caret_blink_enabled(caret_blink_enabled);
 	update();
 }
 
-float LineEdit::cursor_get_blink_speed() const {
+float LineEdit::get_caret_blink_speed() const {
 	return caret_blink_timer->get_wait_time();
 }
 
-void LineEdit::cursor_set_blink_speed(const float p_speed) {
+void LineEdit::set_caret_blink_speed(const float p_speed) {
 	ERR_FAIL_COND(p_speed <= 0);
 	caret_blink_timer->set_wait_time(p_speed);
 }
@@ -1198,14 +1200,14 @@ void LineEdit::_toggle_draw_caret() {
 }
 
 void LineEdit::delete_char() {
-	if ((text.length() <= 0) || (cursor_pos == 0)) {
+	if ((text.length() <= 0) || (caret_column == 0)) {
 		return;
 	}
 
-	text.erase(cursor_pos - 1, 1);
+	text.erase(caret_column - 1, 1);
 	_shape();
 
-	set_cursor_position(get_cursor_position() - 1);
+	set_caret_column(get_caret_column() - 1);
 
 	_text_changed();
 }
@@ -1217,10 +1219,10 @@ void LineEdit::delete_text(int p_from_column, int p_to_column) {
 	text.erase(p_from_column, p_to_column - p_from_column);
 	_shape();
 
-	cursor_pos -= CLAMP(cursor_pos - p_from_column, 0, p_to_column - p_from_column);
+	caret_column -= CLAMP(caret_column - p_from_column, 0, p_to_column - p_from_column);
 
-	if (cursor_pos >= text.length()) {
-		cursor_pos = text.length();
+	if (caret_column >= text.length()) {
+		caret_column = text.length();
 	}
 
 	if (!text_changed_dirty) {
@@ -1233,10 +1235,11 @@ void LineEdit::delete_text(int p_from_column, int p_to_column) {
 
 void LineEdit::set_text(String p_text) {
 	clear_internal();
-	append_at_cursor(p_text);
+	insert_text_at_caret(p_text);
+	_create_undo_state();
 
 	update();
-	cursor_pos = 0;
+	caret_column = 0;
 	scroll_offset = 0;
 }
 
@@ -1346,7 +1349,7 @@ void LineEdit::show_virtual_keyboard() {
 		if (selection.enabled) {
 			DisplayServer::get_singleton()->virtual_keyboard_show(text, get_global_rect(), false, max_length, selection.begin, selection.end);
 		} else {
-			DisplayServer::get_singleton()->virtual_keyboard_show(text, get_global_rect(), false, max_length, cursor_pos);
+			DisplayServer::get_singleton()->virtual_keyboard_show(text, get_global_rect(), false, max_length, caret_column);
 		}
 	}
 }
@@ -1375,16 +1378,16 @@ float LineEdit::get_placeholder_alpha() const {
 	return placeholder_alpha;
 }
 
-void LineEdit::set_cursor_position(int p_pos) {
-	if (p_pos > (int)text.length()) {
-		p_pos = text.length();
+void LineEdit::set_caret_column(int p_column) {
+	if (p_column > (int)text.length()) {
+		p_column = text.length();
 	}
 
-	if (p_pos < 0) {
-		p_pos = 0;
+	if (p_column < 0) {
+		p_column = 0;
 	}
 
-	cursor_pos = p_pos;
+	caret_column = p_column;
 
 	// Fit to window.
 
@@ -1439,7 +1442,7 @@ void LineEdit::set_cursor_position(int p_pos) {
 	}
 
 	// Note: Use two coordinates to fit IME input range.
-	Vector2i primary_catret_offset = get_cursor_pixel_pos();
+	Vector2i primary_catret_offset = get_caret_pixel_pos();
 
 	if (MIN(primary_catret_offset.x, primary_catret_offset.y) <= x_ofs) {
 		scroll_offset += (x_ofs - MIN(primary_catret_offset.x, primary_catret_offset.y));
@@ -1451,8 +1454,8 @@ void LineEdit::set_cursor_position(int p_pos) {
 	update();
 }
 
-int LineEdit::get_cursor_position() const {
-	return cursor_pos;
+int LineEdit::get_caret_column() const {
+	return caret_column;
 }
 
 void LineEdit::set_scroll_offset(int p_pos) {
@@ -1466,17 +1469,17 @@ int LineEdit::get_scroll_offset() const {
 	return scroll_offset;
 }
 
-void LineEdit::append_at_cursor(String p_text) {
+void LineEdit::insert_text_at_caret(String p_text) {
 	if ((max_length <= 0) || (text.length() + p_text.length() <= max_length)) {
-		String pre = text.substr(0, cursor_pos);
-		String post = text.substr(cursor_pos, text.length() - cursor_pos);
+		String pre = text.substr(0, caret_column);
+		String post = text.substr(caret_column, text.length() - caret_column);
 		text = pre + p_text + post;
 		_shape();
-		TextServer::Direction dir = TS->shaped_text_get_dominant_direciton_in_range(text_rid, cursor_pos, cursor_pos + p_text.length());
+		TextServer::Direction dir = TS->shaped_text_get_dominant_direciton_in_range(text_rid, caret_column, caret_column + p_text.length());
 		if (dir != TextServer::DIRECTION_AUTO) {
 			input_direction = (TextDirection)dir;
 		}
-		set_cursor_position(cursor_pos + p_text.length());
+		set_caret_column(caret_column + p_text.length());
 	} else {
 		emit_signal("text_change_rejected");
 	}
@@ -1485,7 +1488,7 @@ void LineEdit::append_at_cursor(String p_text) {
 void LineEdit::clear_internal() {
 	deselect();
 	_clear_undo_stack();
-	cursor_pos = 0;
+	caret_column = 0;
 	scroll_offset = 0;
 	undo_text = "";
 	text = "";
@@ -1505,7 +1508,7 @@ Size2 LineEdit::get_minimum_size() const {
 	min_size.width = get_theme_constant("minimum_character_width") * em_space_size;
 
 	if (expand_to_text_length) {
-		// Add a space because some fonts are too exact, and because cursor needs a bit more when at the end.
+		// Add a space because some fonts are too exact, and because caret needs a bit more when at the end.
 		min_size.width = MAX(min_size.width, full_width + em_space_size);
 	}
 
@@ -1526,7 +1529,7 @@ Size2 LineEdit::get_minimum_size() const {
 void LineEdit::deselect() {
 	selection.begin = 0;
 	selection.end = 0;
-	selection.cursor_start = 0;
+	selection.start_column = 0;
 	selection.enabled = false;
 	selection.creating = false;
 	selection.doubleclick = false;
@@ -1551,13 +1554,13 @@ int LineEdit::get_max_length() const {
 	return max_length;
 }
 
-void LineEdit::selection_fill_at_cursor() {
+void LineEdit::selection_fill_at_caret() {
 	if (!selecting_enabled) {
 		return;
 	}
 
-	selection.begin = cursor_pos;
-	selection.end = selection.cursor_start;
+	selection.begin = caret_column;
+	selection.end = selection.start_column;
 
 	if (selection.end < selection.begin) {
 		int aux = selection.end;
@@ -1714,82 +1717,82 @@ void LineEdit::menu_option(int p_option) {
 		} break;
 		case MENU_INSERT_LRM: {
 			if (editable) {
-				append_at_cursor(String::chr(0x200E));
+				insert_text_at_caret(String::chr(0x200E));
 			}
 		} break;
 		case MENU_INSERT_RLM: {
 			if (editable) {
-				append_at_cursor(String::chr(0x200F));
+				insert_text_at_caret(String::chr(0x200F));
 			}
 		} break;
 		case MENU_INSERT_LRE: {
 			if (editable) {
-				append_at_cursor(String::chr(0x202A));
+				insert_text_at_caret(String::chr(0x202A));
 			}
 		} break;
 		case MENU_INSERT_RLE: {
 			if (editable) {
-				append_at_cursor(String::chr(0x202B));
+				insert_text_at_caret(String::chr(0x202B));
 			}
 		} break;
 		case MENU_INSERT_LRO: {
 			if (editable) {
-				append_at_cursor(String::chr(0x202D));
+				insert_text_at_caret(String::chr(0x202D));
 			}
 		} break;
 		case MENU_INSERT_RLO: {
 			if (editable) {
-				append_at_cursor(String::chr(0x202E));
+				insert_text_at_caret(String::chr(0x202E));
 			}
 		} break;
 		case MENU_INSERT_PDF: {
 			if (editable) {
-				append_at_cursor(String::chr(0x202C));
+				insert_text_at_caret(String::chr(0x202C));
 			}
 		} break;
 		case MENU_INSERT_ALM: {
 			if (editable) {
-				append_at_cursor(String::chr(0x061C));
+				insert_text_at_caret(String::chr(0x061C));
 			}
 		} break;
 		case MENU_INSERT_LRI: {
 			if (editable) {
-				append_at_cursor(String::chr(0x2066));
+				insert_text_at_caret(String::chr(0x2066));
 			}
 		} break;
 		case MENU_INSERT_RLI: {
 			if (editable) {
-				append_at_cursor(String::chr(0x2067));
+				insert_text_at_caret(String::chr(0x2067));
 			}
 		} break;
 		case MENU_INSERT_FSI: {
 			if (editable) {
-				append_at_cursor(String::chr(0x2068));
+				insert_text_at_caret(String::chr(0x2068));
 			}
 		} break;
 		case MENU_INSERT_PDI: {
 			if (editable) {
-				append_at_cursor(String::chr(0x2069));
+				insert_text_at_caret(String::chr(0x2069));
 			}
 		} break;
 		case MENU_INSERT_ZWJ: {
 			if (editable) {
-				append_at_cursor(String::chr(0x200D));
+				insert_text_at_caret(String::chr(0x200D));
 			}
 		} break;
 		case MENU_INSERT_ZWNJ: {
 			if (editable) {
-				append_at_cursor(String::chr(0x200C));
+				insert_text_at_caret(String::chr(0x200C));
 			}
 		} break;
 		case MENU_INSERT_WJ: {
 			if (editable) {
-				append_at_cursor(String::chr(0x2060));
+				insert_text_at_caret(String::chr(0x2060));
 			}
 		} break;
 		case MENU_INSERT_SHY: {
 			if (editable) {
-				append_at_cursor(String::chr(0x00AD));
+				insert_text_at_caret(String::chr(0x00AD));
 			}
 		}
 	}
@@ -1809,18 +1812,18 @@ PopupMenu *LineEdit::get_menu() const {
 
 void LineEdit::_editor_settings_changed() {
 #ifdef TOOLS_ENABLED
-	cursor_set_blink_enabled(EDITOR_DEF("text_editor/cursor/caret_blink", false));
-	cursor_set_blink_speed(EDITOR_DEF("text_editor/cursor/caret_blink_speed", 0.65));
+	set_caret_blink_enabled(EDITOR_DEF("text_editor/cursor/caret_blink", false));
+	set_caret_blink_speed(EDITOR_DEF("text_editor/cursor/caret_blink_speed", 0.65));
 #endif
 }
 
-void LineEdit::set_expand_to_text_length(bool p_enabled) {
+void LineEdit::set_expand_to_text_length_enabled(bool p_enabled) {
 	expand_to_text_length = p_enabled;
 	minimum_size_changed();
-	set_cursor_position(cursor_pos);
+	set_caret_column(caret_column);
 }
 
-bool LineEdit::get_expand_to_text_length() const {
+bool LineEdit::is_expand_to_text_length_enabled() const {
 	return expand_to_text_length;
 }
 
@@ -1905,7 +1908,7 @@ void LineEdit::_shape() {
 		t = secret_character.repeat(text.length() + ime_text.length());
 	} else {
 		if (ime_text.length() > 0) {
-			t = text.substr(0, cursor_pos) + ime_text + text.substr(cursor_pos, text.length());
+			t = text.substr(0, caret_column) + ime_text + text.substr(caret_column, text.length());
 		} else {
 			t = text;
 		}
@@ -1970,7 +1973,7 @@ void LineEdit::_clear_undo_stack() {
 void LineEdit::_create_undo_state() {
 	TextOperation op;
 	op.text = text;
-	op.cursor_pos = cursor_pos;
+	op.caret_column = caret_column;
 	op.scroll_offset = scroll_offset;
 	undo_stack.push_back(op);
 }
@@ -2115,23 +2118,23 @@ void LineEdit::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_placeholder"), &LineEdit::get_placeholder);
 	ClassDB::bind_method(D_METHOD("set_placeholder_alpha", "alpha"), &LineEdit::set_placeholder_alpha);
 	ClassDB::bind_method(D_METHOD("get_placeholder_alpha"), &LineEdit::get_placeholder_alpha);
-	ClassDB::bind_method(D_METHOD("set_cursor_position", "position"), &LineEdit::set_cursor_position);
-	ClassDB::bind_method(D_METHOD("get_cursor_position"), &LineEdit::get_cursor_position);
+	ClassDB::bind_method(D_METHOD("set_caret_column", "position"), &LineEdit::set_caret_column);
+	ClassDB::bind_method(D_METHOD("get_caret_column"), &LineEdit::get_caret_column);
 	ClassDB::bind_method(D_METHOD("get_scroll_offset"), &LineEdit::get_scroll_offset);
-	ClassDB::bind_method(D_METHOD("set_expand_to_text_length", "enabled"), &LineEdit::set_expand_to_text_length);
-	ClassDB::bind_method(D_METHOD("get_expand_to_text_length"), &LineEdit::get_expand_to_text_length);
-	ClassDB::bind_method(D_METHOD("cursor_set_blink_enabled", "enabled"), &LineEdit::cursor_set_blink_enabled);
-	ClassDB::bind_method(D_METHOD("cursor_get_blink_enabled"), &LineEdit::cursor_get_blink_enabled);
-	ClassDB::bind_method(D_METHOD("set_mid_grapheme_caret_enabled", "enabled"), &LineEdit::set_mid_grapheme_caret_enabled);
-	ClassDB::bind_method(D_METHOD("get_mid_grapheme_caret_enabled"), &LineEdit::get_mid_grapheme_caret_enabled);
-	ClassDB::bind_method(D_METHOD("cursor_set_force_displayed", "enabled"), &LineEdit::cursor_set_force_displayed);
-	ClassDB::bind_method(D_METHOD("cursor_get_force_displayed"), &LineEdit::cursor_get_force_displayed);
-	ClassDB::bind_method(D_METHOD("cursor_set_blink_speed", "blink_speed"), &LineEdit::cursor_set_blink_speed);
-	ClassDB::bind_method(D_METHOD("cursor_get_blink_speed"), &LineEdit::cursor_get_blink_speed);
+	ClassDB::bind_method(D_METHOD("set_expand_to_text_length_enabled", "enabled"), &LineEdit::set_expand_to_text_length_enabled);
+	ClassDB::bind_method(D_METHOD("is_expand_to_text_length_enabled"), &LineEdit::is_expand_to_text_length_enabled);
+	ClassDB::bind_method(D_METHOD("set_caret_blink_enabled", "enabled"), &LineEdit::set_caret_blink_enabled);
+	ClassDB::bind_method(D_METHOD("is_caret_blink_enabled"), &LineEdit::is_caret_blink_enabled);
+	ClassDB::bind_method(D_METHOD("set_caret_mid_grapheme_enabled", "enabled"), &LineEdit::set_caret_mid_grapheme_enabled);
+	ClassDB::bind_method(D_METHOD("is_caret_mid_grapheme_enabled"), &LineEdit::is_caret_mid_grapheme_enabled);
+	ClassDB::bind_method(D_METHOD("set_caret_force_displayed", "enabled"), &LineEdit::set_caret_force_displayed);
+	ClassDB::bind_method(D_METHOD("is_caret_force_displayed"), &LineEdit::is_caret_force_displayed);
+	ClassDB::bind_method(D_METHOD("set_caret_blink_speed", "blink_speed"), &LineEdit::set_caret_blink_speed);
+	ClassDB::bind_method(D_METHOD("get_caret_blink_speed"), &LineEdit::get_caret_blink_speed);
 	ClassDB::bind_method(D_METHOD("set_max_length", "chars"), &LineEdit::set_max_length);
 	ClassDB::bind_method(D_METHOD("get_max_length"), &LineEdit::get_max_length);
-	ClassDB::bind_method(D_METHOD("append_at_cursor", "text"), &LineEdit::append_at_cursor);
-	ClassDB::bind_method(D_METHOD("delete_char_at_cursor"), &LineEdit::delete_char);
+	ClassDB::bind_method(D_METHOD("insert_text_at_caret", "text"), &LineEdit::insert_text_at_caret);
+	ClassDB::bind_method(D_METHOD("delete_char_at_caret"), &LineEdit::delete_char);
 	ClassDB::bind_method(D_METHOD("delete_text", "from_column", "to_column"), &LineEdit::delete_text);
 	ClassDB::bind_method(D_METHOD("set_editable", "enabled"), &LineEdit::set_editable);
 	ClassDB::bind_method(D_METHOD("is_editable"), &LineEdit::is_editable);
@@ -2199,7 +2202,7 @@ void LineEdit::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "editable"), "set_editable", "is_editable");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "secret"), "set_secret", "is_secret");
 	ADD_PROPERTY(PropertyInfo(Variant::STRING, "secret_character"), "set_secret_character", "get_secret_character");
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "expand_to_text_length"), "set_expand_to_text_length", "get_expand_to_text_length");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "expand_to_text_length"), "set_expand_to_text_length_enabled", "is_expand_to_text_length_enabled");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "context_menu_enabled"), "set_context_menu_enabled", "is_context_menu_enabled");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "virtual_keyboard_enabled"), "set_virtual_keyboard_enabled", "is_virtual_keyboard_enabled");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "clear_button_enabled"), "set_clear_button_enabled", "is_clear_button_enabled");
@@ -2216,11 +2219,11 @@ void LineEdit::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::STRING, "placeholder_text"), "set_placeholder", "get_placeholder");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "placeholder_alpha", PROPERTY_HINT_RANGE, "0,1,0.001"), "set_placeholder_alpha", "get_placeholder_alpha");
 	ADD_GROUP("Caret", "caret_");
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "caret_blink"), "cursor_set_blink_enabled", "cursor_get_blink_enabled");
-	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "caret_blink_speed", PROPERTY_HINT_RANGE, "0.1,10,0.01"), "cursor_set_blink_speed", "cursor_get_blink_speed");
-	ADD_PROPERTY(PropertyInfo(Variant::INT, "caret_position"), "set_cursor_position", "get_cursor_position");
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "caret_force_displayed"), "cursor_set_force_displayed", "cursor_get_force_displayed");
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "caret_mid_grapheme"), "set_mid_grapheme_caret_enabled", "get_mid_grapheme_caret_enabled");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "caret_blink"), "set_caret_blink_enabled", "is_caret_blink_enabled");
+	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "caret_blink_speed", PROPERTY_HINT_RANGE, "0.1,10,0.01"), "set_caret_blink_speed", "get_caret_blink_speed");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "caret_column"), "set_caret_column", "get_caret_column");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "caret_force_displayed"), "set_caret_force_displayed", "is_caret_force_displayed");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "caret_mid_grapheme"), "set_caret_mid_grapheme_enabled", "is_caret_mid_grapheme_enabled");
 }
 
 LineEdit::LineEdit() {
@@ -2236,7 +2239,7 @@ LineEdit::LineEdit() {
 	add_child(caret_blink_timer);
 	caret_blink_timer->set_wait_time(0.65);
 	caret_blink_timer->connect("timeout", callable_mp(this, &LineEdit::_toggle_draw_caret));
-	cursor_set_blink_enabled(false);
+	set_caret_blink_enabled(false);
 
 	menu = memnew(PopupMenu);
 	add_child(menu);
diff --git a/scene/gui/line_edit.h b/scene/gui/line_edit.h
index ef36377f2e..f4f0ff0629 100644
--- a/scene/gui/line_edit.h
+++ b/scene/gui/line_edit.h
@@ -103,9 +103,9 @@ private:
 	PopupMenu *menu_dir = nullptr;
 	PopupMenu *menu_ctl = nullptr;
 
-	bool mid_grapheme_caret_enabled = false;
+	bool caret_mid_grapheme_enabled = false;
 
-	int cursor_pos = 0;
+	int caret_column = 0;
 	int scroll_offset = 0;
 	int max_length = 0; // 0 for no maximum.
 
@@ -131,7 +131,7 @@ private:
 	struct Selection {
 		int begin = 0;
 		int end = 0;
-		int cursor_start = 0;
+		int start_column = 0;
 		bool enabled = false;
 		bool creating = false;
 		bool doubleclick = false;
@@ -140,7 +140,7 @@ private:
 	} selection;
 
 	struct TextOperation {
-		int cursor_pos = 0;
+		int caret_column = 0;
 		int scroll_offset = 0;
 		int cached_width = 0;
 		String text;
@@ -175,12 +175,12 @@ private:
 	void shift_selection_check_pre(bool);
 	void shift_selection_check_post(bool);
 
-	void selection_fill_at_cursor();
+	void selection_fill_at_caret();
 	void set_scroll_offset(int p_pos);
 	int get_scroll_offset() const;
 
-	void set_cursor_at_pixel_pos(int p_x);
-	Vector2i get_cursor_pixel_pos();
+	void set_caret_at_pixel_pos(int p_x);
+	Vector2i get_caret_pixel_pos();
 
 	void _reset_caret_blink_timer();
 	void _toggle_draw_caret();
@@ -191,10 +191,10 @@ private:
 	void _editor_settings_changed();
 
 	void _swap_current_input_direction();
-	void _move_cursor_left(bool p_select, bool p_move_by_word = false);
-	void _move_cursor_right(bool p_select, bool p_move_by_word = false);
-	void _move_cursor_start(bool p_select);
-	void _move_cursor_end(bool p_select);
+	void _move_caret_left(bool p_select, bool p_move_by_word = false);
+	void _move_caret_right(bool p_select, bool p_move_by_word = false);
+	void _move_caret_start(bool p_select);
+	void _move_caret_end(bool p_select);
 	void _backspace(bool p_word = false, bool p_all_to_left = false);
 	void _delete(bool p_word = false, bool p_all_to_right = false);
 
@@ -259,26 +259,26 @@ public:
 	void set_placeholder_alpha(float p_alpha);
 	float get_placeholder_alpha() const;
 
-	void set_cursor_position(int p_pos);
-	int get_cursor_position() const;
+	void set_caret_column(int p_column);
+	int get_caret_column() const;
 
 	void set_max_length(int p_max_length);
 	int get_max_length() const;
 
-	void append_at_cursor(String p_text);
+	void insert_text_at_caret(String p_text);
 	void clear();
 
-	void set_mid_grapheme_caret_enabled(const bool p_enabled);
-	bool get_mid_grapheme_caret_enabled() const;
+	void set_caret_mid_grapheme_enabled(const bool p_enabled);
+	bool is_caret_mid_grapheme_enabled() const;
 
-	bool cursor_get_blink_enabled() const;
-	void cursor_set_blink_enabled(const bool p_enabled);
+	bool is_caret_blink_enabled() const;
+	void set_caret_blink_enabled(const bool p_enabled);
 
-	float cursor_get_blink_speed() const;
-	void cursor_set_blink_speed(const float p_speed);
+	float get_caret_blink_speed() const;
+	void set_caret_blink_speed(const float p_speed);
 
-	bool cursor_get_force_displayed() const;
-	void cursor_set_force_displayed(const bool p_enabled);
+	void set_caret_force_displayed(const bool p_enabled);
+	bool is_caret_force_displayed() const;
 
 	void copy_text();
 	void cut_text();
@@ -297,8 +297,8 @@ public:
 
 	virtual Size2 get_minimum_size() const override;
 
-	void set_expand_to_text_length(bool p_enabled);
-	bool get_expand_to_text_length() const;
+	void set_expand_to_text_length_enabled(bool p_enabled);
+	bool is_expand_to_text_length_enabled() const;
 
 	void set_clear_button_enabled(bool p_enabled);
 	bool is_clear_button_enabled() const;
diff --git a/scene/gui/range.cpp b/scene/gui/range.cpp
index 86b775e795..adc1ed67ca 100644
--- a/scene/gui/range.cpp
+++ b/scene/gui/range.cpp
@@ -30,17 +30,14 @@
 
 #include "range.h"
 
-String Range::get_configuration_warning() const {
-	String warning = Control::get_configuration_warning();
+TypedArray<String> Range::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (shared->exp_ratio && shared->min <= 0) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("If \"Exp Edit\" is enabled, \"Min Value\" must be greater than 0.");
+		warnings.push_back(TTR("If \"Exp Edit\" is enabled, \"Min Value\" must be greater than 0."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void Range::_value_changed_notify() {
@@ -106,7 +103,7 @@ void Range::set_min(double p_min) {
 
 	shared->emit_changed("min");
 
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 void Range::set_max(double p_max) {
@@ -181,7 +178,6 @@ double Range::get_as_ratio() const {
 		double v = Math::log(value) / Math::log((double)2);
 
 		return CLAMP((v - exp_min) / (exp_max - exp_min), 0, 1);
-
 	} else {
 		float value = CLAMP(get_value(), shared->min, shared->max);
 		return CLAMP((value - get_min()) / (get_max() - get_min()), 0, 1);
@@ -287,7 +283,7 @@ bool Range::is_using_rounded_values() const {
 void Range::set_exp_ratio(bool p_enable) {
 	shared->exp_ratio = p_enable;
 
-	update_configuration_warning();
+	update_configuration_warnings();
 }
 
 bool Range::is_ratio_exp() const {
diff --git a/scene/gui/range.h b/scene/gui/range.h
index 1072a109c6..7a129e88d6 100644
--- a/scene/gui/range.h
+++ b/scene/gui/range.h
@@ -97,7 +97,7 @@ public:
 	void share(Range *p_range);
 	void unshare();
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	Range();
 	~Range();
diff --git a/scene/gui/scroll_container.cpp b/scene/gui/scroll_container.cpp
index 757a0841ea..73c6371658 100644
--- a/scene/gui/scroll_container.cpp
+++ b/scene/gui/scroll_container.cpp
@@ -544,8 +544,8 @@ void ScrollContainer::set_follow_focus(bool p_follow) {
 	follow_focus = p_follow;
 }
 
-String ScrollContainer::get_configuration_warning() const {
-	String warning = Container::get_configuration_warning();
+TypedArray<String> ScrollContainer::get_configuration_warnings() const {
+	TypedArray<String> warnings = Container::get_configuration_warnings();
 
 	int found = 0;
 
@@ -565,12 +565,10 @@ String ScrollContainer::get_configuration_warning() const {
 	}
 
 	if (found != 1) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("ScrollContainer is intended to work with a single child control.\nUse a container as child (VBox, HBox, etc.), or a Control and set the custom minimum size manually.");
+		warnings.push_back(TTR("ScrollContainer is intended to work with a single child control.\nUse a container as child (VBox, HBox, etc.), or a Control and set the custom minimum size manually."));
 	}
-	return warning;
+
+	return warnings;
 }
 
 HScrollBar *ScrollContainer::get_h_scrollbar() {
diff --git a/scene/gui/scroll_container.h b/scene/gui/scroll_container.h
index 9d3ce39345..e7d73bab0a 100644
--- a/scene/gui/scroll_container.h
+++ b/scene/gui/scroll_container.h
@@ -103,7 +103,7 @@ public:
 
 	virtual bool clips_input() const override;
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	ScrollContainer();
 };
diff --git a/scene/gui/text_edit.cpp b/scene/gui/text_edit.cpp
index f54ab004c6..0713df9955 100644
--- a/scene/gui/text_edit.cpp
+++ b/scene/gui/text_edit.cpp
@@ -1973,7 +1973,7 @@ void TextEdit::backspace_at_cursor() {
 		}
 	}
 
-	cursor_set_line(prev_line, true, true);
+	cursor_set_line(prev_line, false, true);
 	cursor_set_column(prev_column);
 }
 
@@ -2054,6 +2054,7 @@ void TextEdit::indent_selected_lines_left() {
 	if (is_selection_active() && get_selection_to_column() == 0) {
 		end_line--;
 	}
+	String first_line_text = get_line(start_line);
 	String last_line_text = get_line(end_line);
 
 	for (int i = start_line; i <= end_line; i++) {
@@ -2078,10 +2079,17 @@ void TextEdit::indent_selected_lines_left() {
 		}
 	}
 
-	// Fix selection and cursor being off by one on the last line.
-	if (is_selection_active() && last_line_text != get_line(end_line)) {
-		select(selection.from_line, selection.from_column - removed_characters,
-				selection.to_line, initial_selection_end_column - removed_characters);
+	if (is_selection_active()) {
+		// Fix selection being off by one on the first line.
+		if (first_line_text != get_line(start_line)) {
+			select(selection.from_line, selection.from_column - removed_characters,
+					selection.to_line, initial_selection_end_column);
+		}
+		// Fix selection being off by one on the last line.
+		if (last_line_text != get_line(end_line)) {
+			select(selection.from_line, selection.from_column,
+					selection.to_line, initial_selection_end_column - removed_characters);
+		}
 	}
 	cursor_set_column(initial_cursor_column - removed_characters, false);
 	end_complex_operation();
@@ -2207,7 +2215,7 @@ void TextEdit::_new_line(bool p_split_current_line, bool p_above) {
 	if (!p_split_current_line) {
 		if (p_above) {
 			if (cursor.line > 0) {
-				cursor_set_line(cursor.line - 1);
+				cursor_set_line(cursor.line - 1, false);
 				cursor_set_column(text[cursor.line].length());
 			} else {
 				cursor_set_column(0);
@@ -2223,7 +2231,7 @@ void TextEdit::_new_line(bool p_split_current_line, bool p_above) {
 	if (first_line) {
 		cursor_set_line(0);
 	} else if (brace_indent) {
-		cursor_set_line(cursor.line - 1);
+		cursor_set_line(cursor.line - 1, false);
 		cursor_set_column(text[cursor.line].length());
 	}
 	end_complex_operation();
@@ -2573,7 +2581,7 @@ void TextEdit::_backspace(bool p_word, bool p_all_to_left) {
 
 		_remove_text(line, column, cursor.line, cursor.column);
 
-		cursor_set_line(line);
+		cursor_set_line(line, false);
 		cursor_set_column(column);
 	} else {
 		// One character.
@@ -2640,7 +2648,7 @@ void TextEdit::_delete_selection() {
 		selection.active = false;
 		update();
 		_remove_text(selection.from_line, selection.from_column, selection.to_line, selection.to_column);
-		cursor_set_line(selection.from_line, true, false);
+		cursor_set_line(selection.from_line, false, false);
 		cursor_set_column(selection.from_column);
 		update();
 	}
@@ -3261,7 +3269,7 @@ void TextEdit::_gui_input(const Ref<InputEvent> &p_gui_input) {
 				accept_event();
 				return;
 			}
-			if (k->is_action("ui_accept", true) || k->is_action("ui_text_completion_accept", true)) {
+			if (k->is_action("ui_text_completion_accept", true)) {
 				_confirm_completion();
 				accept_event();
 				return;
@@ -3851,7 +3859,7 @@ void TextEdit::_insert_text_at_cursor(const String &p_text) {
 	int new_column, new_line;
 	_insert_text(cursor.line, cursor.column, p_text, &new_line, &new_column);
 	_update_scrollbars();
-	cursor_set_line(new_line);
+	cursor_set_line(new_line, false);
 	cursor_set_column(new_column);
 
 	update();
@@ -4425,7 +4433,7 @@ int TextEdit::get_column_x_offset_for_line(int p_char, int p_line) const {
 
 void TextEdit::insert_text_at_cursor(const String &p_text) {
 	if (selection.active) {
-		cursor_set_line(selection.from_line);
+		cursor_set_line(selection.from_line, false);
 		cursor_set_column(selection.from_column);
 
 		_remove_text(selection.from_line, selection.from_column, selection.to_line, selection.to_column);
@@ -5042,7 +5050,7 @@ void TextEdit::cut() {
 		DisplayServer::get_singleton()->clipboard_set(clipboard);
 
 		_remove_text(selection.from_line, selection.from_column, selection.to_line, selection.to_column);
-		cursor_set_line(selection.from_line); // Set afterwards else it causes the view to be offset.
+		cursor_set_line(selection.from_line, false); // Set afterwards else it causes the view to be offset.
 		cursor_set_column(selection.from_column);
 
 		selection.active = false;
@@ -5078,7 +5086,7 @@ void TextEdit::paste() {
 		selection.active = false;
 		selection.selecting_mode = SelectionMode::SELECTION_MODE_NONE;
 		_remove_text(selection.from_line, selection.from_column, selection.to_line, selection.to_column);
-		cursor_set_line(selection.from_line);
+		cursor_set_line(selection.from_line, false);
 		cursor_set_column(selection.from_column);
 
 	} else if (!cut_copy_line.is_empty() && cut_copy_line == clipboard) {
@@ -5817,11 +5825,11 @@ void TextEdit::undo() {
 
 	_update_scrollbars();
 	if (undo_stack_pos->get().type == TextOperation::TYPE_REMOVE) {
-		cursor_set_line(undo_stack_pos->get().to_line);
+		cursor_set_line(undo_stack_pos->get().to_line, false);
 		cursor_set_column(undo_stack_pos->get().to_column);
 		_cancel_code_hint();
 	} else {
-		cursor_set_line(undo_stack_pos->get().from_line);
+		cursor_set_line(undo_stack_pos->get().from_line, false);
 		cursor_set_column(undo_stack_pos->get().from_column);
 	}
 	update();
@@ -5856,7 +5864,7 @@ void TextEdit::redo() {
 	}
 
 	_update_scrollbars();
-	cursor_set_line(undo_stack_pos->get().to_line);
+	cursor_set_line(undo_stack_pos->get().to_line, false);
 	cursor_set_column(undo_stack_pos->get().to_column);
 	undo_stack_pos = undo_stack_pos->next();
 	update();
diff --git a/scene/main/http_request.cpp b/scene/main/http_request.cpp
index 64df37654b..884696d58d 100644
--- a/scene/main/http_request.cpp
+++ b/scene/main/http_request.cpp
@@ -40,9 +40,7 @@ Error HTTPRequest::_request() {
 }
 
 Error HTTPRequest::_parse_url(const String &p_url) {
-	url = p_url;
 	use_ssl = false;
-
 	request_string = "";
 	port = 80;
 	request_sent = false;
@@ -52,35 +50,20 @@ Error HTTPRequest::_parse_url(const String &p_url) {
 	downloaded.set(0);
 	redirections = 0;
 
-	String url_lower = url.to_lower();
-	if (url_lower.begins_with("http://")) {
-		url = url.substr(7, url.length() - 7);
-	} else if (url_lower.begins_with("https://")) {
-		url = url.substr(8, url.length() - 8);
+	String scheme;
+	Error err = p_url.parse_url(scheme, url, port, request_string);
+	ERR_FAIL_COND_V_MSG(err != OK, err, "Error parsing URL: " + p_url + ".");
+	if (scheme == "https://") {
 		use_ssl = true;
-		port = 443;
-	} else {
-		ERR_FAIL_V_MSG(ERR_INVALID_PARAMETER, "Malformed URL: " + url + ".");
+	} else if (scheme != "http://") {
+		ERR_FAIL_V_MSG(ERR_INVALID_PARAMETER, "Invalid URL scheme: " + scheme + ".");
 	}
-
-	ERR_FAIL_COND_V_MSG(url.length() < 1, ERR_INVALID_PARAMETER, "URL too short: " + url + ".");
-
-	int slash_pos = url.find("/");
-
-	if (slash_pos != -1) {
-		request_string = url.substr(slash_pos, url.length());
-		url = url.substr(0, slash_pos);
-	} else {
-		request_string = "/";
+	if (port == 0) {
+		port = use_ssl ? 443 : 80;
 	}
-
-	int colon_pos = url.find(":");
-	if (colon_pos != -1) {
-		port = url.substr(colon_pos + 1, url.length()).to_int();
-		url = url.substr(0, colon_pos);
-		ERR_FAIL_COND_V(port < 1 || port > 65535, ERR_INVALID_PARAMETER);
+	if (request_string.is_empty()) {
+		request_string = "/";
 	}
-
 	return OK;
 }
 
@@ -123,7 +106,7 @@ Error HTTPRequest::request(const String &p_url, const Vector<String> &p_custom_h
 	size_t len = charstr.length();
 	raw_data.resize(len);
 	uint8_t *w = raw_data.ptrw();
-	copymem(w, charstr.ptr(), len);
+	memcpy(w, charstr.ptr(), len);
 
 	return request_raw(p_url, p_custom_headers, p_ssl_validate_domain, p_method, raw_data);
 }
diff --git a/scene/main/node.cpp b/scene/main/node.cpp
index 27712242d1..c90d3e4a32 100644
--- a/scene/main/node.cpp
+++ b/scene/main/node.cpp
@@ -591,7 +591,7 @@ Variant Node::_rpc_bind(const Variant **p_args, int p_argcount, Callable::CallEr
 	if (p_args[0]->get_type() != Variant::STRING_NAME) {
 		r_error.error = Callable::CallError::CALL_ERROR_INVALID_ARGUMENT;
 		r_error.argument = 0;
-		r_error.expected = Variant::STRING;
+		r_error.expected = Variant::STRING_NAME;
 		return Variant();
 	}
 
@@ -620,7 +620,7 @@ Variant Node::_rpc_id_bind(const Variant **p_args, int p_argcount, Callable::Cal
 	if (p_args[1]->get_type() != Variant::STRING_NAME) {
 		r_error.error = Callable::CallError::CALL_ERROR_INVALID_ARGUMENT;
 		r_error.argument = 1;
-		r_error.expected = Variant::STRING;
+		r_error.expected = Variant::STRING_NAME;
 		return Variant();
 	}
 
@@ -643,7 +643,7 @@ Variant Node::_rpc_unreliable_bind(const Variant **p_args, int p_argcount, Calla
 	if (p_args[0]->get_type() != Variant::STRING_NAME) {
 		r_error.error = Callable::CallError::CALL_ERROR_INVALID_ARGUMENT;
 		r_error.argument = 0;
-		r_error.expected = Variant::STRING;
+		r_error.expected = Variant::STRING_NAME;
 		return Variant();
 	}
 
@@ -672,7 +672,7 @@ Variant Node::_rpc_unreliable_id_bind(const Variant **p_args, int p_argcount, Ca
 	if (p_args[1]->get_type() != Variant::STRING_NAME) {
 		r_error.error = Callable::CallError::CALL_ERROR_INVALID_ARGUMENT;
 		r_error.argument = 1;
-		r_error.expected = Variant::STRING;
+		r_error.expected = Variant::STRING_NAME;
 		return Variant();
 	}
 
@@ -2634,15 +2634,27 @@ void Node::clear_internal_tree_resource_paths() {
 	}
 }
 
-String Node::get_configuration_warning() const {
+TypedArray<String> Node::get_configuration_warnings() const {
 	if (get_script_instance() && get_script_instance()->get_script().is_valid() &&
-			get_script_instance()->get_script()->is_tool() && get_script_instance()->has_method("_get_configuration_warning")) {
-		return get_script_instance()->call("_get_configuration_warning");
+			get_script_instance()->get_script()->is_tool() && get_script_instance()->has_method("_get_configuration_warnings")) {
+		return get_script_instance()->call("_get_configuration_warnings");
 	}
-	return String();
+	return Array();
 }
 
-void Node::update_configuration_warning() {
+String Node::get_configuration_warnings_as_string() const {
+	TypedArray<String> warnings = get_configuration_warnings();
+	String all_warnings = String();
+	for (int i = 0; i < warnings.size(); i++) {
+		if (i > 0) {
+			all_warnings += "\n\n";
+		}
+		all_warnings += String(warnings[i]);
+	}
+	return all_warnings;
+}
+
+void Node::update_configuration_warnings() {
 #ifdef TOOLS_ENABLED
 	if (!is_inside_tree()) {
 		return;
@@ -2798,7 +2810,7 @@ void Node::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("rset_unreliable", "property", "value"), &Node::rset_unreliable);
 	ClassDB::bind_method(D_METHOD("rset_unreliable_id", "peer_id", "property", "value"), &Node::rset_unreliable_id);
 
-	ClassDB::bind_method(D_METHOD("update_configuration_warning"), &Node::update_configuration_warning);
+	ClassDB::bind_method(D_METHOD("update_configuration_warnings"), &Node::update_configuration_warnings);
 
 	BIND_CONSTANT(NOTIFICATION_ENTER_TREE);
 	BIND_CONSTANT(NOTIFICATION_EXIT_TREE);
@@ -2874,7 +2886,7 @@ void Node::_bind_methods() {
 	BIND_VMETHOD(MethodInfo("_input", PropertyInfo(Variant::OBJECT, "event", PROPERTY_HINT_RESOURCE_TYPE, "InputEvent")));
 	BIND_VMETHOD(MethodInfo("_unhandled_input", PropertyInfo(Variant::OBJECT, "event", PROPERTY_HINT_RESOURCE_TYPE, "InputEvent")));
 	BIND_VMETHOD(MethodInfo("_unhandled_key_input", PropertyInfo(Variant::OBJECT, "event", PROPERTY_HINT_RESOURCE_TYPE, "InputEventKey")));
-	BIND_VMETHOD(MethodInfo(Variant::STRING, "_get_configuration_warning"));
+	BIND_VMETHOD(MethodInfo(PropertyInfo(Variant::ARRAY, "", PROPERTY_HINT_ARRAY_TYPE, "String"), "_get_configuration_warnings"));
 }
 
 String Node::_get_name_num_separator() {
diff --git a/scene/main/node.h b/scene/main/node.h
index b1e51d2aee..6ca2317d9e 100644
--- a/scene/main/node.h
+++ b/scene/main/node.h
@@ -412,9 +412,10 @@ public:
 
 	_FORCE_INLINE_ Viewport *get_viewport() const { return data.viewport; }
 
-	virtual String get_configuration_warning() const;
+	virtual TypedArray<String> get_configuration_warnings() const;
+	String get_configuration_warnings_as_string() const;
 
-	void update_configuration_warning();
+	void update_configuration_warnings();
 
 	void set_display_folded(bool p_folded);
 	bool is_displayed_folded() const;
diff --git a/scene/main/scene_tree.cpp b/scene/main/scene_tree.cpp
index 66f3a2ebde..387af3703b 100644
--- a/scene/main/scene_tree.cpp
+++ b/scene/main/scene_tree.cpp
@@ -1349,6 +1349,8 @@ SceneTree::SceneTree() {
 
 	GLOBAL_DEF("debug/shapes/collision/draw_2d_outlines", true);
 
+	Math::randomize();
+
 	// Create with mainloop.
 
 	root = memnew(Window);
@@ -1376,6 +1378,9 @@ SceneTree::SceneTree() {
 	const bool use_debanding = GLOBAL_DEF("rendering/anti_aliasing/quality/use_debanding", false);
 	root->set_use_debanding(use_debanding);
 
+	const bool use_occlusion_culling = GLOBAL_DEF("rendering/occlusion_culling/use_occlusion_culling", false);
+	root->set_use_occlusion_culling(use_occlusion_culling);
+
 	float lod_threshold = GLOBAL_DEF("rendering/mesh_lod/lod_change/threshold_pixels", 1.0);
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/mesh_lod/lod_change/threshold_pixels", PropertyInfo(Variant::FLOAT, "rendering/mesh_lod/lod_change/threshold_pixels", PROPERTY_HINT_RANGE, "0,1024,0.1"));
 	root->set_lod_threshold(lod_threshold);
diff --git a/scene/main/shader_globals_override.cpp b/scene/main/shader_globals_override.cpp
index b6b2982155..cb3b2cb392 100644
--- a/scene/main/shader_globals_override.cpp
+++ b/scene/main/shader_globals_override.cpp
@@ -232,7 +232,7 @@ void ShaderGlobalsOverride::_activate() {
 			}
 		}
 
-		update_configuration_warning(); //may have activated
+		update_configuration_warnings(); //may have activated
 	}
 }
 
@@ -260,17 +260,14 @@ void ShaderGlobalsOverride::_notification(int p_what) {
 	}
 }
 
-String ShaderGlobalsOverride::get_configuration_warning() const {
-	String warning = Node::get_configuration_warning();
+TypedArray<String> ShaderGlobalsOverride::get_configuration_warnings() const {
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (!active) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("ShaderGlobalsOverride is not active because another node of the same type is in the scene.");
+		warnings.push_back(TTR("ShaderGlobalsOverride is not active because another node of the same type is in the scene."));
 	}
 
-	return warning;
+	return warnings;
 }
 
 void ShaderGlobalsOverride::_bind_methods() {
diff --git a/scene/main/shader_globals_override.h b/scene/main/shader_globals_override.h
index 8d8794d465..2d9c3c76bd 100644
--- a/scene/main/shader_globals_override.h
+++ b/scene/main/shader_globals_override.h
@@ -58,7 +58,7 @@ protected:
 	static void _bind_methods();
 
 public:
-	String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	ShaderGlobalsOverride();
 };
diff --git a/scene/main/viewport.cpp b/scene/main/viewport.cpp
index a3effef99a..90f556cf1b 100644
--- a/scene/main/viewport.cpp
+++ b/scene/main/viewport.cpp
@@ -1412,6 +1412,16 @@ void Viewport::_update_canvas_items(Node *p_node) {
 	}
 }
 
+void Viewport::set_use_xr(bool p_use_xr) {
+	use_xr = p_use_xr;
+
+	RS::get_singleton()->viewport_set_use_xr(viewport, use_xr);
+}
+
+bool Viewport::is_using_xr() {
+	return use_xr;
+}
+
 Ref<ViewportTexture> Viewport::get_texture() const {
 	return default_texture;
 }
@@ -3175,20 +3185,17 @@ Variant Viewport::gui_get_drag_data() const {
 	return gui.drag_data;
 }
 
-String Viewport::get_configuration_warning() const {
+TypedArray<String> Viewport::get_configuration_warnings() const {
 	/*if (get_parent() && !Object::cast_to<Control>(get_parent()) && !render_target) {
 		return TTR("This viewport is not set as render target. If you intend for it to display its contents directly to the screen, make it a child of a Control so it can obtain a size. Otherwise, make it a RenderTarget and assign its internal texture to some node for display.");
 	}*/
 
-	String warning = Node::get_configuration_warning();
+	TypedArray<String> warnings = Node::get_configuration_warnings();
 
 	if (size.x == 0 || size.y == 0) {
-		if (!warning.is_empty()) {
-			warning += "\n\n";
-		}
-		warning += TTR("Viewport size must be greater than 0 to render anything.");
+		warnings.push_back(TTR("Viewport size must be greater than 0 to render anything."));
 	}
-	return warning;
+	return warnings;
 }
 
 void Viewport::gui_reset_canvas_sort_index() {
@@ -3245,6 +3252,21 @@ float Viewport::get_lod_threshold() const {
 	return lod_threshold;
 }
 
+void Viewport::set_use_occlusion_culling(bool p_use_occlusion_culling) {
+	if (use_occlusion_culling == p_use_occlusion_culling) {
+		return;
+	}
+
+	use_occlusion_culling = p_use_occlusion_culling;
+	RS::get_singleton()->viewport_set_use_occlusion_culling(viewport, p_use_occlusion_culling);
+
+	notify_property_list_changed();
+}
+
+bool Viewport::is_using_occlusion_culling() const {
+	return use_occlusion_culling;
+}
+
 void Viewport::set_debug_draw(DebugDraw p_debug_draw) {
 	debug_draw = p_debug_draw;
 	RS::get_singleton()->viewport_set_debug_draw(viewport, RS::ViewportDebugDraw(p_debug_draw));
@@ -3334,9 +3356,6 @@ bool Viewport::is_handling_input_locally() const {
 	return handle_input_locally;
 }
 
-void Viewport::_validate_property(PropertyInfo &property) const {
-}
-
 void Viewport::set_default_canvas_item_texture_filter(DefaultCanvasItemTextureFilter p_filter) {
 	ERR_FAIL_INDEX(p_filter, DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_MAX);
 
@@ -3481,11 +3500,17 @@ void Viewport::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_use_debanding", "enable"), &Viewport::set_use_debanding);
 	ClassDB::bind_method(D_METHOD("is_using_debanding"), &Viewport::is_using_debanding);
 
+	ClassDB::bind_method(D_METHOD("set_use_occlusion_culling", "enable"), &Viewport::set_use_occlusion_culling);
+	ClassDB::bind_method(D_METHOD("is_using_occlusion_culling"), &Viewport::is_using_occlusion_culling);
+
 	ClassDB::bind_method(D_METHOD("set_debug_draw", "debug_draw"), &Viewport::set_debug_draw);
 	ClassDB::bind_method(D_METHOD("get_debug_draw"), &Viewport::get_debug_draw);
 
 	ClassDB::bind_method(D_METHOD("get_render_info", "info"), &Viewport::get_render_info);
 
+	ClassDB::bind_method(D_METHOD("set_use_xr", "use"), &Viewport::set_use_xr);
+	ClassDB::bind_method(D_METHOD("is_using_xr"), &Viewport::is_using_xr);
+
 	ClassDB::bind_method(D_METHOD("get_texture"), &Viewport::get_texture);
 
 	ClassDB::bind_method(D_METHOD("set_physics_object_picking", "enable"), &Viewport::set_physics_object_picking);
@@ -3566,6 +3591,7 @@ void Viewport::_bind_methods() {
 
 	ClassDB::bind_method(D_METHOD("_process_picking"), &Viewport::_process_picking);
 
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_xr"), "set_use_xr", "is_using_xr");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "own_world_3d"), "set_use_own_world_3d", "is_using_own_world_3d");
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "world_3d", PROPERTY_HINT_RESOURCE_TYPE, "World3D"), "set_world_3d", "get_world_3d");
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "world_2d", PROPERTY_HINT_RESOURCE_TYPE, "World2D", 0), "set_world_2d", "get_world_2d");
@@ -3577,6 +3603,7 @@ void Viewport::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "msaa", PROPERTY_HINT_ENUM, "Disabled,2x,4x,8x,16x,AndroidVR 2x,AndroidVR 4x"), "set_msaa", "get_msaa");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "screen_space_aa", PROPERTY_HINT_ENUM, "Disabled,FXAA"), "set_screen_space_aa", "get_screen_space_aa");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_debanding"), "set_use_debanding", "is_using_debanding");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "use_occlusion_culling"), "set_use_occlusion_culling", "is_using_occlusion_culling");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "lod_threshold", PROPERTY_HINT_RANGE, "0,1024,0.1"), "set_lod_threshold", "get_lod_threshold");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "debug_draw", PROPERTY_HINT_ENUM, "Disabled,Unshaded,Overdraw,Wireframe"), "set_debug_draw", "get_debug_draw");
 	ADD_GROUP("Canvas Items", "canvas_item_");
@@ -3658,6 +3685,7 @@ void Viewport::_bind_methods() {
 	BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_SPOT_LIGHTS);
 	BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_DECALS);
 	BIND_ENUM_CONSTANT(DEBUG_DRAW_CLUSTER_REFLECTION_PROBES);
+	BIND_ENUM_CONSTANT(DEBUG_DRAW_OCCLUDERS)
 
 	BIND_ENUM_CONSTANT(DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_NEAREST);
 	BIND_ENUM_CONSTANT(DEFAULT_CANVAS_ITEM_TEXTURE_FILTER_LINEAR);
@@ -3732,16 +3760,6 @@ Viewport::~Viewport() {
 
 /////////////////////////////////
 
-void SubViewport::set_use_xr(bool p_use_xr) {
-	xr = p_use_xr;
-
-	RS::get_singleton()->viewport_set_use_xr(get_viewport_rid(), xr);
-}
-
-bool SubViewport::is_using_xr() {
-	return xr;
-}
-
 void SubViewport::set_size(const Size2i &p_size) {
 	_set_size(p_size, _get_size_2d_override(), Rect2i(), _stretch_transform(), true);
 }
@@ -3814,9 +3832,6 @@ void SubViewport::_notification(int p_what) {
 }
 
 void SubViewport::_bind_methods() {
-	ClassDB::bind_method(D_METHOD("set_use_xr", "use"), &SubViewport::set_use_xr);
-	ClassDB::bind_method(D_METHOD("is_using_xr"), &SubViewport::is_using_xr);
-
 	ClassDB::bind_method(D_METHOD("set_size", "size"), &SubViewport::set_size);
 	ClassDB::bind_method(D_METHOD("get_size"), &SubViewport::get_size);
 
@@ -3832,7 +3847,6 @@ void SubViewport::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_clear_mode", "mode"), &SubViewport::set_clear_mode);
 	ClassDB::bind_method(D_METHOD("get_clear_mode"), &SubViewport::get_clear_mode);
 
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "xr"), "set_use_xr", "is_using_xr");
 	ADD_PROPERTY(PropertyInfo(Variant::VECTOR2, "size"), "set_size", "get_size");
 	ADD_PROPERTY(PropertyInfo(Variant::VECTOR2, "size_2d_override"), "set_size_2d_override", "get_size_2d_override");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "size_2d_override_stretch"), "set_size_2d_override_stretch", "is_size_2d_override_stretch_enabled");
diff --git a/scene/main/viewport.h b/scene/main/viewport.h
index 8e79b50385..2e88e1251d 100644
--- a/scene/main/viewport.h
+++ b/scene/main/viewport.h
@@ -147,6 +147,7 @@ public:
 		DEBUG_DRAW_CLUSTER_SPOT_LIGHTS,
 		DEBUG_DRAW_CLUSTER_DECALS,
 		DEBUG_DRAW_CLUSTER_REFLECTION_PROBES,
+		DEBUG_DRAW_OCCLUDERS,
 	};
 
 	enum DefaultCanvasItemTextureFilter {
@@ -233,6 +234,7 @@ private:
 	Size2i size;
 	Size2i size_2d_override;
 	bool size_allocated = false;
+	bool use_xr = false;
 
 	RID contact_2d_debug;
 	RID contact_3d_debug_multimesh;
@@ -304,6 +306,7 @@ private:
 	ScreenSpaceAA screen_space_aa = SCREEN_SPACE_AA_DISABLED;
 	bool use_debanding = false;
 	float lod_threshold = 1.0;
+	bool use_occlusion_culling = false;
 
 	Ref<ViewportTexture> default_texture;
 	Set<ViewportTexture *> viewport_textures;
@@ -480,7 +483,6 @@ protected:
 	void _notification(int p_what);
 	void _process_picking();
 	static void _bind_methods();
-	virtual void _validate_property(PropertyInfo &property) const override;
 
 public:
 	uint64_t get_processed_events_count() const { return event_count; }
@@ -533,6 +535,9 @@ public:
 	void set_transparent_background(bool p_enable);
 	bool has_transparent_background() const;
 
+	void set_use_xr(bool p_use_xr);
+	bool is_using_xr();
+
 	Ref<ViewportTexture> get_texture() const;
 
 	void set_shadow_atlas_size(int p_size);
@@ -556,6 +561,9 @@ public:
 	void set_lod_threshold(float p_pixels);
 	float get_lod_threshold() const;
 
+	void set_use_occlusion_culling(bool p_us_occlusion_culling);
+	bool is_using_occlusion_culling() const;
+
 	Vector2 get_camera_coords(const Vector2 &p_viewport_coords) const;
 	Vector2 get_camera_rect_size() const;
 
@@ -580,7 +588,7 @@ public:
 	void gui_reset_canvas_sort_index();
 	int gui_get_canvas_sort_index();
 
-	virtual String get_configuration_warning() const override;
+	TypedArray<String> get_configuration_warnings() const override;
 
 	void set_debug_draw(DebugDraw p_debug_draw);
 	DebugDraw get_debug_draw() const;
@@ -652,7 +660,6 @@ public:
 private:
 	UpdateMode update_mode = UPDATE_WHEN_VISIBLE;
 	ClearMode clear_mode = CLEAR_MODE_ALWAYS;
-	bool xr = false;
 	bool size_2d_override_stretch = false;
 
 protected:
@@ -668,9 +675,6 @@ public:
 	void set_size_2d_override(const Size2i &p_size);
 	Size2i get_size_2d_override() const;
 
-	void set_use_xr(bool p_use_xr);
-	bool is_using_xr();
-
 	void set_size_2d_override_stretch(bool p_enable);
 	bool is_size_2d_override_stretch_enabled() const;
 
diff --git a/scene/register_scene_types.cpp b/scene/register_scene_types.cpp
index 232ad278dd..b16532676f 100644
--- a/scene/register_scene_types.cpp
+++ b/scene/register_scene_types.cpp
@@ -208,6 +208,7 @@
 #include "scene/3d/navigation_agent_3d.h"
 #include "scene/3d/navigation_obstacle_3d.h"
 #include "scene/3d/navigation_region_3d.h"
+#include "scene/3d/occluder_instance_3d.h"
 #include "scene/3d/path_3d.h"
 #include "scene/3d/physics_body_3d.h"
 #include "scene/3d/physics_joint_3d.h"
@@ -442,6 +443,8 @@ void register_scene_types() {
 	ClassDB::register_class<XRAnchor3D>();
 	ClassDB::register_class<XROrigin3D>();
 	ClassDB::register_class<MeshInstance3D>();
+	ClassDB::register_class<OccluderInstance3D>();
+	ClassDB::register_class<Occluder3D>();
 	ClassDB::register_class<ImmediateGeometry3D>();
 	ClassDB::register_virtual_class<SpriteBase3D>();
 	ClassDB::register_class<Sprite3D>();
@@ -686,6 +689,8 @@ void register_scene_types() {
 	ClassDB::register_class<PrismMesh>();
 	ClassDB::register_class<QuadMesh>();
 	ClassDB::register_class<SphereMesh>();
+	ClassDB::register_class<TubeTrailMesh>();
+	ClassDB::register_class<RibbonTrailMesh>();
 	ClassDB::register_class<PointMesh>();
 	ClassDB::register_virtual_class<Material>();
 	ClassDB::register_virtual_class<BaseMaterial3D>();
@@ -726,7 +731,6 @@ void register_scene_types() {
 	ClassDB::register_class<ImageTexture>();
 	ClassDB::register_class<AtlasTexture>();
 	ClassDB::register_class<MeshTexture>();
-	ClassDB::register_class<LargeTexture>();
 	ClassDB::register_class<CurveTexture>();
 	ClassDB::register_class<GradientTexture>();
 	ClassDB::register_class<ProxyTexture>();
diff --git a/scene/resources/audio_stream_sample.cpp b/scene/resources/audio_stream_sample.cpp
index 06a91fb2f8..9a9f019dda 100644
--- a/scene/resources/audio_stream_sample.cpp
+++ b/scene/resources/audio_stream_sample.cpp
@@ -490,9 +490,9 @@ void AudioStreamSample::set_data(const Vector<uint8_t> &p_data) {
 		const uint8_t *r = p_data.ptr();
 		int alloc_len = datalen + DATA_PAD * 2;
 		data = memalloc(alloc_len); //alloc with some padding for interpolation
-		zeromem(data, alloc_len);
+		memset(data, 0, alloc_len);
 		uint8_t *dataptr = (uint8_t *)data;
-		copymem(dataptr + DATA_PAD, r, datalen);
+		memcpy(dataptr + DATA_PAD, r, datalen);
 		data_bytes = datalen;
 	}
 
@@ -507,7 +507,7 @@ Vector<uint8_t> AudioStreamSample::get_data() const {
 		{
 			uint8_t *w = pv.ptrw();
 			uint8_t *dataptr = (uint8_t *)data;
-			copymem(w, dataptr + DATA_PAD, data_bytes);
+			memcpy(w, dataptr + DATA_PAD, data_bytes);
 		}
 	}
 
diff --git a/scene/resources/bit_map.cpp b/scene/resources/bit_map.cpp
index 3cc1af59ae..e9bfac3653 100644
--- a/scene/resources/bit_map.cpp
+++ b/scene/resources/bit_map.cpp
@@ -39,7 +39,7 @@ void BitMap::create(const Size2 &p_size) {
 	width = p_size.width;
 	height = p_size.height;
 	bitmask.resize(((width * height) / 8) + 1);
-	zeromem(bitmask.ptrw(), bitmask.size());
+	memset(bitmask.ptrw(), 0, bitmask.size());
 }
 
 void BitMap::create_from_image_alpha(const Ref<Image> &p_image, float p_threshold) {
diff --git a/scene/resources/curve.cpp b/scene/resources/curve.cpp
index bc479e557a..846da39221 100644
--- a/scene/resources/curve.cpp
+++ b/scene/resources/curve.cpp
@@ -445,10 +445,10 @@ void Curve::set_bake_resolution(int p_resolution) {
 	_baked_cache_dirty = true;
 }
 
-real_t Curve::interpolate_baked(real_t offset) {
+real_t Curve::interpolate_baked(real_t offset) const {
 	if (_baked_cache_dirty) {
 		// Last-second bake if not done already
-		bake();
+		const_cast<Curve *>(this)->bake();
 	}
 
 	// Special cases if the cache is too small
diff --git a/scene/resources/curve.h b/scene/resources/curve.h
index 402c893cd8..746c6fa597 100644
--- a/scene/resources/curve.h
+++ b/scene/resources/curve.h
@@ -122,7 +122,7 @@ public:
 	void bake();
 	int get_bake_resolution() const { return _bake_resolution; }
 	void set_bake_resolution(int p_resolution);
-	real_t interpolate_baked(real_t offset);
+	real_t interpolate_baked(real_t offset) const;
 
 	void ensure_default_setup(float p_min, float p_max);
 
diff --git a/scene/resources/default_theme/default_theme.cpp b/scene/resources/default_theme/default_theme.cpp
index 85d097aa19..7c00c6d146 100644
--- a/scene/resources/default_theme/default_theme.cpp
+++ b/scene/resources/default_theme/default_theme.cpp
@@ -438,7 +438,7 @@ void fill_default_theme(Ref<Theme> &theme, const Ref<Font> &default_font, const
 	theme->set_color("font_selected_color", "LineEdit", Color(0, 0, 0));
 	theme->set_color("font_uneditable_color", "LineEdit", Color(control_font_color.r, control_font_color.g, control_font_color.b, 0.5f));
 	theme->set_color("font_outline_color", "LineEdit", Color(1, 1, 1));
-	theme->set_color("cursor_color", "LineEdit", control_font_hover_color);
+	theme->set_color("caret_color", "LineEdit", control_font_hover_color);
 	theme->set_color("selection_color", "LineEdit", control_selection_color);
 	theme->set_color("clear_button_color", "LineEdit", control_font_color);
 	theme->set_color("clear_button_color_pressed", "LineEdit", control_font_pressed_color);
@@ -830,7 +830,6 @@ void fill_default_theme(Ref<Theme> &theme, const Ref<Font> &default_font, const
 	theme->set_stylebox("tab_selected", "Tabs", sb_expand(make_stylebox(tab_current_png, 4, 3, 4, 1, 16, 3, 16, 2), 2, 2, 2, 2));
 	theme->set_stylebox("tab_unselected", "Tabs", sb_expand(make_stylebox(tab_behind_png, 5, 4, 5, 1, 16, 5, 16, 2), 3, 3, 3, 3));
 	theme->set_stylebox("tab_disabled", "Tabs", sb_expand(make_stylebox(tab_disabled_png, 5, 5, 5, 1, 16, 6, 16, 4), 3, 0, 3, 3));
-	theme->set_stylebox("panel", "Tabs", tc_sb);
 	theme->set_stylebox("button_pressed", "Tabs", make_stylebox(button_pressed_png, 4, 4, 4, 4));
 	theme->set_stylebox("button", "Tabs", make_stylebox(button_normal_png, 4, 4, 4, 4));
 
@@ -891,6 +890,7 @@ void fill_default_theme(Ref<Theme> &theme, const Ref<Font> &default_font, const
 	theme->set_icon("preset_bg", "ColorPicker", make_icon(mini_checkerboard_png));
 	theme->set_icon("overbright_indicator", "ColorPicker", make_icon(overbright_indicator_png));
 	theme->set_icon("bar_arrow", "ColorPicker", make_icon(bar_arrow_png));
+	theme->set_icon("picker_cursor", "ColorPicker", make_icon(picker_cursor_png));
 
 	theme->set_icon("bg", "ColorPickerButton", make_icon(mini_checkerboard_png));
 
diff --git a/scene/resources/default_theme/picker_cursor.png b/scene/resources/default_theme/picker_cursor.png
new file mode 100644
index 0000000000..2f403492d2
--- /dev/null
+++ b/scene/resources/default_theme/picker_cursor.png
diff --git a/scene/resources/default_theme/theme_data.h b/scene/resources/default_theme/theme_data.h
index 5d4dbd0758..190f2a03d9 100644
--- a/scene/resources/default_theme/theme_data.h
+++ b/scene/resources/default_theme/theme_data.h
@@ -266,6 +266,10 @@ static const unsigned char panel_bg_png[] = {
 	0x89, 0x50, 0x4e, 0x47, 0xd, 0xa, 0x1a, 0xa, 0x0, 0x0, 0x0, 0xd, 0x49, 0x48, 0x44, 0x52, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x8, 0x1, 0x3, 0x0, 0x0, 0x0, 0xfe, 0xc1, 0x2c, 0xc8, 0x0, 0x0, 0x0, 0x6, 0x50, 0x4c, 0x54, 0x45, 0x25, 0x25, 0x2a, 0x35, 0x32, 0x3b, 0x4a, 0x73, 0x58, 0x4a, 0x0, 0x0, 0x0, 0xa, 0x49, 0x44, 0x41, 0x54, 0x78, 0xda, 0x63, 0x40, 0x3, 0x0, 0x0, 0x10, 0x0, 0x1, 0xb3, 0xac, 0xe2, 0xd0, 0x0, 0x0, 0x0, 0x0, 0x49, 0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82
 };
 
+static const unsigned char picker_cursor_png[] = {
+	0x89, 0x50, 0x4e, 0x47, 0xd, 0xa, 0x1a, 0xa, 0x0, 0x0, 0x0, 0xd, 0x49, 0x48, 0x44, 0x52, 0x0, 0x0, 0x0, 0xc, 0x0, 0x0, 0x0, 0xc, 0x8, 0x4, 0x0, 0x0, 0x0, 0xfc, 0x7c, 0x94, 0x6c, 0x0, 0x0, 0x0, 0x9, 0x70, 0x48, 0x59, 0x73, 0x0, 0x0, 0xe, 0xc3, 0x0, 0x0, 0xe, 0xc3, 0x1, 0xc7, 0x6f, 0xa8, 0x64, 0x0, 0x0, 0x0, 0x19, 0x74, 0x45, 0x58, 0x74, 0x53, 0x6f, 0x66, 0x74, 0x77, 0x61, 0x72, 0x65, 0x0, 0x77, 0x77, 0x77, 0x2e, 0x69, 0x6e, 0x6b, 0x73, 0x63, 0x61, 0x70, 0x65, 0x2e, 0x6f, 0x72, 0x67, 0x9b, 0xee, 0x3c, 0x1a, 0x0, 0x0, 0x0, 0xb9, 0x49, 0x44, 0x41, 0x54, 0x18, 0xd3, 0x6d, 0x8f, 0x3d, 0x8a, 0xc2, 0x50, 0x18, 0x45, 0xcf, 0x6b, 0x92, 0x2a, 0x19, 0xd4, 0xa4, 0x72, 0x47, 0x3, 0x42, 0xc0, 0x9f, 0x55, 0x44, 0x17, 0x24, 0x88, 0xee, 0x24, 0x53, 0x4d, 0x7e, 0xa, 0xbf, 0x94, 0xd6, 0x71, 0x5, 0xf2, 0x5e, 0x7f, 0x2d, 0xa2, 0xa2, 0xe0, 0x29, 0xef, 0xb9, 0xcd, 0x1, 0x40, 0xb1, 0x76, 0x6a, 0x14, 0x14, 0xd4, 0x68, 0xab, 0x98, 0x11, 0xcd, 0xd5, 0xef, 0x9b, 0xac, 0x27, 0x10, 0x32, 0x3b, 0xb4, 0x32, 0xcd, 0xc7, 0x77, 0xff, 0xfb, 0xc7, 0xc0, 0x92, 0x84, 0x84, 0x82, 0xcb, 0xa2, 0x92, 0x29, 0x46, 0xbb, 0x7d, 0xc3, 0xc0, 0x94, 0x27, 0x13, 0x86, 0x63, 0xa7, 0x12, 0xb5, 0x59, 0xcf, 0x8a, 0x77, 0xd6, 0xb9, 0xa9, 0x46, 0xde, 0x5, 0x92, 0xf, 0x91, 0x3a, 0x2f, 0xff, 0x4d, 0xfc, 0x38, 0xaf, 0x1b, 0x6a, 0x33, 0xa3, 0xf8, 0x10, 0x9b, 0xfc, 0xac, 0x1a, 0x6d, 0xf, 0x2d, 0x17, 0x26, 0xaf, 0x79, 0xc6, 0xf5, 0xd4, 0xa9, 0x44, 0xb1, 0x6c, 0x51, 0x31, 0xb0, 0x26, 0x25, 0x65, 0xc3, 0xb5, 0xa8, 0x64, 0x8a, 0xc6, 0x40, 0x3b, 0x76, 0xb9, 0xb9, 0xe0, 0x42, 0x7e, 0x3e, 0x75, 0x8f, 0x40, 0x0, 0x45, 0x2a, 0x55, 0xcb, 0xcb, 0xeb, 0x5f, 0xa5, 0x22, 0x80, 0x3b, 0xa0, 0x2c, 0x6c, 0xa1, 0x40, 0x2f, 0xda, 0xfc, 0x0, 0x0, 0x0, 0x0, 0x49, 0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82
+};
+
 static const unsigned char popup_bg_png[] = {
 	0x89, 0x50, 0x4e, 0x47, 0xd, 0xa, 0x1a, 0xa, 0x0, 0x0, 0x0, 0xd, 0x49, 0x48, 0x44, 0x52, 0x0, 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x10, 0x8, 0x3, 0x0, 0x0, 0x0, 0x28, 0x2d, 0xf, 0x53, 0x0, 0x0, 0x0, 0xa2, 0x50, 0x4c, 0x54, 0x45, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3b, 0x3b, 0x43, 0x42, 0x42, 0x4b, 0x3e, 0x3e, 0x47, 0x3e, 0x3e, 0x46, 0x41, 0x41, 0x4a, 0x0, 0x0, 0x0, 0x3d, 0x3d, 0x45, 0x3b, 0x3b, 0x43, 0x3a, 0x3a, 0x42, 0x38, 0x38, 0x41, 0x37, 0x37, 0x3e, 0x36, 0x36, 0x3d, 0x35, 0x35, 0x3c, 0x0, 0x0, 0x0, 0x38, 0x38, 0x40, 0x38, 0x38, 0x40, 0x31, 0x31, 0x38, 0x34, 0x34, 0x3b, 0x34, 0x34, 0x3b, 0x39, 0x39, 0x3f, 0x31, 0x31, 0x38, 0x2f, 0x2f, 0x36, 0x2d, 0x2d, 0x33, 0x2c, 0x2c, 0x32, 0x2b, 0x2b, 0x31, 0x2a, 0x2a, 0x31, 0x2a, 0x2a, 0x30, 0x29, 0x29, 0x30, 0x29, 0x29, 0x2f, 0x28, 0x28, 0x2e, 0x28, 0x28, 0x2d, 0x27, 0x27, 0x2d, 0x27, 0x27, 0x2c, 0x29, 0x29, 0x2e, 0x26, 0x26, 0x2c, 0x36, 0xc6, 0xc8, 0x93, 0x0, 0x0, 0x0, 0x28, 0x74, 0x52, 0x4e, 0x53, 0x0, 0x1, 0x3, 0x5, 0x8, 0xa, 0xb, 0x4, 0x13, 0x19, 0x1f, 0x22, 0x23, 0x16, 0x27, 0x35, 0x3f, 0x45, 0x46, 0x94, 0xf5, 0xfa, 0xfb, 0xf5, 0x40, 0xfc, 0xfb, 0xfb, 0xfb, 0xfb, 0xfc, 0xfc, 0x1a, 0xf5, 0xf6, 0x95, 0xfa, 0xfb, 0xf4, 0x94, 0x71, 0xda, 0xac, 0x92, 0x0, 0x0, 0x0, 0x7f, 0x49, 0x44, 0x41, 0x54, 0x78, 0xda, 0x65, 0x8f, 0x35, 0x82, 0xc3, 0x0, 0xc, 0x4, 0x77, 0x24, 0x85, 0xba, 0xe3, 0xff, 0xff, 0xee, 0xca, 0x74, 0x41, 0xdb, 0x32, 0xf3, 0x94, 0x82, 0x85, 0x10, 0x1d, 0x92, 0xb2, 0x3, 0x8e, 0x95, 0x77, 0x93, 0x6c, 0x28, 0xed, 0x15, 0x54, 0x67, 0xa6, 0x41, 0x3e, 0x8, 0x9c, 0xc3, 0xf4, 0xf2, 0xf6, 0x2a, 0x80, 0xf8, 0x44, 0x2d, 0x79, 0x2d, 0x20, 0xe0, 0x2, 0xa8, 0xc3, 0x2e, 0x6f, 0xc, 0x9e, 0x4c, 0x3c, 0x21, 0x4, 0xd8, 0xf0, 0x2, 0x28, 0x24, 0xcd, 0x3, 0xa9, 0x19, 0x64, 0xce, 0x83, 0x4c, 0x45, 0xe6, 0x69, 0x1a, 0xd8, 0xe9, 0x99, 0x96, 0x7f, 0x77, 0x37, 0x59, 0x83, 0xcc, 0xef, 0x7f, 0x89, 0x1f, 0x8e, 0xbf, 0x95, 0xd3, 0x1d, 0xf0, 0xff, 0x7a, 0x63, 0x7e, 0x86, 0xcb, 0x73, 0x8c, 0x5e, 0xee, 0xca, 0xb1, 0xad, 0x5f, 0x3, 0xaf, 0xdb, 0x49, 0x94, 0x4b, 0x90, 0x40, 0xdf, 0x0, 0x0, 0x0, 0x0, 0x49, 0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82
 };
diff --git a/scene/resources/height_map_shape_3d.cpp b/scene/resources/height_map_shape_3d.cpp
index 5593bb766f..de5da944bc 100644
--- a/scene/resources/height_map_shape_3d.cpp
+++ b/scene/resources/height_map_shape_3d.cpp
@@ -41,10 +41,10 @@ Vector<Vector3> HeightMapShape3D::get_debug_mesh_lines() const {
 		Vector2 size(map_width - 1, map_depth - 1);
 		Vector2 start = size * -0.5;
 
-		const real_t *r = map_data.ptr();
+		const float *r = map_data.ptr();
 
 		// reserve some memory for our points..
-		points.resize(((map_width - 1) * map_depth * 2) + (map_width * (map_depth - 1) * 2));
+		points.resize(((map_width - 1) * map_depth * 2) + (map_width * (map_depth - 1) * 2) + ((map_width - 1) * (map_depth - 1) * 2));
 
 		// now set our points
 		int r_offset = 0;
@@ -65,6 +65,11 @@ Vector<Vector3> HeightMapShape3D::get_debug_mesh_lines() const {
 					points.write[w_offset++] = Vector3(height.x, r[r_offset + map_width - 1], height.z + 1.0);
 				}
 
+				if ((w != map_width - 1) && (d != map_depth - 1)) {
+					points.write[w_offset++] = Vector3(height.x + 1.0, r[r_offset], height.z);
+					points.write[w_offset++] = Vector3(height.x, r[r_offset + map_width - 1], height.z + 1.0);
+				}
+
 				height.x += 1.0;
 			}
 
@@ -100,7 +105,7 @@ void HeightMapShape3D::set_map_width(int p_new) {
 		int new_size = map_width * map_depth;
 		map_data.resize(map_width * map_depth);
 
-		real_t *w = map_data.ptrw();
+		float *w = map_data.ptrw();
 		while (was_size < new_size) {
 			w[was_size++] = 0.0;
 		}
@@ -124,7 +129,7 @@ void HeightMapShape3D::set_map_depth(int p_new) {
 		int new_size = map_width * map_depth;
 		map_data.resize(new_size);
 
-		real_t *w = map_data.ptrw();
+		float *w = map_data.ptrw();
 		while (was_size < new_size) {
 			w[was_size++] = 0.0;
 		}
@@ -146,8 +151,8 @@ void HeightMapShape3D::set_map_data(PackedFloat32Array p_new) {
 	}
 
 	// copy
-	real_t *w = map_data.ptrw();
-	const real_t *r = p_new.ptr();
+	float *w = map_data.ptrw();
+	const float *r = p_new.ptr();
 	for (int i = 0; i < size; i++) {
 		float val = r[i];
 		w[i] = val;
@@ -189,7 +194,7 @@ void HeightMapShape3D::_bind_methods() {
 HeightMapShape3D::HeightMapShape3D() :
 		Shape3D(PhysicsServer3D::get_singleton()->shape_create(PhysicsServer3D::SHAPE_HEIGHTMAP)) {
 	map_data.resize(map_width * map_depth);
-	real_t *w = map_data.ptrw();
+	float *w = map_data.ptrw();
 	w[0] = 0.0;
 	w[1] = 0.0;
 	w[2] = 0.0;
diff --git a/scene/resources/height_map_shape_3d.h b/scene/resources/height_map_shape_3d.h
index 6fc88cff90..1219791c56 100644
--- a/scene/resources/height_map_shape_3d.h
+++ b/scene/resources/height_map_shape_3d.h
@@ -39,8 +39,8 @@ class HeightMapShape3D : public Shape3D {
 	int map_width = 2;
 	int map_depth = 2;
 	PackedFloat32Array map_data;
-	float min_height = 0.0;
-	float max_height = 0.0;
+	real_t min_height = 0.0;
+	real_t max_height = 0.0;
 
 protected:
 	static void _bind_methods();
diff --git a/scene/resources/material.cpp b/scene/resources/material.cpp
index 0d02bde90d..e8157c7165 100644
--- a/scene/resources/material.cpp
+++ b/scene/resources/material.cpp
@@ -543,6 +543,9 @@ void BaseMaterial3D::_update_shader() {
 	if (flags[FLAG_DISABLE_DEPTH_TEST]) {
 		code += ",depth_test_disabled";
 	}
+	if (flags[FLAG_PARTICLE_TRAILS_MODE]) {
+		code += ",particle_trails";
+	}
 	if (shading_mode == SHADING_MODE_PER_VERTEX) {
 		code += ",vertex_lighting";
 	}
@@ -672,7 +675,7 @@ void BaseMaterial3D::_update_shader() {
 		code += "uniform sampler2D texture_flowmap : hint_aniso," + texfilter_str + ";\n";
 	}
 	if (features[FEATURE_AMBIENT_OCCLUSION]) {
-		code += "uniform sampler2D texture_ambient_occlusion : hint_white;\n";
+		code += "uniform sampler2D texture_ambient_occlusion : hint_white, " + texfilter_str + ";\n";
 		code += "uniform vec4 ao_texture_channel;\n";
 		code += "uniform float ao_light_affect;\n";
 	}
@@ -829,16 +832,26 @@ void BaseMaterial3D::_update_shader() {
 	}
 
 	if (flags[FLAG_UV1_USE_TRIPLANAR]) {
-		code += "\tuv1_power_normal=pow(abs(NORMAL),vec3(uv1_blend_sharpness));\n";
+		if (flags[FLAG_UV1_USE_WORLD_TRIPLANAR]) {
+			code += "\tuv1_power_normal=pow(abs(mat3(WORLD_MATRIX) * NORMAL),vec3(uv1_blend_sharpness));\n";
+			code += "\tuv1_triplanar_pos = (WORLD_MATRIX * vec4(VERTEX, 1.0f)).xyz * uv1_scale + uv1_offset;\n";
+		} else {
+			code += "\tuv1_power_normal=pow(abs(NORMAL),vec3(uv1_blend_sharpness));\n";
+			code += "\tuv1_triplanar_pos = VERTEX * uv1_scale + uv1_offset;\n";
+		}
 		code += "\tuv1_power_normal/=dot(uv1_power_normal,vec3(1.0));\n";
-		code += "\tuv1_triplanar_pos = VERTEX * uv1_scale + uv1_offset;\n";
 		code += "\tuv1_triplanar_pos *= vec3(1.0,-1.0, 1.0);\n";
 	}
 
 	if (flags[FLAG_UV2_USE_TRIPLANAR]) {
-		code += "\tuv2_power_normal=pow(abs(NORMAL), vec3(uv2_blend_sharpness));\n";
+		if (flags[FLAG_UV2_USE_WORLD_TRIPLANAR]) {
+			code += "\tuv2_power_normal=pow(abs(mat3(WORLD_MATRIX) * NORMAL), vec3(uv2_blend_sharpness));\n";
+			code += "\tuv2_triplanar_pos = (WORLD_MATRIX * vec4(VERTEX, 1.0f)).xyz * uv2_scale + uv2_offset;\n";
+		} else {
+			code += "\tuv2_power_normal=pow(abs(NORMAL), vec3(uv2_blend_sharpness));\n";
+			code += "\tuv2_triplanar_pos = VERTEX * uv2_scale + uv2_offset;\n";
+		}
 		code += "\tuv2_power_normal/=dot(uv2_power_normal,vec3(1.0));\n";
-		code += "\tuv2_triplanar_pos = VERTEX * uv2_scale + uv2_offset;\n";
 		code += "\tuv2_triplanar_pos *= vec3(1.0,-1.0, 1.0);\n";
 	}
 
@@ -1587,6 +1600,9 @@ void BaseMaterial3D::set_flag(Flags p_flag, bool p_enabled) {
 	if (p_flag == FLAG_USE_SHADOW_TO_OPACITY || p_flag == FLAG_USE_TEXTURE_REPEAT || p_flag == FLAG_SUBSURFACE_MODE_SKIN || p_flag == FLAG_USE_POINT_SIZE) {
 		notify_property_list_changed();
 	}
+	if (p_flag == FLAG_PARTICLE_TRAILS_MODE) {
+		update_configuration_warning();
+	}
 	_queue_shader_change();
 }
 
@@ -2167,6 +2183,8 @@ Shader::Mode BaseMaterial3D::get_shader_mode() const {
 }
 
 void BaseMaterial3D::_bind_methods() {
+	static_assert(sizeof(MaterialKey) == 16, "MaterialKey should be 16 bytes");
+
 	ClassDB::bind_method(D_METHOD("set_albedo", "albedo"), &BaseMaterial3D::set_albedo);
 	ClassDB::bind_method(D_METHOD("get_albedo"), &BaseMaterial3D::get_albedo);
 
@@ -2524,6 +2542,7 @@ void BaseMaterial3D::_bind_methods() {
 	ADD_PROPERTYI(PropertyInfo(Variant::BOOL, "fixed_size"), "set_flag", "get_flag", FLAG_FIXED_SIZE);
 	ADD_PROPERTYI(PropertyInfo(Variant::BOOL, "use_point_size"), "set_flag", "get_flag", FLAG_USE_POINT_SIZE);
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "point_size", PROPERTY_HINT_RANGE, "0.1,128,0.1"), "set_point_size", "get_point_size");
+	ADD_PROPERTYI(PropertyInfo(Variant::BOOL, "use_particle_trails"), "set_flag", "get_flag", FLAG_PARTICLE_TRAILS_MODE);
 	ADD_GROUP("Proximity Fade", "proximity_fade_");
 	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "proximity_fade_enable"), "set_proximity_fade", "is_proximity_fade_enabled");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "proximity_fade_distance", PROPERTY_HINT_RANGE, "0,4096,0.01"), "set_proximity_fade_distance", "get_proximity_fade_distance");
@@ -2625,6 +2644,7 @@ void BaseMaterial3D::_bind_methods() {
 	BIND_ENUM_CONSTANT(FLAG_USE_TEXTURE_REPEAT);
 	BIND_ENUM_CONSTANT(FLAG_INVERT_HEIGHTMAP);
 	BIND_ENUM_CONSTANT(FLAG_SUBSURFACE_MODE_SKIN);
+	BIND_ENUM_CONSTANT(FLAG_PARTICLE_TRAILS_MODE);
 	BIND_ENUM_CONSTANT(FLAG_MAX);
 
 	BIND_ENUM_CONSTANT(DIFFUSE_BURLEY);
diff --git a/scene/resources/material.h b/scene/resources/material.h
index 70452a5f74..ad1b7b3e33 100644
--- a/scene/resources/material.h
+++ b/scene/resources/material.h
@@ -235,6 +235,7 @@ public:
 		FLAG_USE_TEXTURE_REPEAT,
 		FLAG_INVERT_HEIGHTMAP,
 		FLAG_SUBSURFACE_MODE_SKIN,
+		FLAG_PARTICLE_TRAILS_MODE,
 		FLAG_MAX
 	};
 
@@ -305,16 +306,15 @@ private:
 		uint64_t roughness_channel : get_num_bits(TEXTURE_CHANNEL_MAX - 1);
 		uint64_t emission_op : get_num_bits(EMISSION_OP_MAX - 1);
 		uint64_t distance_fade : get_num_bits(DISTANCE_FADE_MAX - 1);
-
-		// flag bitfield
-		uint64_t feature_mask : FEATURE_MAX - 1;
-		uint64_t flags : FLAG_MAX - 1;
-
 		// booleans
 		uint64_t deep_parallax : 1;
 		uint64_t grow : 1;
 		uint64_t proximity_fade : 1;
 
+		// flag bitfield
+		uint32_t feature_mask;
+		uint32_t flags;
+
 		MaterialKey() {
 			memset(this, 0, sizeof(MaterialKey));
 		}
diff --git a/scene/resources/mesh.cpp b/scene/resources/mesh.cpp
index f8e1ce6a61..33ad15b938 100644
--- a/scene/resources/mesh.cpp
+++ b/scene/resources/mesh.cpp
@@ -579,6 +579,13 @@ Vector<Ref<Shape3D>> Mesh::convex_decompose() const {
 	return ret;
 }
 
+int Mesh::get_builtin_bind_pose_count() const {
+	return 0;
+}
+Transform Mesh::get_builtin_bind_pose(int p_index) const {
+	return Transform();
+}
+
 Mesh::Mesh() {
 }
 
@@ -1394,7 +1401,7 @@ void ArrayMesh::regen_normal_maps() {
 }
 
 //dirty hack
-bool (*array_mesh_lightmap_unwrap_callback)(float p_texel_size, const float *p_vertices, const float *p_normals, int p_vertex_count, const int *p_indices, int p_index_count, float **r_uv, int **r_vertex, int *r_vertex_count, int **r_index, int *r_index_count, int *r_size_hint_x, int *r_size_hint_y, int *&r_cache_data, unsigned int &r_cache_size, bool &r_used_cache);
+bool (*array_mesh_lightmap_unwrap_callback)(float p_texel_size, const float *p_vertices, const float *p_normals, int p_vertex_count, const int *p_indices, int p_index_count, const uint8_t *p_cache_data, bool *r_use_cache, uint8_t **r_mesh_cache, int *r_mesh_cache_size, float **r_uv, int **r_vertex, int *r_vertex_count, int **r_index, int *r_index_count, int *r_size_hint_x, int *r_size_hint_y) = NULL;
 
 struct ArrayMeshLightmapSurface {
 	Ref<Material> material;
@@ -1404,28 +1411,28 @@ struct ArrayMeshLightmapSurface {
 };
 
 Error ArrayMesh::lightmap_unwrap(const Transform &p_base_transform, float p_texel_size) {
-	int *cache_data = nullptr;
-	unsigned int cache_size = 0;
-	bool use_cache = false; // Don't use cache
-	return lightmap_unwrap_cached(cache_data, cache_size, use_cache, p_base_transform, p_texel_size);
+	Vector<uint8_t> null_cache;
+	return lightmap_unwrap_cached(p_base_transform, p_texel_size, null_cache, null_cache, false);
 }
 
-Error ArrayMesh::lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cache_size, bool &r_used_cache, const Transform &p_base_transform, float p_texel_size) {
+Error ArrayMesh::lightmap_unwrap_cached(const Transform &p_base_transform, float p_texel_size, const Vector<uint8_t> &p_src_cache, Vector<uint8_t> &r_dst_cache, bool p_generate_cache) {
 	ERR_FAIL_COND_V(!array_mesh_lightmap_unwrap_callback, ERR_UNCONFIGURED);
 	ERR_FAIL_COND_V_MSG(blend_shapes.size() != 0, ERR_UNAVAILABLE, "Can't unwrap mesh with blend shapes.");
 
-	Vector<float> vertices;
-	Vector<float> normals;
-	Vector<int> indices;
-	Vector<float> uv;
-	Vector<Pair<int, int>> uv_indices;
+	LocalVector<float> vertices;
+	LocalVector<float> normals;
+	LocalVector<int> indices;
+	LocalVector<float> uv;
+	LocalVector<Pair<int, int>> uv_indices;
 
 	Vector<ArrayMeshLightmapSurface> lightmap_surfaces;
 
 	// Keep only the scale
-	Transform transform = p_base_transform;
-	transform.origin = Vector3();
-	transform.looking_at(Vector3(1, 0, 0), Vector3(0, 1, 0));
+	Basis basis = p_base_transform.get_basis();
+	Vector3 scale = Vector3(basis.get_axis(0).length(), basis.get_axis(1).length(), basis.get_axis(2).length());
+
+	Transform transform;
+	transform.scale(scale);
 
 	Basis normal_basis = transform.basis.inverse().transposed();
 
@@ -1439,14 +1446,12 @@ Error ArrayMesh::lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cach
 
 		Array arrays = surface_get_arrays(i);
 		s.material = surface_get_material(i);
-		SurfaceTool::create_vertex_array_from_triangle_arrays(arrays, s.vertices);
+		SurfaceTool::create_vertex_array_from_triangle_arrays(arrays, s.vertices, &s.format);
 
-		Vector<Vector3> rvertices = arrays[Mesh::ARRAY_VERTEX];
+		PackedVector3Array rvertices = arrays[Mesh::ARRAY_VERTEX];
 		int vc = rvertices.size();
-		const Vector3 *r = rvertices.ptr();
 
-		Vector<Vector3> rnormals = arrays[Mesh::ARRAY_NORMAL];
-		const Vector3 *rn = rnormals.ptr();
+		PackedVector3Array rnormals = arrays[Mesh::ARRAY_NORMAL];
 
 		int vertex_ofs = vertices.size() / 3;
 
@@ -1455,24 +1460,29 @@ Error ArrayMesh::lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cach
 		uv_indices.resize(vertex_ofs + vc);
 
 		for (int j = 0; j < vc; j++) {
-			Vector3 v = transform.xform(r[j]);
-			Vector3 n = normal_basis.xform(rn[j]).normalized();
-
-			vertices.write[(j + vertex_ofs) * 3 + 0] = v.x;
-			vertices.write[(j + vertex_ofs) * 3 + 1] = v.y;
-			vertices.write[(j + vertex_ofs) * 3 + 2] = v.z;
-			normals.write[(j + vertex_ofs) * 3 + 0] = n.x;
-			normals.write[(j + vertex_ofs) * 3 + 1] = n.y;
-			normals.write[(j + vertex_ofs) * 3 + 2] = n.z;
-			uv_indices.write[j + vertex_ofs] = Pair<int, int>(i, j);
+			Vector3 v = transform.xform(rvertices[j]);
+			Vector3 n = normal_basis.xform(rnormals[j]).normalized();
+
+			vertices[(j + vertex_ofs) * 3 + 0] = v.x;
+			vertices[(j + vertex_ofs) * 3 + 1] = v.y;
+			vertices[(j + vertex_ofs) * 3 + 2] = v.z;
+			normals[(j + vertex_ofs) * 3 + 0] = n.x;
+			normals[(j + vertex_ofs) * 3 + 1] = n.y;
+			normals[(j + vertex_ofs) * 3 + 2] = n.z;
+			uv_indices[j + vertex_ofs] = Pair<int, int>(i, j);
 		}
 
-		Vector<int> rindices = arrays[Mesh::ARRAY_INDEX];
+		PackedInt32Array rindices = arrays[Mesh::ARRAY_INDEX];
 		int ic = rindices.size();
 
+		float eps = 1.19209290e-7F; // Taken from xatlas.h
 		if (ic == 0) {
 			for (int j = 0; j < vc / 3; j++) {
-				if (Face3(r[j * 3 + 0], r[j * 3 + 1], r[j * 3 + 2]).is_degenerate()) {
+				Vector3 p0 = transform.xform(rvertices[j * 3 + 0]);
+				Vector3 p1 = transform.xform(rvertices[j * 3 + 1]);
+				Vector3 p2 = transform.xform(rvertices[j * 3 + 2]);
+
+				if ((p0 - p1).length_squared() < eps || (p1 - p2).length_squared() < eps || (p2 - p0).length_squared() < eps) {
 					continue;
 				}
 
@@ -1482,15 +1492,18 @@ Error ArrayMesh::lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cach
 			}
 
 		} else {
-			const int *ri = rindices.ptr();
-
 			for (int j = 0; j < ic / 3; j++) {
-				if (Face3(r[ri[j * 3 + 0]], r[ri[j * 3 + 1]], r[ri[j * 3 + 2]]).is_degenerate()) {
+				Vector3 p0 = transform.xform(rvertices[rindices[j * 3 + 0]]);
+				Vector3 p1 = transform.xform(rvertices[rindices[j * 3 + 1]]);
+				Vector3 p2 = transform.xform(rvertices[rindices[j * 3 + 2]]);
+
+				if ((p0 - p1).length_squared() < eps || (p1 - p2).length_squared() < eps || (p2 - p0).length_squared() < eps) {
 					continue;
 				}
-				indices.push_back(vertex_ofs + ri[j * 3 + 0]);
-				indices.push_back(vertex_ofs + ri[j * 3 + 1]);
-				indices.push_back(vertex_ofs + ri[j * 3 + 2]);
+
+				indices.push_back(vertex_ofs + rindices[j * 3 + 0]);
+				indices.push_back(vertex_ofs + rindices[j * 3 + 1]);
+				indices.push_back(vertex_ofs + rindices[j * 3 + 2]);
 			}
 		}
 
@@ -1499,6 +1512,9 @@ Error ArrayMesh::lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cach
 
 	//unwrap
 
+	bool use_cache = p_generate_cache; // Used to request cache generation and to know if cache was used
+	uint8_t *gen_cache;
+	int gen_cache_size;
 	float *gen_uvs;
 	int *gen_vertices;
 	int *gen_indices;
@@ -1507,17 +1523,16 @@ Error ArrayMesh::lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cach
 	int size_x;
 	int size_y;
 
-	bool ok = array_mesh_lightmap_unwrap_callback(p_texel_size, vertices.ptr(), normals.ptr(), vertices.size() / 3, indices.ptr(), indices.size(), &gen_uvs, &gen_vertices, &gen_vertex_count, &gen_indices, &gen_index_count, &size_x, &size_y, r_cache_data, r_cache_size, r_used_cache);
+	bool ok = array_mesh_lightmap_unwrap_callback(p_texel_size, vertices.ptr(), normals.ptr(), vertices.size() / 3, indices.ptr(), indices.size(), p_src_cache.ptr(), &use_cache, &gen_cache, &gen_cache_size, &gen_uvs, &gen_vertices, &gen_vertex_count, &gen_indices, &gen_index_count, &size_x, &size_y);
 
 	if (!ok) {
 		return ERR_CANT_CREATE;
 	}
 
-	//remove surfaces
 	clear_surfaces();
 
 	//create surfacetools for each surface..
-	Vector<Ref<SurfaceTool>> surfaces_tools;
+	LocalVector<Ref<SurfaceTool>> surfaces_tools;
 
 	for (int i = 0; i < lightmap_surfaces.size(); i++) {
 		Ref<SurfaceTool> st;
@@ -1528,11 +1543,12 @@ Error ArrayMesh::lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cach
 	}
 
 	print_verbose("Mesh: Gen indices: " + itos(gen_index_count));
+
 	//go through all indices
 	for (int i = 0; i < gen_index_count; i += 3) {
-		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 0]], uv_indices.size(), ERR_BUG);
-		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 1]], uv_indices.size(), ERR_BUG);
-		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 2]], uv_indices.size(), ERR_BUG);
+		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 0]], (int)uv_indices.size(), ERR_BUG);
+		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 1]], (int)uv_indices.size(), ERR_BUG);
+		ERR_FAIL_INDEX_V(gen_vertices[gen_indices[i + 2]], (int)uv_indices.size(), ERR_BUG);
 
 		ERR_FAIL_COND_V(uv_indices[gen_vertices[gen_indices[i + 0]]].first != uv_indices[gen_vertices[gen_indices[i + 1]]].first || uv_indices[gen_vertices[gen_indices[i + 0]]].first != uv_indices[gen_vertices[gen_indices[i + 2]]].first, ERR_BUG);
 
@@ -1542,48 +1558,53 @@ Error ArrayMesh::lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cach
 			SurfaceTool::Vertex v = lightmap_surfaces[surface].vertices[uv_indices[gen_vertices[gen_indices[i + j]]].second];
 
 			if (lightmap_surfaces[surface].format & ARRAY_FORMAT_COLOR) {
-				surfaces_tools.write[surface]->set_color(v.color);
+				surfaces_tools[surface]->set_color(v.color);
 			}
 			if (lightmap_surfaces[surface].format & ARRAY_FORMAT_TEX_UV) {
-				surfaces_tools.write[surface]->set_uv(v.uv);
+				surfaces_tools[surface]->set_uv(v.uv);
 			}
 			if (lightmap_surfaces[surface].format & ARRAY_FORMAT_NORMAL) {
-				surfaces_tools.write[surface]->set_normal(v.normal);
+				surfaces_tools[surface]->set_normal(v.normal);
 			}
 			if (lightmap_surfaces[surface].format & ARRAY_FORMAT_TANGENT) {
 				Plane t;
 				t.normal = v.tangent;
 				t.d = v.binormal.dot(v.normal.cross(v.tangent)) < 0 ? -1 : 1;
-				surfaces_tools.write[surface]->set_tangent(t);
+				surfaces_tools[surface]->set_tangent(t);
 			}
 			if (lightmap_surfaces[surface].format & ARRAY_FORMAT_BONES) {
-				surfaces_tools.write[surface]->set_bones(v.bones);
+				surfaces_tools[surface]->set_bones(v.bones);
 			}
 			if (lightmap_surfaces[surface].format & ARRAY_FORMAT_WEIGHTS) {
-				surfaces_tools.write[surface]->set_weights(v.weights);
+				surfaces_tools[surface]->set_weights(v.weights);
 			}
 
 			Vector2 uv2(gen_uvs[gen_indices[i + j] * 2 + 0], gen_uvs[gen_indices[i + j] * 2 + 1]);
-			surfaces_tools.write[surface]->set_uv2(uv2);
+			surfaces_tools[surface]->set_uv2(uv2);
 
-			surfaces_tools.write[surface]->add_vertex(v.vertex);
+			surfaces_tools[surface]->add_vertex(v.vertex);
 		}
 	}
 
 	//generate surfaces
-
-	for (int i = 0; i < surfaces_tools.size(); i++) {
-		surfaces_tools.write[i]->index();
-		surfaces_tools.write[i]->commit(Ref<ArrayMesh>((ArrayMesh *)this), lightmap_surfaces[i].format);
+	for (unsigned int i = 0; i < surfaces_tools.size(); i++) {
+		surfaces_tools[i]->index();
+		surfaces_tools[i]->commit(Ref<ArrayMesh>((ArrayMesh *)this), lightmap_surfaces[i].format);
 	}
 
 	set_lightmap_size_hint(Size2(size_x, size_y));
 
-	if (!r_used_cache) {
-		//free stuff
-		::free(gen_vertices);
-		::free(gen_indices);
-		::free(gen_uvs);
+	if (gen_cache_size > 0) {
+		r_dst_cache.resize(gen_cache_size);
+		memcpy(r_dst_cache.ptrw(), gen_cache, gen_cache_size);
+		memfree(gen_cache);
+	}
+
+	if (!use_cache) {
+		// Cache was not used, free the buffers
+		memfree(gen_vertices);
+		memfree(gen_indices);
+		memfree(gen_uvs);
 	}
 
 	return OK;
diff --git a/scene/resources/mesh.h b/scene/resources/mesh.h
index 9a462d5719..aa830d7b50 100644
--- a/scene/resources/mesh.h
+++ b/scene/resources/mesh.h
@@ -165,6 +165,9 @@ public:
 
 	Vector<Ref<Shape3D>> convex_decompose() const;
 
+	virtual int get_builtin_bind_pose_count() const;
+	virtual Transform get_builtin_bind_pose(int p_index) const;
+
 	Mesh();
 };
 
@@ -260,7 +263,7 @@ public:
 	void regen_normal_maps();
 
 	Error lightmap_unwrap(const Transform &p_base_transform = Transform(), float p_texel_size = 0.05);
-	Error lightmap_unwrap_cached(int *&r_cache_data, unsigned int &r_cache_size, bool &r_used_cache, const Transform &p_base_transform = Transform(), float p_texel_size = 0.05);
+	Error lightmap_unwrap_cached(const Transform &p_base_transform, float p_texel_size, const Vector<uint8_t> &p_src_cache, Vector<uint8_t> &r_dst_cache, bool p_generate_cache = true);
 
 	virtual void reload_from_file() override;
 
diff --git a/scene/resources/navigation_mesh.cpp b/scene/resources/navigation_mesh.cpp
index 8c12f59a00..0a25bb2ed1 100644
--- a/scene/resources/navigation_mesh.cpp
+++ b/scene/resources/navigation_mesh.cpp
@@ -92,6 +92,7 @@ uint32_t NavigationMesh::get_collision_mask() const {
 }
 
 void NavigationMesh::set_collision_mask_bit(int p_bit, bool p_value) {
+	ERR_FAIL_INDEX_MSG(p_bit, 32, "Collision mask bit must be between 0 and 31 inclusive.");
 	uint32_t mask = get_collision_mask();
 	if (p_value) {
 		mask |= 1 << p_bit;
@@ -102,6 +103,7 @@ void NavigationMesh::set_collision_mask_bit(int p_bit, bool p_value) {
 }
 
 bool NavigationMesh::get_collision_mask_bit(int p_bit) const {
+	ERR_FAIL_INDEX_V_MSG(p_bit, 32, false, "Collision mask bit must be between 0 and 31 inclusive.");
 	return get_collision_mask() & (1 << p_bit);
 }
 
diff --git a/scene/resources/particles_material.cpp b/scene/resources/particles_material.cpp
index 195ce070a7..59e699326d 100644
--- a/scene/resources/particles_material.cpp
+++ b/scene/resources/particles_material.cpp
@@ -289,7 +289,7 @@ void ParticlesMaterial::_update_shader() {
 	code += "}\n";
 	code += "\n";
 
-	code += "void compute() {\n";
+	code += "void start() {\n";
 	code += "	uint base_number = NUMBER;\n";
 	code += "	uint alt_seed = hash(base_number + uint(1) + RANDOM_SEED);\n";
 	code += "	float angle_rand = rand_from_seed(alt_seed);\n";
@@ -305,97 +305,94 @@ void ParticlesMaterial::_update_shader() {
 		code += "	ivec2 emission_tex_size = textureSize(emission_texture_points, 0);\n";
 		code += "	ivec2 emission_tex_ofs = ivec2(point % emission_tex_size.x, point / emission_tex_size.x);\n";
 	}
-	code += "	float tv = 0.0;\n";
-	code += "	if (RESTART) {\n";
-
 	if (tex_parameters[PARAM_ANGLE].is_valid()) {
-		code += "		float tex_angle = textureLod(angle_texture, vec2(0.0, 0.0), 0.0).r;\n";
+		code += "	float tex_angle = textureLod(angle_texture, vec2(0.0, 0.0), 0.0).r;\n";
 	} else {
-		code += "		float tex_angle = 0.0;\n";
+		code += "	float tex_angle = 0.0;\n";
 	}
 
 	if (tex_parameters[PARAM_ANIM_OFFSET].is_valid()) {
-		code += "		float tex_anim_offset = textureLod(anim_offset_texture, vec2(0.0, 0.0), 0.0).r;\n";
+		code += "	float tex_anim_offset = textureLod(anim_offset_texture, vec2(0.0, 0.0), 0.0).r;\n";
 	} else {
-		code += "		float tex_anim_offset = 0.0;\n";
+		code += "	float tex_anim_offset = 0.0;\n";
 	}
 
-	code += "		float spread_rad = spread * degree_to_rad;\n";
+	code += "	float spread_rad = spread * degree_to_rad;\n";
 
-	code += "		if (RESTART_VELOCITY) {\n";
+	code += "	if (RESTART_VELOCITY) {\n";
 
 	if (tex_parameters[PARAM_INITIAL_LINEAR_VELOCITY].is_valid()) {
-		code += "			float tex_linear_velocity = textureLod(linear_velocity_texture, vec2(0.0, 0.0), 0.0).r;\n";
+		code += "		float tex_linear_velocity = textureLod(linear_velocity_texture, vec2(0.0, 0.0), 0.0).r;\n";
 	} else {
-		code += "			float tex_linear_velocity = 0.0;\n";
+		code += "		float tex_linear_velocity = 0.0;\n";
 	}
 
 	if (particle_flags[PARTICLE_FLAG_DISABLE_Z]) {
-		code += "			float angle1_rad = rand_from_seed_m1_p1(alt_seed) * spread_rad;\n";
-		code += "			angle1_rad += direction.x != 0.0 ? atan(direction.y, direction.x) : sign(direction.y) * (pi / 2.0);\n";
-		code += "			vec3 rot = vec3(cos(angle1_rad), sin(angle1_rad), 0.0);\n";
-		code += "			VELOCITY = rot * initial_linear_velocity * mix(1.0, rand_from_seed(alt_seed), initial_linear_velocity_random);\n";
+		code += "		float angle1_rad = rand_from_seed_m1_p1(alt_seed) * spread_rad;\n";
+		code += "		angle1_rad += direction.x != 0.0 ? atan(direction.y, direction.x) : sign(direction.y) * (pi / 2.0);\n";
+		code += "		vec3 rot = vec3(cos(angle1_rad), sin(angle1_rad), 0.0);\n";
+		code += "		VELOCITY = rot * initial_linear_velocity * mix(1.0, rand_from_seed(alt_seed), initial_linear_velocity_random);\n";
 
 	} else {
 		//initiate velocity spread in 3D
-		code += "			float angle1_rad = rand_from_seed_m1_p1(alt_seed) * spread_rad;\n";
-		code += "			float angle2_rad = rand_from_seed_m1_p1(alt_seed) * spread_rad * (1.0 - flatness);\n";
-		code += "			vec3 direction_xz = vec3(sin(angle1_rad), 0.0, cos(angle1_rad));\n";
-		code += "			vec3 direction_yz = vec3(0.0, sin(angle2_rad), cos(angle2_rad));\n";
-		code += "			direction_yz.z = direction_yz.z / max(0.0001,sqrt(abs(direction_yz.z))); // better uniform distribution\n";
-		code += "			vec3 spread_direction = vec3(direction_xz.x * direction_yz.z, direction_yz.y, direction_xz.z * direction_yz.z);\n";
-		code += "			vec3 direction_nrm = normalize(direction);\n";
-		code += "			// rotate spread to direction\n";
-		code += "			vec3 binormal = cross(vec3(0.0, 1.0, 0.0), direction_nrm);\n";
-		code += "			if (length(binormal) < 0.0001) {\n";
-		code += "				// direction is parallel to Y. Choose Z as the binormal.\n";
-		code += "				binormal = vec3(0.0, 0.0, 1.0);\n";
-		code += "			}\n";
-		code += "			binormal = normalize(binormal);\n";
-		code += "			vec3 normal = cross(binormal, direction_nrm);\n";
-		code += "			spread_direction = binormal * spread_direction.x + normal * spread_direction.y + direction_nrm * spread_direction.z;\n";
-		code += "			VELOCITY = spread_direction * initial_linear_velocity * mix(1.0, rand_from_seed(alt_seed), initial_linear_velocity_random);\n";
+		code += "		float angle1_rad = rand_from_seed_m1_p1(alt_seed) * spread_rad;\n";
+		code += "		float angle2_rad = rand_from_seed_m1_p1(alt_seed) * spread_rad * (1.0 - flatness);\n";
+		code += "		vec3 direction_xz = vec3(sin(angle1_rad), 0.0, cos(angle1_rad));\n";
+		code += "		vec3 direction_yz = vec3(0.0, sin(angle2_rad), cos(angle2_rad));\n";
+		code += "		direction_yz.z = direction_yz.z / max(0.0001,sqrt(abs(direction_yz.z))); // better uniform distribution\n";
+		code += "		vec3 spread_direction = vec3(direction_xz.x * direction_yz.z, direction_yz.y, direction_xz.z * direction_yz.z);\n";
+		code += "		vec3 direction_nrm = normalize(direction);\n";
+		code += "		// rotate spread to direction\n";
+		code += "		vec3 binormal = cross(vec3(0.0, 1.0, 0.0), direction_nrm);\n";
+		code += "		if (length(binormal) < 0.0001) {\n";
+		code += "			// direction is parallel to Y. Choose Z as the binormal.\n";
+		code += "			binormal = vec3(0.0, 0.0, 1.0);\n";
+		code += "		}\n";
+		code += "		binormal = normalize(binormal);\n";
+		code += "		vec3 normal = cross(binormal, direction_nrm);\n";
+		code += "		spread_direction = binormal * spread_direction.x + normal * spread_direction.y + direction_nrm * spread_direction.z;\n";
+		code += "		VELOCITY = spread_direction * initial_linear_velocity * mix(1.0, rand_from_seed(alt_seed), initial_linear_velocity_random);\n";
 	}
-	code += "		}\n";
+	code += "	}\n";
 
-	code += "		float base_angle = (initial_angle + tex_angle) * mix(1.0, angle_rand, initial_angle_random);\n";
-	code += "		CUSTOM.x = base_angle * degree_to_rad;\n"; // angle
-	code += "		CUSTOM.y = 0.0;\n"; // phase
-	code += "		CUSTOM.w = (1.0 - lifetime_randomness * rand_from_seed(alt_seed));\n";
-	code += "		CUSTOM.z = (anim_offset + tex_anim_offset) * mix(1.0, anim_offset_rand, anim_offset_random);\n"; // animation offset (0-1)
+	code += "	float base_angle = (initial_angle + tex_angle) * mix(1.0, angle_rand, initial_angle_random);\n";
+	code += "	CUSTOM.x = base_angle * degree_to_rad;\n"; // angle
+	code += "	CUSTOM.y = 0.0;\n"; // phase
+	code += "	CUSTOM.w = (1.0 - lifetime_randomness * rand_from_seed(alt_seed));\n";
+	code += "	CUSTOM.z = (anim_offset + tex_anim_offset) * mix(1.0, anim_offset_rand, anim_offset_random);\n"; // animation offset (0-1)
 
-	code += "		if (RESTART_POSITION) {\n";
+	code += "	if (RESTART_POSITION) {\n";
 
 	switch (emission_shape) {
 		case EMISSION_SHAPE_POINT: {
 			//do none, identity (will later be multiplied by emission transform)
-			code += "			TRANSFORM = mat4(vec4(1,0,0,0),vec4(0,1,0,0),vec4(0,0,1,0),vec4(0,0,0,1));\n";
+			code += "		TRANSFORM = mat4(vec4(1,0,0,0),vec4(0,1,0,0),vec4(0,0,1,0),vec4(0,0,0,1));\n";
 		} break;
 		case EMISSION_SHAPE_SPHERE: {
-			code += "			float s = rand_from_seed(alt_seed) * 2.0 - 1.0;\n";
-			code += "			float t = rand_from_seed(alt_seed) * 2.0 * pi;\n";
-			code += "			float radius = emission_sphere_radius * sqrt(1.0 - s * s);\n";
-			code += "			TRANSFORM[3].xyz = vec3(radius * cos(t), radius * sin(t), emission_sphere_radius * s);\n";
+			code += "		float s = rand_from_seed(alt_seed) * 2.0 - 1.0;\n";
+			code += "		float t = rand_from_seed(alt_seed) * 2.0 * pi;\n";
+			code += "		float radius = emission_sphere_radius * sqrt(1.0 - s * s);\n";
+			code += "		TRANSFORM[3].xyz = vec3(radius * cos(t), radius * sin(t), emission_sphere_radius * s);\n";
 		} break;
 		case EMISSION_SHAPE_BOX: {
-			code += "			TRANSFORM[3].xyz = vec3(rand_from_seed(alt_seed) * 2.0 - 1.0, rand_from_seed(alt_seed) * 2.0 - 1.0, rand_from_seed(alt_seed) * 2.0 - 1.0) * emission_box_extents;\n";
+			code += "		TRANSFORM[3].xyz = vec3(rand_from_seed(alt_seed) * 2.0 - 1.0, rand_from_seed(alt_seed) * 2.0 - 1.0, rand_from_seed(alt_seed) * 2.0 - 1.0) * emission_box_extents;\n";
 		} break;
 		case EMISSION_SHAPE_POINTS:
 		case EMISSION_SHAPE_DIRECTED_POINTS: {
-			code += "			TRANSFORM[3].xyz = texelFetch(emission_texture_points, emission_tex_ofs, 0).xyz;\n";
+			code += "		TRANSFORM[3].xyz = texelFetch(emission_texture_points, emission_tex_ofs, 0).xyz;\n";
 
 			if (emission_shape == EMISSION_SHAPE_DIRECTED_POINTS) {
 				if (particle_flags[PARTICLE_FLAG_DISABLE_Z]) {
-					code += "			mat2 rotm;";
-					code += "			rotm[0] = texelFetch(emission_texture_normal, emission_tex_ofs, 0).xy;\n";
-					code += "			rotm[1] = rotm[0].yx * vec2(1.0, -1.0);\n";
-					code += "			if (RESTART_VELOCITY) VELOCITY.xy = rotm * VELOCITY.xy;\n";
+					code += "		mat2 rotm;";
+					code += "		rotm[0] = texelFetch(emission_texture_normal, emission_tex_ofs, 0).xy;\n";
+					code += "		rotm[1] = rotm[0].yx * vec2(1.0, -1.0);\n";
+					code += "		if (RESTART_VELOCITY) VELOCITY.xy = rotm * VELOCITY.xy;\n";
 				} else {
-					code += "			vec3 normal = texelFetch(emission_texture_normal, emission_tex_ofs, 0).xyz;\n";
-					code += "			vec3 v0 = abs(normal.z) < 0.999 ? vec3(0.0, 0.0, 1.0) : vec3(0.0, 1.0, 0.0);\n";
-					code += "			vec3 tangent = normalize(cross(v0, normal));\n";
-					code += "			vec3 bitangent = normalize(cross(tangent, normal));\n";
-					code += "			if (RESTART_VELOCITY) VELOCITY = mat3(tangent, bitangent, normal) * VELOCITY;\n";
+					code += "		vec3 normal = texelFetch(emission_texture_normal, emission_tex_ofs, 0).xyz;\n";
+					code += "		vec3 v0 = abs(normal.z) < 0.999 ? vec3(0.0, 0.0, 1.0) : vec3(0.0, 1.0, 0.0);\n";
+					code += "		vec3 tangent = normalize(cross(v0, normal));\n";
+					code += "		vec3 bitangent = normalize(cross(tangent, normal));\n";
+					code += "		if (RESTART_VELOCITY) VELOCITY = mat3(tangent, bitangent, normal) * VELOCITY;\n";
 				}
 			}
 		} break;
@@ -404,134 +401,144 @@ void ParticlesMaterial::_update_shader() {
 		}
 	}
 
-	code += "			if (RESTART_VELOCITY) VELOCITY = (EMISSION_TRANSFORM * vec4(VELOCITY, 0.0)).xyz;\n";
-	code += "			TRANSFORM = EMISSION_TRANSFORM * TRANSFORM;\n";
+	code += "	if (RESTART_VELOCITY) VELOCITY = (EMISSION_TRANSFORM * vec4(VELOCITY, 0.0)).xyz;\n";
+	code += "	TRANSFORM = EMISSION_TRANSFORM * TRANSFORM;\n";
 	if (particle_flags[PARTICLE_FLAG_DISABLE_Z]) {
-		code += "			VELOCITY.z = 0.0;\n";
-		code += "			TRANSFORM[3].z = 0.0;\n";
+		code += "	VELOCITY.z = 0.0;\n";
+		code += "	TRANSFORM[3].z = 0.0;\n";
 	}
-	code += "		}\n";
+	code += "	}\n";
+	code += "}\n\n";
 
-	code += "	} else {\n";
+	code += "void process() {\n";
+	code += "	uint base_number = NUMBER;\n";
+	code += "	uint alt_seed = hash(base_number + uint(1) + RANDOM_SEED);\n";
+	code += "	float angle_rand = rand_from_seed(alt_seed);\n";
+	code += "	float scale_rand = rand_from_seed(alt_seed);\n";
+	code += "	float hue_rot_rand = rand_from_seed(alt_seed);\n";
+	code += "	float anim_offset_rand = rand_from_seed(alt_seed);\n";
+	code += "	float pi = 3.14159;\n";
+	code += "	float degree_to_rad = pi / 180.0;\n";
+	code += "\n";
 
-	code += "		CUSTOM.y += DELTA / LIFETIME;\n";
-	code += "		tv = CUSTOM.y / CUSTOM.w;\n";
+	code += "	CUSTOM.y += DELTA / LIFETIME;\n";
+	code += "	float tv = CUSTOM.y / CUSTOM.w;\n";
 	if (tex_parameters[PARAM_INITIAL_LINEAR_VELOCITY].is_valid()) {
-		code += "		float tex_linear_velocity = textureLod(linear_velocity_texture, vec2(tv, 0.0), 0.0).r;\n";
+		code += "	float tex_linear_velocity = textureLod(linear_velocity_texture, vec2(tv, 0.0), 0.0).r;\n";
 	} else {
-		code += "		float tex_linear_velocity = 0.0;\n";
+		code += "	float tex_linear_velocity = 0.0;\n";
 	}
 
 	if (particle_flags[PARTICLE_FLAG_DISABLE_Z]) {
 		if (tex_parameters[PARAM_ORBIT_VELOCITY].is_valid()) {
-			code += "		float tex_orbit_velocity = textureLod(orbit_velocity_texture, vec2(tv, 0.0), 0.0).r;\n";
+			code += "	float tex_orbit_velocity = textureLod(orbit_velocity_texture, vec2(tv, 0.0), 0.0).r;\n";
 		} else {
-			code += "		float tex_orbit_velocity = 0.0;\n";
+			code += "	float tex_orbit_velocity = 0.0;\n";
 		}
 	}
 
 	if (tex_parameters[PARAM_ANGULAR_VELOCITY].is_valid()) {
-		code += "		float tex_angular_velocity = textureLod(angular_velocity_texture, vec2(tv, 0.0), 0.0).r;\n";
+		code += "	float tex_angular_velocity = textureLod(angular_velocity_texture, vec2(tv, 0.0), 0.0).r;\n";
 	} else {
-		code += "		float tex_angular_velocity = 0.0;\n";
+		code += "	float tex_angular_velocity = 0.0;\n";
 	}
 
 	if (tex_parameters[PARAM_LINEAR_ACCEL].is_valid()) {
-		code += "		float tex_linear_accel = textureLod(linear_accel_texture, vec2(tv, 0.0), 0.0).r;\n";
+		code += "	float tex_linear_accel = textureLod(linear_accel_texture, vec2(tv, 0.0), 0.0).r;\n";
 	} else {
-		code += "		float tex_linear_accel = 0.0;\n";
+		code += "	float tex_linear_accel = 0.0;\n";
 	}
 
 	if (tex_parameters[PARAM_RADIAL_ACCEL].is_valid()) {
-		code += "		float tex_radial_accel = textureLod(radial_accel_texture, vec2(tv, 0.0), 0.0).r;\n";
+		code += "	float tex_radial_accel = textureLod(radial_accel_texture, vec2(tv, 0.0), 0.0).r;\n";
 	} else {
-		code += "		float tex_radial_accel = 0.0;\n";
+		code += "	float tex_radial_accel = 0.0;\n";
 	}
 
 	if (tex_parameters[PARAM_TANGENTIAL_ACCEL].is_valid()) {
-		code += "		float tex_tangent_accel = textureLod(tangent_accel_texture, vec2(tv, 0.0), 0.0).r;\n";
+		code += "	float tex_tangent_accel = textureLod(tangent_accel_texture, vec2(tv, 0.0), 0.0).r;\n";
 	} else {
-		code += "		float tex_tangent_accel = 0.0;\n";
+		code += "	float tex_tangent_accel = 0.0;\n";
 	}
 
 	if (tex_parameters[PARAM_DAMPING].is_valid()) {
-		code += "		float tex_damping = textureLod(damping_texture, vec2(tv, 0.0), 0.0).r;\n";
+		code += "	float tex_damping = textureLod(damping_texture, vec2(tv, 0.0), 0.0).r;\n";
 	} else {
-		code += "		float tex_damping = 0.0;\n";
+		code += "	float tex_damping = 0.0;\n";
 	}
 
 	if (tex_parameters[PARAM_ANGLE].is_valid()) {
-		code += "		float tex_angle = textureLod(angle_texture, vec2(tv, 0.0), 0.0).r;\n";
+		code += "	float tex_angle = textureLod(angle_texture, vec2(tv, 0.0), 0.0).r;\n";
 	} else {
-		code += "		float tex_angle = 0.0;\n";
+		code += "	float tex_angle = 0.0;\n";
 	}
 
 	if (tex_parameters[PARAM_ANIM_SPEED].is_valid()) {
-		code += "		float tex_anim_speed = textureLod(anim_speed_texture, vec2(tv, 0.0), 0.0).r;\n";
+		code += "	float tex_anim_speed = textureLod(anim_speed_texture, vec2(tv, 0.0), 0.0).r;\n";
 	} else {
-		code += "		float tex_anim_speed = 0.0;\n";
+		code += "	float tex_anim_speed = 0.0;\n";
 	}
 
 	if (tex_parameters[PARAM_ANIM_OFFSET].is_valid()) {
-		code += "		float tex_anim_offset = textureLod(anim_offset_texture, vec2(tv, 0.0), 0.0).r;\n";
+		code += "	float tex_anim_offset = textureLod(anim_offset_texture, vec2(tv, 0.0), 0.0).r;\n";
 	} else {
-		code += "		float tex_anim_offset = 0.0;\n";
+		code += "	float tex_anim_offset = 0.0;\n";
 	}
 
-	code += "		vec3 force = gravity;\n";
-	code += "		vec3 pos = TRANSFORM[3].xyz;\n";
+	code += "	vec3 force = gravity;\n";
+	code += "	vec3 pos = TRANSFORM[3].xyz;\n";
 	if (particle_flags[PARTICLE_FLAG_DISABLE_Z]) {
-		code += "		pos.z = 0.0;\n";
-	}
-	code += "		// apply linear acceleration\n";
-	code += "		force += length(VELOCITY) > 0.0 ? normalize(VELOCITY) * (linear_accel + tex_linear_accel) * mix(1.0, rand_from_seed(alt_seed), linear_accel_random) : vec3(0.0);\n";
-	code += "		// apply radial acceleration\n";
-	code += "		vec3 org = EMISSION_TRANSFORM[3].xyz;\n";
-	code += "		vec3 diff = pos - org;\n";
-	code += "		force += length(diff) > 0.0 ? normalize(diff) * (radial_accel + tex_radial_accel) * mix(1.0, rand_from_seed(alt_seed), radial_accel_random) : vec3(0.0);\n";
-	code += "		// apply tangential acceleration;\n";
+		code += "	pos.z = 0.0;\n";
+	}
+	code += "	// apply linear acceleration\n";
+	code += "	force += length(VELOCITY) > 0.0 ? normalize(VELOCITY) * (linear_accel + tex_linear_accel) * mix(1.0, rand_from_seed(alt_seed), linear_accel_random) : vec3(0.0);\n";
+	code += "	// apply radial acceleration\n";
+	code += "	vec3 org = EMISSION_TRANSFORM[3].xyz;\n";
+	code += "	vec3 diff = pos - org;\n";
+	code += "	force += length(diff) > 0.0 ? normalize(diff) * (radial_accel + tex_radial_accel) * mix(1.0, rand_from_seed(alt_seed), radial_accel_random) : vec3(0.0);\n";
+	code += "	// apply tangential acceleration;\n";
 	if (particle_flags[PARTICLE_FLAG_DISABLE_Z]) {
-		code += "		force += length(diff.yx) > 0.0 ? vec3(normalize(diff.yx * vec2(-1.0, 1.0)), 0.0) * ((tangent_accel + tex_tangent_accel) * mix(1.0, rand_from_seed(alt_seed), tangent_accel_random)) : vec3(0.0);\n";
+		code += "	force += length(diff.yx) > 0.0 ? vec3(normalize(diff.yx * vec2(-1.0, 1.0)), 0.0) * ((tangent_accel + tex_tangent_accel) * mix(1.0, rand_from_seed(alt_seed), tangent_accel_random)) : vec3(0.0);\n";
 
 	} else {
-		code += "		vec3 crossDiff = cross(normalize(diff), normalize(gravity));\n";
-		code += "		force += length(crossDiff) > 0.0 ? normalize(crossDiff) * ((tangent_accel + tex_tangent_accel) * mix(1.0, rand_from_seed(alt_seed), tangent_accel_random)) : vec3(0.0);\n";
+		code += "	vec3 crossDiff = cross(normalize(diff), normalize(gravity));\n";
+		code += "	force += length(crossDiff) > 0.0 ? normalize(crossDiff) * ((tangent_accel + tex_tangent_accel) * mix(1.0, rand_from_seed(alt_seed), tangent_accel_random)) : vec3(0.0);\n";
 	}
 	if (attractor_interaction_enabled) {
-		code += "		force += ATTRACTOR_FORCE;\n\n";
+		code += "	force += ATTRACTOR_FORCE;\n\n";
 	}
 
-	code += "		// apply attractor forces\n";
-	code += "		VELOCITY += force * DELTA;\n";
-	code += "		// orbit velocity\n";
+	code += "	// apply attractor forces\n";
+	code += "	VELOCITY += force * DELTA;\n";
+	code += "	// orbit velocity\n";
 	if (particle_flags[PARTICLE_FLAG_DISABLE_Z]) {
-		code += "		float orbit_amount = (orbit_velocity + tex_orbit_velocity) * mix(1.0, rand_from_seed(alt_seed), orbit_velocity_random);\n";
-		code += "		if (orbit_amount != 0.0) {\n";
-		code += "		     float ang = orbit_amount * DELTA * pi * 2.0;\n";
-		code += "		     mat2 rot = mat2(vec2(cos(ang), -sin(ang)), vec2(sin(ang), cos(ang)));\n";
-		code += "		     TRANSFORM[3].xy -= diff.xy;\n";
-		code += "		     TRANSFORM[3].xy += rot * diff.xy;\n";
-		code += "		}\n";
+		code += "	float orbit_amount = (orbit_velocity + tex_orbit_velocity) * mix(1.0, rand_from_seed(alt_seed), orbit_velocity_random);\n";
+		code += "	if (orbit_amount != 0.0) {\n";
+		code += "	     float ang = orbit_amount * DELTA * pi * 2.0;\n";
+		code += "	     mat2 rot = mat2(vec2(cos(ang), -sin(ang)), vec2(sin(ang), cos(ang)));\n";
+		code += "	     TRANSFORM[3].xy -= diff.xy;\n";
+		code += "	     TRANSFORM[3].xy += rot * diff.xy;\n";
+		code += "	}\n";
 	}
 
 	if (tex_parameters[PARAM_INITIAL_LINEAR_VELOCITY].is_valid()) {
-		code += "		VELOCITY = normalize(VELOCITY) * tex_linear_velocity;\n";
-	}
-	code += "		if (damping + tex_damping > 0.0) {\n";
-	code += "			float v = length(VELOCITY);\n";
-	code += "			float damp = (damping + tex_damping) * mix(1.0, rand_from_seed(alt_seed), damping_random);\n";
-	code += "			v -= damp * DELTA;\n";
-	code += "			if (v < 0.0) {\n";
-	code += "				VELOCITY = vec3(0.0);\n";
-	code += "			} else {\n";
-	code += "				VELOCITY = normalize(VELOCITY) * v;\n";
-	code += "			}\n";
+		code += "	VELOCITY = normalize(VELOCITY) * tex_linear_velocity;\n";
+	}
+	code += "	if (damping + tex_damping > 0.0) {\n";
+	code += "		float v = length(VELOCITY);\n";
+	code += "		float damp = (damping + tex_damping) * mix(1.0, rand_from_seed(alt_seed), damping_random);\n";
+	code += "		v -= damp * DELTA;\n";
+	code += "		if (v < 0.0) {\n";
+	code += "			VELOCITY = vec3(0.0);\n";
+	code += "		} else {\n";
+	code += "			VELOCITY = normalize(VELOCITY) * v;\n";
 	code += "		}\n";
-	code += "		float base_angle = (initial_angle + tex_angle) * mix(1.0, angle_rand, initial_angle_random);\n";
-	code += "		base_angle += CUSTOM.y * LIFETIME * (angular_velocity + tex_angular_velocity) * mix(1.0, rand_from_seed(alt_seed) * 2.0 - 1.0, angular_velocity_random);\n";
-	code += "		CUSTOM.x = base_angle * degree_to_rad;\n"; // angle
-	code += "		CUSTOM.z = (anim_offset + tex_anim_offset) * mix(1.0, anim_offset_rand, anim_offset_random) + CUSTOM.y * (anim_speed + tex_anim_speed) * mix(1.0, rand_from_seed(alt_seed), anim_speed_random);\n"; // angle
 	code += "	}\n";
+	code += "	float base_angle = (initial_angle + tex_angle) * mix(1.0, angle_rand, initial_angle_random);\n";
+	code += "	base_angle += CUSTOM.y * LIFETIME * (angular_velocity + tex_angular_velocity) * mix(1.0, rand_from_seed(alt_seed) * 2.0 - 1.0, angular_velocity_random);\n";
+	code += "	CUSTOM.x = base_angle * degree_to_rad;\n"; // angle
+	code += "	CUSTOM.z = (anim_offset + tex_anim_offset) * mix(1.0, anim_offset_rand, anim_offset_random) + CUSTOM.y * (anim_speed + tex_anim_speed) * mix(1.0, rand_from_seed(alt_seed), anim_speed_random);\n"; // angle
+
 	// apply color
 	// apply hue rotation
 	if (tex_parameters[PARAM_SCALE].is_valid()) {
@@ -608,7 +615,7 @@ void ParticlesMaterial::_update_shader() {
 		}
 		// turn particle by rotation in Y
 		if (particle_flags[PARTICLE_FLAG_ROTATE_Y]) {
-			code += "	TRANSFORM = TRANSFORM * mat4(vec4(cos(CUSTOM.x), 0.0, -sin(CUSTOM.x), 0.0), vec4(0.0, 1.0, 0.0, 0.0), vec4(sin(CUSTOM.x), 0.0, cos(CUSTOM.x), 0.0), vec4(0.0, 0.0, 0.0, 1.0));\n";
+			code += "	TRANSFORM = mat4(vec4(cos(CUSTOM.x), 0.0, -sin(CUSTOM.x), 0.0), vec4(0.0, 1.0, 0.0, 0.0), vec4(sin(CUSTOM.x), 0.0, cos(CUSTOM.x), 0.0), vec4(0.0, 0.0, 0.0, 1.0));\n";
 		}
 	}
 	//scale by scale
@@ -659,7 +666,7 @@ void ParticlesMaterial::_update_shader() {
 		code += "	}";
 	}
 
-	code += "	if (CUSTOM.y > CUSTOM.w) {";
+	code += "	if (CUSTOM.y > CUSTOM.w) {\n";
 	code += "		ACTIVE = false;\n";
 	code += "	}\n";
 	code += "}\n";
diff --git a/scene/resources/primitive_meshes.cpp b/scene/resources/primitive_meshes.cpp
index 1be511e8f1..c3d84aeda2 100644
--- a/scene/resources/primitive_meshes.cpp
+++ b/scene/resources/primitive_meshes.cpp
@@ -1538,3 +1538,552 @@ void PointMesh::_create_mesh_array(Array &p_arr) const {
 PointMesh::PointMesh() {
 	primitive_type = PRIMITIVE_POINTS;
 }
+// TUBE TRAIL
+
+void TubeTrailMesh::set_radius(const float p_radius) {
+	radius = p_radius;
+	_request_update();
+}
+float TubeTrailMesh::get_radius() const {
+	return radius;
+}
+
+void TubeTrailMesh::set_radial_steps(const int p_radial_steps) {
+	ERR_FAIL_COND(p_radial_steps < 3 || p_radial_steps > 128);
+	radial_steps = p_radial_steps;
+	_request_update();
+}
+int TubeTrailMesh::get_radial_steps() const {
+	return radial_steps;
+}
+
+void TubeTrailMesh::set_sections(const int p_sections) {
+	ERR_FAIL_COND(p_sections < 2 || p_sections > 128);
+	sections = p_sections;
+	_request_update();
+}
+int TubeTrailMesh::get_sections() const {
+	return sections;
+}
+
+void TubeTrailMesh::set_section_length(float p_section_length) {
+	section_length = p_section_length;
+	_request_update();
+}
+float TubeTrailMesh::get_section_length() const {
+	return section_length;
+}
+
+void TubeTrailMesh::set_section_rings(const int p_section_rings) {
+	ERR_FAIL_COND(p_section_rings < 1 || p_section_rings > 1024);
+	section_rings = p_section_rings;
+	_request_update();
+}
+int TubeTrailMesh::get_section_rings() const {
+	return section_rings;
+}
+
+void TubeTrailMesh::set_curve(const Ref<Curve> &p_curve) {
+	if (curve == p_curve) {
+		return;
+	}
+	if (curve.is_valid()) {
+		curve->disconnect("changed", callable_mp(this, &TubeTrailMesh::_curve_changed));
+	}
+	curve = p_curve;
+	if (curve.is_valid()) {
+		curve->connect("changed", callable_mp(this, &TubeTrailMesh::_curve_changed));
+	}
+	_request_update();
+}
+Ref<Curve> TubeTrailMesh::get_curve() const {
+	return curve;
+}
+
+void TubeTrailMesh::_curve_changed() {
+	_request_update();
+}
+int TubeTrailMesh::get_builtin_bind_pose_count() const {
+	return sections + 1;
+}
+
+Transform TubeTrailMesh::get_builtin_bind_pose(int p_index) const {
+	float depth = section_length * sections;
+
+	Transform xform;
+	xform.origin.y = depth / 2.0 - section_length * float(p_index);
+	xform.origin.y = -xform.origin.y; //bind is an inverse transform, so negate y
+
+	return xform;
+}
+
+void TubeTrailMesh::_create_mesh_array(Array &p_arr) const {
+	PackedVector3Array points;
+	PackedVector3Array normals;
+	PackedFloat32Array tangents;
+	PackedVector2Array uvs;
+	PackedInt32Array bone_indices;
+	PackedFloat32Array bone_weights;
+	PackedInt32Array indices;
+
+	int point = 0;
+
+#define ADD_TANGENT(m_x, m_y, m_z, m_d) \
+	tangents.push_back(m_x);            \
+	tangents.push_back(m_y);            \
+	tangents.push_back(m_z);            \
+	tangents.push_back(m_d);
+
+	int thisrow = 0;
+	int prevrow = 0;
+
+	int total_rings = section_rings * sections;
+	float depth = section_length * sections;
+
+	for (int j = 0; j <= total_rings; j++) {
+		float v = j;
+		v /= total_rings;
+
+		float y = depth * v;
+		y = (depth * 0.5) - y;
+
+		int bone = j / section_rings;
+		float blend = 1.0 - float(j % section_rings) / float(section_rings);
+
+		for (int i = 0; i <= radial_steps; i++) {
+			float u = i;
+			u /= radial_steps;
+
+			float r = radius;
+			if (curve.is_valid() && curve->get_point_count() > 0) {
+				r *= curve->interpolate_baked(v);
+			}
+			float x = sin(u * Math_TAU);
+			float z = cos(u * Math_TAU);
+
+			Vector3 p = Vector3(x * r, y, z * r);
+			points.push_back(p);
+			normals.push_back(Vector3(x, 0, z));
+			ADD_TANGENT(z, 0.0, -x, 1.0)
+			uvs.push_back(Vector2(u, v * 0.5));
+			point++;
+			{
+				bone_indices.push_back(bone);
+				bone_indices.push_back(MIN(sections, bone + 1));
+				bone_indices.push_back(0);
+				bone_indices.push_back(0);
+
+				bone_weights.push_back(blend);
+				bone_weights.push_back(1.0 - blend);
+				bone_weights.push_back(0);
+				bone_weights.push_back(0);
+			}
+
+			if (i > 0 && j > 0) {
+				indices.push_back(prevrow + i - 1);
+				indices.push_back(prevrow + i);
+				indices.push_back(thisrow + i - 1);
+
+				indices.push_back(prevrow + i);
+				indices.push_back(thisrow + i);
+				indices.push_back(thisrow + i - 1);
+			}
+		}
+
+		prevrow = thisrow;
+		thisrow = point;
+	}
+
+	// add top
+	float scale_pos = 1.0;
+	if (curve.is_valid() && curve->get_point_count() > 0) {
+		scale_pos = curve->interpolate_baked(0);
+	}
+
+	if (scale_pos > CMP_EPSILON) {
+		float y = depth * 0.5;
+
+		thisrow = point;
+		points.push_back(Vector3(0.0, y, 0));
+		normals.push_back(Vector3(0.0, 1.0, 0.0));
+		ADD_TANGENT(1.0, 0.0, 0.0, 1.0)
+		uvs.push_back(Vector2(0.25, 0.75));
+		point++;
+
+		bone_indices.push_back(0);
+		bone_indices.push_back(0);
+		bone_indices.push_back(0);
+		bone_indices.push_back(0);
+
+		bone_weights.push_back(1.0);
+		bone_weights.push_back(0);
+		bone_weights.push_back(0);
+		bone_weights.push_back(0);
+
+		float rm = radius * scale_pos;
+
+		for (int i = 0; i <= radial_steps; i++) {
+			float r = i;
+			r /= radial_steps;
+
+			float x = sin(r * Math_TAU);
+			float z = cos(r * Math_TAU);
+
+			float u = ((x + 1.0) * 0.25);
+			float v = 0.5 + ((z + 1.0) * 0.25);
+
+			Vector3 p = Vector3(x * rm, y, z * rm);
+			points.push_back(p);
+			normals.push_back(Vector3(0.0, 1.0, 0.0));
+			ADD_TANGENT(1.0, 0.0, 0.0, 1.0)
+			uvs.push_back(Vector2(u, v));
+			point++;
+
+			bone_indices.push_back(0);
+			bone_indices.push_back(0);
+			bone_indices.push_back(0);
+			bone_indices.push_back(0);
+
+			bone_weights.push_back(1.0);
+			bone_weights.push_back(0);
+			bone_weights.push_back(0);
+			bone_weights.push_back(0);
+
+			if (i > 0) {
+				indices.push_back(thisrow);
+				indices.push_back(point - 1);
+				indices.push_back(point - 2);
+			};
+		};
+	};
+
+	float scale_neg = 1.0;
+	if (curve.is_valid() && curve->get_point_count() > 0) {
+		scale_neg = curve->interpolate_baked(1.0);
+	}
+
+	// add bottom
+	if (scale_neg > CMP_EPSILON) {
+		float y = depth * -0.5;
+
+		thisrow = point;
+		points.push_back(Vector3(0.0, y, 0.0));
+		normals.push_back(Vector3(0.0, -1.0, 0.0));
+		ADD_TANGENT(1.0, 0.0, 0.0, 1.0)
+		uvs.push_back(Vector2(0.75, 0.75));
+		point++;
+
+		bone_indices.push_back(sections);
+		bone_indices.push_back(0);
+		bone_indices.push_back(0);
+		bone_indices.push_back(0);
+
+		bone_weights.push_back(1.0);
+		bone_weights.push_back(0);
+		bone_weights.push_back(0);
+		bone_weights.push_back(0);
+
+		float rm = radius * scale_neg;
+
+		for (int i = 0; i <= radial_steps; i++) {
+			float r = i;
+			r /= radial_steps;
+
+			float x = sin(r * Math_TAU);
+			float z = cos(r * Math_TAU);
+
+			float u = 0.5 + ((x + 1.0) * 0.25);
+			float v = 1.0 - ((z + 1.0) * 0.25);
+
+			Vector3 p = Vector3(x * rm, y, z * rm);
+			points.push_back(p);
+			normals.push_back(Vector3(0.0, -1.0, 0.0));
+			ADD_TANGENT(1.0, 0.0, 0.0, 1.0)
+			uvs.push_back(Vector2(u, v));
+			point++;
+
+			bone_indices.push_back(sections);
+			bone_indices.push_back(0);
+			bone_indices.push_back(0);
+			bone_indices.push_back(0);
+
+			bone_weights.push_back(1.0);
+			bone_weights.push_back(0);
+			bone_weights.push_back(0);
+			bone_weights.push_back(0);
+
+			if (i > 0) {
+				indices.push_back(thisrow);
+				indices.push_back(point - 2);
+				indices.push_back(point - 1);
+			};
+		};
+	};
+
+	p_arr[RS::ARRAY_VERTEX] = points;
+	p_arr[RS::ARRAY_NORMAL] = normals;
+	p_arr[RS::ARRAY_TANGENT] = tangents;
+	p_arr[RS::ARRAY_TEX_UV] = uvs;
+	p_arr[RS::ARRAY_BONES] = bone_indices;
+	p_arr[RS::ARRAY_WEIGHTS] = bone_weights;
+	p_arr[RS::ARRAY_INDEX] = indices;
+}
+
+void TubeTrailMesh::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("set_radius", "radius"), &TubeTrailMesh::set_radius);
+	ClassDB::bind_method(D_METHOD("get_radius"), &TubeTrailMesh::get_radius);
+
+	ClassDB::bind_method(D_METHOD("set_radial_steps", "radial_steps"), &TubeTrailMesh::set_radial_steps);
+	ClassDB::bind_method(D_METHOD("get_radial_steps"), &TubeTrailMesh::get_radial_steps);
+
+	ClassDB::bind_method(D_METHOD("set_sections", "sections"), &TubeTrailMesh::set_sections);
+	ClassDB::bind_method(D_METHOD("get_sections"), &TubeTrailMesh::get_sections);
+
+	ClassDB::bind_method(D_METHOD("set_section_length", "section_length"), &TubeTrailMesh::set_section_length);
+	ClassDB::bind_method(D_METHOD("get_section_length"), &TubeTrailMesh::get_section_length);
+
+	ClassDB::bind_method(D_METHOD("set_section_rings", "section_rings"), &TubeTrailMesh::set_section_rings);
+	ClassDB::bind_method(D_METHOD("get_section_rings"), &TubeTrailMesh::get_section_rings);
+
+	ClassDB::bind_method(D_METHOD("set_curve", "curve"), &TubeTrailMesh::set_curve);
+	ClassDB::bind_method(D_METHOD("get_curve"), &TubeTrailMesh::get_curve);
+
+	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "radius", PROPERTY_HINT_RANGE, "0.001,100.0,0.001,or_greater"), "set_radius", "get_radius");
+
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "radial_steps", PROPERTY_HINT_RANGE, "3,128,1"), "set_radial_steps", "get_radial_steps");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "sections", PROPERTY_HINT_RANGE, "2,128,1"), "set_sections", "get_sections");
+
+	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "section_length", PROPERTY_HINT_RANGE, "0.001,1024.0,0.001,or_greater"), "set_section_length", "get_section_length");
+
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "section_rings", PROPERTY_HINT_RANGE, "1,128,1"), "set_section_rings", "get_section_rings");
+
+	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "curve", PROPERTY_HINT_RESOURCE_TYPE, "Curve"), "set_curve", "get_curve");
+}
+
+TubeTrailMesh::TubeTrailMesh() {
+}
+
+// TUBE TRAIL
+
+void RibbonTrailMesh::set_shape(Shape p_shape) {
+	shape = p_shape;
+	_request_update();
+}
+RibbonTrailMesh::Shape RibbonTrailMesh::get_shape() const {
+	return shape;
+}
+
+void RibbonTrailMesh::set_size(const float p_size) {
+	size = p_size;
+	_request_update();
+}
+float RibbonTrailMesh::get_size() const {
+	return size;
+}
+
+void RibbonTrailMesh::set_sections(const int p_sections) {
+	ERR_FAIL_COND(p_sections < 2 || p_sections > 128);
+	sections = p_sections;
+	_request_update();
+}
+int RibbonTrailMesh::get_sections() const {
+	return sections;
+}
+
+void RibbonTrailMesh::set_section_length(float p_section_length) {
+	section_length = p_section_length;
+	_request_update();
+}
+float RibbonTrailMesh::get_section_length() const {
+	return section_length;
+}
+
+void RibbonTrailMesh::set_section_segments(const int p_section_segments) {
+	ERR_FAIL_COND(p_section_segments < 1 || p_section_segments > 1024);
+	section_segments = p_section_segments;
+	_request_update();
+}
+int RibbonTrailMesh::get_section_segments() const {
+	return section_segments;
+}
+
+void RibbonTrailMesh::set_curve(const Ref<Curve> &p_curve) {
+	if (curve == p_curve) {
+		return;
+	}
+	if (curve.is_valid()) {
+		curve->disconnect("changed", callable_mp(this, &RibbonTrailMesh::_curve_changed));
+	}
+	curve = p_curve;
+	if (curve.is_valid()) {
+		curve->connect("changed", callable_mp(this, &RibbonTrailMesh::_curve_changed));
+	}
+	_request_update();
+}
+Ref<Curve> RibbonTrailMesh::get_curve() const {
+	return curve;
+}
+
+void RibbonTrailMesh::_curve_changed() {
+	_request_update();
+}
+int RibbonTrailMesh::get_builtin_bind_pose_count() const {
+	return sections + 1;
+}
+
+Transform RibbonTrailMesh::get_builtin_bind_pose(int p_index) const {
+	float depth = section_length * sections;
+
+	Transform xform;
+	xform.origin.y = depth / 2.0 - section_length * float(p_index);
+	xform.origin.y = -xform.origin.y; //bind is an inverse transform, so negate y
+
+	return xform;
+}
+
+void RibbonTrailMesh::_create_mesh_array(Array &p_arr) const {
+	PackedVector3Array points;
+	PackedVector3Array normals;
+	PackedFloat32Array tangents;
+	PackedVector2Array uvs;
+	PackedInt32Array bone_indices;
+	PackedFloat32Array bone_weights;
+	PackedInt32Array indices;
+
+#define ADD_TANGENT(m_x, m_y, m_z, m_d) \
+	tangents.push_back(m_x);            \
+	tangents.push_back(m_y);            \
+	tangents.push_back(m_z);            \
+	tangents.push_back(m_d);
+
+	int total_segments = section_segments * sections;
+	float depth = section_length * sections;
+
+	for (int j = 0; j <= total_segments; j++) {
+		float v = j;
+		v /= total_segments;
+
+		float y = depth * v;
+		y = (depth * 0.5) - y;
+
+		int bone = j / section_segments;
+		float blend = 1.0 - float(j % section_segments) / float(section_segments);
+
+		float s = size;
+
+		if (curve.is_valid() && curve->get_point_count() > 0) {
+			s *= curve->interpolate_baked(v);
+		}
+
+		points.push_back(Vector3(-s * 0.5, y, 0));
+		points.push_back(Vector3(+s * 0.5, y, 0));
+		if (shape == SHAPE_CROSS) {
+			points.push_back(Vector3(0, y, -s * 0.5));
+			points.push_back(Vector3(0, y, +s * 0.5));
+		}
+
+		normals.push_back(Vector3(0, 0, 1));
+		normals.push_back(Vector3(0, 0, 1));
+		if (shape == SHAPE_CROSS) {
+			normals.push_back(Vector3(1, 0, 0));
+			normals.push_back(Vector3(1, 0, 0));
+		}
+
+		uvs.push_back(Vector2(0, v));
+		uvs.push_back(Vector2(1, v));
+		if (shape == SHAPE_CROSS) {
+			uvs.push_back(Vector2(0, v));
+			uvs.push_back(Vector2(1, v));
+		}
+
+		ADD_TANGENT(0.0, 1.0, 0.0, 1.0)
+		ADD_TANGENT(0.0, 1.0, 0.0, 1.0)
+		if (shape == SHAPE_CROSS) {
+			ADD_TANGENT(0.0, 1.0, 0.0, 1.0)
+			ADD_TANGENT(0.0, 1.0, 0.0, 1.0)
+		}
+
+		for (int i = 0; i < (shape == SHAPE_CROSS ? 4 : 2); i++) {
+			bone_indices.push_back(bone);
+			bone_indices.push_back(MIN(sections, bone + 1));
+			bone_indices.push_back(0);
+			bone_indices.push_back(0);
+
+			bone_weights.push_back(blend);
+			bone_weights.push_back(1.0 - blend);
+			bone_weights.push_back(0);
+			bone_weights.push_back(0);
+		}
+
+		if (j > 0) {
+			if (shape == SHAPE_CROSS) {
+				int base = j * 4 - 4;
+				indices.push_back(base + 0);
+				indices.push_back(base + 1);
+				indices.push_back(base + 4);
+
+				indices.push_back(base + 1);
+				indices.push_back(base + 5);
+				indices.push_back(base + 4);
+
+				indices.push_back(base + 2);
+				indices.push_back(base + 3);
+				indices.push_back(base + 6);
+
+				indices.push_back(base + 3);
+				indices.push_back(base + 7);
+				indices.push_back(base + 6);
+			} else {
+				int base = j * 2 - 2;
+				indices.push_back(base + 0);
+				indices.push_back(base + 1);
+				indices.push_back(base + 2);
+
+				indices.push_back(base + 1);
+				indices.push_back(base + 3);
+				indices.push_back(base + 2);
+			}
+		}
+	}
+
+	p_arr[RS::ARRAY_VERTEX] = points;
+	p_arr[RS::ARRAY_NORMAL] = normals;
+	p_arr[RS::ARRAY_TANGENT] = tangents;
+	p_arr[RS::ARRAY_TEX_UV] = uvs;
+	p_arr[RS::ARRAY_BONES] = bone_indices;
+	p_arr[RS::ARRAY_WEIGHTS] = bone_weights;
+	p_arr[RS::ARRAY_INDEX] = indices;
+}
+
+void RibbonTrailMesh::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("set_size", "size"), &RibbonTrailMesh::set_size);
+	ClassDB::bind_method(D_METHOD("get_size"), &RibbonTrailMesh::get_size);
+
+	ClassDB::bind_method(D_METHOD("set_sections", "sections"), &RibbonTrailMesh::set_sections);
+	ClassDB::bind_method(D_METHOD("get_sections"), &RibbonTrailMesh::get_sections);
+
+	ClassDB::bind_method(D_METHOD("set_section_length", "section_length"), &RibbonTrailMesh::set_section_length);
+	ClassDB::bind_method(D_METHOD("get_section_length"), &RibbonTrailMesh::get_section_length);
+
+	ClassDB::bind_method(D_METHOD("set_section_segments", "section_segments"), &RibbonTrailMesh::set_section_segments);
+	ClassDB::bind_method(D_METHOD("get_section_segments"), &RibbonTrailMesh::get_section_segments);
+
+	ClassDB::bind_method(D_METHOD("set_curve", "curve"), &RibbonTrailMesh::set_curve);
+	ClassDB::bind_method(D_METHOD("get_curve"), &RibbonTrailMesh::get_curve);
+
+	ClassDB::bind_method(D_METHOD("set_shape", "shape"), &RibbonTrailMesh::set_shape);
+	ClassDB::bind_method(D_METHOD("get_shape"), &RibbonTrailMesh::get_shape);
+
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "shape", PROPERTY_HINT_ENUM, "Flat,Cross"), "set_shape", "get_shape");
+	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "size", PROPERTY_HINT_RANGE, "0.001,100.0,0.001,or_greater"), "set_size", "get_size");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "sections", PROPERTY_HINT_RANGE, "2,128,1"), "set_sections", "get_sections");
+	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "section_length", PROPERTY_HINT_RANGE, "0.001,1024.0,0.001,or_greater"), "set_section_length", "get_section_length");
+	ADD_PROPERTY(PropertyInfo(Variant::INT, "section_segments", PROPERTY_HINT_RANGE, "1,128,1"), "set_section_segments", "get_section_segments");
+	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "curve", PROPERTY_HINT_RESOURCE_TYPE, "Curve"), "set_curve", "get_curve");
+
+	BIND_ENUM_CONSTANT(SHAPE_FLAT)
+	BIND_ENUM_CONSTANT(SHAPE_CROSS)
+}
+
+RibbonTrailMesh::RibbonTrailMesh() {
+}
diff --git a/scene/resources/primitive_meshes.h b/scene/resources/primitive_meshes.h
index 65ecdfc19d..ec5806489e 100644
--- a/scene/resources/primitive_meshes.h
+++ b/scene/resources/primitive_meshes.h
@@ -336,4 +336,98 @@ public:
 	PointMesh();
 };
 
+class TubeTrailMesh : public PrimitiveMesh {
+	GDCLASS(TubeTrailMesh, PrimitiveMesh);
+
+private:
+	float radius = 1.0;
+	int radial_steps = 8;
+	int sections = 5;
+	float section_length = 0.2;
+	int section_rings = 3;
+
+	Ref<Curve> curve;
+
+	void _curve_changed();
+
+protected:
+	static void _bind_methods();
+	virtual void _create_mesh_array(Array &p_arr) const override;
+
+public:
+	void set_radius(const float p_radius);
+	float get_radius() const;
+
+	void set_radial_steps(const int p_radial_steps);
+	int get_radial_steps() const;
+
+	void set_sections(const int p_sections);
+	int get_sections() const;
+
+	void set_section_length(float p_sectionlength);
+	float get_section_length() const;
+
+	void set_section_rings(const int p_section_rings);
+	int get_section_rings() const;
+
+	void set_curve(const Ref<Curve> &p_curve);
+	Ref<Curve> get_curve() const;
+
+	virtual int get_builtin_bind_pose_count() const override;
+	virtual Transform get_builtin_bind_pose(int p_index) const override;
+
+	TubeTrailMesh();
+};
+
+class RibbonTrailMesh : public PrimitiveMesh {
+	GDCLASS(RibbonTrailMesh, PrimitiveMesh);
+
+public:
+	enum Shape {
+		SHAPE_FLAT,
+		SHAPE_CROSS
+	};
+
+private:
+	float size = 1.0;
+	int sections = 5;
+	float section_length = 0.2;
+	int section_segments = 3;
+
+	Shape shape = SHAPE_CROSS;
+
+	Ref<Curve> curve;
+
+	void _curve_changed();
+
+protected:
+	static void _bind_methods();
+	virtual void _create_mesh_array(Array &p_arr) const override;
+
+public:
+	void set_shape(Shape p_shape);
+	Shape get_shape() const;
+
+	void set_size(const float p_size);
+	float get_size() const;
+
+	void set_sections(const int p_sections);
+	int get_sections() const;
+
+	void set_section_length(float p_sectionlength);
+	float get_section_length() const;
+
+	void set_section_segments(const int p_section_segments);
+	int get_section_segments() const;
+
+	void set_curve(const Ref<Curve> &p_curve);
+	Ref<Curve> get_curve() const;
+
+	virtual int get_builtin_bind_pose_count() const override;
+	virtual Transform get_builtin_bind_pose(int p_index) const override;
+
+	RibbonTrailMesh();
+};
+
+VARIANT_ENUM_CAST(RibbonTrailMesh::Shape)
 #endif
diff --git a/scene/resources/sky_material.cpp b/scene/resources/sky_material.cpp
index b2efecb1cb..f50ee9c4c8 100644
--- a/scene/resources/sky_material.cpp
+++ b/scene/resources/sky_material.cpp
@@ -194,7 +194,7 @@ ProceduralSkyMaterial::ProceduralSkyMaterial() {
 	code += "uniform float sun_angle_max = 1.74;\n";
 	code += "uniform float sun_curve : hint_range(0, 1) = 0.05;\n\n";
 	code += "const float PI = 3.1415926535897932384626433833;\n\n";
-	code += "void fragment() {\n";
+	code += "void sky() {\n";
 	code += "\tfloat v_angle = acos(clamp(EYEDIR.y, -1.0, 1.0));\n";
 	code += "\tfloat c = (1.0 - v_angle / (PI * 0.5));\n";
 	code += "\tvec3 sky = mix(sky_horizon_color.rgb, sky_top_color.rgb, clamp(1.0 - pow(1.0 - c, 1.0 / sky_curve), 0.0, 1.0));\n";
@@ -301,7 +301,7 @@ PanoramaSkyMaterial::PanoramaSkyMaterial() {
 	String code = "shader_type sky;\n\n";
 
 	code += "uniform sampler2D source_panorama : filter_linear;\n";
-	code += "void fragment() {\n";
+	code += "void sky() {\n";
 	code += "\tCOLOR = texture(source_panorama, SKY_COORDS).rgb;\n";
 	code += "}";
 
@@ -521,7 +521,7 @@ PhysicalSkyMaterial::PhysicalSkyMaterial() {
 	code += "\treturn fract(p.x * p.y * p.z * (p.x + p.y + p.z));\n";
 	code += "}\n\n";
 
-	code += "void fragment() {\n";
+	code += "void sky() {\n";
 	code += "\tif (LIGHT0_ENABLED) {\n";
 	code += "\t\tfloat zenith_angle = clamp( dot(UP, normalize(LIGHT0_DIRECTION)), -1.0, 1.0 );\n";
 	code += "\t\tfloat sun_energy = max(0.0, 1.0 - exp(-((PI * 0.5) - acos(zenith_angle)))) * SUN_ENERGY * LIGHT0_ENERGY;\n";
diff --git a/scene/resources/surface_tool.cpp b/scene/resources/surface_tool.cpp
index 3d3900ecc5..c30bd7927d 100644
--- a/scene/resources/surface_tool.cpp
+++ b/scene/resources/surface_tool.cpp
@@ -1105,7 +1105,7 @@ void SurfaceTool::optimize_indices_for_cache() {
 	ERR_FAIL_COND(index_array.size() == 0);
 
 	LocalVector old_index_array = index_array;
-	zeromem(index_array.ptr(), index_array.size() * sizeof(int));
+	memset(index_array.ptr(), 0, index_array.size() * sizeof(int));
 	optimize_vertex_cache_func((unsigned int *)index_array.ptr(), (unsigned int *)old_index_array.ptr(), old_index_array.size(), vertex_array.size());
 }
 
diff --git a/scene/resources/texture.cpp b/scene/resources/texture.cpp
index b6a2f24b8b..624eae0411 100644
--- a/scene/resources/texture.cpp
+++ b/scene/resources/texture.cpp
@@ -410,7 +410,7 @@ Ref<Image> StreamTexture2D::load_image_from_file(FileAccess *f, int p_size_limit
 					Vector<uint8_t> id = mipmap_images[i]->get_data();
 					int len = id.size();
 					const uint8_t *r = id.ptr();
-					copymem(&wr[ofs], r, len);
+					memcpy(&wr[ofs], r, len);
 					ofs += len;
 				}
 			}
@@ -1405,185 +1405,6 @@ MeshTexture::MeshTexture() {
 
 //////////////////////////////////////////
 
-int LargeTexture::get_width() const {
-	return size.width;
-}
-
-int LargeTexture::get_height() const {
-	return size.height;
-}
-
-RID LargeTexture::get_rid() const {
-	return RID();
-}
-
-bool LargeTexture::has_alpha() const {
-	for (int i = 0; i < pieces.size(); i++) {
-		if (pieces[i].texture->has_alpha()) {
-			return true;
-		}
-	}
-
-	return false;
-}
-
-int LargeTexture::add_piece(const Point2 &p_offset, const Ref<Texture2D> &p_texture) {
-	ERR_FAIL_COND_V(p_texture.is_null(), -1);
-	Piece p;
-	p.offset = p_offset;
-	p.texture = p_texture;
-	pieces.push_back(p);
-
-	return pieces.size() - 1;
-}
-
-void LargeTexture::set_piece_offset(int p_idx, const Point2 &p_offset) {
-	ERR_FAIL_INDEX(p_idx, pieces.size());
-	pieces.write[p_idx].offset = p_offset;
-};
-
-void LargeTexture::set_piece_texture(int p_idx, const Ref<Texture2D> &p_texture) {
-	ERR_FAIL_COND(p_texture == this);
-	ERR_FAIL_COND(p_texture.is_null());
-	ERR_FAIL_INDEX(p_idx, pieces.size());
-	pieces.write[p_idx].texture = p_texture;
-};
-
-void LargeTexture::set_size(const Size2 &p_size) {
-	size = p_size;
-}
-
-void LargeTexture::clear() {
-	pieces.clear();
-	size = Size2i();
-}
-
-Array LargeTexture::_get_data() const {
-	Array arr;
-	for (int i = 0; i < pieces.size(); i++) {
-		arr.push_back(pieces[i].offset);
-		arr.push_back(pieces[i].texture);
-	}
-	arr.push_back(Size2(size));
-	return arr;
-}
-
-void LargeTexture::_set_data(const Array &p_array) {
-	ERR_FAIL_COND(p_array.size() < 1);
-	ERR_FAIL_COND(!(p_array.size() & 1));
-	clear();
-	for (int i = 0; i < p_array.size() - 1; i += 2) {
-		add_piece(p_array[i], p_array[i + 1]);
-	}
-	size = Size2(p_array[p_array.size() - 1]);
-}
-
-int LargeTexture::get_piece_count() const {
-	return pieces.size();
-}
-
-Vector2 LargeTexture::get_piece_offset(int p_idx) const {
-	ERR_FAIL_INDEX_V(p_idx, pieces.size(), Vector2());
-	return pieces[p_idx].offset;
-}
-
-Ref<Texture2D> LargeTexture::get_piece_texture(int p_idx) const {
-	ERR_FAIL_INDEX_V(p_idx, pieces.size(), Ref<Texture2D>());
-	return pieces[p_idx].texture;
-}
-
-Ref<Image> LargeTexture::to_image() const {
-	Ref<Image> img = memnew(Image(this->get_width(), this->get_height(), false, Image::FORMAT_RGBA8));
-	for (int i = 0; i < pieces.size(); i++) {
-		Ref<Image> src_img = pieces[i].texture->get_image();
-		img->blit_rect(src_img, Rect2(0, 0, src_img->get_width(), src_img->get_height()), pieces[i].offset);
-	}
-
-	return img;
-}
-
-void LargeTexture::_bind_methods() {
-	ClassDB::bind_method(D_METHOD("add_piece", "ofs", "texture"), &LargeTexture::add_piece);
-	ClassDB::bind_method(D_METHOD("set_piece_offset", "idx", "ofs"), &LargeTexture::set_piece_offset);
-	ClassDB::bind_method(D_METHOD("set_piece_texture", "idx", "texture"), &LargeTexture::set_piece_texture);
-	ClassDB::bind_method(D_METHOD("set_size", "size"), &LargeTexture::set_size);
-	ClassDB::bind_method(D_METHOD("clear"), &LargeTexture::clear);
-
-	ClassDB::bind_method(D_METHOD("get_piece_count"), &LargeTexture::get_piece_count);
-	ClassDB::bind_method(D_METHOD("get_piece_offset", "idx"), &LargeTexture::get_piece_offset);
-	ClassDB::bind_method(D_METHOD("get_piece_texture", "idx"), &LargeTexture::get_piece_texture);
-
-	ClassDB::bind_method(D_METHOD("_set_data", "data"), &LargeTexture::_set_data);
-	ClassDB::bind_method(D_METHOD("_get_data"), &LargeTexture::_get_data);
-
-	ADD_PROPERTY(PropertyInfo(Variant::ARRAY, "_data", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR | PROPERTY_USAGE_INTERNAL), "_set_data", "_get_data");
-}
-
-void LargeTexture::draw(RID p_canvas_item, const Point2 &p_pos, const Color &p_modulate, bool p_transpose) const {
-	for (int i = 0; i < pieces.size(); i++) {
-		// TODO
-		pieces[i].texture->draw(p_canvas_item, pieces[i].offset + p_pos, p_modulate, p_transpose);
-	}
-}
-
-void LargeTexture::draw_rect(RID p_canvas_item, const Rect2 &p_rect, bool p_tile, const Color &p_modulate, bool p_transpose) const {
-	//tiling not supported for this
-	if (size.x == 0 || size.y == 0) {
-		return;
-	}
-
-	Size2 scale = p_rect.size / size;
-
-	for (int i = 0; i < pieces.size(); i++) {
-		// TODO
-		pieces[i].texture->draw_rect(p_canvas_item, Rect2(pieces[i].offset * scale + p_rect.position, pieces[i].texture->get_size() * scale), false, p_modulate, p_transpose);
-	}
-}
-
-void LargeTexture::draw_rect_region(RID p_canvas_item, const Rect2 &p_rect, const Rect2 &p_src_rect, const Color &p_modulate, bool p_transpose, bool p_clip_uv) const {
-	//tiling not supported for this
-	if (p_src_rect.size.x == 0 || p_src_rect.size.y == 0) {
-		return;
-	}
-
-	Size2 scale = p_rect.size / p_src_rect.size;
-
-	for (int i = 0; i < pieces.size(); i++) {
-		// TODO
-		Rect2 rect(pieces[i].offset, pieces[i].texture->get_size());
-		if (!p_src_rect.intersects(rect)) {
-			continue;
-		}
-		Rect2 local = p_src_rect.intersection(rect);
-		Rect2 target = local;
-		target.size *= scale;
-		target.position = p_rect.position + (p_src_rect.position + rect.position) * scale;
-		local.position -= rect.position;
-		pieces[i].texture->draw_rect_region(p_canvas_item, target, local, p_modulate, p_transpose, false);
-	}
-}
-
-bool LargeTexture::is_pixel_opaque(int p_x, int p_y) const {
-	for (int i = 0; i < pieces.size(); i++) {
-		// TODO
-		if (!pieces[i].texture.is_valid()) {
-			continue;
-		}
-
-		Rect2 rect(pieces[i].offset, pieces[i].texture->get_size());
-		if (rect.has_point(Point2(p_x, p_y))) {
-			return pieces[i].texture->is_pixel_opaque(p_x - rect.position.x, p_y - rect.position.y);
-		}
-	}
-
-	return true;
-}
-
-LargeTexture::LargeTexture() {
-}
-
-///////////////////
-
 void CurveTexture::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_width", "width"), &CurveTexture::set_width);
 
diff --git a/scene/resources/texture.h b/scene/resources/texture.h
index 16c98f2891..264d85d187 100644
--- a/scene/resources/texture.h
+++ b/scene/resources/texture.h
@@ -297,51 +297,6 @@ public:
 	MeshTexture();
 };
 
-class LargeTexture : public Texture2D {
-	GDCLASS(LargeTexture, Texture2D);
-	RES_BASE_EXTENSION("largetex");
-
-protected:
-	struct Piece {
-		Point2 offset;
-		Ref<Texture2D> texture;
-	};
-
-	Vector<Piece> pieces;
-	Size2i size;
-
-	Array _get_data() const;
-	void _set_data(const Array &p_array);
-	static void _bind_methods();
-
-public:
-	virtual int get_width() const override;
-	virtual int get_height() const override;
-	virtual RID get_rid() const override;
-
-	virtual bool has_alpha() const override;
-
-	int add_piece(const Point2 &p_offset, const Ref<Texture2D> &p_texture);
-	void set_piece_offset(int p_idx, const Point2 &p_offset);
-	void set_piece_texture(int p_idx, const Ref<Texture2D> &p_texture);
-
-	void set_size(const Size2 &p_size);
-	void clear();
-
-	int get_piece_count() const;
-	Vector2 get_piece_offset(int p_idx) const;
-	Ref<Texture2D> get_piece_texture(int p_idx) const;
-	Ref<Image> to_image() const;
-
-	virtual void draw(RID p_canvas_item, const Point2 &p_pos, const Color &p_modulate = Color(1, 1, 1), bool p_transpose = false) const override;
-	virtual void draw_rect(RID p_canvas_item, const Rect2 &p_rect, bool p_tile = false, const Color &p_modulate = Color(1, 1, 1), bool p_transpose = false) const override;
-	virtual void draw_rect_region(RID p_canvas_item, const Rect2 &p_rect, const Rect2 &p_src_rect, const Color &p_modulate = Color(1, 1, 1), bool p_transpose = false, bool p_clip_uv = true) const override;
-
-	bool is_pixel_opaque(int p_x, int p_y) const override;
-
-	LargeTexture();
-};
-
 class TextureLayered : public Texture {
 	GDCLASS(TextureLayered, Texture);
 
diff --git a/scene/resources/theme.cpp b/scene/resources/theme.cpp
index 036d11574c..e8b203417e 100644
--- a/scene/resources/theme.cpp
+++ b/scene/resources/theme.cpp
@@ -508,6 +508,10 @@ bool Theme::has_icon(const StringName &p_name, const StringName &p_node_type) co
 	return (icon_map.has(p_node_type) && icon_map[p_node_type].has(p_name) && icon_map[p_node_type][p_name].is_valid());
 }
 
+bool Theme::has_icon_nocheck(const StringName &p_name, const StringName &p_node_type) const {
+	return (icon_map.has(p_node_type) && icon_map[p_node_type].has(p_name));
+}
+
 void Theme::rename_icon(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type) {
 	ERR_FAIL_COND_MSG(!icon_map.has(p_node_type), "Cannot rename the icon '" + String(p_old_name) + "' because the node type '" + String(p_node_type) + "' does not exist.");
 	ERR_FAIL_COND_MSG(icon_map[p_node_type].has(p_name), "Cannot rename the icon '" + String(p_old_name) + "' because the new name '" + String(p_name) + "' already exists.");
@@ -592,6 +596,10 @@ bool Theme::has_stylebox(const StringName &p_name, const StringName &p_node_type
 	return (style_map.has(p_node_type) && style_map[p_node_type].has(p_name) && style_map[p_node_type][p_name].is_valid());
 }
 
+bool Theme::has_stylebox_nocheck(const StringName &p_name, const StringName &p_node_type) const {
+	return (style_map.has(p_node_type) && style_map[p_node_type].has(p_name));
+}
+
 void Theme::rename_stylebox(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type) {
 	ERR_FAIL_COND_MSG(!style_map.has(p_node_type), "Cannot rename the stylebox '" + String(p_old_name) + "' because the node type '" + String(p_node_type) + "' does not exist.");
 	ERR_FAIL_COND_MSG(style_map[p_node_type].has(p_name), "Cannot rename the stylebox '" + String(p_old_name) + "' because the new name '" + String(p_name) + "' already exists.");
@@ -678,6 +686,10 @@ bool Theme::has_font(const StringName &p_name, const StringName &p_node_type) co
 	return ((font_map.has(p_node_type) && font_map[p_node_type].has(p_name) && font_map[p_node_type][p_name].is_valid()) || default_theme_font.is_valid());
 }
 
+bool Theme::has_font_nocheck(const StringName &p_name, const StringName &p_node_type) const {
+	return (font_map.has(p_node_type) && font_map[p_node_type].has(p_name));
+}
+
 void Theme::rename_font(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type) {
 	ERR_FAIL_COND_MSG(!font_map.has(p_node_type), "Cannot rename the font '" + String(p_old_name) + "' because the node type '" + String(p_node_type) + "' does not exist.");
 	ERR_FAIL_COND_MSG(font_map[p_node_type].has(p_name), "Cannot rename the font '" + String(p_old_name) + "' because the new name '" + String(p_name) + "' already exists.");
@@ -755,6 +767,10 @@ bool Theme::has_font_size(const StringName &p_name, const StringName &p_node_typ
 	return ((font_size_map.has(p_node_type) && font_size_map[p_node_type].has(p_name) && (font_size_map[p_node_type][p_name] > 0)) || (default_theme_font_size > 0));
 }
 
+bool Theme::has_font_size_nocheck(const StringName &p_name, const StringName &p_node_type) const {
+	return (font_size_map.has(p_node_type) && font_size_map[p_node_type].has(p_name));
+}
+
 void Theme::rename_font_size(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type) {
 	ERR_FAIL_COND_MSG(!font_size_map.has(p_node_type), "Cannot rename the font size '" + String(p_old_name) + "' because the node type '" + String(p_node_type) + "' does not exist.");
 	ERR_FAIL_COND_MSG(font_size_map[p_node_type].has(p_name), "Cannot rename the font size '" + String(p_old_name) + "' because the new name '" + String(p_name) + "' already exists.");
@@ -826,6 +842,10 @@ bool Theme::has_color(const StringName &p_name, const StringName &p_node_type) c
 	return (color_map.has(p_node_type) && color_map[p_node_type].has(p_name));
 }
 
+bool Theme::has_color_nocheck(const StringName &p_name, const StringName &p_node_type) const {
+	return (color_map.has(p_node_type) && color_map[p_node_type].has(p_name));
+}
+
 void Theme::rename_color(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type) {
 	ERR_FAIL_COND_MSG(!color_map.has(p_node_type), "Cannot rename the color '" + String(p_old_name) + "' because the node type '" + String(p_node_type) + "' does not exist.");
 	ERR_FAIL_COND_MSG(color_map[p_node_type].has(p_name), "Cannot rename the color '" + String(p_old_name) + "' because the new name '" + String(p_name) + "' already exists.");
@@ -896,6 +916,10 @@ bool Theme::has_constant(const StringName &p_name, const StringName &p_node_type
 	return (constant_map.has(p_node_type) && constant_map[p_node_type].has(p_name));
 }
 
+bool Theme::has_constant_nocheck(const StringName &p_name, const StringName &p_node_type) const {
+	return (constant_map.has(p_node_type) && constant_map[p_node_type].has(p_name));
+}
+
 void Theme::rename_constant(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type) {
 	ERR_FAIL_COND_MSG(!constant_map.has(p_node_type), "Cannot rename the constant '" + String(p_old_name) + "' because the node type '" + String(p_node_type) + "' does not exist.");
 	ERR_FAIL_COND_MSG(constant_map[p_node_type].has(p_name), "Cannot rename the constant '" + String(p_old_name) + "' because the new name '" + String(p_name) + "' already exists.");
@@ -1029,6 +1053,27 @@ bool Theme::has_theme_item(DataType p_data_type, const StringName &p_name, const
 	return false;
 }
 
+bool Theme::has_theme_item_nocheck(DataType p_data_type, const StringName &p_name, const StringName &p_node_type) const {
+	switch (p_data_type) {
+		case DATA_TYPE_COLOR:
+			return has_color_nocheck(p_name, p_node_type);
+		case DATA_TYPE_CONSTANT:
+			return has_constant_nocheck(p_name, p_node_type);
+		case DATA_TYPE_FONT:
+			return has_font_nocheck(p_name, p_node_type);
+		case DATA_TYPE_FONT_SIZE:
+			return has_font_size_nocheck(p_name, p_node_type);
+		case DATA_TYPE_ICON:
+			return has_icon_nocheck(p_name, p_node_type);
+		case DATA_TYPE_STYLEBOX:
+			return has_stylebox_nocheck(p_name, p_node_type);
+		case DATA_TYPE_MAX:
+			break; // Can't happen, but silences warning.
+	}
+
+	return false;
+}
+
 void Theme::rename_theme_item(DataType p_data_type, const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type) {
 	switch (p_data_type) {
 		case DATA_TYPE_COLOR:
diff --git a/scene/resources/theme.h b/scene/resources/theme.h
index eb918fac69..7e887b6343 100644
--- a/scene/resources/theme.h
+++ b/scene/resources/theme.h
@@ -119,6 +119,7 @@ public:
 	void set_icon(const StringName &p_name, const StringName &p_node_type, const Ref<Texture2D> &p_icon);
 	Ref<Texture2D> get_icon(const StringName &p_name, const StringName &p_node_type) const;
 	bool has_icon(const StringName &p_name, const StringName &p_node_type) const;
+	bool has_icon_nocheck(const StringName &p_name, const StringName &p_node_type) const;
 	void rename_icon(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type);
 	void clear_icon(const StringName &p_name, const StringName &p_node_type);
 	void get_icon_list(StringName p_node_type, List<StringName> *p_list) const;
@@ -128,6 +129,7 @@ public:
 	void set_stylebox(const StringName &p_name, const StringName &p_node_type, const Ref<StyleBox> &p_style);
 	Ref<StyleBox> get_stylebox(const StringName &p_name, const StringName &p_node_type) const;
 	bool has_stylebox(const StringName &p_name, const StringName &p_node_type) const;
+	bool has_stylebox_nocheck(const StringName &p_name, const StringName &p_node_type) const;
 	void rename_stylebox(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type);
 	void clear_stylebox(const StringName &p_name, const StringName &p_node_type);
 	void get_stylebox_list(StringName p_node_type, List<StringName> *p_list) const;
@@ -137,6 +139,7 @@ public:
 	void set_font(const StringName &p_name, const StringName &p_node_type, const Ref<Font> &p_font);
 	Ref<Font> get_font(const StringName &p_name, const StringName &p_node_type) const;
 	bool has_font(const StringName &p_name, const StringName &p_node_type) const;
+	bool has_font_nocheck(const StringName &p_name, const StringName &p_node_type) const;
 	void rename_font(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type);
 	void clear_font(const StringName &p_name, const StringName &p_node_type);
 	void get_font_list(StringName p_node_type, List<StringName> *p_list) const;
@@ -146,6 +149,7 @@ public:
 	void set_font_size(const StringName &p_name, const StringName &p_node_type, int p_font_size);
 	int get_font_size(const StringName &p_name, const StringName &p_node_type) const;
 	bool has_font_size(const StringName &p_name, const StringName &p_node_type) const;
+	bool has_font_size_nocheck(const StringName &p_name, const StringName &p_node_type) const;
 	void rename_font_size(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type);
 	void clear_font_size(const StringName &p_name, const StringName &p_node_type);
 	void get_font_size_list(StringName p_node_type, List<StringName> *p_list) const;
@@ -155,6 +159,7 @@ public:
 	void set_color(const StringName &p_name, const StringName &p_node_type, const Color &p_color);
 	Color get_color(const StringName &p_name, const StringName &p_node_type) const;
 	bool has_color(const StringName &p_name, const StringName &p_node_type) const;
+	bool has_color_nocheck(const StringName &p_name, const StringName &p_node_type) const;
 	void rename_color(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type);
 	void clear_color(const StringName &p_name, const StringName &p_node_type);
 	void get_color_list(StringName p_node_type, List<StringName> *p_list) const;
@@ -164,6 +169,7 @@ public:
 	void set_constant(const StringName &p_name, const StringName &p_node_type, int p_constant);
 	int get_constant(const StringName &p_name, const StringName &p_node_type) const;
 	bool has_constant(const StringName &p_name, const StringName &p_node_type) const;
+	bool has_constant_nocheck(const StringName &p_name, const StringName &p_node_type) const;
 	void rename_constant(const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type);
 	void clear_constant(const StringName &p_name, const StringName &p_node_type);
 	void get_constant_list(StringName p_node_type, List<StringName> *p_list) const;
@@ -173,6 +179,7 @@ public:
 	void set_theme_item(DataType p_data_type, const StringName &p_name, const StringName &p_node_type, const Variant &p_value);
 	Variant get_theme_item(DataType p_data_type, const StringName &p_name, const StringName &p_node_type) const;
 	bool has_theme_item(DataType p_data_type, const StringName &p_name, const StringName &p_node_type) const;
+	bool has_theme_item_nocheck(DataType p_data_type, const StringName &p_name, const StringName &p_node_type) const;
 	void rename_theme_item(DataType p_data_type, const StringName &p_old_name, const StringName &p_name, const StringName &p_node_type);
 	void clear_theme_item(DataType p_data_type, const StringName &p_name, const StringName &p_node_type);
 	void get_theme_item_list(DataType p_data_type, StringName p_node_type, List<StringName> *p_list) const;
diff --git a/scene/resources/visual_shader.cpp b/scene/resources/visual_shader.cpp
index e1e24ddab2..b810f9562e 100644
--- a/scene/resources/visual_shader.cpp
+++ b/scene/resources/visual_shader.cpp
@@ -300,6 +300,30 @@ String VisualShaderNodeCustom::generate_global_per_node(Shader::Mode p_mode, Vis
 	return "";
 }
 
+void VisualShaderNodeCustom::set_input_port_default_value(int p_port, const Variant &p_value) {
+	if (!is_initialized) {
+		VisualShaderNode::set_input_port_default_value(p_port, p_value);
+	}
+}
+
+void VisualShaderNodeCustom::set_default_input_values(const Array &p_values) {
+	if (!is_initialized) {
+		VisualShaderNode::set_default_input_values(p_values);
+	}
+}
+
+void VisualShaderNodeCustom::_set_input_port_default_value(int p_port, const Variant &p_value) {
+	VisualShaderNode::set_input_port_default_value(p_port, p_value);
+}
+
+bool VisualShaderNodeCustom::_is_initialized() {
+	return is_initialized;
+}
+
+void VisualShaderNodeCustom::_set_initialized(bool p_enabled) {
+	is_initialized = p_enabled;
+}
+
 void VisualShaderNodeCustom::_bind_methods() {
 	BIND_VMETHOD(MethodInfo(Variant::STRING, "_get_name"));
 	BIND_VMETHOD(MethodInfo(Variant::STRING, "_get_description"));
@@ -314,6 +338,12 @@ void VisualShaderNodeCustom::_bind_methods() {
 	BIND_VMETHOD(MethodInfo(Variant::STRING, "_get_code", PropertyInfo(Variant::ARRAY, "input_vars"), PropertyInfo(Variant::ARRAY, "output_vars"), PropertyInfo(Variant::INT, "mode"), PropertyInfo(Variant::INT, "type")));
 	BIND_VMETHOD(MethodInfo(Variant::STRING, "_get_global_code", PropertyInfo(Variant::INT, "mode")));
 	BIND_VMETHOD(MethodInfo(Variant::BOOL, "_is_highend"));
+
+	ClassDB::bind_method(D_METHOD("_set_initialized", "enabled"), &VisualShaderNodeCustom::_set_initialized);
+	ClassDB::bind_method(D_METHOD("_is_initialized"), &VisualShaderNodeCustom::_is_initialized);
+	ClassDB::bind_method(D_METHOD("_set_input_port_default_value", "port", "value"), &VisualShaderNodeCustom::_set_input_port_default_value);
+
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "initialized", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NOEDITOR | PROPERTY_USAGE_INTERNAL), "_set_initialized", "_is_initialized");
 }
 
 VisualShaderNodeCustom::VisualShaderNodeCustom() {
@@ -961,7 +991,8 @@ static const char *type_string[VisualShader::TYPE_MAX] = {
 	"light",
 	"emit",
 	"process",
-	"end"
+	"end",
+	"sky",
 };
 
 bool VisualShader::_set(const StringName &p_name, const Variant &p_value) {
@@ -1476,7 +1507,7 @@ void VisualShader::_update_shader() const {
 		global_code += "render_mode " + render_mode + ";\n\n";
 	}
 
-	static const char *func_name[TYPE_MAX] = { "vertex", "fragment", "light", "emit", "process", "end" };
+	static const char *func_name[TYPE_MAX] = { "vertex", "fragment", "light", "emit", "process", "end", "sky" };
 
 	String global_expressions;
 	Set<String> used_uniform_names;
@@ -1667,6 +1698,7 @@ void VisualShader::_bind_methods() {
 	BIND_ENUM_CONSTANT(TYPE_EMIT);
 	BIND_ENUM_CONSTANT(TYPE_PROCESS);
 	BIND_ENUM_CONSTANT(TYPE_END);
+	BIND_ENUM_CONSTANT(TYPE_SKY);
 	BIND_ENUM_CONSTANT(TYPE_MAX);
 
 	BIND_CONSTANT(NODE_ID_INVALID);
@@ -1698,7 +1730,6 @@ const VisualShaderNodeInput::Port VisualShaderNodeInput::ports[] = {
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_VECTOR, "color", "COLOR.rgb" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_SCALAR, "alpha", "COLOR.a" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_SCALAR, "point_size", "POINT_SIZE" },
-
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_TRANSFORM, "world", "WORLD_MATRIX" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_TRANSFORM, "modelview", "MODELVIEW_MATRIX" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_TRANSFORM, "camera", "CAMERA_MATRIX" },
@@ -1721,10 +1752,8 @@ const VisualShaderNodeInput::Port VisualShaderNodeInput::ports[] = {
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "color", "COLOR.rgb" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SCALAR, "alpha", "COLOR.a" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "point_coord", "vec3(POINT_COORD, 0.0)" },
-
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "screen_uv", "vec3(SCREEN_UV, 0.0)" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SCALAR, "side", "float(FRONT_FACING ? 1.0 : 0.0)" },
-
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_TRANSFORM, "world", "WORLD_MATRIX" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_TRANSFORM, "inv_camera", "INV_CAMERA_MATRIX" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_TRANSFORM, "camera", "CAMERA_MATRIX" },
@@ -1750,7 +1779,6 @@ const VisualShaderNodeInput::Port VisualShaderNodeInput::ports[] = {
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_LIGHT, VisualShaderNode::PORT_TYPE_VECTOR, "specular", "SPECULAR_LIGHT" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_LIGHT, VisualShaderNode::PORT_TYPE_SCALAR, "roughness", "ROUGHNESS" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_LIGHT, VisualShaderNode::PORT_TYPE_SCALAR, "metallic", "METALLIC" },
-
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_LIGHT, VisualShaderNode::PORT_TYPE_TRANSFORM, "world", "WORLD_MATRIX" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_LIGHT, VisualShaderNode::PORT_TYPE_TRANSFORM, "inv_camera", "INV_CAMERA_MATRIX" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_LIGHT, VisualShaderNode::PORT_TYPE_TRANSFORM, "camera", "CAMERA_MATRIX" },
@@ -1759,6 +1787,7 @@ const VisualShaderNodeInput::Port VisualShaderNodeInput::ports[] = {
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_LIGHT, VisualShaderNode::PORT_TYPE_SCALAR, "time", "TIME" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_LIGHT, VisualShaderNode::PORT_TYPE_VECTOR, "viewport_size", "vec3(VIEWPORT_SIZE, 0.0)" },
 	{ Shader::MODE_SPATIAL, VisualShader::TYPE_LIGHT, VisualShaderNode::PORT_TYPE_BOOLEAN, "output_is_srgb", "OUTPUT_IS_SRGB" },
+
 	// Canvas Item, Vertex
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_VECTOR, "vertex", "vec3(VERTEX, 0.0)" },
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_VECTOR, "uv", "vec3(UV, 0.0)" },
@@ -1766,12 +1795,12 @@ const VisualShaderNodeInput::Port VisualShaderNodeInput::ports[] = {
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_SCALAR, "alpha", "COLOR.a" },
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_SCALAR, "point_size", "POINT_SIZE" },
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_VECTOR, "texture_pixel_size", "vec3(TEXTURE_PIXEL_SIZE, 1.0)" },
-
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_TRANSFORM, "world", "WORLD_MATRIX" },
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_TRANSFORM, "canvas", "CANVAS_MATRIX" },
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_TRANSFORM, "screen", "SCREEN_MATRIX" },
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_SCALAR, "time", "TIME" },
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_VERTEX, VisualShaderNode::PORT_TYPE_BOOLEAN, "at_light_pass", "AT_LIGHT_PASS" },
+
 	// Canvas Item, Fragment
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "fragcoord", "FRAGCOORD.xyz" },
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "uv", "vec3(UV, 0.0)" },
@@ -1789,6 +1818,7 @@ const VisualShaderNodeInput::Port VisualShaderNodeInput::ports[] = {
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "specular_shininess", "SPECULAR_SHININESS.rgb" },
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SCALAR, "specular_shininess_alpha", "SPECULAR_SHININESS.a" },
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SAMPLER, "specular_shininess_texture", "SPECULAR_SHININESS_TEXTURE" },
+
 	// Canvas Item, Light
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_LIGHT, VisualShaderNode::PORT_TYPE_VECTOR, "fragcoord", "FRAGCOORD.xyz" },
 	{ Shader::MODE_CANVAS_ITEM, VisualShader::TYPE_LIGHT, VisualShaderNode::PORT_TYPE_VECTOR, "uv", "vec3(UV, 0.0)" },
@@ -1856,36 +1886,36 @@ const VisualShaderNodeInput::Port VisualShaderNodeInput::ports[] = {
 	{ Shader::MODE_PARTICLES, VisualShader::TYPE_END, VisualShaderNode::PORT_TYPE_TRANSFORM, "emission_transform", "EMISSION_TRANSFORM" },
 	{ Shader::MODE_PARTICLES, VisualShader::TYPE_END, VisualShaderNode::PORT_TYPE_SCALAR, "time", "TIME" },
 
-	// Sky, Fragment
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_BOOLEAN, "at_cubemap_pass", "AT_CUBEMAP_PASS" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_BOOLEAN, "at_half_res_pass", "AT_HALF_RES_PASS" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_BOOLEAN, "at_quarter_res_pass", "AT_QUARTER_RES_PASS" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "eyedir", "EYEDIR" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "half_res_color", "HALF_RES_COLOR.rgb" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SCALAR, "half_res_alpha", "HALF_RES_COLOR.a" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "light0_color", "LIGHT0_COLOR" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "light0_direction", "LIGHT0_DIRECTION" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_BOOLEAN, "light0_enabled", "LIGHT0_ENABLED" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SCALAR, "light0_energy", "LIGHT0_ENERGY" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "light1_color", "LIGHT1_COLOR" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "light1_direction", "LIGHT1_DIRECTION" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_BOOLEAN, "light1_enabled", "LIGHT1_ENABLED" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SCALAR, "light1_energy", "LIGHT1_ENERGY" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "light2_color", "LIGHT2_COLOR" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "light2_direction", "LIGHT2_DIRECTION" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_BOOLEAN, "light2_enabled", "LIGHT2_ENABLED" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SCALAR, "light2_energy", "LIGHT2_ENERGY" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "light3_color", "LIGHT3_COLOR" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "light3_direction", "LIGHT3_DIRECTION" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_BOOLEAN, "light3_enabled", "LIGHT3_ENABLED" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SCALAR, "light3_energy", "LIGHT3_ENERGY" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "position", "POSITION" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "quarter_res_color", "QUARTER_RES_COLOR.rgb" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SCALAR, "quarter_res_alpha", "QUARTER_RES_COLOR.a" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SAMPLER, "radiance", "RADIANCE" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "screen_uv", "vec3(SCREEN_UV, 0.0)" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "sky_coords", "vec3(SKY_COORDS, 0.0)" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SCALAR, "time", "TIME" },
+	// Sky, Sky
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_BOOLEAN, "at_cubemap_pass", "AT_CUBEMAP_PASS" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_BOOLEAN, "at_half_res_pass", "AT_HALF_RES_PASS" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_BOOLEAN, "at_quarter_res_pass", "AT_QUARTER_RES_PASS" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "eyedir", "EYEDIR" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "half_res_color", "HALF_RES_COLOR.rgb" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_SCALAR, "half_res_alpha", "HALF_RES_COLOR.a" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "light0_color", "LIGHT0_COLOR" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "light0_direction", "LIGHT0_DIRECTION" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_BOOLEAN, "light0_enabled", "LIGHT0_ENABLED" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_SCALAR, "light0_energy", "LIGHT0_ENERGY" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "light1_color", "LIGHT1_COLOR" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "light1_direction", "LIGHT1_DIRECTION" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_BOOLEAN, "light1_enabled", "LIGHT1_ENABLED" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_SCALAR, "light1_energy", "LIGHT1_ENERGY" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "light2_color", "LIGHT2_COLOR" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "light2_direction", "LIGHT2_DIRECTION" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_BOOLEAN, "light2_enabled", "LIGHT2_ENABLED" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_SCALAR, "light2_energy", "LIGHT2_ENERGY" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "light3_color", "LIGHT3_COLOR" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "light3_direction", "LIGHT3_DIRECTION" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_BOOLEAN, "light3_enabled", "LIGHT3_ENABLED" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_SCALAR, "light3_energy", "LIGHT3_ENERGY" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "position", "POSITION" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "quarter_res_color", "QUARTER_RES_COLOR.rgb" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_SCALAR, "quarter_res_alpha", "QUARTER_RES_COLOR.a" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_SAMPLER, "radiance", "RADIANCE" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "screen_uv", "vec3(SCREEN_UV, 0.0)" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "sky_coords", "vec3(SKY_COORDS, 0.0)" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_SCALAR, "time", "TIME" },
 
 	{ Shader::MODE_MAX, VisualShader::TYPE_MAX, VisualShaderNode::PORT_TYPE_TRANSFORM, nullptr, nullptr },
 };
@@ -2449,9 +2479,9 @@ const VisualShaderNodeOutput::Port VisualShaderNodeOutput::ports[] = {
 	{ Shader::MODE_PARTICLES, VisualShader::TYPE_END, VisualShaderNode::PORT_TYPE_SCALAR, "custom_alpha", "CUSTOM.a" },
 	{ Shader::MODE_PARTICLES, VisualShader::TYPE_END, VisualShaderNode::PORT_TYPE_TRANSFORM, "transform", "TRANSFORM" },
 	{ Shader::MODE_PARTICLES, VisualShader::TYPE_END, VisualShaderNode::PORT_TYPE_BOOLEAN, "active", "ACTIVE" },
-	// Sky, Fragment
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_VECTOR, "color", "COLOR" },
-	{ Shader::MODE_SKY, VisualShader::TYPE_FRAGMENT, VisualShaderNode::PORT_TYPE_SCALAR, "alpha", "ALPHA" },
+	// Sky, Sky
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_VECTOR, "color", "COLOR" },
+	{ Shader::MODE_SKY, VisualShader::TYPE_SKY, VisualShaderNode::PORT_TYPE_SCALAR, "alpha", "ALPHA" },
 
 	{ Shader::MODE_MAX, VisualShader::TYPE_MAX, VisualShaderNode::PORT_TYPE_TRANSFORM, nullptr, nullptr },
 };
diff --git a/scene/resources/visual_shader.h b/scene/resources/visual_shader.h
index 54a5c19049..8af0fc9e44 100644
--- a/scene/resources/visual_shader.h
+++ b/scene/resources/visual_shader.h
@@ -54,6 +54,7 @@ public:
 		TYPE_EMIT,
 		TYPE_PROCESS,
 		TYPE_END,
+		TYPE_SKY,
 		TYPE_MAX
 	};
 
@@ -222,10 +223,10 @@ public:
 	virtual PortType get_input_port_type(int p_port) const = 0;
 	virtual String get_input_port_name(int p_port) const = 0;
 
-	void set_input_port_default_value(int p_port, const Variant &p_value);
+	virtual void set_input_port_default_value(int p_port, const Variant &p_value);
 	Variant get_input_port_default_value(int p_port) const; // if NIL (default if node does not set anything) is returned, it means no default value is wanted if disconnected, thus no input var must be supplied (empty string will be supplied)
 	Array get_default_input_values() const;
-	void set_default_input_values(const Array &p_values);
+	virtual void set_default_input_values(const Array &p_values);
 
 	virtual int get_output_port_count() const = 0;
 	virtual PortType get_output_port_type(int p_port) const = 0;
@@ -271,6 +272,7 @@ class VisualShaderNodeCustom : public VisualShaderNode {
 		int type = 0;
 	};
 
+	bool is_initialized = false;
 	List<Port> input_ports;
 	List<Port> output_ports;
 
@@ -287,7 +289,12 @@ protected:
 	virtual PortType get_output_port_type(int p_port) const override;
 	virtual String get_output_port_name(int p_port) const override;
 
+	virtual void set_input_port_default_value(int p_port, const Variant &p_value) override;
+	virtual void set_default_input_values(const Array &p_values) override;
+
 protected:
+	void _set_input_port_default_value(int p_port, const Variant &p_value);
+
 	virtual String generate_code(Shader::Mode p_mode, VisualShader::Type p_type, int p_id, const String *p_input_vars, const String *p_output_vars, bool p_for_preview = false) const override;
 	virtual String generate_global_per_node(Shader::Mode p_mode, VisualShader::Type p_type, int p_id) const override;
 
@@ -296,6 +303,9 @@ protected:
 public:
 	VisualShaderNodeCustom();
 	void update_ports();
+
+	bool _is_initialized();
+	void _set_initialized(bool p_enabled);
 };
 
 /////
diff --git a/scene/resources/world_3d.cpp b/scene/resources/world_3d.cpp
index 0e9f7a6cf2..e811cbf57a 100644
--- a/scene/resources/world_3d.cpp
+++ b/scene/resources/world_3d.cpp
@@ -321,7 +321,7 @@ void World3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_environment"), &World3D::get_environment);
 	ClassDB::bind_method(D_METHOD("set_fallback_environment", "env"), &World3D::set_fallback_environment);
 	ClassDB::bind_method(D_METHOD("get_fallback_environment"), &World3D::get_fallback_environment);
-	ClassDB::bind_method(D_METHOD("set_camera_effects", "env"), &World3D::set_camera_effects);
+	ClassDB::bind_method(D_METHOD("set_camera_effects", "effects"), &World3D::set_camera_effects);
 	ClassDB::bind_method(D_METHOD("get_camera_effects"), &World3D::get_camera_effects);
 	ClassDB::bind_method(D_METHOD("get_direct_space_state"), &World3D::get_direct_space_state);
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "environment", PROPERTY_HINT_RESOURCE_TYPE, "Environment"), "set_environment", "get_environment");
@@ -348,7 +348,7 @@ World3D::World3D() {
 	navigation_map = NavigationServer3D::get_singleton()->map_create();
 	NavigationServer3D::get_singleton()->map_set_active(navigation_map, true);
 	NavigationServer3D::get_singleton()->map_set_cell_size(navigation_map, GLOBAL_DEF("navigation/3d/default_cell_size", 0.3));
-	NavigationServer3D::get_singleton()->map_set_edge_connection_margin(navigation_map, GLOBAL_DEF("navigation/3d/default_edge_connection_margin", 5.0)); // Five meters, depends a lot on the agent's radius
+	NavigationServer3D::get_singleton()->map_set_edge_connection_margin(navigation_map, GLOBAL_DEF("navigation/3d/default_edge_connection_margin", 0.3));
 
 #ifdef _3D_DISABLED
 	indexer = nullptr;
diff --git a/scene/scene_string_names.cpp b/scene/scene_string_names.cpp
index 892802c103..7575ccd5c3 100644
--- a/scene/scene_string_names.cpp
+++ b/scene/scene_string_names.cpp
@@ -190,10 +190,6 @@ SceneStringNames::SceneStringNames() {
 
 	_default = StaticCString::create("default");
 
-	for (int i = 0; i < MAX_MATERIALS; i++) {
-		mesh_materials[i] = "material/" + itos(i);
-	}
-
 	_window_group = StaticCString::create("_window_group");
 	_window_input = StaticCString::create("_window_input");
 	window_input = StaticCString::create("window_input");
diff --git a/scene/scene_string_names.h b/scene/scene_string_names.h
index 655e49c6f9..a5b489eddc 100644
--- a/scene/scene_string_names.h
+++ b/scene/scene_string_names.h
@@ -216,10 +216,6 @@ public:
 	StringName use_in_baked_light;
 	StringName use_dynamic_gi;
 #endif
-	enum {
-		MAX_MATERIALS = 32
-	};
-	StringName mesh_materials[MAX_MATERIALS];
 };
 
 #endif // SCENE_STRING_NAMES_H
diff --git a/servers/audio_server.cpp b/servers/audio_server.cpp
index 138cb6e1f8..08c482553b 100644
--- a/servers/audio_server.cpp
+++ b/servers/audio_server.cpp
@@ -246,6 +246,7 @@ void AudioServer::_driver_process(int p_frames, int32_t *p_buffer) {
 		init_channels_and_buffers();
 	}
 
+	ERR_FAIL_COND_MSG(buses.is_empty() && todo, "AudioServer bus count is less than 1.");
 	while (todo) {
 		if (to_mix == 0) {
 			_mix_step();
diff --git a/servers/physics_2d/body_2d_sw.cpp b/servers/physics_2d/body_2d_sw.cpp
index d0636047b7..9306fea70c 100644
--- a/servers/physics_2d/body_2d_sw.cpp
+++ b/servers/physics_2d/body_2d_sw.cpp
@@ -591,16 +591,17 @@ void Body2DSW::call_queries() {
 		Variant v = dbs;
 		const Variant *vp[2] = { &v, &fi_callback->callback_udata };
 
-		Object *obj = ObjectDB::get_instance(fi_callback->id);
+		Object *obj = fi_callback->callable.get_object();
 		if (!obj) {
-			set_force_integration_callback(ObjectID(), StringName());
+			set_force_integration_callback(Callable());
 		} else {
 			Callable::CallError ce;
+			Variant rv;
 			if (fi_callback->callback_udata.get_type() != Variant::NIL) {
-				obj->call(fi_callback->method, vp, 2, ce);
+				fi_callback->callable.call(vp, 2, rv, ce);
 
 			} else {
-				obj->call(fi_callback->method, vp, 1, ce);
+				fi_callback->callable.call(vp, 1, rv, ce);
 			}
 		}
 	}
@@ -625,16 +626,15 @@ bool Body2DSW::sleep_test(real_t p_step) {
 	}
 }
 
-void Body2DSW::set_force_integration_callback(ObjectID p_id, const StringName &p_method, const Variant &p_udata) {
+void Body2DSW::set_force_integration_callback(const Callable &p_callable, const Variant &p_udata) {
 	if (fi_callback) {
 		memdelete(fi_callback);
 		fi_callback = nullptr;
 	}
 
-	if (p_id.is_valid()) {
+	if (p_callable.get_object()) {
 		fi_callback = memnew(ForceIntegrationCallback);
-		fi_callback->id = p_id;
-		fi_callback->method = p_method;
+		fi_callback->callable = p_callable;
 		fi_callback->callback_udata = p_udata;
 	}
 }
@@ -658,8 +658,6 @@ Body2DSW::Body2DSW() :
 	omit_force_integration = false;
 	applied_torque = 0;
 	island_step = 0;
-	island_next = nullptr;
-	island_list_next = nullptr;
 	_set_static(false);
 	first_time_kinematic = false;
 	linear_damp = -1;
diff --git a/servers/physics_2d/body_2d_sw.h b/servers/physics_2d/body_2d_sw.h
index 60d55ab8bd..b4a95651cb 100644
--- a/servers/physics_2d/body_2d_sw.h
+++ b/servers/physics_2d/body_2d_sw.h
@@ -117,23 +117,20 @@ class Body2DSW : public CollisionObject2DSW {
 	int contact_count;
 
 	struct ForceIntegrationCallback {
-		ObjectID id;
-		StringName method;
+		Callable callable;
 		Variant callback_udata;
 	};
 
 	ForceIntegrationCallback *fi_callback;
 
 	uint64_t island_step;
-	Body2DSW *island_next;
-	Body2DSW *island_list_next;
 
 	_FORCE_INLINE_ void _compute_area_gravity_and_dampenings(const Area2DSW *p_area);
 
 	friend class PhysicsDirectBodyState2DSW; // i give up, too many functions to expose
 
 public:
-	void set_force_integration_callback(ObjectID p_id, const StringName &p_method, const Variant &p_udata = Variant());
+	void set_force_integration_callback(const Callable &p_callable, const Variant &p_udata = Variant());
 
 	_FORCE_INLINE_ void add_area(Area2DSW *p_area) {
 		int index = areas.find(AreaCMP(p_area));
@@ -175,12 +172,6 @@ public:
 	_FORCE_INLINE_ uint64_t get_island_step() const { return island_step; }
 	_FORCE_INLINE_ void set_island_step(uint64_t p_step) { island_step = p_step; }
 
-	_FORCE_INLINE_ Body2DSW *get_island_next() const { return island_next; }
-	_FORCE_INLINE_ void set_island_next(Body2DSW *p_next) { island_next = p_next; }
-
-	_FORCE_INLINE_ Body2DSW *get_island_list_next() const { return island_list_next; }
-	_FORCE_INLINE_ void set_island_list_next(Body2DSW *p_next) { island_list_next = p_next; }
-
 	_FORCE_INLINE_ void add_constraint(Constraint2DSW *p_constraint, int p_pos) { constraint_list.push_back({ p_constraint, p_pos }); }
 	_FORCE_INLINE_ void remove_constraint(Constraint2DSW *p_constraint, int p_pos) { constraint_list.erase({ p_constraint, p_pos }); }
 	const List<Pair<Constraint2DSW *, int>> &get_constraint_list() const { return constraint_list; }
diff --git a/servers/physics_2d/broad_phase_2d_hash_grid.cpp b/servers/physics_2d/broad_phase_2d_hash_grid.cpp
index 6cfe6908d1..35447c5389 100644
--- a/servers/physics_2d/broad_phase_2d_hash_grid.cpp
+++ b/servers/physics_2d/broad_phase_2d_hash_grid.cpp
@@ -35,6 +35,12 @@
 #define LARGE_ELEMENT_FI 1.01239812
 
 void BroadPhase2DHashGrid::_pair_attempt(Element *p_elem, Element *p_with) {
+	if (p_elem->owner == p_with->owner) {
+		return;
+	}
+	if (!_test_collision_mask(p_elem->collision_mask, p_elem->collision_layer, p_with->collision_mask, p_with->collision_layer)) {
+		return;
+	}
 	Map<Element *, PairData *>::Element *E = p_elem->paired.find(p_with);
 
 	ERR_FAIL_COND(p_elem->_static && p_with->_static);
@@ -49,6 +55,12 @@ void BroadPhase2DHashGrid::_pair_attempt(Element *p_elem, Element *p_with) {
 }
 
 void BroadPhase2DHashGrid::_unpair_attempt(Element *p_elem, Element *p_with) {
+	if (p_elem->owner == p_with->owner) {
+		return;
+	}
+	if (!_test_collision_mask(p_elem->collision_mask, p_elem->collision_layer, p_with->collision_mask, p_with->collision_layer)) {
+		return;
+	}
 	Map<Element *, PairData *>::Element *E = p_elem->paired.find(p_with);
 
 	ERR_FAIL_COND(!E); //this should really be paired..
@@ -74,24 +86,22 @@ void BroadPhase2DHashGrid::_check_motion(Element *p_elem) {
 		bool physical_collision = p_elem->aabb.intersects(E->key()->aabb);
 		bool logical_collision = p_elem->owner->test_collision_mask(E->key()->owner);
 
-		if (physical_collision) {
-			if (!E->get()->colliding || (logical_collision && !E->get()->ud && pair_callback)) {
+		if (physical_collision && logical_collision) {
+			if (!E->get()->colliding && pair_callback) {
 				E->get()->ud = pair_callback(p_elem->owner, p_elem->subindex, E->key()->owner, E->key()->subindex, pair_userdata);
-			} else if (E->get()->colliding && !logical_collision && E->get()->ud && unpair_callback) {
-				unpair_callback(p_elem->owner, p_elem->subindex, E->key()->owner, E->key()->subindex, E->get()->ud, unpair_userdata);
-				E->get()->ud = nullptr;
 			}
 			E->get()->colliding = true;
-		} else { // No physcial_collision
+		} else { // No collision
 			if (E->get()->colliding && unpair_callback) {
 				unpair_callback(p_elem->owner, p_elem->subindex, E->key()->owner, E->key()->subindex, E->get()->ud, unpair_userdata);
+				E->get()->ud = nullptr;
 			}
 			E->get()->colliding = false;
 		}
 	}
 }
 
-void BroadPhase2DHashGrid::_enter_grid(Element *p_elem, const Rect2 &p_rect, bool p_static) {
+void BroadPhase2DHashGrid::_enter_grid(Element *p_elem, const Rect2 &p_rect, bool p_static, bool p_force_enter) {
 	Vector2 sz = (p_rect.size / cell_size * LARGE_ELEMENT_FI); //use magic number to avoid floating point issues
 	if (sz.width * sz.height > large_object_min_surface) {
 		//large object, do not use grid, must check against all elements
@@ -99,9 +109,6 @@ void BroadPhase2DHashGrid::_enter_grid(Element *p_elem, const Rect2 &p_rect, boo
 			if (E->key() == p_elem->self) {
 				continue; // do not pair against itself
 			}
-			if (E->get().owner == p_elem->owner) {
-				continue;
-			}
 			if (E->get()._static && p_static) {
 				continue;
 			}
@@ -133,7 +140,7 @@ void BroadPhase2DHashGrid::_enter_grid(Element *p_elem, const Rect2 &p_rect, boo
 				pb = pb->next;
 			}
 
-			bool entered = false;
+			bool entered = p_force_enter;
 
 			if (!pb) {
 				//does not exist, create!
@@ -155,17 +162,11 @@ void BroadPhase2DHashGrid::_enter_grid(Element *p_elem, const Rect2 &p_rect, boo
 
 			if (entered) {
 				for (Map<Element *, RC>::Element *E = pb->object_set.front(); E; E = E->next()) {
-					if (E->key()->owner == p_elem->owner) {
-						continue;
-					}
 					_pair_attempt(p_elem, E->key());
 				}
 
 				if (!p_static) {
 					for (Map<Element *, RC>::Element *E = pb->static_object_set.front(); E; E = E->next()) {
-						if (E->key()->owner == p_elem->owner) {
-							continue;
-						}
 						_pair_attempt(p_elem, E->key());
 					}
 				}
@@ -179,18 +180,14 @@ void BroadPhase2DHashGrid::_enter_grid(Element *p_elem, const Rect2 &p_rect, boo
 		if (E->key() == p_elem) {
 			continue; // do not pair against itself
 		}
-		if (E->key()->owner == p_elem->owner) {
-			continue;
-		}
 		if (E->key()->_static && p_static) {
 			continue;
 		}
-
 		_pair_attempt(E->key(), p_elem);
 	}
 }
 
-void BroadPhase2DHashGrid::_exit_grid(Element *p_elem, const Rect2 &p_rect, bool p_static) {
+void BroadPhase2DHashGrid::_exit_grid(Element *p_elem, const Rect2 &p_rect, bool p_static, bool p_force_exit) {
 	Vector2 sz = (p_rect.size / cell_size * LARGE_ELEMENT_FI);
 	if (sz.width * sz.height > large_object_min_surface) {
 		//unpair all elements, instead of checking all, just check what is already paired, so we at least save from checking static vs static
@@ -229,7 +226,7 @@ void BroadPhase2DHashGrid::_exit_grid(Element *p_elem, const Rect2 &p_rect, bool
 
 			ERR_CONTINUE(!pb); //should exist!!
 
-			bool exited = false;
+			bool exited = p_force_exit;
 
 			if (p_static) {
 				if (pb->static_object_set[p_elem].dec() == 0) {
@@ -245,17 +242,11 @@ void BroadPhase2DHashGrid::_exit_grid(Element *p_elem, const Rect2 &p_rect, bool
 
 			if (exited) {
 				for (Map<Element *, RC>::Element *E = pb->object_set.front(); E; E = E->next()) {
-					if (E->key()->owner == p_elem->owner) {
-						continue;
-					}
 					_unpair_attempt(p_elem, E->key());
 				}
 
 				if (!p_static) {
 					for (Map<Element *, RC>::Element *E = pb->static_object_set.front(); E; E = E->next()) {
-						if (E->key()->owner == p_elem->owner) {
-							continue;
-						}
 						_unpair_attempt(p_elem, E->key());
 					}
 				}
@@ -288,9 +279,6 @@ void BroadPhase2DHashGrid::_exit_grid(Element *p_elem, const Rect2 &p_rect, bool
 		if (E->key() == p_elem) {
 			continue; // do not pair against itself
 		}
-		if (E->key()->owner == p_elem->owner) {
-			continue;
-		}
 		if (E->key()->_static && p_static) {
 			continue;
 		}
@@ -306,6 +294,8 @@ BroadPhase2DHashGrid::ID BroadPhase2DHashGrid::create(CollisionObject2DSW *p_obj
 	Element e;
 	e.owner = p_object;
 	e._static = false;
+	e.collision_mask = p_object->get_collision_mask();
+	e.collision_layer = p_object->get_collision_layer();
 	e.subindex = p_subindex;
 	e.self = current;
 	e.pass = 0;
@@ -319,13 +309,26 @@ void BroadPhase2DHashGrid::move(ID p_id, const Rect2 &p_aabb) {
 	ERR_FAIL_COND(!E);
 
 	Element &e = E->get();
+	bool layer_changed = e.collision_mask != e.owner->get_collision_mask() || e.collision_layer != e.owner->get_collision_layer();
 
-	if (p_aabb != e.aabb) {
+	if (p_aabb != e.aabb || layer_changed) {
+		uint32_t old_mask = e.collision_mask;
+		uint32_t old_layer = e.collision_layer;
 		if (p_aabb != Rect2()) {
-			_enter_grid(&e, p_aabb, e._static);
+			e.collision_mask = e.owner->get_collision_mask();
+			e.collision_layer = e.owner->get_collision_layer();
+
+			_enter_grid(&e, p_aabb, e._static, layer_changed);
 		}
 		if (e.aabb != Rect2()) {
-			_exit_grid(&e, e.aabb, e._static);
+			// Need _exit_grid to remove from cells based on the old layer values.
+			e.collision_mask = old_mask;
+			e.collision_layer = old_layer;
+
+			_exit_grid(&e, e.aabb, e._static, layer_changed);
+
+			e.collision_mask = e.owner->get_collision_mask();
+			e.collision_layer = e.owner->get_collision_layer();
 		}
 		e.aabb = p_aabb;
 	}
@@ -344,13 +347,13 @@ void BroadPhase2DHashGrid::set_static(ID p_id, bool p_static) {
 	}
 
 	if (e.aabb != Rect2()) {
-		_exit_grid(&e, e.aabb, e._static);
+		_exit_grid(&e, e.aabb, e._static, false);
 	}
 
 	e._static = p_static;
 
 	if (e.aabb != Rect2()) {
-		_enter_grid(&e, e.aabb, e._static);
+		_enter_grid(&e, e.aabb, e._static, false);
 		_check_motion(&e);
 	}
 }
@@ -362,7 +365,7 @@ void BroadPhase2DHashGrid::remove(ID p_id) {
 	Element &e = E->get();
 
 	if (e.aabb != Rect2()) {
-		_exit_grid(&e, e.aabb, e._static);
+		_exit_grid(&e, e.aabb, e._static, false);
 	}
 
 	element_map.erase(p_id);
diff --git a/servers/physics_2d/broad_phase_2d_hash_grid.h b/servers/physics_2d/broad_phase_2d_hash_grid.h
index eb7c8879ac..bb7c03b989 100644
--- a/servers/physics_2d/broad_phase_2d_hash_grid.h
+++ b/servers/physics_2d/broad_phase_2d_hash_grid.h
@@ -51,6 +51,9 @@ class BroadPhase2DHashGrid : public BroadPhase2DSW {
 		CollisionObject2DSW *owner;
 		bool _static;
 		Rect2 aabb;
+		// Owner's collision_mask/layer, used to detect changes in layers.
+		uint32_t collision_mask;
+		uint32_t collision_layer;
 		int subindex;
 		uint64_t pass;
 		Map<Element *, PairData *> paired;
@@ -115,8 +118,12 @@ class BroadPhase2DHashGrid : public BroadPhase2DSW {
 	UnpairCallback unpair_callback;
 	void *unpair_userdata;
 
-	void _enter_grid(Element *p_elem, const Rect2 &p_rect, bool p_static);
-	void _exit_grid(Element *p_elem, const Rect2 &p_rect, bool p_static);
+	static _FORCE_INLINE_ bool _test_collision_mask(uint32_t p_mask1, uint32_t p_layer1, uint32_t p_mask2, uint32_t p_layer2) {
+		return p_mask1 & p_layer2 || p_mask2 & p_layer1;
+	}
+
+	void _enter_grid(Element *p_elem, const Rect2 &p_rect, bool p_static, bool p_force_enter);
+	void _exit_grid(Element *p_elem, const Rect2 &p_rect, bool p_static, bool p_force_exit);
 	template <bool use_aabb, bool use_segment>
 	_FORCE_INLINE_ void _cull(const Point2i p_cell, const Rect2 &p_aabb, const Point2 &p_from, const Point2 &p_to, CollisionObject2DSW **p_results, int p_max_results, int *p_result_indices, int &index);
 
diff --git a/servers/physics_2d/constraint_2d_sw.h b/servers/physics_2d/constraint_2d_sw.h
index 49ae4dd848..b724deb48e 100644
--- a/servers/physics_2d/constraint_2d_sw.h
+++ b/servers/physics_2d/constraint_2d_sw.h
@@ -37,8 +37,6 @@ class Constraint2DSW {
 	Body2DSW **_body_ptr;
 	int _body_count;
 	uint64_t island_step;
-	Constraint2DSW *island_next;
-	Constraint2DSW *island_list_next;
 	bool disabled_collisions_between_bodies;
 
 	RID self;
@@ -58,12 +56,6 @@ public:
 	_FORCE_INLINE_ uint64_t get_island_step() const { return island_step; }
 	_FORCE_INLINE_ void set_island_step(uint64_t p_step) { island_step = p_step; }
 
-	_FORCE_INLINE_ Constraint2DSW *get_island_next() const { return island_next; }
-	_FORCE_INLINE_ void set_island_next(Constraint2DSW *p_next) { island_next = p_next; }
-
-	_FORCE_INLINE_ Constraint2DSW *get_island_list_next() const { return island_list_next; }
-	_FORCE_INLINE_ void set_island_list_next(Constraint2DSW *p_next) { island_list_next = p_next; }
-
 	_FORCE_INLINE_ Body2DSW **get_body_ptr() const { return _body_ptr; }
 	_FORCE_INLINE_ int get_body_count() const { return _body_count; }
 
diff --git a/servers/physics_2d/joints_2d_sw.cpp b/servers/physics_2d/joints_2d_sw.cpp
index c7b556deba..20d4b9aa1a 100644
--- a/servers/physics_2d/joints_2d_sw.cpp
+++ b/servers/physics_2d/joints_2d_sw.cpp
@@ -97,8 +97,13 @@ normal_relative_velocity(Body2DSW *a, Body2DSW *b, Vector2 rA, Vector2 rB, Vecto
 }
 
 bool PinJoint2DSW::setup(real_t p_step) {
+	if ((A->get_mode() <= PhysicsServer2D::BODY_MODE_KINEMATIC) && (B->get_mode() <= PhysicsServer2D::BODY_MODE_KINEMATIC)) {
+		return false;
+	}
+
 	Space2DSW *space = A->get_space();
 	ERR_FAIL_COND_V(!space, false);
+
 	rA = A->get_transform().basis_xform(anchor_A);
 	rB = B ? B->get_transform().basis_xform(anchor_B) : anchor_B;
 
@@ -257,6 +262,10 @@ mult_k(const Vector2 &vr, const Vector2 &k1, const Vector2 &k2) {
 }
 
 bool GrooveJoint2DSW::setup(real_t p_step) {
+	if ((A->get_mode() <= PhysicsServer2D::BODY_MODE_KINEMATIC) && (B->get_mode() <= PhysicsServer2D::BODY_MODE_KINEMATIC)) {
+		return false;
+	}
+
 	// calculate endpoints in worldspace
 	Vector2 ta = A->get_transform().xform(A_groove_1);
 	Vector2 tb = A->get_transform().xform(A_groove_2);
@@ -342,6 +351,10 @@ GrooveJoint2DSW::GrooveJoint2DSW(const Vector2 &p_a_groove1, const Vector2 &p_a_
 //////////////////////////////////////////////
 
 bool DampedSpringJoint2DSW::setup(real_t p_step) {
+	if ((A->get_mode() <= PhysicsServer2D::BODY_MODE_KINEMATIC) && (B->get_mode() <= PhysicsServer2D::BODY_MODE_KINEMATIC)) {
+		return false;
+	}
+
 	rA = A->get_transform().basis_xform(anchor_A);
 	rB = B->get_transform().basis_xform(anchor_B);
 
diff --git a/servers/physics_2d/physics_server_2d_sw.cpp b/servers/physics_2d/physics_server_2d_sw.cpp
index 1040437ca7..6d64f4126c 100644
--- a/servers/physics_2d/physics_server_2d_sw.cpp
+++ b/servers/physics_2d/physics_server_2d_sw.cpp
@@ -927,10 +927,10 @@ int PhysicsServer2DSW::body_get_max_contacts_reported(RID p_body) const {
 	return body->get_max_contacts_reported();
 }
 
-void PhysicsServer2DSW::body_set_force_integration_callback(RID p_body, Object *p_receiver, const StringName &p_method, const Variant &p_udata) {
+void PhysicsServer2DSW::body_set_force_integration_callback(RID p_body, const Callable &p_callable, const Variant &p_udata) {
 	Body2DSW *body = body_owner.getornull(p_body);
 	ERR_FAIL_COND(!body);
-	body->set_force_integration_callback(p_receiver ? p_receiver->get_instance_id() : ObjectID(), p_method, p_udata);
+	body->set_force_integration_callback(p_callable, p_udata);
 }
 
 bool PhysicsServer2DSW::body_collide_shape(RID p_body, int p_body_shape, RID p_shape, const Transform2D &p_shape_xform, const Vector2 &p_motion, Vector2 *r_results, int p_result_max, int &r_result_count) {
diff --git a/servers/physics_2d/physics_server_2d_sw.h b/servers/physics_2d/physics_server_2d_sw.h
index 65c5df0fce..efa0784245 100644
--- a/servers/physics_2d/physics_server_2d_sw.h
+++ b/servers/physics_2d/physics_server_2d_sw.h
@@ -242,7 +242,7 @@ public:
 	virtual void body_set_max_contacts_reported(RID p_body, int p_contacts) override;
 	virtual int body_get_max_contacts_reported(RID p_body) const override;
 
-	virtual void body_set_force_integration_callback(RID p_body, Object *p_receiver, const StringName &p_method, const Variant &p_udata = Variant()) override;
+	virtual void body_set_force_integration_callback(RID p_body, const Callable &p_callable, const Variant &p_udata = Variant()) override;
 	virtual bool body_collide_shape(RID p_body, int p_body_shape, RID p_shape, const Transform2D &p_shape_xform, const Vector2 &p_motion, Vector2 *r_results, int p_result_max, int &r_result_count) override;
 
 	virtual void body_set_pickable(RID p_body, bool p_pickable) override;
diff --git a/servers/physics_2d/physics_server_2d_wrap_mt.h b/servers/physics_2d/physics_server_2d_wrap_mt.h
index 3577f706de..88ac742e40 100644
--- a/servers/physics_2d/physics_server_2d_wrap_mt.h
+++ b/servers/physics_2d/physics_server_2d_wrap_mt.h
@@ -245,7 +245,7 @@ public:
 	FUNC2(body_set_omit_force_integration, RID, bool);
 	FUNC1RC(bool, body_is_omitting_force_integration, RID);
 
-	FUNC4(body_set_force_integration_callback, RID, Object *, const StringName &, const Variant &);
+	FUNC3(body_set_force_integration_callback, RID, const Callable &, const Variant &);
 
 	bool body_collide_shape(RID p_body, int p_body_shape, RID p_shape, const Transform2D &p_shape_xform, const Vector2 &p_motion, Vector2 *r_results, int p_result_max, int &r_result_count) override {
 		return physics_2d_server->body_collide_shape(p_body, p_body_shape, p_shape, p_shape_xform, p_motion, r_results, p_result_max, r_result_count);
diff --git a/servers/physics_2d/step_2d_sw.cpp b/servers/physics_2d/step_2d_sw.cpp
index 6613d19729..406d750776 100644
--- a/servers/physics_2d/step_2d_sw.cpp
+++ b/servers/physics_2d/step_2d_sw.cpp
@@ -31,19 +31,23 @@
 #include "step_2d_sw.h"
 #include "core/os/os.h"
 
-void Step2DSW::_populate_island(Body2DSW *p_body, Body2DSW **p_island, Constraint2DSW **p_constraint_island) {
+#define BODY_ISLAND_COUNT_RESERVE 128
+#define BODY_ISLAND_SIZE_RESERVE 512
+#define ISLAND_COUNT_RESERVE 128
+#define ISLAND_SIZE_RESERVE 512
+
+void Step2DSW::_populate_island(Body2DSW *p_body, LocalVector<Body2DSW *> &p_body_island, LocalVector<Constraint2DSW *> &p_constraint_island) {
 	p_body->set_island_step(_step);
-	p_body->set_island_next(*p_island);
-	*p_island = p_body;
+	p_body_island.push_back(p_body);
 
-	for (const List<Pair<Constraint2DSW *, int>>::Element *E = p_body->get_constraint_list().front(); E; E = E->next()) {
+	// Faster with reversed iterations.
+	for (const List<Pair<Constraint2DSW *, int>>::Element *E = p_body->get_constraint_list().back(); E; E = E->prev()) {
 		Constraint2DSW *c = (Constraint2DSW *)E->get().first;
 		if (c->get_island_step() == _step) {
 			continue; //already processed
 		}
 		c->set_island_step(_step);
-		c->set_island_next(*p_constraint_island);
-		*p_constraint_island = c;
+		p_constraint_island.push_back(c);
 
 		for (int i = 0; i < c->get_body_count(); i++) {
 			if (i == E->get().second) {
@@ -53,78 +57,62 @@ void Step2DSW::_populate_island(Body2DSW *p_body, Body2DSW **p_island, Constrain
 			if (b->get_island_step() == _step || b->get_mode() == PhysicsServer2D::BODY_MODE_STATIC || b->get_mode() == PhysicsServer2D::BODY_MODE_KINEMATIC) {
 				continue; //no go
 			}
-			_populate_island(c->get_body_ptr()[i], p_island, p_constraint_island);
+			_populate_island(c->get_body_ptr()[i], p_body_island, p_constraint_island);
 		}
 	}
 }
 
-bool Step2DSW::_setup_island(Constraint2DSW *p_island, real_t p_delta) {
-	Constraint2DSW *ci = p_island;
-	Constraint2DSW *prev_ci = nullptr;
-	bool removed_root = false;
-	while (ci) {
-		bool process = ci->setup(p_delta);
-
-		if (!process) {
-			//remove from island if process fails
-			if (prev_ci) {
-				prev_ci->set_island_next(ci->get_island_next());
-			} else {
-				removed_root = true;
-				prev_ci = ci;
-			}
-		} else {
-			prev_ci = ci;
+void Step2DSW::_setup_island(LocalVector<Constraint2DSW *> &p_constraint_island, real_t p_delta) {
+	uint32_t constraint_count = p_constraint_island.size();
+	uint32_t valid_constraint_count = 0;
+	for (uint32_t constraint_index = 0; constraint_index < constraint_count; ++constraint_index) {
+		Constraint2DSW *constraint = p_constraint_island[constraint_index];
+		if (p_constraint_island[constraint_index]->setup(p_delta)) {
+			// Keep this constraint for solving.
+			p_constraint_island[valid_constraint_count++] = constraint;
 		}
-		ci = ci->get_island_next();
 	}
-
-	return removed_root;
+	p_constraint_island.resize(valid_constraint_count);
 }
 
-void Step2DSW::_solve_island(Constraint2DSW *p_island, int p_iterations, real_t p_delta) {
+void Step2DSW::_solve_island(LocalVector<Constraint2DSW *> &p_constraint_island, int p_iterations, real_t p_delta) {
 	for (int i = 0; i < p_iterations; i++) {
-		Constraint2DSW *ci = p_island;
-		while (ci) {
-			ci->solve(p_delta);
-			ci = ci->get_island_next();
+		uint32_t constraint_count = p_constraint_island.size();
+		for (uint32_t constraint_index = 0; constraint_index < constraint_count; ++constraint_index) {
+			p_constraint_island[constraint_index]->solve(p_delta);
 		}
 	}
 }
 
-void Step2DSW::_check_suspend(Body2DSW *p_island, real_t p_delta) {
+void Step2DSW::_check_suspend(const LocalVector<Body2DSW *> &p_body_island, real_t p_delta) {
 	bool can_sleep = true;
 
-	Body2DSW *b = p_island;
-	while (b) {
-		if (b->get_mode() == PhysicsServer2D::BODY_MODE_STATIC || b->get_mode() == PhysicsServer2D::BODY_MODE_KINEMATIC) {
-			b = b->get_island_next();
-			continue; //ignore for static
+	uint32_t body_count = p_body_island.size();
+	for (uint32_t body_index = 0; body_index < body_count; ++body_index) {
+		Body2DSW *body = p_body_island[body_index];
+
+		if (body->get_mode() == PhysicsServer2D::BODY_MODE_STATIC || body->get_mode() == PhysicsServer2D::BODY_MODE_KINEMATIC) {
+			continue; // Ignore for static.
 		}
 
-		if (!b->sleep_test(p_delta)) {
+		if (!body->sleep_test(p_delta)) {
 			can_sleep = false;
 		}
-
-		b = b->get_island_next();
 	}
 
-	//put all to sleep or wake up everyoen
+	// Put all to sleep or wake up everyone.
+	for (uint32_t body_index = 0; body_index < body_count; ++body_index) {
+		Body2DSW *body = p_body_island[body_index];
 
-	b = p_island;
-	while (b) {
-		if (b->get_mode() == PhysicsServer2D::BODY_MODE_STATIC || b->get_mode() == PhysicsServer2D::BODY_MODE_KINEMATIC) {
-			b = b->get_island_next();
-			continue; //ignore for static
+		if (body->get_mode() == PhysicsServer2D::BODY_MODE_STATIC || body->get_mode() == PhysicsServer2D::BODY_MODE_KINEMATIC) {
+			continue; // Ignore for static.
 		}
 
-		bool active = b->is_active();
+		bool active = body->is_active();
 
 		if (active == can_sleep) {
-			b->set_active(!can_sleep);
+			body->set_active(!can_sleep);
 		}
-
-		b = b->get_island_next();
 	}
 }
 
@@ -159,33 +147,43 @@ void Step2DSW::step(Space2DSW *p_space, real_t p_delta, int p_iterations) {
 
 	/* GENERATE CONSTRAINT ISLANDS */
 
-	Body2DSW *island_list = nullptr;
-	Constraint2DSW *constraint_island_list = nullptr;
 	b = body_list->first();
 
-	int island_count = 0;
+	uint32_t body_island_count = 0;
+	uint32_t island_count = 0;
 
 	while (b) {
 		Body2DSW *body = b->self();
 
 		if (body->get_island_step() != _step) {
-			Body2DSW *island = nullptr;
-			Constraint2DSW *constraint_island = nullptr;
-			_populate_island(body, &island, &constraint_island);
+			++body_island_count;
+			if (body_islands.size() < body_island_count) {
+				body_islands.resize(body_island_count);
+			}
+			LocalVector<Body2DSW *> &body_island = body_islands[body_island_count - 1];
+			body_island.clear();
+			body_island.reserve(BODY_ISLAND_SIZE_RESERVE);
 
-			island->set_island_list_next(island_list);
-			island_list = island;
+			++island_count;
+			if (constraint_islands.size() < island_count) {
+				constraint_islands.resize(island_count);
+			}
+			LocalVector<Constraint2DSW *> &constraint_island = constraint_islands[island_count - 1];
+			constraint_island.clear();
+			constraint_island.reserve(ISLAND_SIZE_RESERVE);
 
-			if (constraint_island) {
-				constraint_island->set_island_list_next(constraint_island_list);
-				constraint_island_list = constraint_island;
-				island_count++;
+			_populate_island(body, body_island, constraint_island);
+
+			body_islands.push_back(body_island);
+
+			if (constraint_island.is_empty()) {
+				--island_count;
 			}
 		}
 		b = b->next();
 	}
 
-	p_space->set_island_count(island_count);
+	p_space->set_island_count((int)island_count);
 
 	const SelfList<Area2DSW>::List &aml = p_space->get_moved_area_list();
 
@@ -196,9 +194,13 @@ void Step2DSW::step(Space2DSW *p_space, real_t p_delta, int p_iterations) {
 				continue;
 			}
 			c->set_island_step(_step);
-			c->set_island_next(nullptr);
-			c->set_island_list_next(constraint_island_list);
-			constraint_island_list = c;
+			++island_count;
+			if (constraint_islands.size() < island_count) {
+				constraint_islands.resize(island_count);
+			}
+			LocalVector<Constraint2DSW *> &constraint_island = constraint_islands[island_count - 1];
+			constraint_island.clear();
+			constraint_island.push_back(c);
 		}
 		p_space->area_remove_from_moved_list((SelfList<Area2DSW> *)aml.first()); //faster to remove here
 	}
@@ -211,39 +213,8 @@ void Step2DSW::step(Space2DSW *p_space, real_t p_delta, int p_iterations) {
 
 	/* SETUP CONSTRAINT ISLANDS */
 
-	{
-		Constraint2DSW *ci = constraint_island_list;
-		Constraint2DSW *prev_ci = nullptr;
-		while (ci) {
-			if (_setup_island(ci, p_delta)) {
-				//removed the root from the island graph because it is not to be processed
-
-				Constraint2DSW *next = ci->get_island_next();
-
-				if (next) {
-					//root from list being deleted no longer exists, replace by next
-					next->set_island_list_next(ci->get_island_list_next());
-					if (prev_ci) {
-						prev_ci->set_island_list_next(next);
-					} else {
-						constraint_island_list = next;
-					}
-					prev_ci = next;
-				} else {
-					//list is empty, just skip
-					if (prev_ci) {
-						prev_ci->set_island_list_next(ci->get_island_list_next());
-
-					} else {
-						constraint_island_list = ci->get_island_list_next();
-					}
-				}
-			} else {
-				prev_ci = ci;
-			}
-
-			ci = ci->get_island_list_next();
-		}
+	for (uint32_t island_index = 0; island_index < island_count; ++island_index) {
+		_setup_island(constraint_islands[island_index], p_delta);
 	}
 
 	{ //profile
@@ -254,13 +225,8 @@ void Step2DSW::step(Space2DSW *p_space, real_t p_delta, int p_iterations) {
 
 	/* SOLVE CONSTRAINT ISLANDS */
 
-	{
-		Constraint2DSW *ci = constraint_island_list;
-		while (ci) {
-			//iterating each island separatedly improves cache efficiency
-			_solve_island(ci, p_iterations, p_delta);
-			ci = ci->get_island_list_next();
-		}
+	for (uint32_t island_index = 0; island_index < island_count; ++island_index) {
+		_solve_island(constraint_islands[island_index], p_iterations, p_delta);
 	}
 
 	{ //profile
@@ -280,12 +246,8 @@ void Step2DSW::step(Space2DSW *p_space, real_t p_delta, int p_iterations) {
 
 	/* SLEEP / WAKE UP ISLANDS */
 
-	{
-		Body2DSW *bi = island_list;
-		while (bi) {
-			_check_suspend(bi, p_delta);
-			bi = bi->get_island_list_next();
-		}
+	for (uint32_t island_index = 0; island_index < body_island_count; ++island_index) {
+		_check_suspend(body_islands[island_index], p_delta);
 	}
 
 	{ //profile
@@ -301,4 +263,7 @@ void Step2DSW::step(Space2DSW *p_space, real_t p_delta, int p_iterations) {
 
 Step2DSW::Step2DSW() {
 	_step = 1;
+
+	body_islands.reserve(BODY_ISLAND_COUNT_RESERVE);
+	constraint_islands.reserve(ISLAND_COUNT_RESERVE);
 }
diff --git a/servers/physics_2d/step_2d_sw.h b/servers/physics_2d/step_2d_sw.h
index 83b9130608..5af4a36f52 100644
--- a/servers/physics_2d/step_2d_sw.h
+++ b/servers/physics_2d/step_2d_sw.h
@@ -33,13 +33,18 @@
 
 #include "space_2d_sw.h"
 
+#include "core/templates/local_vector.h"
+
 class Step2DSW {
 	uint64_t _step;
 
-	void _populate_island(Body2DSW *p_body, Body2DSW **p_island, Constraint2DSW **p_constraint_island);
-	bool _setup_island(Constraint2DSW *p_island, real_t p_delta);
-	void _solve_island(Constraint2DSW *p_island, int p_iterations, real_t p_delta);
-	void _check_suspend(Body2DSW *p_island, real_t p_delta);
+	LocalVector<LocalVector<Body2DSW *>> body_islands;
+	LocalVector<LocalVector<Constraint2DSW *>> constraint_islands;
+
+	void _populate_island(Body2DSW *p_body, LocalVector<Body2DSW *> &p_body_island, LocalVector<Constraint2DSW *> &p_constraint_island);
+	void _setup_island(LocalVector<Constraint2DSW *> &p_constraint_island, real_t p_delta);
+	void _solve_island(LocalVector<Constraint2DSW *> &p_constraint_island, int p_iterations, real_t p_delta);
+	void _check_suspend(const LocalVector<Body2DSW *> &p_body_island, real_t p_delta);
 
 public:
 	void step(Space2DSW *p_space, real_t p_delta, int p_iterations);
diff --git a/servers/physics_3d/body_3d_sw.cpp b/servers/physics_3d/body_3d_sw.cpp
index 64ba0cb09d..d54345821d 100644
--- a/servers/physics_3d/body_3d_sw.cpp
+++ b/servers/physics_3d/body_3d_sw.cpp
@@ -693,15 +693,16 @@ void Body3DSW::call_queries() {
 
 		Variant v = dbs;
 
-		Object *obj = ObjectDB::get_instance(fi_callback->id);
+		Object *obj = fi_callback->callable.get_object();
 		if (!obj) {
-			set_force_integration_callback(ObjectID(), StringName());
+			set_force_integration_callback(Callable());
 		} else {
 			const Variant *vp[2] = { &v, &fi_callback->udata };
 
 			Callable::CallError ce;
 			int argc = (fi_callback->udata.get_type() == Variant::NIL) ? 1 : 2;
-			obj->call(fi_callback->method, vp, argc, ce);
+			Variant rv;
+			fi_callback->callable.call(vp, argc, rv, ce);
 		}
 	}
 }
@@ -725,16 +726,15 @@ bool Body3DSW::sleep_test(real_t p_step) {
 	}
 }
 
-void Body3DSW::set_force_integration_callback(ObjectID p_id, const StringName &p_method, const Variant &p_udata) {
+void Body3DSW::set_force_integration_callback(const Callable &p_callable, const Variant &p_udata) {
 	if (fi_callback) {
 		memdelete(fi_callback);
 		fi_callback = nullptr;
 	}
 
-	if (p_id.is_valid()) {
+	if (p_callable.get_object()) {
 		fi_callback = memnew(ForceIntegrationCallback);
-		fi_callback->id = p_id;
-		fi_callback->method = p_method;
+		fi_callback->callable = p_callable;
 		fi_callback->udata = p_udata;
 	}
 }
@@ -761,8 +761,6 @@ Body3DSW::Body3DSW() :
 	omit_force_integration = false;
 	//applied_torque=0;
 	island_step = 0;
-	island_next = nullptr;
-	island_list_next = nullptr;
 	first_time_kinematic = false;
 	first_integration = false;
 	_set_static(false);
diff --git a/servers/physics_3d/body_3d_sw.h b/servers/physics_3d/body_3d_sw.h
index e87ff2364b..9afb8cd56f 100644
--- a/servers/physics_3d/body_3d_sw.h
+++ b/servers/physics_3d/body_3d_sw.h
@@ -127,16 +127,13 @@ class Body3DSW : public CollisionObject3DSW {
 	int contact_count;
 
 	struct ForceIntegrationCallback {
-		ObjectID id;
-		StringName method;
+		Callable callable;
 		Variant udata;
 	};
 
 	ForceIntegrationCallback *fi_callback;
 
 	uint64_t island_step;
-	Body3DSW *island_next;
-	Body3DSW *island_list_next;
 
 	_FORCE_INLINE_ void _compute_area_gravity_and_dampenings(const Area3DSW *p_area);
 
@@ -145,7 +142,7 @@ class Body3DSW : public CollisionObject3DSW {
 	friend class PhysicsDirectBodyState3DSW; // i give up, too many functions to expose
 
 public:
-	void set_force_integration_callback(ObjectID p_id, const StringName &p_method, const Variant &p_udata = Variant());
+	void set_force_integration_callback(const Callable &p_callable, const Variant &p_udata = Variant());
 
 	void set_kinematic_margin(real_t p_margin);
 	_FORCE_INLINE_ real_t get_kinematic_margin() { return kinematic_safe_margin; }
@@ -189,12 +186,6 @@ public:
 	_FORCE_INLINE_ uint64_t get_island_step() const { return island_step; }
 	_FORCE_INLINE_ void set_island_step(uint64_t p_step) { island_step = p_step; }
 
-	_FORCE_INLINE_ Body3DSW *get_island_next() const { return island_next; }
-	_FORCE_INLINE_ void set_island_next(Body3DSW *p_next) { island_next = p_next; }
-
-	_FORCE_INLINE_ Body3DSW *get_island_list_next() const { return island_list_next; }
-	_FORCE_INLINE_ void set_island_list_next(Body3DSW *p_next) { island_list_next = p_next; }
-
 	_FORCE_INLINE_ void add_constraint(Constraint3DSW *p_constraint, int p_pos) { constraint_map[p_constraint] = p_pos; }
 	_FORCE_INLINE_ void remove_constraint(Constraint3DSW *p_constraint) { constraint_map.erase(p_constraint); }
 	const Map<Constraint3DSW *, int> &get_constraint_map() const { return constraint_map; }
diff --git a/servers/physics_3d/body_pair_3d_sw.cpp b/servers/physics_3d/body_pair_3d_sw.cpp
index 36114c0c91..28c854466f 100644
--- a/servers/physics_3d/body_pair_3d_sw.cpp
+++ b/servers/physics_3d/body_pair_3d_sw.cpp
@@ -281,6 +281,8 @@ bool BodyPair3DSW::setup(real_t p_step) {
 
 	real_t inv_dt = 1.0 / p_step;
 
+	bool do_process = false;
+
 	for (int i = 0; i < contact_count; i++) {
 		Contact &c = contacts[i];
 		c.active = false;
@@ -323,6 +325,7 @@ bool BodyPair3DSW::setup(real_t p_step) {
 		}
 
 		c.active = true;
+		do_process = true;
 
 		// Precompute normal mass, tangent mass, and bias.
 		Vector3 inertia_A = A->get_inv_inertia_tensor().xform(c.rA.cross(c.normal));
@@ -350,7 +353,7 @@ bool BodyPair3DSW::setup(real_t p_step) {
 		}
 	}
 
-	return true;
+	return do_process;
 }
 
 void BodyPair3DSW::solve(real_t p_step) {
@@ -594,6 +597,8 @@ bool BodySoftBodyPair3DSW::setup(real_t p_step) {
 
 	real_t inv_dt = 1.0 / p_step;
 
+	bool do_process = false;
+
 	uint32_t contact_count = contacts.size();
 	for (uint32_t contact_index = 0; contact_index < contact_count; ++contact_index) {
 		Contact &c = contacts[contact_index];
@@ -614,6 +619,7 @@ bool BodySoftBodyPair3DSW::setup(real_t p_step) {
 		}
 
 		c.active = true;
+		do_process = true;
 
 #ifdef DEBUG_ENABLED
 
@@ -645,7 +651,7 @@ bool BodySoftBodyPair3DSW::setup(real_t p_step) {
 		c.depth = depth;
 
 		Vector3 j_vec = c.normal * c.acc_normal_impulse + c.acc_tangent_impulse;
-		body->apply_impulse(c.rA + body->get_center_of_mass(), -j_vec);
+		body->apply_impulse(-j_vec, c.rA + body->get_center_of_mass());
 		soft_body->apply_node_impulse(c.index_B, j_vec);
 		c.acc_bias_impulse = 0;
 		c.acc_bias_impulse_center_of_mass = 0;
@@ -661,7 +667,7 @@ bool BodySoftBodyPair3DSW::setup(real_t p_step) {
 		}
 	}
 
-	return true;
+	return do_process;
 }
 
 void BodySoftBodyPair3DSW::solve(real_t p_step) {
@@ -691,7 +697,7 @@ void BodySoftBodyPair3DSW::solve(real_t p_step) {
 
 			Vector3 jb = c.normal * (c.acc_bias_impulse - jbnOld);
 
-			body->apply_bias_impulse(c.rA + body->get_center_of_mass(), -jb, MAX_BIAS_ROTATION / p_step);
+			body->apply_bias_impulse(-jb, c.rA + body->get_center_of_mass(), MAX_BIAS_ROTATION / p_step);
 			soft_body->apply_node_bias_impulse(c.index_B, jb);
 
 			crbA = body->get_biased_angular_velocity().cross(c.rA);
@@ -706,8 +712,8 @@ void BodySoftBodyPair3DSW::solve(real_t p_step) {
 
 				Vector3 jb_com = c.normal * (c.acc_bias_impulse_center_of_mass - jbnOld_com);
 
-				body->apply_bias_impulse(body->get_center_of_mass(), -jb_com, 0.0f);
-				soft_body->apply_node_bias_impulse(c.index_B, -jb_com);
+				body->apply_bias_impulse(-jb_com, body->get_center_of_mass(), 0.0f);
+				soft_body->apply_node_bias_impulse(c.index_B, jb_com);
 			}
 
 			c.active = true;
@@ -726,7 +732,7 @@ void BodySoftBodyPair3DSW::solve(real_t p_step) {
 
 			Vector3 j = c.normal * (c.acc_normal_impulse - jnOld);
 
-			body->apply_impulse(c.rA + body->get_center_of_mass(), -j);
+			body->apply_impulse(-j, c.rA + body->get_center_of_mass());
 			soft_body->apply_node_impulse(c.index_B, j);
 
 			c.active = true;
@@ -767,7 +773,7 @@ void BodySoftBodyPair3DSW::solve(real_t p_step) {
 
 			jt = c.acc_tangent_impulse - jtOld;
 
-			body->apply_impulse(c.rA + body->get_center_of_mass(), -jt);
+			body->apply_impulse(-jt, c.rA + body->get_center_of_mass());
 			soft_body->apply_node_impulse(c.index_B, jt);
 
 			c.active = true;
diff --git a/servers/physics_3d/constraint_3d_sw.h b/servers/physics_3d/constraint_3d_sw.h
index 2571335c43..16a31e167d 100644
--- a/servers/physics_3d/constraint_3d_sw.h
+++ b/servers/physics_3d/constraint_3d_sw.h
@@ -37,8 +37,6 @@ class Constraint3DSW {
 	Body3DSW **_body_ptr;
 	int _body_count;
 	uint64_t island_step;
-	Constraint3DSW *island_next;
-	Constraint3DSW *island_list_next;
 	int priority;
 	bool disabled_collisions_between_bodies;
 
@@ -60,12 +58,6 @@ public:
 	_FORCE_INLINE_ uint64_t get_island_step() const { return island_step; }
 	_FORCE_INLINE_ void set_island_step(uint64_t p_step) { island_step = p_step; }
 
-	_FORCE_INLINE_ Constraint3DSW *get_island_next() const { return island_next; }
-	_FORCE_INLINE_ void set_island_next(Constraint3DSW *p_next) { island_next = p_next; }
-
-	_FORCE_INLINE_ Constraint3DSW *get_island_list_next() const { return island_list_next; }
-	_FORCE_INLINE_ void set_island_list_next(Constraint3DSW *p_next) { island_list_next = p_next; }
-
 	_FORCE_INLINE_ Body3DSW **get_body_ptr() const { return _body_ptr; }
 	_FORCE_INLINE_ int get_body_count() const { return _body_count; }
 
diff --git a/servers/physics_3d/joints/cone_twist_joint_3d_sw.cpp b/servers/physics_3d/joints/cone_twist_joint_3d_sw.cpp
index 9c4493f4a2..167f797bfe 100644
--- a/servers/physics_3d/joints/cone_twist_joint_3d_sw.cpp
+++ b/servers/physics_3d/joints/cone_twist_joint_3d_sw.cpp
@@ -109,6 +109,10 @@ ConeTwistJoint3DSW::ConeTwistJoint3DSW(Body3DSW *rbA, Body3DSW *rbB, const Trans
 }
 
 bool ConeTwistJoint3DSW::setup(real_t p_timestep) {
+	if ((A->get_mode() <= PhysicsServer3D::BODY_MODE_KINEMATIC) && (B->get_mode() <= PhysicsServer3D::BODY_MODE_KINEMATIC)) {
+		return false;
+	}
+
 	m_appliedImpulse = real_t(0.);
 
 	//set bias, sign, clear accumulator
diff --git a/servers/physics_3d/joints/generic_6dof_joint_3d_sw.cpp b/servers/physics_3d/joints/generic_6dof_joint_3d_sw.cpp
index 13b389251f..a86e8b4e76 100644
--- a/servers/physics_3d/joints/generic_6dof_joint_3d_sw.cpp
+++ b/servers/physics_3d/joints/generic_6dof_joint_3d_sw.cpp
@@ -303,6 +303,10 @@ bool Generic6DOFJoint3DSW::testAngularLimitMotor(int axis_index) {
 }
 
 bool Generic6DOFJoint3DSW::setup(real_t p_timestep) {
+	if ((A->get_mode() <= PhysicsServer3D::BODY_MODE_KINEMATIC) && (B->get_mode() <= PhysicsServer3D::BODY_MODE_KINEMATIC)) {
+		return false;
+	}
+
 	// Clear accumulated impulses for the next simulation step
 	m_linearLimits.m_accumulatedImpulse = Vector3(real_t(0.), real_t(0.), real_t(0.));
 	int i;
diff --git a/servers/physics_3d/joints/hinge_joint_3d_sw.cpp b/servers/physics_3d/joints/hinge_joint_3d_sw.cpp
index 2b9f0038b4..90b82f4680 100644
--- a/servers/physics_3d/joints/hinge_joint_3d_sw.cpp
+++ b/servers/physics_3d/joints/hinge_joint_3d_sw.cpp
@@ -155,6 +155,10 @@ HingeJoint3DSW::HingeJoint3DSW(Body3DSW *rbA, Body3DSW *rbB, const Vector3 &pivo
 }
 
 bool HingeJoint3DSW::setup(real_t p_step) {
+	if ((A->get_mode() <= PhysicsServer3D::BODY_MODE_KINEMATIC) && (B->get_mode() <= PhysicsServer3D::BODY_MODE_KINEMATIC)) {
+		return false;
+	}
+
 	m_appliedImpulse = real_t(0.);
 
 	if (!m_angularOnly) {
diff --git a/servers/physics_3d/joints/pin_joint_3d_sw.cpp b/servers/physics_3d/joints/pin_joint_3d_sw.cpp
index 9f708ce151..75d87992d1 100644
--- a/servers/physics_3d/joints/pin_joint_3d_sw.cpp
+++ b/servers/physics_3d/joints/pin_joint_3d_sw.cpp
@@ -50,6 +50,10 @@ subject to the following restrictions:
 #include "pin_joint_3d_sw.h"
 
 bool PinJoint3DSW::setup(real_t p_step) {
+	if ((A->get_mode() <= PhysicsServer3D::BODY_MODE_KINEMATIC) && (B->get_mode() <= PhysicsServer3D::BODY_MODE_KINEMATIC)) {
+		return false;
+	}
+
 	m_appliedImpulse = real_t(0.);
 
 	Vector3 normal(0, 0, 0);
diff --git a/servers/physics_3d/joints/slider_joint_3d_sw.cpp b/servers/physics_3d/joints/slider_joint_3d_sw.cpp
index 0adc471797..2e1ee8e770 100644
--- a/servers/physics_3d/joints/slider_joint_3d_sw.cpp
+++ b/servers/physics_3d/joints/slider_joint_3d_sw.cpp
@@ -127,6 +127,10 @@ SliderJoint3DSW::SliderJoint3DSW(Body3DSW *rbA, Body3DSW *rbB, const Transform &
 //-----------------------------------------------------------------------------
 
 bool SliderJoint3DSW::setup(real_t p_step) {
+	if ((A->get_mode() <= PhysicsServer3D::BODY_MODE_KINEMATIC) && (B->get_mode() <= PhysicsServer3D::BODY_MODE_KINEMATIC)) {
+		return false;
+	}
+
 	//calculate transforms
 	m_calculatedTransformA = A->get_transform() * m_frameInA;
 	m_calculatedTransformB = B->get_transform() * m_frameInB;
diff --git a/servers/physics_3d/physics_server_3d_sw.cpp b/servers/physics_3d/physics_server_3d_sw.cpp
index 3d0063b0fa..c08e2b5794 100644
--- a/servers/physics_3d/physics_server_3d_sw.cpp
+++ b/servers/physics_3d/physics_server_3d_sw.cpp
@@ -857,10 +857,10 @@ int PhysicsServer3DSW::body_get_max_contacts_reported(RID p_body) const {
 	return body->get_max_contacts_reported();
 }
 
-void PhysicsServer3DSW::body_set_force_integration_callback(RID p_body, Object *p_receiver, const StringName &p_method, const Variant &p_udata) {
+void PhysicsServer3DSW::body_set_force_integration_callback(RID p_body, const Callable &p_callable, const Variant &p_udata) {
 	Body3DSW *body = body_owner.getornull(p_body);
 	ERR_FAIL_COND(!body);
-	body->set_force_integration_callback(p_receiver ? p_receiver->get_instance_id() : ObjectID(), p_method, p_udata);
+	body->set_force_integration_callback(p_callable, p_udata);
 }
 
 void PhysicsServer3DSW::body_set_ray_pickable(RID p_body, bool p_enable) {
diff --git a/servers/physics_3d/physics_server_3d_sw.h b/servers/physics_3d/physics_server_3d_sw.h
index f92652bfad..0b42f1d605 100644
--- a/servers/physics_3d/physics_server_3d_sw.h
+++ b/servers/physics_3d/physics_server_3d_sw.h
@@ -241,7 +241,7 @@ public:
 	virtual void body_set_max_contacts_reported(RID p_body, int p_contacts) override;
 	virtual int body_get_max_contacts_reported(RID p_body) const override;
 
-	virtual void body_set_force_integration_callback(RID p_body, Object *p_receiver, const StringName &p_method, const Variant &p_udata = Variant()) override;
+	virtual void body_set_force_integration_callback(RID p_body, const Callable &p_callable, const Variant &p_udata = Variant()) override;
 
 	virtual void body_set_ray_pickable(RID p_body, bool p_enable) override;
 
diff --git a/servers/physics_3d/physics_server_3d_wrap_mt.h b/servers/physics_3d/physics_server_3d_wrap_mt.h
index 49ae60db92..69d0fcf3ed 100644
--- a/servers/physics_3d/physics_server_3d_wrap_mt.h
+++ b/servers/physics_3d/physics_server_3d_wrap_mt.h
@@ -249,7 +249,7 @@ public:
 	FUNC2(body_set_omit_force_integration, RID, bool);
 	FUNC1RC(bool, body_is_omitting_force_integration, RID);
 
-	FUNC4(body_set_force_integration_callback, RID, Object *, const StringName &, const Variant &);
+	FUNC3(body_set_force_integration_callback, RID, const Callable &, const Variant &);
 
 	FUNC2(body_set_ray_pickable, RID, bool);
 
diff --git a/servers/physics_3d/shape_3d_sw.cpp b/servers/physics_3d/shape_3d_sw.cpp
index 4c14cb3162..ccd37ca742 100644
--- a/servers/physics_3d/shape_3d_sw.cpp
+++ b/servers/physics_3d/shape_3d_sw.cpp
@@ -30,10 +30,28 @@
 
 #include "shape_3d_sw.h"
 
+#include "core/io/image.h"
 #include "core/math/geometry_3d.h"
 #include "core/math/quick_hull.h"
 #include "core/templates/sort_array.h"
 
+// HeightMapShape3DSW is based on Bullet btHeightfieldTerrainShape.
+
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
 #define _EDGE_IS_VALID_SUPPORT_THRESHOLD 0.0002
 #define _FACE_IS_VALID_SUPPORT_THRESHOLD 0.9998
 
@@ -1617,7 +1635,7 @@ ConcavePolygonShape3DSW::ConcavePolygonShape3DSW() {
 
 /* HEIGHT MAP SHAPE */
 
-Vector<real_t> HeightMapShape3DSW::get_heights() const {
+Vector<float> HeightMapShape3DSW::get_heights() const {
 	return heights;
 }
 
@@ -1629,10 +1647,6 @@ int HeightMapShape3DSW::get_depth() const {
 	return depth;
 }
 
-real_t HeightMapShape3DSW::get_cell_size() const {
-	return cell_size;
-}
-
 void HeightMapShape3DSW::project_range(const Vector3 &p_normal, const Transform &p_transform, real_t &r_min, real_t &r_max) const {
 	//not very useful, but not very used either
 	p_transform.xform(get_aabb()).project_range_in_plane(Plane(p_normal, 0), r_min, r_max);
@@ -1643,7 +1657,198 @@ Vector3 HeightMapShape3DSW::get_support(const Vector3 &p_normal) const {
 	return get_aabb().get_support(p_normal);
 }
 
+struct _HeightmapSegmentCullParams {
+	Vector3 from;
+	Vector3 to;
+	Vector3 dir;
+
+	Vector3 result;
+	Vector3 normal;
+
+	const HeightMapShape3DSW *heightmap = nullptr;
+	FaceShape3DSW *face = nullptr;
+};
+
+_FORCE_INLINE_ bool _heightmap_face_cull_segment(_HeightmapSegmentCullParams &p_params) {
+	Vector3 res;
+	Vector3 normal;
+	if (p_params.face->intersect_segment(p_params.from, p_params.to, res, normal)) {
+		p_params.result = res;
+		p_params.normal = normal;
+		return true;
+	}
+
+	return false;
+}
+
+_FORCE_INLINE_ bool _heightmap_cell_cull_segment(_HeightmapSegmentCullParams &p_params, int p_x, int p_z) {
+	// First triangle.
+	p_params.heightmap->_get_point(p_x, p_z, p_params.face->vertex[0]);
+	p_params.heightmap->_get_point(p_x + 1, p_z, p_params.face->vertex[1]);
+	p_params.heightmap->_get_point(p_x, p_z + 1, p_params.face->vertex[2]);
+	p_params.face->normal = Plane(p_params.face->vertex[0], p_params.face->vertex[1], p_params.face->vertex[2]).normal;
+	if (_heightmap_face_cull_segment(p_params)) {
+		return true;
+	}
+
+	// Second triangle.
+	p_params.face->vertex[0] = p_params.face->vertex[1];
+	p_params.heightmap->_get_point(p_x + 1, p_z + 1, p_params.face->vertex[1]);
+	p_params.face->normal = Plane(p_params.face->vertex[0], p_params.face->vertex[1], p_params.face->vertex[2]).normal;
+	if (_heightmap_face_cull_segment(p_params)) {
+		return true;
+	}
+
+	return false;
+}
+
 bool HeightMapShape3DSW::intersect_segment(const Vector3 &p_begin, const Vector3 &p_end, Vector3 &r_point, Vector3 &r_normal) const {
+	if (heights.is_empty()) {
+		return false;
+	}
+
+	Vector3 local_begin = p_begin + local_origin;
+	Vector3 local_end = p_end + local_origin;
+
+	FaceShape3DSW face;
+	face.backface_collision = false;
+
+	_HeightmapSegmentCullParams params;
+	params.from = p_begin;
+	params.to = p_end;
+	params.dir = (p_end - p_begin).normalized();
+	params.heightmap = this;
+	params.face = &face;
+
+	// Quantize the ray begin/end.
+	int begin_x = floor(local_begin.x);
+	int begin_z = floor(local_begin.z);
+	int end_x = floor(local_end.x);
+	int end_z = floor(local_end.z);
+
+	if ((begin_x == end_x) && (begin_z == end_z)) {
+		// Simple case for rays that don't traverse the grid horizontally.
+		// Just perform a test on the given cell.
+		int x = CLAMP(begin_x, 0, width - 2);
+		int z = CLAMP(begin_z, 0, depth - 2);
+		if (_heightmap_cell_cull_segment(params, x, z)) {
+			r_point = params.result;
+			r_normal = params.normal;
+			return true;
+		}
+	} else {
+		// Perform grid query from projected ray.
+		Vector2 ray_dir_proj(local_end.x - local_begin.x, local_end.z - local_begin.z);
+		real_t ray_dist_proj = ray_dir_proj.length();
+
+		if (ray_dist_proj < CMP_EPSILON) {
+			ray_dir_proj = Vector2();
+		} else {
+			ray_dir_proj /= ray_dist_proj;
+		}
+
+		const int x_step = (ray_dir_proj.x > CMP_EPSILON) ? 1 : ((ray_dir_proj.x < -CMP_EPSILON) ? -1 : 0);
+		const int z_step = (ray_dir_proj.y > CMP_EPSILON) ? 1 : ((ray_dir_proj.y < -CMP_EPSILON) ? -1 : 0);
+
+		const real_t infinite = 1e20;
+		const real_t delta_x = (x_step != 0) ? 1.f / Math::abs(ray_dir_proj.x) : infinite;
+		const real_t delta_z = (z_step != 0) ? 1.f / Math::abs(ray_dir_proj.y) : infinite;
+
+		real_t cross_x; // At which value of `param` we will cross a x-axis lane?
+		real_t cross_z; // At which value of `param` we will cross a z-axis lane?
+
+		// X initialization.
+		if (x_step != 0) {
+			if (x_step == 1) {
+				cross_x = (ceil(local_begin.x) - local_begin.x) * delta_x;
+			} else {
+				cross_x = (local_begin.x - floor(local_begin.x)) * delta_x;
+			}
+		} else {
+			cross_x = infinite; // Will never cross on X.
+		}
+
+		// Z initialization.
+		if (z_step != 0) {
+			if (z_step == 1) {
+				cross_z = (ceil(local_begin.z) - local_begin.z) * delta_z;
+			} else {
+				cross_z = (local_begin.z - floor(local_begin.z)) * delta_z;
+			}
+		} else {
+			cross_z = infinite; // Will never cross on Z.
+		}
+
+		int x = floor(local_begin.x);
+		int z = floor(local_begin.z);
+
+		// Workaround cases where the ray starts at an integer position.
+		if (Math::abs(cross_x) < CMP_EPSILON) {
+			cross_x += delta_x;
+			// If going backwards, we should ignore the position we would get by the above flooring,
+			// because the ray is not heading in that direction.
+			if (x_step == -1) {
+				x -= 1;
+			}
+		}
+
+		if (Math::abs(cross_z) < CMP_EPSILON) {
+			cross_z += delta_z;
+			if (z_step == -1) {
+				z -= 1;
+			}
+		}
+
+		// Start inside the grid.
+		int x_start = CLAMP(x, 0, width - 2);
+		int z_start = CLAMP(z, 0, depth - 2);
+
+		// Adjust initial cross values.
+		cross_x += delta_x * x_step * (x_start - x);
+		cross_z += delta_z * z_step * (z_start - z);
+
+		x = x_start;
+		z = z_start;
+
+		if (_heightmap_cell_cull_segment(params, x, z)) {
+			r_point = params.result;
+			r_normal = params.normal;
+			return true;
+		}
+
+		real_t dist = 0.0;
+		while (true) {
+			if (cross_x < cross_z) {
+				// X lane.
+				x += x_step;
+				// Assign before advancing the param,
+				// to be in sync with the initialization step.
+				dist = cross_x;
+				cross_x += delta_x;
+			} else {
+				// Z lane.
+				z += z_step;
+				dist = cross_z;
+				cross_z += delta_z;
+			}
+
+			// Stop when outside the grid.
+			if ((x < 0) || (z < 0) || (x >= width - 1) || (z >= depth - 1)) {
+				break;
+			}
+
+			if (_heightmap_cell_cull_segment(params, x, z)) {
+				r_point = params.result;
+				r_normal = params.normal;
+				return true;
+			}
+
+			if (dist > ray_dist_proj) {
+				break;
+			}
+		}
+	}
+
 	return false;
 }
 
@@ -1655,7 +1860,66 @@ Vector3 HeightMapShape3DSW::get_closest_point_to(const Vector3 &p_point) const {
 	return Vector3();
 }
 
+void HeightMapShape3DSW::_get_cell(const Vector3 &p_point, int &r_x, int &r_y, int &r_z) const {
+	const AABB &aabb = get_aabb();
+
+	Vector3 pos_local = aabb.position + local_origin;
+
+	Vector3 clamped_point(p_point);
+	clamped_point.x = CLAMP(p_point.x, pos_local.x, pos_local.x + aabb.size.x);
+	clamped_point.y = CLAMP(p_point.y, pos_local.y, pos_local.y + aabb.size.y);
+	clamped_point.z = CLAMP(p_point.z, pos_local.z, pos_local.x + aabb.size.z);
+
+	r_x = (clamped_point.x < 0.0) ? (clamped_point.x - 0.5) : (clamped_point.x + 0.5);
+	r_y = (clamped_point.y < 0.0) ? (clamped_point.y - 0.5) : (clamped_point.y + 0.5);
+	r_z = (clamped_point.z < 0.0) ? (clamped_point.z - 0.5) : (clamped_point.z + 0.5);
+}
+
 void HeightMapShape3DSW::cull(const AABB &p_local_aabb, Callback p_callback, void *p_userdata) const {
+	if (heights.is_empty()) {
+		return;
+	}
+
+	AABB local_aabb = p_local_aabb;
+	local_aabb.position += local_origin;
+
+	// Quantize the aabb, and adjust the start/end ranges.
+	int aabb_min[3];
+	int aabb_max[3];
+	_get_cell(local_aabb.position, aabb_min[0], aabb_min[1], aabb_min[2]);
+	_get_cell(local_aabb.position + local_aabb.size, aabb_max[0], aabb_max[1], aabb_max[2]);
+
+	// Expand the min/max quantized values.
+	// This is to catch the case where the input aabb falls between grid points.
+	for (int i = 0; i < 3; ++i) {
+		aabb_min[i]--;
+		aabb_max[i]++;
+	}
+
+	int start_x = MAX(0, aabb_min[0]);
+	int end_x = MIN(width - 1, aabb_max[0]);
+	int start_z = MAX(0, aabb_min[2]);
+	int end_z = MIN(depth - 1, aabb_max[2]);
+
+	FaceShape3DSW face;
+	face.backface_collision = true;
+
+	for (int z = start_z; z < end_z; z++) {
+		for (int x = start_x; x < end_x; x++) {
+			// First triangle.
+			_get_point(x, z, face.vertex[0]);
+			_get_point(x + 1, z, face.vertex[1]);
+			_get_point(x, z + 1, face.vertex[2]);
+			face.normal = Plane(face.vertex[0], face.vertex[2], face.vertex[1]).normal;
+			p_callback(p_userdata, &face);
+
+			// Second triangle.
+			face.vertex[0] = face.vertex[1];
+			_get_point(x + 1, z + 1, face.vertex[1]);
+			face.normal = Plane(face.vertex[0], face.vertex[2], face.vertex[1]).normal;
+			p_callback(p_userdata, &face);
+		}
+	}
 }
 
 Vector3 HeightMapShape3DSW::get_moment_of_inertia(real_t p_mass) const {
@@ -1668,58 +1932,102 @@ Vector3 HeightMapShape3DSW::get_moment_of_inertia(real_t p_mass) const {
 			(p_mass / 3.0) * (extents.x * extents.x + extents.y * extents.y));
 }
 
-void HeightMapShape3DSW::_setup(Vector<real_t> p_heights, int p_width, int p_depth, real_t p_cell_size) {
+void HeightMapShape3DSW::_setup(const Vector<float> &p_heights, int p_width, int p_depth, real_t p_min_height, real_t p_max_height) {
 	heights = p_heights;
 	width = p_width;
 	depth = p_depth;
-	cell_size = p_cell_size;
-
-	const real_t *r = heights.ptr();
 
+	// Initialize aabb.
 	AABB aabb;
+	aabb.position = Vector3(0.0, p_min_height, 0.0);
+	aabb.size = Vector3(p_width - 1, p_max_height - p_min_height, p_depth - 1);
 
-	for (int i = 0; i < depth; i++) {
-		for (int j = 0; j < width; j++) {
-			real_t h = r[i * width + j];
+	// Initialize origin as the aabb center.
+	local_origin = aabb.position + 0.5 * aabb.size;
+	local_origin.y = 0.0;
 
-			Vector3 pos(j * cell_size, h, i * cell_size);
-			if (i == 0 || j == 0) {
-				aabb.position = pos;
-			} else {
-				aabb.expand_to(pos);
-			}
-		}
-	}
+	aabb.position -= local_origin;
 
 	configure(aabb);
 }
 
 void HeightMapShape3DSW::set_data(const Variant &p_data) {
 	ERR_FAIL_COND(p_data.get_type() != Variant::DICTIONARY);
+
 	Dictionary d = p_data;
 	ERR_FAIL_COND(!d.has("width"));
 	ERR_FAIL_COND(!d.has("depth"));
-	ERR_FAIL_COND(!d.has("cell_size"));
 	ERR_FAIL_COND(!d.has("heights"));
 
 	int width = d["width"];
 	int depth = d["depth"];
-	real_t cell_size = d["cell_size"];
-	Vector<real_t> heights = d["heights"];
 
-	ERR_FAIL_COND(width <= 0);
-	ERR_FAIL_COND(depth <= 0);
-	ERR_FAIL_COND(cell_size <= CMP_EPSILON);
-	ERR_FAIL_COND(heights.size() != (width * depth));
-	_setup(heights, width, depth, cell_size);
+	ERR_FAIL_COND(width <= 0.0);
+	ERR_FAIL_COND(depth <= 0.0);
+
+	Variant heights_variant = d["heights"];
+	Vector<float> heights_buffer;
+	if (heights_variant.get_type() == Variant::PACKED_FLOAT32_ARRAY) {
+		// Ready-to-use heights can be passed.
+		heights_buffer = heights_variant;
+	} else if (heights_variant.get_type() == Variant::OBJECT) {
+		// If an image is passed, we have to convert it.
+		// This would be expensive to do with a script, so it's nice to have it here.
+		Ref<Image> image = heights_variant;
+		ERR_FAIL_COND(image.is_null());
+		ERR_FAIL_COND(image->get_format() != Image::FORMAT_RF);
+
+		PackedByteArray im_data = image->get_data();
+		heights_buffer.resize(image->get_width() * image->get_height());
+
+		float *w = heights_buffer.ptrw();
+		float *rp = (float *)im_data.ptr();
+		for (int i = 0; i < heights_buffer.size(); ++i) {
+			w[i] = rp[i];
+		}
+	} else {
+		ERR_FAIL_MSG("Expected PackedFloat32Array or float Image.");
+	}
+
+	// Compute min and max heights or use precomputed values.
+	real_t min_height = 0.0;
+	real_t max_height = 0.0;
+	if (d.has("min_height") && d.has("max_height")) {
+		min_height = d["min_height"];
+		max_height = d["max_height"];
+	} else {
+		int heights_size = heights.size();
+		for (int i = 0; i < heights_size; ++i) {
+			float h = heights[i];
+			if (h < min_height) {
+				min_height = h;
+			} else if (h > max_height) {
+				max_height = h;
+			}
+		}
+	}
+
+	ERR_FAIL_COND(min_height > max_height);
+
+	ERR_FAIL_COND(heights_buffer.size() != (width * depth));
+
+	// If specified, min and max height will be used as precomputed values.
+	_setup(heights_buffer, width, depth, min_height, max_height);
 }
 
 Variant HeightMapShape3DSW::get_data() const {
-	ERR_FAIL_V(Variant());
+	Dictionary d;
+	d["width"] = width;
+	d["depth"] = depth;
+
+	const AABB &aabb = get_aabb();
+	d["min_height"] = aabb.position.y;
+	d["max_height"] = aabb.position.y + aabb.size.y;
+
+	d["heights"] = heights;
+
+	return d;
 }
 
 HeightMapShape3DSW::HeightMapShape3DSW() {
-	width = 0;
-	depth = 0;
-	cell_size = 0;
 }
diff --git a/servers/physics_3d/shape_3d_sw.h b/servers/physics_3d/shape_3d_sw.h
index 988e76c699..4d2b6ffbed 100644
--- a/servers/physics_3d/shape_3d_sw.h
+++ b/servers/physics_3d/shape_3d_sw.h
@@ -81,7 +81,7 @@ public:
 
 	virtual PhysicsServer3D::ShapeType get_type() const = 0;
 
-	_FORCE_INLINE_ AABB get_aabb() const { return aabb; }
+	_FORCE_INLINE_ const AABB &get_aabb() const { return aabb; }
 	_FORCE_INLINE_ bool is_configured() const { return configured; }
 
 	virtual bool is_concave() const { return false; }
@@ -389,21 +389,29 @@ public:
 };
 
 struct HeightMapShape3DSW : public ConcaveShape3DSW {
-	Vector<real_t> heights;
-	int width;
-	int depth;
-	real_t cell_size;
+	Vector<float> heights;
+	int width = 0;
+	int depth = 0;
+	Vector3 local_origin;
 
-	//void _cull_segment(int p_idx,_SegmentCullParams *p_params) const;
-	//void _cull(int p_idx,_CullParams *p_params) const;
+	_FORCE_INLINE_ float _get_height(int p_x, int p_z) const {
+		return heights[(p_z * width) + p_x];
+	}
+
+	_FORCE_INLINE_ void _get_point(int p_x, int p_z, Vector3 &r_point) const {
+		r_point.x = p_x - 0.5 * (width - 1.0);
+		r_point.y = _get_height(p_x, p_z);
+		r_point.z = p_z - 0.5 * (depth - 1.0);
+	}
+
+	void _get_cell(const Vector3 &p_point, int &r_x, int &r_y, int &r_z) const;
 
-	void _setup(Vector<real_t> p_heights, int p_width, int p_depth, real_t p_cell_size);
+	void _setup(const Vector<float> &p_heights, int p_width, int p_depth, real_t p_min_height, real_t p_max_height);
 
 public:
-	Vector<real_t> get_heights() const;
+	Vector<float> get_heights() const;
 	int get_width() const;
 	int get_depth() const;
-	real_t get_cell_size() const;
 
 	virtual PhysicsServer3D::ShapeType get_type() const { return PhysicsServer3D::SHAPE_HEIGHTMAP; }
 
diff --git a/servers/physics_3d/step_3d_sw.cpp b/servers/physics_3d/step_3d_sw.cpp
index 2133a38670..06f3227eab 100644
--- a/servers/physics_3d/step_3d_sw.cpp
+++ b/servers/physics_3d/step_3d_sw.cpp
@@ -33,19 +33,23 @@
 
 #include "core/os/os.h"
 
-void Step3DSW::_populate_island(Body3DSW *p_body, Body3DSW **p_island, Constraint3DSW **p_constraint_island) {
+#define BODY_ISLAND_COUNT_RESERVE 128
+#define BODY_ISLAND_SIZE_RESERVE 512
+#define ISLAND_COUNT_RESERVE 128
+#define ISLAND_SIZE_RESERVE 512
+
+void Step3DSW::_populate_island(Body3DSW *p_body, LocalVector<Body3DSW *> &p_body_island, LocalVector<Constraint3DSW *> &p_constraint_island) {
 	p_body->set_island_step(_step);
-	p_body->set_island_next(*p_island);
-	*p_island = p_body;
+	p_body_island.push_back(p_body);
 
-	for (Map<Constraint3DSW *, int>::Element *E = p_body->get_constraint_map().front(); E; E = E->next()) {
+	// Faster with reversed iterations.
+	for (Map<Constraint3DSW *, int>::Element *E = p_body->get_constraint_map().back(); E; E = E->prev()) {
 		Constraint3DSW *c = (Constraint3DSW *)E->key();
 		if (c->get_island_step() == _step) {
 			continue; //already processed
 		}
 		c->set_island_step(_step);
-		c->set_island_next(*p_constraint_island);
-		*p_constraint_island = c;
+		p_constraint_island.push_back(c);
 
 		for (int i = 0; i < c->get_body_count(); i++) {
 			if (i == E->get()) {
@@ -55,87 +59,79 @@ void Step3DSW::_populate_island(Body3DSW *p_body, Body3DSW **p_island, Constrain
 			if (b->get_island_step() == _step || b->get_mode() == PhysicsServer3D::BODY_MODE_STATIC || b->get_mode() == PhysicsServer3D::BODY_MODE_KINEMATIC) {
 				continue; //no go
 			}
-			_populate_island(c->get_body_ptr()[i], p_island, p_constraint_island);
+			_populate_island(c->get_body_ptr()[i], p_body_island, p_constraint_island);
 		}
 	}
 }
 
-void Step3DSW::_setup_island(Constraint3DSW *p_island, real_t p_delta) {
-	Constraint3DSW *ci = p_island;
-	while (ci) {
-		ci->setup(p_delta);
-		//todo remove from island if process fails
-		ci = ci->get_island_next();
+void Step3DSW::_setup_island(LocalVector<Constraint3DSW *> &p_constraint_island, real_t p_delta) {
+	uint32_t constraint_count = p_constraint_island.size();
+	uint32_t valid_constraint_count = 0;
+	for (uint32_t constraint_index = 0; constraint_index < constraint_count; ++constraint_index) {
+		Constraint3DSW *constraint = p_constraint_island[constraint_index];
+		if (p_constraint_island[constraint_index]->setup(p_delta)) {
+			// Keep this constraint for solving.
+			p_constraint_island[valid_constraint_count++] = constraint;
+		}
 	}
+	p_constraint_island.resize(valid_constraint_count);
 }
 
-void Step3DSW::_solve_island(Constraint3DSW *p_island, int p_iterations, real_t p_delta) {
-	int at_priority = 1;
+void Step3DSW::_solve_island(LocalVector<Constraint3DSW *> &p_constraint_island, int p_iterations, real_t p_delta) {
+	int current_priority = 1;
 
-	while (p_island) {
+	uint32_t constraint_count = p_constraint_island.size();
+	while (constraint_count > 0) {
 		for (int i = 0; i < p_iterations; i++) {
-			Constraint3DSW *ci = p_island;
-			while (ci) {
-				ci->solve(p_delta);
-				ci = ci->get_island_next();
+			// Go through all iterations.
+			for (uint32_t constraint_index = 0; constraint_index < constraint_count; ++constraint_index) {
+				p_constraint_island[constraint_index]->solve(p_delta);
 			}
 		}
 
-		at_priority++;
-
-		{
-			Constraint3DSW *ci = p_island;
-			Constraint3DSW *prev = nullptr;
-			while (ci) {
-				if (ci->get_priority() < at_priority) {
-					if (prev) {
-						prev->set_island_next(ci->get_island_next()); //remove
-					} else {
-						p_island = ci->get_island_next();
-					}
-				} else {
-					prev = ci;
-				}
-
-				ci = ci->get_island_next();
+		// Check priority to keep only higher priority constraints.
+		uint32_t priority_constraint_count = 0;
+		++current_priority;
+		for (uint32_t constraint_index = 0; constraint_index < constraint_count; ++constraint_index) {
+			Constraint3DSW *constraint = p_constraint_island[constraint_index];
+			if (constraint->get_priority() >= current_priority) {
+				// Keep this constraint for the next iteration.
+				p_constraint_island[priority_constraint_count++] = constraint;
 			}
 		}
+		constraint_count = priority_constraint_count;
 	}
 }
 
-void Step3DSW::_check_suspend(Body3DSW *p_island, real_t p_delta) {
+void Step3DSW::_check_suspend(const LocalVector<Body3DSW *> &p_body_island, real_t p_delta) {
 	bool can_sleep = true;
 
-	Body3DSW *b = p_island;
-	while (b) {
-		if (b->get_mode() == PhysicsServer3D::BODY_MODE_STATIC || b->get_mode() == PhysicsServer3D::BODY_MODE_KINEMATIC) {
-			b = b->get_island_next();
-			continue; //ignore for static
+	uint32_t body_count = p_body_island.size();
+	for (uint32_t body_index = 0; body_index < body_count; ++body_index) {
+		Body3DSW *body = p_body_island[body_index];
+
+		if (body->get_mode() == PhysicsServer3D::BODY_MODE_STATIC || body->get_mode() == PhysicsServer3D::BODY_MODE_KINEMATIC) {
+			continue; // Ignore for static.
 		}
 
-		if (!b->sleep_test(p_delta)) {
+		if (!body->sleep_test(p_delta)) {
 			can_sleep = false;
 		}
-
-		b = b->get_island_next();
 	}
 
-	//put all to sleep or wake up everyoen
+	// Put all to sleep or wake up everyone.
+	for (uint32_t body_index = 0; body_index < body_count; ++body_index) {
+		Body3DSW *body = p_body_island[body_index];
 
-	b = p_island;
-	while (b) {
-		if (b->get_mode() == PhysicsServer3D::BODY_MODE_STATIC || b->get_mode() == PhysicsServer3D::BODY_MODE_KINEMATIC) {
-			b = b->get_island_next();
-			continue; //ignore for static
+		if (body->get_mode() == PhysicsServer3D::BODY_MODE_STATIC || body->get_mode() == PhysicsServer3D::BODY_MODE_KINEMATIC) {
+			continue; // Ignore for static.
 		}
 
-		bool active = b->is_active();
+		bool active = body->is_active();
 
 		if (active == can_sleep) {
-			b->set_active(!can_sleep);
+			body->set_active(!can_sleep);
 		}
-
-		b = b->get_island_next();
 	}
 }
 
@@ -181,33 +177,43 @@ void Step3DSW::step(Space3DSW *p_space, real_t p_delta, int p_iterations) {
 
 	/* GENERATE CONSTRAINT ISLANDS */
 
-	Body3DSW *island_list = nullptr;
-	Constraint3DSW *constraint_island_list = nullptr;
 	b = body_list->first();
 
-	int island_count = 0;
+	uint32_t body_island_count = 0;
+	uint32_t island_count = 0;
 
 	while (b) {
 		Body3DSW *body = b->self();
 
 		if (body->get_island_step() != _step) {
-			Body3DSW *island = nullptr;
-			Constraint3DSW *constraint_island = nullptr;
-			_populate_island(body, &island, &constraint_island);
+			++body_island_count;
+			if (body_islands.size() < body_island_count) {
+				body_islands.resize(body_island_count);
+			}
+			LocalVector<Body3DSW *> &body_island = body_islands[body_island_count - 1];
+			body_island.clear();
+			body_island.reserve(BODY_ISLAND_SIZE_RESERVE);
 
-			island->set_island_list_next(island_list);
-			island_list = island;
+			++island_count;
+			if (constraint_islands.size() < island_count) {
+				constraint_islands.resize(island_count);
+			}
+			LocalVector<Constraint3DSW *> &constraint_island = constraint_islands[island_count - 1];
+			constraint_island.clear();
+			constraint_island.reserve(ISLAND_SIZE_RESERVE);
 
-			if (constraint_island) {
-				constraint_island->set_island_list_next(constraint_island_list);
-				constraint_island_list = constraint_island;
-				island_count++;
+			_populate_island(body, body_island, constraint_island);
+
+			body_islands.push_back(body_island);
+
+			if (constraint_island.is_empty()) {
+				--island_count;
 			}
 		}
 		b = b->next();
 	}
 
-	p_space->set_island_count(island_count);
+	p_space->set_island_count((int)island_count);
 
 	const SelfList<Area3DSW>::List &aml = p_space->get_moved_area_list();
 
@@ -218,9 +224,13 @@ void Step3DSW::step(Space3DSW *p_space, real_t p_delta, int p_iterations) {
 				continue;
 			}
 			c->set_island_step(_step);
-			c->set_island_next(nullptr);
-			c->set_island_list_next(constraint_island_list);
-			constraint_island_list = c;
+			++island_count;
+			if (constraint_islands.size() < island_count) {
+				constraint_islands.resize(island_count);
+			}
+			LocalVector<Constraint3DSW *> &constraint_island = constraint_islands[island_count - 1];
+			constraint_island.clear();
+			constraint_island.push_back(c);
 		}
 		p_space->area_remove_from_moved_list((SelfList<Area3DSW> *)aml.first()); //faster to remove here
 	}
@@ -233,9 +243,13 @@ void Step3DSW::step(Space3DSW *p_space, real_t p_delta, int p_iterations) {
 				continue;
 			}
 			c->set_island_step(_step);
-			c->set_island_next(nullptr);
-			c->set_island_list_next(constraint_island_list);
-			constraint_island_list = c;
+			++island_count;
+			if (constraint_islands.size() < island_count) {
+				constraint_islands.resize(island_count);
+			}
+			LocalVector<Constraint3DSW *> &constraint_island = constraint_islands[island_count - 1];
+			constraint_island.clear();
+			constraint_island.push_back(c);
 		}
 		sb = sb->next();
 	}
@@ -248,12 +262,8 @@ void Step3DSW::step(Space3DSW *p_space, real_t p_delta, int p_iterations) {
 
 	/* SETUP CONSTRAINT ISLANDS */
 
-	{
-		Constraint3DSW *ci = constraint_island_list;
-		while (ci) {
-			_setup_island(ci, p_delta);
-			ci = ci->get_island_list_next();
-		}
+	for (uint32_t island_index = 0; island_index < island_count; ++island_index) {
+		_setup_island(constraint_islands[island_index], p_delta);
 	}
 
 	{ //profile
@@ -264,13 +274,10 @@ void Step3DSW::step(Space3DSW *p_space, real_t p_delta, int p_iterations) {
 
 	/* SOLVE CONSTRAINT ISLANDS */
 
-	{
-		Constraint3DSW *ci = constraint_island_list;
-		while (ci) {
-			//iterating each island separatedly improves cache efficiency
-			_solve_island(ci, p_iterations, p_delta);
-			ci = ci->get_island_list_next();
-		}
+	for (uint32_t island_index = 0; island_index < island_count; ++island_index) {
+		// Warning: _solve_island modifies the constraint islands for optimization purpose,
+		// their content is not reliable after these calls and shouldn't be used anymore.
+		_solve_island(constraint_islands[island_index], p_iterations, p_delta);
 	}
 
 	{ //profile
@@ -290,12 +297,8 @@ void Step3DSW::step(Space3DSW *p_space, real_t p_delta, int p_iterations) {
 
 	/* SLEEP / WAKE UP ISLANDS */
 
-	{
-		Body3DSW *bi = island_list;
-		while (bi) {
-			_check_suspend(bi, p_delta);
-			bi = bi->get_island_list_next();
-		}
+	for (uint32_t island_index = 0; island_index < body_island_count; ++island_index) {
+		_check_suspend(body_islands[island_index], p_delta);
 	}
 
 	/* UPDATE SOFT BODY CONSTRAINTS */
@@ -319,4 +322,7 @@ void Step3DSW::step(Space3DSW *p_space, real_t p_delta, int p_iterations) {
 
 Step3DSW::Step3DSW() {
 	_step = 1;
+
+	body_islands.reserve(BODY_ISLAND_COUNT_RESERVE);
+	constraint_islands.reserve(ISLAND_COUNT_RESERVE);
 }
diff --git a/servers/physics_3d/step_3d_sw.h b/servers/physics_3d/step_3d_sw.h
index 55c48ec0eb..f406c35c3a 100644
--- a/servers/physics_3d/step_3d_sw.h
+++ b/servers/physics_3d/step_3d_sw.h
@@ -33,13 +33,18 @@
 
 #include "space_3d_sw.h"
 
+#include "core/templates/local_vector.h"
+
 class Step3DSW {
 	uint64_t _step;
 
-	void _populate_island(Body3DSW *p_body, Body3DSW **p_island, Constraint3DSW **p_constraint_island);
-	void _setup_island(Constraint3DSW *p_island, real_t p_delta);
-	void _solve_island(Constraint3DSW *p_island, int p_iterations, real_t p_delta);
-	void _check_suspend(Body3DSW *p_island, real_t p_delta);
+	LocalVector<LocalVector<Body3DSW *>> body_islands;
+	LocalVector<LocalVector<Constraint3DSW *>> constraint_islands;
+
+	void _populate_island(Body3DSW *p_body, LocalVector<Body3DSW *> &p_body_island, LocalVector<Constraint3DSW *> &p_constraint_island);
+	void _setup_island(LocalVector<Constraint3DSW *> &p_constraint_island, real_t p_delta);
+	void _solve_island(LocalVector<Constraint3DSW *> &p_constraint_island, int p_iterations, real_t p_delta);
+	void _check_suspend(const LocalVector<Body3DSW *> &p_body_island, real_t p_delta);
 
 public:
 	void step(Space3DSW *p_space, real_t p_delta, int p_iterations);
diff --git a/servers/physics_server_2d.cpp b/servers/physics_server_2d.cpp
index 83ebc0c55b..384179f2c3 100644
--- a/servers/physics_server_2d.cpp
+++ b/servers/physics_server_2d.cpp
@@ -176,11 +176,11 @@ real_t PhysicsShapeQueryParameters2D::get_margin() const {
 	return margin;
 }
 
-void PhysicsShapeQueryParameters2D::set_collision_mask(int p_collision_mask) {
+void PhysicsShapeQueryParameters2D::set_collision_mask(uint32_t p_collision_mask) {
 	collision_mask = p_collision_mask;
 }
 
-int PhysicsShapeQueryParameters2D::get_collision_mask() const {
+uint32_t PhysicsShapeQueryParameters2D::get_collision_mask() const {
 	return collision_mask;
 }
 
@@ -647,7 +647,7 @@ void PhysicsServer2D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("body_set_omit_force_integration", "body", "enable"), &PhysicsServer2D::body_set_omit_force_integration);
 	ClassDB::bind_method(D_METHOD("body_is_omitting_force_integration", "body"), &PhysicsServer2D::body_is_omitting_force_integration);
 
-	ClassDB::bind_method(D_METHOD("body_set_force_integration_callback", "body", "receiver", "method", "userdata"), &PhysicsServer2D::body_set_force_integration_callback, DEFVAL(Variant()));
+	ClassDB::bind_method(D_METHOD("body_set_force_integration_callback", "body", "callable", "userdata"), &PhysicsServer2D::body_set_force_integration_callback, DEFVAL(Variant()));
 
 	ClassDB::bind_method(D_METHOD("body_test_motion", "body", "from", "motion", "infinite_inertia", "margin", "result"), &PhysicsServer2D::_body_test_motion, DEFVAL(0.08), DEFVAL(Variant()));
 
diff --git a/servers/physics_server_2d.h b/servers/physics_server_2d.h
index 28f22ce06b..a5cf3f3a46 100644
--- a/servers/physics_server_2d.h
+++ b/servers/physics_server_2d.h
@@ -128,8 +128,8 @@ public:
 	void set_margin(real_t p_margin);
 	real_t get_margin() const;
 
-	void set_collision_mask(int p_collision_mask);
-	int get_collision_mask() const;
+	void set_collision_mask(uint32_t p_mask);
+	uint32_t get_collision_mask() const;
 
 	void set_collide_with_bodies(bool p_enable);
 	bool is_collide_with_bodies_enabled() const;
@@ -477,7 +477,7 @@ public:
 	virtual void body_set_omit_force_integration(RID p_body, bool p_omit) = 0;
 	virtual bool body_is_omitting_force_integration(RID p_body) const = 0;
 
-	virtual void body_set_force_integration_callback(RID p_body, Object *p_receiver, const StringName &p_method, const Variant &p_udata = Variant()) = 0;
+	virtual void body_set_force_integration_callback(RID p_body, const Callable &p_callable, const Variant &p_udata = Variant()) = 0;
 
 	virtual bool body_collide_shape(RID p_body, int p_body_shape, RID p_shape, const Transform2D &p_shape_xform, const Vector2 &p_motion, Vector2 *r_results, int p_result_max, int &r_result_count) = 0;
 
diff --git a/servers/physics_server_3d.cpp b/servers/physics_server_3d.cpp
index 586845de99..80a9bd4c0b 100644
--- a/servers/physics_server_3d.cpp
+++ b/servers/physics_server_3d.cpp
@@ -172,11 +172,11 @@ real_t PhysicsShapeQueryParameters3D::get_margin() const {
 	return margin;
 }
 
-void PhysicsShapeQueryParameters3D::set_collision_mask(int p_collision_mask) {
+void PhysicsShapeQueryParameters3D::set_collision_mask(uint32_t p_collision_mask) {
 	collision_mask = p_collision_mask;
 }
 
-int PhysicsShapeQueryParameters3D::get_collision_mask() const {
+uint32_t PhysicsShapeQueryParameters3D::get_collision_mask() const {
 	return collision_mask;
 }
 
@@ -550,7 +550,7 @@ void PhysicsServer3D::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("body_set_omit_force_integration", "body", "enable"), &PhysicsServer3D::body_set_omit_force_integration);
 	ClassDB::bind_method(D_METHOD("body_is_omitting_force_integration", "body"), &PhysicsServer3D::body_is_omitting_force_integration);
 
-	ClassDB::bind_method(D_METHOD("body_set_force_integration_callback", "body", "receiver", "method", "userdata"), &PhysicsServer3D::body_set_force_integration_callback, DEFVAL(Variant()));
+	ClassDB::bind_method(D_METHOD("body_set_force_integration_callback", "body", "callable", "userdata"), &PhysicsServer3D::body_set_force_integration_callback, DEFVAL(Variant()));
 
 	ClassDB::bind_method(D_METHOD("body_set_ray_pickable", "body", "enable"), &PhysicsServer3D::body_set_ray_pickable);
 
diff --git a/servers/physics_server_3d.h b/servers/physics_server_3d.h
index 69f5c1c0ad..c434109865 100644
--- a/servers/physics_server_3d.h
+++ b/servers/physics_server_3d.h
@@ -125,8 +125,8 @@ public:
 	void set_margin(real_t p_margin);
 	real_t get_margin() const;
 
-	void set_collision_mask(int p_collision_mask);
-	int get_collision_mask() const;
+	void set_collision_mask(uint32_t p_collision_mask);
+	uint32_t get_collision_mask() const;
 
 	void set_exclude(const Vector<RID> &p_exclude);
 	Vector<RID> get_exclude() const;
@@ -486,7 +486,7 @@ public:
 	virtual void body_set_omit_force_integration(RID p_body, bool p_omit) = 0;
 	virtual bool body_is_omitting_force_integration(RID p_body) const = 0;
 
-	virtual void body_set_force_integration_callback(RID p_body, Object *p_receiver, const StringName &p_method, const Variant &p_udata = Variant()) = 0;
+	virtual void body_set_force_integration_callback(RID p_body, const Callable &p_callable, const Variant &p_udata = Variant()) = 0;
 
 	virtual void body_set_ray_pickable(RID p_body, bool p_enable) = 0;
 
diff --git a/servers/rendering/renderer_rd/SCsub b/servers/rendering/renderer_rd/SCsub
index 9c95f538ac..64e613ab91 100644
--- a/servers/rendering/renderer_rd/SCsub
+++ b/servers/rendering/renderer_rd/SCsub
@@ -5,4 +5,5 @@ Import("env")
 env.add_source_files(env.servers_sources, "*.cpp")
 
 SConscript("forward_clustered/SCsub")
+SConscript("forward_mobile/SCsub")
 SConscript("shaders/SCsub")
diff --git a/servers/rendering/renderer_rd/cluster_builder_rd.cpp b/servers/rendering/renderer_rd/cluster_builder_rd.cpp
index 0fdd864d47..2669a73014 100644
--- a/servers/rendering/renderer_rd/cluster_builder_rd.cpp
+++ b/servers/rendering/renderer_rd/cluster_builder_rd.cpp
@@ -86,13 +86,13 @@ ClusterBuilderSharedDataRD::ClusterBuilderSharedDataRD() {
 
 		Vector<uint8_t> vertex_data;
 		vertex_data.resize(sizeof(float) * icosphere_vertex_count * 3);
-		copymem(vertex_data.ptrw(), icosphere_vertices, vertex_data.size());
+		memcpy(vertex_data.ptrw(), icosphere_vertices, vertex_data.size());
 
 		sphere_vertex_buffer = RD::get_singleton()->vertex_buffer_create(vertex_data.size(), vertex_data);
 
 		Vector<uint8_t> index_data;
 		index_data.resize(sizeof(uint32_t) * icosphere_triangle_count * 3);
-		copymem(index_data.ptrw(), icosphere_triangle_indices, index_data.size());
+		memcpy(index_data.ptrw(), icosphere_triangle_indices, index_data.size());
 
 		sphere_index_buffer = RD::get_singleton()->index_buffer_create(icosphere_triangle_count * 3, RD::INDEX_BUFFER_FORMAT_UINT32, index_data);
 
@@ -130,13 +130,13 @@ ClusterBuilderSharedDataRD::ClusterBuilderSharedDataRD() {
 
 		Vector<uint8_t> vertex_data;
 		vertex_data.resize(sizeof(float) * cone_vertex_count * 3);
-		copymem(vertex_data.ptrw(), cone_vertices, vertex_data.size());
+		memcpy(vertex_data.ptrw(), cone_vertices, vertex_data.size());
 
 		cone_vertex_buffer = RD::get_singleton()->vertex_buffer_create(vertex_data.size(), vertex_data);
 
 		Vector<uint8_t> index_data;
 		index_data.resize(sizeof(uint32_t) * cone_triangle_count * 3);
-		copymem(index_data.ptrw(), cone_triangle_indices, index_data.size());
+		memcpy(index_data.ptrw(), cone_triangle_indices, index_data.size());
 
 		cone_index_buffer = RD::get_singleton()->index_buffer_create(cone_triangle_count * 3, RD::INDEX_BUFFER_FORMAT_UINT32, index_data);
 
@@ -184,13 +184,13 @@ ClusterBuilderSharedDataRD::ClusterBuilderSharedDataRD() {
 
 		Vector<uint8_t> vertex_data;
 		vertex_data.resize(sizeof(float) * box_vertex_count * 3);
-		copymem(vertex_data.ptrw(), box_vertices, vertex_data.size());
+		memcpy(vertex_data.ptrw(), box_vertices, vertex_data.size());
 
 		box_vertex_buffer = RD::get_singleton()->vertex_buffer_create(vertex_data.size(), vertex_data);
 
 		Vector<uint8_t> index_data;
 		index_data.resize(sizeof(uint32_t) * box_triangle_count * 3);
-		copymem(index_data.ptrw(), box_triangle_indices, index_data.size());
+		memcpy(index_data.ptrw(), box_triangle_indices, index_data.size());
 
 		box_index_buffer = RD::get_singleton()->index_buffer_create(box_triangle_count * 3, RD::INDEX_BUFFER_FORMAT_UINT32, index_data);
 
diff --git a/servers/rendering/renderer_rd/effects_rd.cpp b/servers/rendering/renderer_rd/effects_rd.cpp
index bc304aedd8..563e08fdcb 100644
--- a/servers/rendering/renderer_rd/effects_rd.cpp
+++ b/servers/rendering/renderer_rd/effects_rd.cpp
@@ -226,7 +226,7 @@ RID EffectsRD::_get_compute_uniform_set_from_image_pair(RID p_texture1, RID p_te
 }
 
 void EffectsRD::copy_to_atlas_fb(RID p_source_rd_texture, RID p_dest_framebuffer, const Rect2 &p_uv_rect, RD::DrawListID p_draw_list, bool p_flip_y, bool p_panorama) {
-	zeromem(&copy_to_fb.push_constant, sizeof(CopyToFbPushConstant));
+	memset(&copy_to_fb.push_constant, 0, sizeof(CopyToFbPushConstant));
 
 	copy_to_fb.push_constant.use_section = true;
 	copy_to_fb.push_constant.section[0] = p_uv_rect.position.x;
@@ -247,7 +247,7 @@ void EffectsRD::copy_to_atlas_fb(RID p_source_rd_texture, RID p_dest_framebuffer
 }
 
 void EffectsRD::copy_to_fb_rect(RID p_source_rd_texture, RID p_dest_framebuffer, const Rect2i &p_rect, bool p_flip_y, bool p_force_luminance, bool p_alpha_to_zero, bool p_srgb, RID p_secondary) {
-	zeromem(&copy_to_fb.push_constant, sizeof(CopyToFbPushConstant));
+	memset(&copy_to_fb.push_constant, 0, sizeof(CopyToFbPushConstant));
 
 	if (p_flip_y) {
 		copy_to_fb.push_constant.flip_y = true;
@@ -275,7 +275,7 @@ void EffectsRD::copy_to_fb_rect(RID p_source_rd_texture, RID p_dest_framebuffer,
 }
 
 void EffectsRD::copy_to_rect(RID p_source_rd_texture, RID p_dest_texture, const Rect2i &p_rect, bool p_flip_y, bool p_force_luminance, bool p_all_source, bool p_8_bit_dst, bool p_alpha_to_one) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 	if (p_flip_y) {
 		copy.push_constant.flags |= COPY_FLAG_FLIP_Y;
 	}
@@ -309,7 +309,7 @@ void EffectsRD::copy_to_rect(RID p_source_rd_texture, RID p_dest_texture, const
 }
 
 void EffectsRD::copy_cubemap_to_panorama(RID p_source_cube, RID p_dest_panorama, const Size2i &p_panorama_size, float p_lod, bool p_is_array) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 
 	copy.push_constant.section[0] = 0;
 	copy.push_constant.section[1] = 0;
@@ -329,7 +329,7 @@ void EffectsRD::copy_cubemap_to_panorama(RID p_source_cube, RID p_dest_panorama,
 }
 
 void EffectsRD::copy_depth_to_rect_and_linearize(RID p_source_rd_texture, RID p_dest_texture, const Rect2i &p_rect, bool p_flip_y, float p_z_near, float p_z_far) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 	if (p_flip_y) {
 		copy.push_constant.flags |= COPY_FLAG_FLIP_Y;
 	}
@@ -353,7 +353,7 @@ void EffectsRD::copy_depth_to_rect_and_linearize(RID p_source_rd_texture, RID p_
 }
 
 void EffectsRD::copy_depth_to_rect(RID p_source_rd_texture, RID p_dest_texture, const Rect2i &p_rect, bool p_flip_y) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 	if (p_flip_y) {
 		copy.push_constant.flags |= COPY_FLAG_FLIP_Y;
 	}
@@ -375,7 +375,7 @@ void EffectsRD::copy_depth_to_rect(RID p_source_rd_texture, RID p_dest_texture,
 }
 
 void EffectsRD::set_color(RID p_dest_texture, const Color &p_color, const Rect2i &p_region, bool p_8bit_dst) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 
 	copy.push_constant.section[0] = 0;
 	copy.push_constant.section[1] = 0;
@@ -397,7 +397,7 @@ void EffectsRD::set_color(RID p_dest_texture, const Color &p_color, const Rect2i
 }
 
 void EffectsRD::gaussian_blur(RID p_source_rd_texture, RID p_texture, RID p_back_texture, const Rect2i &p_region, bool p_8bit_dst) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 
 	uint32_t base_flags = 0;
 	copy.push_constant.section[0] = p_region.position.x;
@@ -430,7 +430,7 @@ void EffectsRD::gaussian_blur(RID p_source_rd_texture, RID p_texture, RID p_back
 }
 
 void EffectsRD::gaussian_glow(RID p_source_rd_texture, RID p_back_texture, const Size2i &p_size, float p_strength, bool p_high_quality, bool p_first_pass, float p_luminance_cap, float p_exposure, float p_bloom, float p_hdr_bleed_treshold, float p_hdr_bleed_scale, RID p_auto_exposure, float p_auto_exposure_grey) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 
 	CopyMode copy_mode = p_first_pass && p_auto_exposure.is_valid() ? COPY_MODE_GAUSSIAN_GLOW_AUTO_EXPOSURE : COPY_MODE_GAUSSIAN_GLOW;
 	uint32_t base_flags = 0;
@@ -657,7 +657,7 @@ void EffectsRD::merge_specular(RID p_dest_framebuffer, RID p_specular, RID p_bas
 }
 
 void EffectsRD::make_mipmap(RID p_source_rd_texture, RID p_dest_texture, const Size2i &p_size) {
-	zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+	memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 
 	copy.push_constant.section[0] = 0;
 	copy.push_constant.section[1] = 0;
@@ -694,7 +694,7 @@ void EffectsRD::copy_cubemap_to_dp(RID p_source_rd_texture, RID p_dst_framebuffe
 }
 
 void EffectsRD::tonemapper(RID p_source_color, RID p_dst_framebuffer, const TonemapSettings &p_settings) {
-	zeromem(&tonemap.push_constant, sizeof(TonemapPushConstant));
+	memset(&tonemap.push_constant, 0, sizeof(TonemapPushConstant));
 
 	tonemap.push_constant.use_bcs = p_settings.use_bcs;
 	tonemap.push_constant.bcs[0] = p_settings.brightness;
@@ -1294,7 +1294,7 @@ void EffectsRD::roughness_limit(RID p_source_normal, RID p_roughness, const Size
 }
 
 void EffectsRD::cubemap_roughness(RID p_source_rd_texture, RID p_dest_framebuffer, uint32_t p_face_id, uint32_t p_sample_count, float p_roughness, float p_size) {
-	zeromem(&roughness.push_constant, sizeof(CubemapRoughnessPushConstant));
+	memset(&roughness.push_constant, 0, sizeof(CubemapRoughnessPushConstant));
 
 	roughness.push_constant.face_id = p_face_id > 9 ? 0 : p_face_id;
 	roughness.push_constant.roughness = p_roughness;
@@ -1368,7 +1368,7 @@ void EffectsRD::cubemap_filter(RID p_source_cubemap, Vector<RID> p_dest_cubemap,
 void EffectsRD::render_sky(RD::DrawListID p_list, float p_time, RID p_fb, RID p_samplers, RID p_fog, PipelineCacheRD *p_pipeline, RID p_uniform_set, RID p_texture_set, const CameraMatrix &p_camera, const Basis &p_orientation, float p_multiplier, const Vector3 &p_position) {
 	SkyPushConstant sky_push_constant;
 
-	zeromem(&sky_push_constant, sizeof(SkyPushConstant));
+	memset(&sky_push_constant, 0, sizeof(SkyPushConstant));
 
 	sky_push_constant.proj[0] = p_camera.matrix[2][0];
 	sky_push_constant.proj[1] = p_camera.matrix[0][0];
@@ -1510,7 +1510,7 @@ EffectsRD::EffectsRD() {
 		copy_modes.push_back("\n#define MODE_CUBEMAP_ARRAY_TO_PANORAMA\n");
 
 		copy.shader.initialize(copy_modes);
-		zeromem(&copy.push_constant, sizeof(CopyPushConstant));
+		memset(&copy.push_constant, 0, sizeof(CopyPushConstant));
 		copy.shader_version = copy.shader.version_create();
 
 		for (int i = 0; i < COPY_MODE_MAX; i++) {
diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
index cdff3139eb..aadb7bac19 100644
--- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
+++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp
@@ -284,8 +284,6 @@ void RenderForwardClustered::_allocate_normal_roughness_texture(RenderBufferData
 		fb.push_back(rb->normal_roughness_buffer_msaa);
 		rb->depth_normal_roughness_fb = RD::get_singleton()->framebuffer_create(fb);
 	}
-
-	_render_buffers_clear_uniform_set(rb);
 }
 
 RendererSceneRenderRD::RenderBufferData *RenderForwardClustered::_create_render_buffer_data() {
@@ -466,6 +464,10 @@ void RenderForwardClustered::_render_list_template(RenderingDevice::DrawListID p
 		RD::get_singleton()->draw_list_set_push_constant(draw_list, &push_constant, sizeof(SceneState::PushConstant));
 
 		uint32_t instance_count = surf->owner->instance_count > 1 ? surf->owner->instance_count : element_info.repeat;
+		if (surf->flags & GeometryInstanceSurfaceDataCache::FLAG_USES_PARTICLE_TRAILS) {
+			instance_count /= surf->owner->trail_steps;
+		}
+
 		RD::get_singleton()->draw_list_draw(draw_list, index_array_rd.is_valid(), instance_count);
 		i += element_info.repeat - 1; //skip equal elements
 	}
@@ -941,7 +943,7 @@ void RenderForwardClustered::_fill_render_list(RenderListType p_render_list, con
 					uses_lightmap = true;
 				}
 
-			} else if (!low_end) {
+			} else {
 				if (p_using_opaque_gi) {
 					flags |= INSTANCE_DATA_FLAG_USE_GI_BUFFERS;
 				}
@@ -1126,6 +1128,7 @@ void RenderForwardClustered::_render_scene(RID p_render_buffer, const Transform
 	bool using_ssr = false;
 	bool using_sdfgi = false;
 	bool using_giprobe = false;
+	bool reverse_cull = false;
 
 	if (render_buffer) {
 		screen_size.x = render_buffer->width;
@@ -1133,7 +1136,7 @@ void RenderForwardClustered::_render_scene(RID p_render_buffer, const Transform
 
 		opaque_framebuffer = render_buffer->color_fb;
 
-		if (!low_end && p_gi_probes.size() > 0) {
+		if (p_gi_probes.size() > 0) {
 			using_giprobe = true;
 		}
 
@@ -1192,6 +1195,8 @@ void RenderForwardClustered::_render_scene(RID p_render_buffer, const Transform
 		if (storage->reflection_probe_is_interior(reflection_probe_instance_get_probe(p_reflection_probe))) {
 			p_environment = RID(); //no environment on interiors
 		}
+
+		reverse_cull = true; // for some reason our views are inverted
 	} else {
 		ERR_FAIL(); //bug?
 	}
@@ -1212,7 +1217,7 @@ void RenderForwardClustered::_render_scene(RID p_render_buffer, const Transform
 
 	RD::get_singleton()->draw_command_end_label();
 
-	bool using_sss = !low_end && render_buffer && scene_state.used_sss && sub_surface_scattering_get_quality() != RS::SUB_SURFACE_SCATTERING_QUALITY_DISABLED;
+	bool using_sss = render_buffer && scene_state.used_sss && sub_surface_scattering_get_quality() != RS::SUB_SURFACE_SCATTERING_QUALITY_DISABLED;
 
 	if (using_sss) {
 		using_separate_specular = true;
@@ -1296,7 +1301,7 @@ void RenderForwardClustered::_render_scene(RID p_render_buffer, const Transform
 
 	bool debug_giprobes = get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_GI_PROBE_ALBEDO || get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_GI_PROBE_LIGHTING || get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_GI_PROBE_EMISSION;
 	bool debug_sdfgi_probes = get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_SDFGI_PROBES;
-	bool depth_pre_pass = !low_end && depth_framebuffer.is_valid();
+	bool depth_pre_pass = depth_framebuffer.is_valid();
 
 	bool using_ssao = depth_pre_pass && p_render_buffer.is_valid() && p_environment.is_valid() && environment_is_ssao_enabled(p_environment);
 	bool continue_depth = false;
@@ -1321,7 +1326,7 @@ void RenderForwardClustered::_render_scene(RID p_render_buffer, const Transform
 		RID rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_OPAQUE, RID(), RID(), RID(), RID(), RID(), PagedArray<RID>(), PagedArray<RID>());
 
 		bool finish_depth = using_ssao || using_sdfgi || using_giprobe;
-		RenderListParameters render_list_params(render_list[RENDER_LIST_OPAQUE].elements.ptr(), render_list[RENDER_LIST_OPAQUE].element_info.ptr(), render_list[RENDER_LIST_OPAQUE].elements.size(), false, depth_pass_mode, render_buffer == nullptr, rp_uniform_set, get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_WIREFRAME, Vector2(), lod_camera_plane, lod_distance_multiplier, p_screen_lod_threshold);
+		RenderListParameters render_list_params(render_list[RENDER_LIST_OPAQUE].elements.ptr(), render_list[RENDER_LIST_OPAQUE].element_info.ptr(), render_list[RENDER_LIST_OPAQUE].elements.size(), reverse_cull, depth_pass_mode, render_buffer == nullptr, rp_uniform_set, get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_WIREFRAME, Vector2(), lod_camera_plane, lod_distance_multiplier, p_screen_lod_threshold);
 		_render_list_with_threads(&render_list_params, depth_framebuffer, needs_pre_resolve ? RD::INITIAL_ACTION_CONTINUE : RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_READ, needs_pre_resolve ? RD::INITIAL_ACTION_CONTINUE : RD::INITIAL_ACTION_CLEAR, finish_depth ? RD::FINAL_ACTION_READ : RD::FINAL_ACTION_CONTINUE, needs_pre_resolve ? Vector<Color>() : depth_pass_clear);
 
 		RD::get_singleton()->draw_command_end_label();
@@ -1379,7 +1384,7 @@ void RenderForwardClustered::_render_scene(RID p_render_buffer, const Transform
 		}
 
 		RID framebuffer = using_separate_specular ? opaque_specular_framebuffer : opaque_framebuffer;
-		RenderListParameters render_list_params(render_list[RENDER_LIST_OPAQUE].elements.ptr(), render_list[RENDER_LIST_OPAQUE].element_info.ptr(), render_list[RENDER_LIST_OPAQUE].elements.size(), false, using_separate_specular ? PASS_MODE_COLOR_SPECULAR : PASS_MODE_COLOR, render_buffer == nullptr, rp_uniform_set, get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_WIREFRAME, Vector2(), lod_camera_plane, lod_distance_multiplier, p_screen_lod_threshold);
+		RenderListParameters render_list_params(render_list[RENDER_LIST_OPAQUE].elements.ptr(), render_list[RENDER_LIST_OPAQUE].element_info.ptr(), render_list[RENDER_LIST_OPAQUE].elements.size(), reverse_cull, using_separate_specular ? PASS_MODE_COLOR_SPECULAR : PASS_MODE_COLOR, render_buffer == nullptr, rp_uniform_set, get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_WIREFRAME, Vector2(), lod_camera_plane, lod_distance_multiplier, p_screen_lod_threshold);
 		_render_list_with_threads(&render_list_params, framebuffer, keep_color ? RD::INITIAL_ACTION_KEEP : RD::INITIAL_ACTION_CLEAR, will_continue_color ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ, depth_pre_pass ? (continue_depth ? RD::INITIAL_ACTION_CONTINUE : RD::INITIAL_ACTION_KEEP) : RD::INITIAL_ACTION_CLEAR, will_continue_depth ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ, c, 1.0, 0);
 		if (will_continue_color && using_separate_specular) {
 			// close the specular framebuffer, as it's no longer used
@@ -1693,6 +1698,7 @@ void RenderForwardClustered::_render_uv2(const PagedArray<GeometryInstance *> &p
 			_render_list(draw_list, RD::get_singleton()->framebuffer_get_format(p_framebuffer), &render_list_params, 0, render_list_params.element_count); //first wireframe, for pseudo conservative
 		}
 		render_list_params.uv_offset = Vector2();
+		render_list_params.force_wireframe = false;
 		_render_list(draw_list, RD::get_singleton()->framebuffer_get_format(p_framebuffer), &render_list_params, 0, render_list_params.element_count); //second regular triangles
 
 		RD::get_singleton()->draw_list_end();
@@ -1903,7 +1909,7 @@ void RenderForwardClustered::_update_render_base_uniform_set() {
 			uniforms.push_back(u);
 		}
 
-		if (!low_end) {
+		{
 			RD::Uniform u;
 			u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER;
 			u.binding = 13;
@@ -2065,7 +2071,7 @@ RID RenderForwardClustered::_setup_render_pass_uniform_set(RenderListType p_rend
 		uniforms.push_back(u);
 	}
 
-	if (!low_end) {
+	{
 		{
 			RD::Uniform u;
 			u.binding = 11;
@@ -2298,15 +2304,6 @@ RID RenderForwardClustered::_setup_sdfgi_render_pass_uniform_set(RID p_albedo_te
 	return sdfgi_pass_uniform_set;
 }
 
-void RenderForwardClustered::_render_buffers_clear_uniform_set(RenderBufferDataForwardClustered *rb) {
-}
-
-void RenderForwardClustered::_render_buffers_uniform_set_changed(RID p_render_buffers) {
-	RenderBufferDataForwardClustered *rb = (RenderBufferDataForwardClustered *)render_buffers_get_data(p_render_buffers);
-
-	_render_buffers_clear_uniform_set(rb);
-}
-
 RID RenderForwardClustered::_render_buffers_get_normal_texture(RID p_render_buffers) {
 	RenderBufferDataForwardClustered *rb = (RenderBufferDataForwardClustered *)render_buffers_get_data(p_render_buffers);
 
@@ -2376,9 +2373,13 @@ void RenderForwardClustered::_geometry_instance_add_surface_with_material(Geomet
 		flags |= GeometryInstanceSurfaceDataCache::FLAG_PASS_SHADOW;
 	}
 
+	if (p_material->shader_data->uses_particle_trails) {
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_USES_PARTICLE_TRAILS;
+	}
+
 	SceneShaderForwardClustered::MaterialData *material_shadow = nullptr;
 	void *surface_shadow = nullptr;
-	if (!p_material->shader_data->writes_modelview_or_projection && !p_material->shader_data->uses_vertex && !p_material->shader_data->uses_discard && !p_material->shader_data->uses_depth_pre_pass) {
+	if (!p_material->shader_data->uses_particle_trails && !p_material->shader_data->writes_modelview_or_projection && !p_material->shader_data->uses_vertex && !p_material->shader_data->uses_discard && !p_material->shader_data->uses_depth_pre_pass) {
 		flags |= GeometryInstanceSurfaceDataCache::FLAG_USES_SHARED_SHADOW_MATERIAL;
 		material_shadow = (SceneShaderForwardClustered::MaterialData *)storage->material_get_data(scene_shader.default_material, RendererStorageRD::SHADER_TYPE_3D);
 
@@ -2547,7 +2548,7 @@ void RenderForwardClustered::_geometry_instance_update(GeometryInstance *p_geome
 				}
 			}
 
-			ginstance->instance_count = storage->particles_get_amount(ginstance->data->base);
+			ginstance->instance_count = storage->particles_get_amount(ginstance->data->base, ginstance->trail_steps);
 
 		} break;
 
@@ -2561,42 +2562,26 @@ void RenderForwardClustered::_geometry_instance_update(GeometryInstance *p_geome
 
 	if (ginstance->data->base_type == RS::INSTANCE_MULTIMESH) {
 		ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH;
-		uint32_t stride;
 		if (storage->multimesh_get_transform_format(ginstance->data->base) == RS::MULTIMESH_TRANSFORM_2D) {
 			ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_FORMAT_2D;
-			stride = 2;
-		} else {
-			stride = 3;
 		}
 		if (storage->multimesh_uses_colors(ginstance->data->base)) {
 			ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_HAS_COLOR;
-			stride += 1;
 		}
 		if (storage->multimesh_uses_custom_data(ginstance->data->base)) {
 			ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_HAS_CUSTOM_DATA;
-			stride += 1;
 		}
 
-		ginstance->base_flags |= (stride << INSTANCE_DATA_FLAGS_MULTIMESH_STRIDE_SHIFT);
 		ginstance->transforms_uniform_set = storage->multimesh_get_3d_uniform_set(ginstance->data->base, scene_shader.default_shader_rd, TRANSFORMS_UNIFORM_SET);
 
 	} else if (ginstance->data->base_type == RS::INSTANCE_PARTICLES) {
 		ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH;
-		uint32_t stride;
-		if (false) { // 2D particles
-			ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_FORMAT_2D;
-			stride = 2;
-		} else {
-			stride = 3;
-		}
 
 		ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_HAS_COLOR;
-		stride += 1;
-
 		ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_HAS_CUSTOM_DATA;
-		stride += 1;
 
-		ginstance->base_flags |= (stride << INSTANCE_DATA_FLAGS_MULTIMESH_STRIDE_SHIFT);
+		//for particles, stride is the trail size
+		ginstance->base_flags |= (ginstance->trail_steps << INSTANCE_DATA_FLAGS_PARTICLE_TRAIL_SHIFT);
 
 		if (!storage->particles_is_using_local_coords(ginstance->data->base)) {
 			store_transform = false;
@@ -2605,7 +2590,6 @@ void RenderForwardClustered::_geometry_instance_update(GeometryInstance *p_geome
 
 	} else if (ginstance->data->base_type == RS::INSTANCE_MESH) {
 		if (storage->skeleton_is_valid(ginstance->data->skeleton)) {
-			ginstance->base_flags |= INSTANCE_DATA_FLAG_SKELETON;
 			ginstance->transforms_uniform_set = storage->skeleton_get_3d_uniform_set(ginstance->data->skeleton, scene_shader.default_shader_rd, TRANSFORMS_UNIFORM_SET);
 			if (ginstance->data->dirty_dependencies) {
 				storage->skeleton_update_dependency(ginstance->data->skeleton, &ginstance->data->dependency_tracker);
@@ -2616,7 +2600,7 @@ void RenderForwardClustered::_geometry_instance_update(GeometryInstance *p_geome
 	ginstance->store_transform_cache = store_transform;
 	ginstance->can_sdfgi = false;
 
-	if (!lightmap_instance_is_valid(ginstance->lightmap_instance) && !low_end) {
+	if (!lightmap_instance_is_valid(ginstance->lightmap_instance)) {
 		if (ginstance->gi_probes[0].is_null() && (ginstance->data->use_baked_light || ginstance->data->use_dynamic_gi)) {
 			ginstance->can_sdfgi = true;
 		}
@@ -2640,6 +2624,7 @@ void RenderForwardClustered::_geometry_instance_dependency_changed(RendererStora
 	switch (p_notification) {
 		case RendererStorage::DEPENDENCY_CHANGED_MATERIAL:
 		case RendererStorage::DEPENDENCY_CHANGED_MESH:
+		case RendererStorage::DEPENDENCY_CHANGED_PARTICLES:
 		case RendererStorage::DEPENDENCY_CHANGED_MULTIMESH:
 		case RendererStorage::DEPENDENCY_CHANGED_SKELETON_DATA: {
 			static_cast<RenderForwardClustered *>(singleton)->_geometry_instance_mark_dirty(static_cast<GeometryInstance *>(p_tracker->userdata));
@@ -2753,7 +2738,7 @@ void RenderForwardClustered::geometry_instance_set_lightmap_capture(GeometryInst
 			ginstance->lightmap_sh = geometry_instance_lightmap_sh.alloc();
 		}
 
-		copymem(ginstance->lightmap_sh->sh, p_sh9, sizeof(Color) * 9);
+		memcpy(ginstance->lightmap_sh->sh, p_sh9, sizeof(Color) * 9);
 	} else {
 		if (ginstance->lightmap_sh != nullptr) {
 			geometry_instance_lightmap_sh.free(ginstance->lightmap_sh);
@@ -2843,10 +2828,6 @@ RenderForwardClustered::RenderForwardClustered(RendererStorageRD *p_storage) :
 
 	{
 		String defines;
-		if (low_end) {
-			defines += "\n#define LOW_END_MODE \n";
-		}
-
 		defines += "\n#define MAX_ROUGHNESS_LOD " + itos(get_roughness_layers() - 1) + ".0\n";
 		if (is_using_radiance_cubemap_array()) {
 			defines += "\n#define USE_RADIANCE_CUBEMAP_ARRAY \n";
@@ -2856,7 +2837,7 @@ RenderForwardClustered::RenderForwardClustered(RendererStorageRD *p_storage) :
 
 		{
 			//lightmaps
-			scene_state.max_lightmaps = low_end ? 2 : MAX_LIGHTMAPS;
+			scene_state.max_lightmaps = MAX_LIGHTMAPS;
 			defines += "\n#define MAX_LIGHTMAP_TEXTURES " + itos(scene_state.max_lightmaps) + "\n";
 			defines += "\n#define MAX_LIGHTMAPS " + itos(scene_state.max_lightmaps) + "\n";
 
@@ -2872,7 +2853,7 @@ RenderForwardClustered::RenderForwardClustered(RendererStorageRD *p_storage) :
 			defines += "\n#define MATERIAL_UNIFORM_SET " + itos(MATERIAL_UNIFORM_SET) + "\n";
 		}
 
-		scene_shader.init(p_storage, defines, low_end);
+		scene_shader.init(p_storage, defines);
 	}
 
 	render_list_thread_threshold = GLOBAL_GET("rendering/limits/forward_renderer/threaded_render_minimum_instances");
diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h
index 72e84a6f24..4b998a9e76 100644
--- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h
+++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h
@@ -118,8 +118,6 @@ class RenderForwardClustered : public RendererSceneRenderRD {
 	uint64_t lightmap_texture_array_version = 0xFFFFFFFF;
 
 	virtual void _base_uniforms_changed();
-	void _render_buffers_clear_uniform_set(RenderBufferDataForwardClustered *rb);
-	virtual void _render_buffers_uniform_set_changed(RID p_render_buffers);
 	virtual RID _render_buffers_get_normal_texture(RID p_render_buffers);
 
 	void _update_render_base_uniform_set();
@@ -196,12 +194,13 @@ class RenderForwardClustered : public RendererSceneRenderRD {
 		INSTANCE_DATA_FLAG_MULTIMESH_FORMAT_2D = 1 << 13,
 		INSTANCE_DATA_FLAG_MULTIMESH_HAS_COLOR = 1 << 14,
 		INSTANCE_DATA_FLAG_MULTIMESH_HAS_CUSTOM_DATA = 1 << 15,
-		INSTANCE_DATA_FLAGS_MULTIMESH_STRIDE_SHIFT = 16,
-		INSTANCE_DATA_FLAGS_MULTIMESH_STRIDE_MASK = 0x7,
-		INSTANCE_DATA_FLAG_SKELETON = 1 << 19,
+		INSTANCE_DATA_FLAGS_PARTICLE_TRAIL_SHIFT = 16,
+		INSTANCE_DATA_FLAGS_PARTICLE_TRAIL_MASK = 0xFF,
+		INSTANCE_DATA_FLAGS_NON_UNIFORM_SCALE = 1 << 24,
 	};
 
 	struct SceneState {
+		// This struct is loaded into Set 1 - Binding 0, populated at start of rendering a frame, must match with shader code
 		struct UBO {
 			float projection_matrix[16];
 			float inv_projection_matrix[16];
@@ -398,6 +397,7 @@ class RenderForwardClustered : public RendererSceneRenderRD {
 			FLAG_USES_DEPTH_TEXTURE = 8192,
 			FLAG_USES_NORMAL_TEXTURE = 16384,
 			FLAG_USES_DOUBLE_SIDED_SHADOWS = 32768,
+			FLAG_USES_PARTICLE_TRAILS = 65536,
 		};
 
 		union {
@@ -453,6 +453,7 @@ class RenderForwardClustered : public RendererSceneRenderRD {
 		uint32_t layer_mask = 1;
 		RID transforms_uniform_set;
 		uint32_t instance_count = 0;
+		uint32_t trail_steps = 1;
 		RID mesh_instance;
 		bool can_sdfgi = false;
 		//used during setup
diff --git a/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp b/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp
index cb8c6e0cf3..f7ed0205af 100644
--- a/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp
+++ b/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.cpp
@@ -73,10 +73,14 @@ void SceneShaderForwardClustered::ShaderData::set_code(const String &p_code) {
 	uses_time = false;
 	writes_modelview_or_projection = false;
 	uses_world_coordinates = false;
+	uses_particle_trails = false;
 
 	int depth_drawi = DEPTH_DRAW_OPAQUE;
 
 	ShaderCompilerRD::IdentifierActions actions;
+	actions.entry_point_stages["vertex"] = ShaderCompilerRD::STAGE_VERTEX;
+	actions.entry_point_stages["fragment"] = ShaderCompilerRD::STAGE_FRAGMENT;
+	actions.entry_point_stages["light"] = ShaderCompilerRD::STAGE_FRAGMENT;
 
 	actions.render_mode_values["blend_add"] = Pair<int *, int>(&blend_mode, BLEND_MODE_ADD);
 	actions.render_mode_values["blend_mix"] = Pair<int *, int>(&blend_mode, BLEND_MODE_MIX);
@@ -98,6 +102,7 @@ void SceneShaderForwardClustered::ShaderData::set_code(const String &p_code) {
 
 	actions.render_mode_flags["unshaded"] = &unshaded;
 	actions.render_mode_flags["wireframe"] = &wireframe;
+	actions.render_mode_flags["particle_trails"] = &uses_particle_trails;
 
 	actions.usage_flag_pointers["ALPHA"] = &uses_alpha;
 	actions.render_mode_flags["depth_prepass_alpha"] = &uses_depth_pre_pass;
@@ -141,14 +146,19 @@ void SceneShaderForwardClustered::ShaderData::set_code(const String &p_code) {
 	for (int i = 0; i < gen_code.defines.size(); i++) {
 		print_line(gen_code.defines[i]);
 	}
+
+	Map<String, String>::Element * el = gen_code.code.front();
+	while (el) {
+		print_line("\n**code " + el->key() + ":\n" + el->value());
+
+		el = el->next();
+	}
+
 	print_line("\n**uniforms:\n" + gen_code.uniforms);
-	print_line("\n**vertex_globals:\n" + gen_code.vertex_global);
-	print_line("\n**vertex_code:\n" + gen_code.vertex);
-	print_line("\n**fragment_globals:\n" + gen_code.fragment_global);
-	print_line("\n**fragment_code:\n" + gen_code.fragment);
-	print_line("\n**light_code:\n" + gen_code.light);
+	print_line("\n**vertex_globals:\n" + gen_code.stage_globals[ShaderCompilerRD::STAGE_VERTEX]);
+	print_line("\n**fragment_globals:\n" + gen_code.stage_globals[ShaderCompilerRD::STAGE_FRAGMENT]);
 #endif
-	shader_singleton->shader.version_set_code(version, gen_code.uniforms, gen_code.vertex_global, gen_code.vertex, gen_code.fragment_global, gen_code.light, gen_code.fragment, gen_code.defines);
+	shader_singleton->shader.version_set_code(version, gen_code.code, gen_code.uniforms, gen_code.stage_globals[ShaderCompilerRD::STAGE_VERTEX], gen_code.stage_globals[ShaderCompilerRD::STAGE_FRAGMENT], gen_code.defines);
 	ERR_FAIL_COND(!shader_singleton->shader.version_is_valid(version));
 
 	ubo_size = gen_code.uniform_total_size;
@@ -544,7 +554,7 @@ SceneShaderForwardClustered::~SceneShaderForwardClustered() {
 	storage->free(default_material);
 }
 
-void SceneShaderForwardClustered::init(RendererStorageRD *p_storage, const String p_defines, bool p_is_low_end) {
+void SceneShaderForwardClustered::init(RendererStorageRD *p_storage, const String p_defines) {
 	storage = p_storage;
 
 	{
@@ -561,16 +571,6 @@ void SceneShaderForwardClustered::init(RendererStorageRD *p_storage, const Strin
 		shader_versions.push_back("\n#define USE_LIGHTMAP\n");
 		shader_versions.push_back("\n#define MODE_MULTIPLE_RENDER_TARGETS\n#define USE_LIGHTMAP\n");
 		shader.initialize(shader_versions, p_defines);
-
-		if (p_is_low_end) {
-			//disable the high end versions
-			shader.set_variant_enabled(SHADER_VERSION_DEPTH_PASS_WITH_NORMAL_AND_ROUGHNESS, false);
-			shader.set_variant_enabled(SHADER_VERSION_DEPTH_PASS_WITH_NORMAL_AND_ROUGHNESS_AND_GIPROBE, false);
-			shader.set_variant_enabled(SHADER_VERSION_DEPTH_PASS_WITH_SDF, false);
-			shader.set_variant_enabled(SHADER_VERSION_COLOR_PASS_WITH_FORWARD_GI, false);
-			shader.set_variant_enabled(SHADER_VERSION_COLOR_PASS_WITH_SEPARATE_SPECULAR, false);
-			shader.set_variant_enabled(SHADER_VERSION_LIGHTMAP_COLOR_PASS_WITH_SEPARATE_SPECULAR, false);
-		}
 	}
 
 	storage->shader_set_data_request_function(RendererStorageRD::SHADER_TYPE_3D, _create_shader_funcs);
@@ -709,6 +709,7 @@ void SceneShaderForwardClustered::init(RendererStorageRD *p_storage, const Strin
 		actions.render_mode_defines["ensure_correct_normals"] = "#define ENSURE_CORRECT_NORMALS\n";
 		actions.render_mode_defines["cull_front"] = "#define DO_SIDE_CHECK\n";
 		actions.render_mode_defines["cull_disabled"] = "#define DO_SIDE_CHECK\n";
+		actions.render_mode_defines["particle_trails"] = "#define USE_PARTICLE_TRAILS\n";
 
 		bool force_lambert = GLOBAL_GET("rendering/shading/overrides/force_lambert_over_burley");
 
@@ -764,9 +765,7 @@ void SceneShaderForwardClustered::init(RendererStorageRD *p_storage, const Strin
 
 		MaterialData *md = (MaterialData *)storage->material_get_data(default_material, RendererStorageRD::SHADER_TYPE_3D);
 		default_shader_rd = shader.version_get_shader(md->shader_data->version, SHADER_VERSION_COLOR_PASS);
-		if (!p_is_low_end) {
-			default_shader_sdfgi_rd = shader.version_get_shader(md->shader_data->version, SHADER_VERSION_DEPTH_PASS_WITH_SDF);
-		}
+		default_shader_sdfgi_rd = shader.version_get_shader(md->shader_data->version, SHADER_VERSION_DEPTH_PASS_WITH_SDF);
 	}
 
 	{
diff --git a/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.h b/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.h
index 368340e258..7c8879686b 100644
--- a/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.h
+++ b/servers/rendering/renderer_rd/forward_clustered/scene_shader_forward_clustered.h
@@ -126,6 +126,7 @@ public:
 		bool uses_discard;
 		bool uses_roughness;
 		bool uses_normal;
+		bool uses_particle_trails;
 
 		bool unshaded;
 		bool uses_vertex;
@@ -203,7 +204,7 @@ public:
 	SceneShaderForwardClustered();
 	~SceneShaderForwardClustered();
 
-	void init(RendererStorageRD *p_storage, const String p_defines, bool p_is_low_end);
+	void init(RendererStorageRD *p_storage, const String p_defines);
 };
 
 } // namespace RendererSceneRenderImplementation
diff --git a/servers/rendering/renderer_rd/forward_mobile/SCsub b/servers/rendering/renderer_rd/forward_mobile/SCsub
new file mode 100644
index 0000000000..86681f9c74
--- /dev/null
+++ b/servers/rendering/renderer_rd/forward_mobile/SCsub
@@ -0,0 +1,5 @@
+#!/usr/bin/env python
+
+Import("env")
+
+env.add_source_files(env.servers_sources, "*.cpp")
diff --git a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp
new file mode 100644
index 0000000000..b2aaa50421
--- /dev/null
+++ b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp
@@ -0,0 +1,2163 @@
+/*************************************************************************/
+/*  render_forward_mobile.cpp                                            */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "render_forward_mobile.h"
+#include "core/config/project_settings.h"
+#include "servers/rendering/rendering_device.h"
+#include "servers/rendering/rendering_server_default.h"
+
+using namespace RendererSceneRenderImplementation;
+
+/* Render buffer */
+
+void RenderForwardMobile::RenderBufferDataForwardMobile::clear() {
+	if (color_msaa.is_valid()) {
+		RD::get_singleton()->free(color_msaa);
+		color_msaa = RID();
+	}
+
+	if (depth_msaa.is_valid()) {
+		RD::get_singleton()->free(depth_msaa);
+		depth_msaa = RID();
+	}
+
+	color = RID();
+	depth = RID();
+	color_fb = RID();
+}
+
+void RenderForwardMobile::RenderBufferDataForwardMobile::configure(RID p_color_buffer, RID p_depth_buffer, int p_width, int p_height, RS::ViewportMSAA p_msaa) {
+	clear();
+
+	msaa = p_msaa;
+
+	width = p_width;
+	height = p_height;
+
+	color = p_color_buffer;
+	depth = p_depth_buffer;
+
+	// re-introduce setting up msaa? For now we ignore this...
+
+	if (p_msaa == RS::VIEWPORT_MSAA_DISABLED) {
+		Vector<RID> fb;
+		fb.push_back(p_color_buffer);
+		fb.push_back(depth);
+
+		color_fb = RD::get_singleton()->framebuffer_create(fb);
+	} else {
+		RD::TextureFormat tf;
+		tf.format = RD::DATA_FORMAT_R16G16B16A16_SFLOAT;
+		tf.width = p_width;
+		tf.height = p_height;
+		tf.texture_type = RD::TEXTURE_TYPE_2D;
+		tf.usage_bits = RD::TEXTURE_USAGE_COLOR_ATTACHMENT_BIT | RD::TEXTURE_USAGE_CAN_COPY_FROM_BIT | RD::TEXTURE_USAGE_SAMPLING_BIT;
+
+		RD::TextureSamples ts[RS::VIEWPORT_MSAA_MAX] = {
+			RD::TEXTURE_SAMPLES_1,
+			RD::TEXTURE_SAMPLES_2,
+			RD::TEXTURE_SAMPLES_4,
+			RD::TEXTURE_SAMPLES_8,
+			RD::TEXTURE_SAMPLES_16
+		};
+
+		texture_samples = ts[p_msaa];
+		tf.samples = texture_samples;
+
+		color_msaa = RD::get_singleton()->texture_create(tf, RD::TextureView());
+
+		tf.format = RD::get_singleton()->texture_is_format_supported_for_usage(RD::DATA_FORMAT_D24_UNORM_S8_UINT, RD::TEXTURE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) ? RD::DATA_FORMAT_D24_UNORM_S8_UINT : RD::DATA_FORMAT_D32_SFLOAT_S8_UINT;
+		tf.usage_bits = RD::TEXTURE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | RD::TEXTURE_USAGE_CAN_COPY_FROM_BIT | RD::TEXTURE_USAGE_SAMPLING_BIT;
+
+		depth_msaa = RD::get_singleton()->texture_create(tf, RD::TextureView());
+
+		{
+			Vector<RID> fb;
+			fb.push_back(color_msaa);
+			fb.push_back(depth_msaa);
+
+			color_fb = RD::get_singleton()->framebuffer_create(fb);
+		}
+	}
+}
+
+RenderForwardMobile::RenderBufferDataForwardMobile::~RenderBufferDataForwardMobile() {
+	clear();
+}
+
+RendererSceneRenderRD::RenderBufferData *RenderForwardMobile::_create_render_buffer_data() {
+	return memnew(RenderBufferDataForwardMobile);
+}
+
+bool RenderForwardMobile::free(RID p_rid) {
+	if (RendererSceneRenderRD::free(p_rid)) {
+		return true;
+	}
+	return false;
+}
+
+/* Render functions */
+
+RID RenderForwardMobile::_setup_render_pass_uniform_set(RenderListType p_render_list, RID p_render_buffers, RID p_radiance_texture, RID p_shadow_atlas, RID p_reflection_atlas, const PagedArray<RID> &p_lightmaps, bool p_use_directional_shadow_atlas, int p_index) {
+	//there should always be enough uniform buffers for render passes, otherwise bugs
+	ERR_FAIL_INDEX_V(p_index, (int)scene_state.uniform_buffers.size(), RID());
+
+	RenderBufferDataForwardMobile *rb = nullptr;
+	if (p_render_buffers.is_valid()) {
+		rb = (RenderBufferDataForwardMobile *)render_buffers_get_data(p_render_buffers);
+	}
+
+	// default render buffer and scene state uniform set
+	// loaded into set 1
+
+	Vector<RD::Uniform> uniforms;
+
+	{
+		RD::Uniform u;
+		u.binding = 0;
+		u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER;
+		u.ids.push_back(scene_state.uniform_buffers[p_index]);
+		uniforms.push_back(u);
+	}
+
+	{
+		RID radiance_texture;
+		if (p_radiance_texture.is_valid()) {
+			radiance_texture = p_radiance_texture;
+		} else {
+			radiance_texture = storage->texture_rd_get_default(is_using_radiance_cubemap_array() ? RendererStorageRD::DEFAULT_RD_TEXTURE_CUBEMAP_ARRAY_BLACK : RendererStorageRD::DEFAULT_RD_TEXTURE_CUBEMAP_BLACK);
+		}
+		RD::Uniform u;
+		u.binding = 2;
+		u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
+		u.ids.push_back(radiance_texture);
+		uniforms.push_back(u);
+	}
+
+	{
+		RID ref_texture = p_reflection_atlas.is_valid() ? reflection_atlas_get_texture(p_reflection_atlas) : RID();
+		RD::Uniform u;
+		u.binding = 3;
+		u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
+		if (ref_texture.is_valid()) {
+			u.ids.push_back(ref_texture);
+		} else {
+			u.ids.push_back(storage->texture_rd_get_default(RendererStorageRD::DEFAULT_RD_TEXTURE_CUBEMAP_ARRAY_BLACK));
+		}
+		uniforms.push_back(u);
+	}
+
+	{
+		RD::Uniform u;
+		u.binding = 4;
+		u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
+		RID texture;
+		if (p_shadow_atlas.is_valid()) {
+			texture = shadow_atlas_get_texture(p_shadow_atlas);
+		}
+		if (!texture.is_valid()) {
+			texture = storage->texture_rd_get_default(RendererStorageRD::DEFAULT_RD_TEXTURE_WHITE);
+		}
+		u.ids.push_back(texture);
+		uniforms.push_back(u);
+	}
+	{
+		RD::Uniform u;
+		u.binding = 5;
+		u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
+		if (p_use_directional_shadow_atlas && directional_shadow_get_texture().is_valid()) {
+			u.ids.push_back(directional_shadow_get_texture());
+		} else {
+			u.ids.push_back(storage->texture_rd_get_default(RendererStorageRD::DEFAULT_RD_TEXTURE_WHITE));
+		}
+		uniforms.push_back(u);
+	}
+
+	/* we have limited ability to keep textures like this so we're moving this to a set we change before drawing geometry and just pushing the needed texture in */
+	{
+		RD::Uniform u;
+		u.binding = 6;
+		u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
+		u.ids.resize(scene_state.max_lightmaps);
+		RID default_tex = storage->texture_rd_get_default(RendererStorageRD::DEFAULT_RD_TEXTURE_2D_ARRAY_WHITE);
+		for (uint32_t i = 0; i < scene_state.max_lightmaps; i++) {
+			if (i < p_lightmaps.size()) {
+				RID base = lightmap_instance_get_lightmap(p_lightmaps[i]);
+				RID texture = storage->lightmap_get_texture(base);
+				RID rd_texture = storage->texture_get_rd_texture(texture);
+				u.ids.write[i] = rd_texture;
+			} else {
+				u.ids.write[i] = default_tex;
+			}
+		}
+
+		uniforms.push_back(u);
+	}
+
+	/*
+	{
+		RD::Uniform u;
+		u.binding = 7;
+		u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
+		u.ids.resize(MAX_GI_PROBES);
+		RID default_tex = storage->texture_rd_get_default(RendererStorageRD::DEFAULT_RD_TEXTURE_3D_WHITE);
+		for (int i = 0; i < MAX_GI_PROBES; i++) {
+			if (i < (int)p_gi_probes.size()) {
+				RID tex = gi.gi_probe_instance_get_texture(p_gi_probes[i]);
+				if (!tex.is_valid()) {
+					tex = default_tex;
+				}
+				u.ids.write[i] = tex;
+			} else {
+				u.ids.write[i] = default_tex;
+			}
+		}
+
+		uniforms.push_back(u);
+	}
+
+	{
+		RD::Uniform u;
+		u.binding = 8;
+		u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+		RID cb = p_cluster_buffer.is_valid() ? p_cluster_buffer : default_vec4_xform_buffer;
+		u.ids.push_back(cb);
+		uniforms.push_back(u);
+	}
+	*/
+
+	{
+		RD::Uniform u;
+		u.binding = 9;
+		u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
+		RID texture = (false && rb && rb->depth.is_valid()) ? rb->depth : storage->texture_rd_get_default(RendererStorageRD::DEFAULT_RD_TEXTURE_WHITE);
+		u.ids.push_back(texture);
+		uniforms.push_back(u);
+	}
+	{
+		RD::Uniform u;
+		u.binding = 10;
+		u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
+		RID bbt = rb ? render_buffers_get_back_buffer_texture(p_render_buffers) : RID();
+		RID texture = bbt.is_valid() ? bbt : storage->texture_rd_get_default(RendererStorageRD::DEFAULT_RD_TEXTURE_BLACK);
+		u.ids.push_back(texture);
+		uniforms.push_back(u);
+	}
+
+	if (p_index >= (int)render_pass_uniform_sets.size()) {
+		render_pass_uniform_sets.resize(p_index + 1);
+	}
+
+	if (render_pass_uniform_sets[p_index].is_valid() && RD::get_singleton()->uniform_set_is_valid(render_pass_uniform_sets[p_index])) {
+		RD::get_singleton()->free(render_pass_uniform_sets[p_index]);
+	}
+
+	render_pass_uniform_sets[p_index] = RD::get_singleton()->uniform_set_create(uniforms, scene_shader.default_shader_rd, RENDER_PASS_UNIFORM_SET);
+	return render_pass_uniform_sets[p_index];
+}
+
+void RenderForwardMobile::_setup_lightmaps(const PagedArray<RID> &p_lightmaps, const Transform &p_cam_transform) {
+	// This probably needs to change...
+	scene_state.lightmaps_used = 0;
+	for (int i = 0; i < (int)p_lightmaps.size(); i++) {
+		if (i >= (int)scene_state.max_lightmaps) {
+			break;
+		}
+
+		RID lightmap = lightmap_instance_get_lightmap(p_lightmaps[i]);
+
+		Basis to_lm = lightmap_instance_get_transform(p_lightmaps[i]).basis.inverse() * p_cam_transform.basis;
+		to_lm = to_lm.inverse().transposed(); //will transform normals
+		RendererStorageRD::store_transform_3x3(to_lm, scene_state.lightmaps[i].normal_xform);
+		scene_state.lightmap_ids[i] = p_lightmaps[i];
+		scene_state.lightmap_has_sh[i] = storage->lightmap_uses_spherical_harmonics(lightmap);
+
+		scene_state.lightmaps_used++;
+	}
+	if (scene_state.lightmaps_used > 0) {
+		RD::get_singleton()->buffer_update(scene_state.lightmap_buffer, 0, sizeof(LightmapData) * scene_state.lightmaps_used, scene_state.lightmaps, RD::BARRIER_MASK_RASTER);
+	}
+}
+
+void RenderForwardMobile::_render_scene(RID p_render_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_cluster_buffer, uint32_t p_cluster_size, uint32_t p_cluster_max_elements, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, const Color &p_default_bg_color, float p_screen_lod_threshold) {
+	// These are UNUSED here and will not have data parsed from RendererSceneRenderRD:
+	// - p_gi_probes
+	// - p_cluster_buffer
+	// - p_cluster_size
+	// - p_cluster_max_elements
+
+	RenderBufferDataForwardMobile *render_buffer = nullptr;
+	if (p_render_buffer.is_valid()) {
+		render_buffer = (RenderBufferDataForwardMobile *)render_buffers_get_data(p_render_buffer);
+	}
+	RendererSceneEnvironmentRD *env = get_environment(p_environment);
+
+	RENDER_TIMESTAMP("Setup 3D Scene");
+
+	float lod_distance_multiplier = p_cam_projection.get_lod_multiplier();
+	Plane lod_camera_plane(p_cam_transform.get_origin(), -p_cam_transform.basis.get_axis(Vector3::AXIS_Z));
+
+	if (get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_DISABLE_LOD) {
+		p_screen_lod_threshold = 0.0;
+	}
+
+	Vector2 vp_he = p_cam_projection.get_viewport_half_extents();
+	scene_state.ubo.viewport_size[0] = vp_he.x;
+	scene_state.ubo.viewport_size[1] = vp_he.y;
+	scene_state.ubo.directional_light_count = 0;
+
+	Size2i screen_size;
+	RID opaque_framebuffer;
+	RID alpha_framebuffer;
+	bool reverse_cull = false;
+
+	// I don't think we support either of these in our mobile renderer so probably should phase them out
+	bool using_ssr = false;
+	bool using_sss = false;
+
+	if (render_buffer) {
+		// setup rendering to render buffer
+		screen_size.x = render_buffer->width;
+		screen_size.y = render_buffer->height;
+
+		opaque_framebuffer = render_buffer->color_fb;
+		alpha_framebuffer = opaque_framebuffer;
+	} else if (p_reflection_probe.is_valid()) {
+		uint32_t resolution = reflection_probe_instance_get_resolution(p_reflection_probe);
+		screen_size.x = resolution;
+		screen_size.y = resolution;
+
+		opaque_framebuffer = reflection_probe_instance_get_framebuffer(p_reflection_probe, p_reflection_probe_pass);
+		alpha_framebuffer = opaque_framebuffer;
+
+		if (storage->reflection_probe_is_interior(reflection_probe_instance_get_probe(p_reflection_probe))) {
+			p_environment = RID(); //no environment on interiors
+		}
+
+		reverse_cull = true;
+	} else {
+		ERR_FAIL(); //bug?
+	}
+
+	RD::get_singleton()->draw_command_begin_label("Render Setup");
+
+	_setup_lightmaps(p_lightmaps, p_cam_transform);
+	_setup_environment(p_environment, p_render_buffer, p_cam_projection, p_cam_transform, p_reflection_probe, p_reflection_probe.is_valid(), screen_size, p_shadow_atlas, !p_reflection_probe.is_valid(), p_default_bg_color, p_cam_projection.get_z_near(), p_cam_projection.get_z_far(), false);
+
+	_update_render_base_uniform_set(); //may have changed due to the above (light buffer enlarged, as an example)
+
+	_fill_render_list(RENDER_LIST_OPAQUE, p_instances, PASS_MODE_COLOR, p_cam_projection, p_cam_transform, lod_camera_plane, lod_distance_multiplier, p_screen_lod_threshold);
+	render_list[RENDER_LIST_OPAQUE].sort_by_key();
+	render_list[RENDER_LIST_ALPHA].sort_by_depth();
+
+	// we no longer use this...
+	_fill_instance_data(RENDER_LIST_OPAQUE);
+	_fill_instance_data(RENDER_LIST_ALPHA);
+
+	RD::get_singleton()->draw_command_end_label();
+
+	// note, no depth prepass here!
+
+	// setup environment
+	RID radiance_texture;
+	bool draw_sky = false;
+	bool draw_sky_fog_only = false;
+
+	Color clear_color = p_default_bg_color;
+	bool keep_color = false;
+
+	if (get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_OVERDRAW) {
+		clear_color = Color(0, 0, 0, 1); //in overdraw mode, BG should always be black
+	} else if (is_environment(p_environment)) {
+		RS::EnvironmentBG bg_mode = environment_get_background(p_environment);
+		float bg_energy = environment_get_bg_energy(p_environment);
+		switch (bg_mode) {
+			case RS::ENV_BG_CLEAR_COLOR: {
+				clear_color = p_default_bg_color;
+				clear_color.r *= bg_energy;
+				clear_color.g *= bg_energy;
+				clear_color.b *= bg_energy;
+				/*
+				if (render_buffers_has_volumetric_fog(p_render_buffer) || environment_is_fog_enabled(p_environment)) {
+					draw_sky_fog_only = true;
+					storage->material_set_param(sky.sky_scene_state.fog_material, "clear_color", Variant(clear_color.to_linear()));
+				}
+				*/
+			} break;
+			case RS::ENV_BG_COLOR: {
+				clear_color = environment_get_bg_color(p_environment);
+				clear_color.r *= bg_energy;
+				clear_color.g *= bg_energy;
+				clear_color.b *= bg_energy;
+				/*
+				if (render_buffers_has_volumetric_fog(p_render_buffer) || environment_is_fog_enabled(p_environment)) {
+					draw_sky_fog_only = true;
+					storage->material_set_param(sky.sky_scene_state.fog_material, "clear_color", Variant(clear_color.to_linear()));
+				}
+				*/
+			} break;
+			case RS::ENV_BG_SKY: {
+				draw_sky = true;
+			} break;
+			case RS::ENV_BG_CANVAS: {
+				keep_color = true;
+			} break;
+			case RS::ENV_BG_KEEP: {
+				keep_color = true;
+			} break;
+			case RS::ENV_BG_CAMERA_FEED: {
+			} break;
+			default: {
+			}
+		}
+		// setup sky if used for ambient, reflections, or background
+		if (draw_sky || draw_sky_fog_only || environment_get_reflection_source(p_environment) == RS::ENV_REFLECTION_SOURCE_SKY || environment_get_ambient_source(p_environment) == RS::ENV_AMBIENT_SOURCE_SKY) {
+			RENDER_TIMESTAMP("Setup Sky");
+			RD::get_singleton()->draw_command_begin_label("Setup Sky");
+			CameraMatrix projection = p_cam_projection;
+			if (p_reflection_probe.is_valid()) {
+				CameraMatrix correction;
+				correction.set_depth_correction(true);
+				projection = correction * p_cam_projection;
+			}
+
+			sky.setup(env, p_render_buffer, projection, p_cam_transform, screen_size, this);
+
+			RID sky_rid = env->sky;
+			if (sky_rid.is_valid()) {
+				sky.update(env, projection, p_cam_transform, time);
+				radiance_texture = sky.sky_get_radiance_texture_rd(sky_rid);
+			} else {
+				// do not try to draw sky if invalid
+				draw_sky = false;
+			}
+			RD::get_singleton()->draw_command_end_label();
+		}
+	} else {
+		clear_color = p_default_bg_color;
+	}
+
+	// opaque pass
+
+	// !BAS! Look into this, seems most of the code in here related to clustered only, may want to move this code into ForwardClustered/RenderForwardMobile before calling it from here
+	// does trigger shadow map rendering so kinda important
+	_pre_opaque_render(false, false, RID(), RID());
+
+	RD::get_singleton()->draw_command_begin_label("Render Opaque Pass");
+
+	scene_state.ubo.directional_light_count = _get_render_state_directional_light_count();
+
+	_setup_environment(p_environment, p_render_buffer, p_cam_projection, p_cam_transform, p_reflection_probe, p_reflection_probe.is_valid(), screen_size, p_shadow_atlas, !p_reflection_probe.is_valid(), p_default_bg_color, p_cam_projection.get_z_near(), p_cam_projection.get_z_far(), p_render_buffer.is_valid());
+
+	RENDER_TIMESTAMP("Render Opaque Pass");
+
+	RID rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_OPAQUE, p_render_buffer, radiance_texture, p_shadow_atlas, p_reflection_atlas, p_lightmaps, true);
+
+	bool can_continue_color = !scene_state.used_screen_texture && !using_ssr && !using_sss;
+	bool can_continue_depth = !scene_state.used_depth_texture && !using_ssr && !using_sss;
+
+	{
+		bool will_continue_color = (can_continue_color || draw_sky || draw_sky_fog_only);
+		bool will_continue_depth = (can_continue_depth || draw_sky || draw_sky_fog_only);
+
+		// regular forward for now
+		Vector<Color> c;
+		c.push_back(clear_color.to_linear());
+
+		RenderListParameters render_list_params(render_list[RENDER_LIST_OPAQUE].elements.ptr(), render_list[RENDER_LIST_OPAQUE].element_info.ptr(), render_list[RENDER_LIST_OPAQUE].elements.size(), reverse_cull, PASS_MODE_COLOR, rp_uniform_set, get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_WIREFRAME, Vector2(), lod_camera_plane, lod_distance_multiplier, p_screen_lod_threshold);
+		_render_list_with_threads(&render_list_params, opaque_framebuffer, keep_color ? RD::INITIAL_ACTION_KEEP : RD::INITIAL_ACTION_CLEAR, will_continue_color ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ, RD::INITIAL_ACTION_CLEAR, will_continue_depth ? RD::FINAL_ACTION_CONTINUE : RD::FINAL_ACTION_READ, c, 1.0, 0);
+	}
+
+	RD::get_singleton()->draw_command_end_label();
+
+	if (draw_sky || draw_sky_fog_only) {
+		RENDER_TIMESTAMP("Render Sky");
+
+		CameraMatrix projection = p_cam_projection;
+		if (p_reflection_probe.is_valid()) {
+			CameraMatrix correction;
+			correction.set_depth_correction(true);
+			projection = correction * p_cam_projection;
+		}
+		RD::get_singleton()->draw_command_begin_label("Draw Sky");
+		sky.draw(env, can_continue_color, can_continue_depth, opaque_framebuffer, projection, p_cam_transform, time);
+		RD::get_singleton()->draw_command_end_label();
+	}
+
+	if (render_buffer && !can_continue_color && render_buffer->msaa != RS::VIEWPORT_MSAA_DISABLED) {
+		RD::get_singleton()->texture_resolve_multisample(render_buffer->color_msaa, render_buffer->color);
+		/*
+		if (using_separate_specular) {
+			RD::get_singleton()->texture_resolve_multisample(render_buffer->specular_msaa, render_buffer->specular);
+		}
+		*/
+	}
+
+	if (render_buffer && !can_continue_depth && render_buffer->msaa != RS::VIEWPORT_MSAA_DISABLED) {
+		RD::get_singleton()->texture_resolve_multisample(render_buffer->depth_msaa, render_buffer->depth);
+	}
+
+	// transparent pass
+	RENDER_TIMESTAMP("Render Transparent Pass");
+
+	RD::get_singleton()->draw_command_begin_label("Render Transparent Pass");
+
+	rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_ALPHA, p_render_buffer, radiance_texture, p_shadow_atlas, p_reflection_atlas, p_lightmaps, true);
+
+	_setup_environment(p_environment, p_render_buffer, p_cam_projection, p_cam_transform, p_reflection_probe, p_reflection_probe.is_valid(), screen_size, p_shadow_atlas, !p_reflection_probe.is_valid(), p_default_bg_color, p_cam_projection.get_z_near(), p_cam_projection.get_z_far(), false);
+
+	{
+		RenderListParameters render_list_params(render_list[RENDER_LIST_ALPHA].elements.ptr(), render_list[RENDER_LIST_ALPHA].element_info.ptr(), render_list[RENDER_LIST_ALPHA].elements.size(), reverse_cull, PASS_MODE_COLOR, rp_uniform_set, get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_WIREFRAME, Vector2(), lod_camera_plane, lod_distance_multiplier, p_screen_lod_threshold);
+		_render_list_with_threads(&render_list_params, alpha_framebuffer, can_continue_color ? RD::INITIAL_ACTION_CONTINUE : RD::INITIAL_ACTION_KEEP, RD::FINAL_ACTION_READ, can_continue_depth ? RD::INITIAL_ACTION_CONTINUE : RD::INITIAL_ACTION_KEEP, RD::FINAL_ACTION_READ);
+	}
+
+	RD::get_singleton()->draw_command_end_label();
+
+	RD::get_singleton()->draw_command_begin_label("Resolve");
+
+	if (render_buffer && render_buffer->msaa != RS::VIEWPORT_MSAA_DISABLED) {
+		RD::get_singleton()->texture_resolve_multisample(render_buffer->color_msaa, render_buffer->color);
+	}
+
+	RD::get_singleton()->draw_command_end_label();
+}
+
+/* these are being called from RendererSceneRenderRD::_pre_opaque_render */
+
+void RenderForwardMobile::_render_shadow_begin() {
+	scene_state.shadow_passes.clear();
+	RD::get_singleton()->draw_command_begin_label("Shadow Setup");
+	_update_render_base_uniform_set();
+
+	render_list[RENDER_LIST_SECONDARY].clear();
+}
+
+void RenderForwardMobile::_render_shadow_append(RID p_framebuffer, const PagedArray<GeometryInstance *> &p_instances, const CameraMatrix &p_projection, const Transform &p_transform, float p_zfar, float p_bias, float p_normal_bias, bool p_use_dp, bool p_use_dp_flip, bool p_use_pancake, const Plane &p_camera_plane, float p_lod_distance_multiplier, float p_screen_lod_threshold, const Rect2i &p_rect, bool p_flip_y, bool p_clear_region, bool p_begin, bool p_end) {
+	uint32_t shadow_pass_index = scene_state.shadow_passes.size();
+
+	SceneState::ShadowPass shadow_pass;
+
+	scene_state.ubo.dual_paraboloid_side = p_use_dp_flip ? -1 : 1;
+
+	_setup_environment(RID(), RID(), p_projection, p_transform, RID(), true, Vector2(1, 1), RID(), !p_flip_y, Color(), 0, p_zfar, false, p_use_pancake, shadow_pass_index);
+
+	if (get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_DISABLE_LOD) {
+		p_screen_lod_threshold = 0.0;
+	}
+
+	PassMode pass_mode = p_use_dp ? PASS_MODE_SHADOW_DP : PASS_MODE_SHADOW;
+
+	uint32_t render_list_from = render_list[RENDER_LIST_SECONDARY].elements.size();
+	_fill_render_list(RENDER_LIST_SECONDARY, p_instances, pass_mode, p_projection, p_transform, p_camera_plane, p_lod_distance_multiplier, p_screen_lod_threshold, true);
+	uint32_t render_list_size = render_list[RENDER_LIST_SECONDARY].elements.size() - render_list_from;
+	render_list[RENDER_LIST_SECONDARY].sort_by_key_range(render_list_from, render_list_size);
+	_fill_instance_data(RENDER_LIST_SECONDARY, render_list_from, render_list_size, false);
+
+	{
+		//regular forward for now
+		bool flip_cull = p_use_dp_flip;
+		if (p_flip_y) {
+			flip_cull = !flip_cull;
+		}
+
+		shadow_pass.element_from = render_list_from;
+		shadow_pass.element_count = render_list_size;
+		shadow_pass.flip_cull = flip_cull;
+		shadow_pass.pass_mode = pass_mode;
+
+		shadow_pass.rp_uniform_set = RID(); //will be filled later when instance buffer is complete
+		shadow_pass.camera_plane = p_camera_plane;
+		shadow_pass.screen_lod_threshold = p_screen_lod_threshold;
+		shadow_pass.lod_distance_multiplier = p_lod_distance_multiplier;
+
+		shadow_pass.framebuffer = p_framebuffer;
+		shadow_pass.initial_depth_action = p_begin ? (p_clear_region ? RD::INITIAL_ACTION_CLEAR_REGION : RD::INITIAL_ACTION_CLEAR) : (p_clear_region ? RD::INITIAL_ACTION_CLEAR_REGION_CONTINUE : RD::INITIAL_ACTION_CONTINUE);
+		shadow_pass.final_depth_action = p_end ? RD::FINAL_ACTION_READ : RD::FINAL_ACTION_CONTINUE;
+		shadow_pass.rect = p_rect;
+
+		scene_state.shadow_passes.push_back(shadow_pass);
+	}
+}
+
+void RenderForwardMobile::_render_shadow_process() {
+	//render shadows one after the other, so this can be done un-barriered and the driver can optimize (as well as allow us to run compute at the same time)
+
+	for (uint32_t i = 0; i < scene_state.shadow_passes.size(); i++) {
+		//render passes need to be configured after instance buffer is done, since they need the latest version
+		SceneState::ShadowPass &shadow_pass = scene_state.shadow_passes[i];
+		shadow_pass.rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_SECONDARY, RID(), RID(), RID(), RID(), PagedArray<RID>(), false, i);
+	}
+
+	RD::get_singleton()->draw_command_end_label();
+}
+
+void RenderForwardMobile::_render_shadow_end(uint32_t p_barrier) {
+	RD::get_singleton()->draw_command_begin_label("Shadow Render");
+
+	for (uint32_t i = 0; i < scene_state.shadow_passes.size(); i++) {
+		SceneState::ShadowPass &shadow_pass = scene_state.shadow_passes[i];
+		RenderListParameters render_list_parameters(render_list[RENDER_LIST_SECONDARY].elements.ptr() + shadow_pass.element_from, render_list[RENDER_LIST_SECONDARY].element_info.ptr() + shadow_pass.element_from, shadow_pass.element_count, shadow_pass.flip_cull, shadow_pass.pass_mode, shadow_pass.rp_uniform_set, false, Vector2(), shadow_pass.camera_plane, shadow_pass.lod_distance_multiplier, shadow_pass.screen_lod_threshold, shadow_pass.element_from, RD::BARRIER_MASK_NO_BARRIER);
+		_render_list_with_threads(&render_list_parameters, shadow_pass.framebuffer, RD::INITIAL_ACTION_DROP, RD::FINAL_ACTION_DISCARD, shadow_pass.initial_depth_action, shadow_pass.final_depth_action, Vector<Color>(), 1.0, 0, shadow_pass.rect);
+	}
+
+	if (p_barrier != RD::BARRIER_MASK_NO_BARRIER) {
+		RD::get_singleton()->barrier(RD::BARRIER_MASK_RASTER, p_barrier);
+	}
+	RD::get_singleton()->draw_command_end_label();
+}
+
+/* */
+
+void RenderForwardMobile::_render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region) {
+	RENDER_TIMESTAMP("Setup Rendering Material");
+
+	RD::get_singleton()->draw_command_begin_label("Render Material");
+
+	_update_render_base_uniform_set();
+
+	scene_state.ubo.dual_paraboloid_side = 0;
+	scene_state.ubo.material_uv2_mode = false;
+
+	_setup_environment(RID(), RID(), p_cam_projection, p_cam_transform, RID(), true, Vector2(1, 1), RID(), false, Color(), 0, 0);
+
+	PassMode pass_mode = PASS_MODE_DEPTH_MATERIAL;
+	_fill_render_list(RENDER_LIST_SECONDARY, p_instances, pass_mode, p_cam_projection, p_cam_transform);
+	render_list[RENDER_LIST_SECONDARY].sort_by_key();
+	_fill_instance_data(RENDER_LIST_SECONDARY);
+
+	RID rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_SECONDARY, RID(), RID(), RID(), RID(), PagedArray<RID>());
+
+	RENDER_TIMESTAMP("Render Material");
+
+	{
+		RenderListParameters render_list_params(render_list[RENDER_LIST_SECONDARY].elements.ptr(), render_list[RENDER_LIST_SECONDARY].element_info.ptr(), render_list[RENDER_LIST_SECONDARY].elements.size(), true, pass_mode, rp_uniform_set);
+		//regular forward for now
+		Vector<Color> clear;
+		clear.push_back(Color(0, 0, 0, 0));
+		clear.push_back(Color(0, 0, 0, 0));
+		clear.push_back(Color(0, 0, 0, 0));
+		clear.push_back(Color(0, 0, 0, 0));
+		clear.push_back(Color(0, 0, 0, 0));
+		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_framebuffer, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_READ, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_READ, clear, 1.0, 0, p_region);
+		_render_list(draw_list, RD::get_singleton()->framebuffer_get_format(p_framebuffer), &render_list_params, 0, render_list_params.element_count);
+		RD::get_singleton()->draw_list_end();
+	}
+
+	RD::get_singleton()->draw_command_end_label();
+}
+
+void RenderForwardMobile::_render_uv2(const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region) {
+	RENDER_TIMESTAMP("Setup Rendering UV2");
+
+	RD::get_singleton()->draw_command_begin_label("Render UV2");
+
+	_update_render_base_uniform_set();
+
+	scene_state.ubo.dual_paraboloid_side = 0;
+	scene_state.ubo.material_uv2_mode = true;
+
+	_setup_environment(RID(), RID(), CameraMatrix(), Transform(), RID(), true, Vector2(1, 1), RID(), false, Color(), 0, 0);
+
+	PassMode pass_mode = PASS_MODE_DEPTH_MATERIAL;
+	_fill_render_list(RENDER_LIST_SECONDARY, p_instances, pass_mode, CameraMatrix(), Transform());
+	render_list[RENDER_LIST_SECONDARY].sort_by_key();
+	_fill_instance_data(RENDER_LIST_SECONDARY);
+
+	RID rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_SECONDARY, RID(), RID(), RID(), RID(), PagedArray<RID>());
+
+	RENDER_TIMESTAMP("Render Material");
+
+	{
+		RenderListParameters render_list_params(render_list[RENDER_LIST_SECONDARY].elements.ptr(), render_list[RENDER_LIST_SECONDARY].element_info.ptr(), render_list[RENDER_LIST_SECONDARY].elements.size(), true, pass_mode, rp_uniform_set, true);
+		//regular forward for now
+		Vector<Color> clear;
+		clear.push_back(Color(0, 0, 0, 0));
+		clear.push_back(Color(0, 0, 0, 0));
+		clear.push_back(Color(0, 0, 0, 0));
+		clear.push_back(Color(0, 0, 0, 0));
+		clear.push_back(Color(0, 0, 0, 0));
+		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_framebuffer, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_READ, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_READ, clear, 1.0, 0, p_region);
+
+		const int uv_offset_count = 9;
+		static const Vector2 uv_offsets[uv_offset_count] = {
+			Vector2(-1, 1),
+			Vector2(1, 1),
+			Vector2(1, -1),
+			Vector2(-1, -1),
+			Vector2(-1, 0),
+			Vector2(1, 0),
+			Vector2(0, -1),
+			Vector2(0, 1),
+			Vector2(0, 0),
+
+		};
+
+		for (int i = 0; i < uv_offset_count; i++) {
+			Vector2 ofs = uv_offsets[i];
+			ofs.x /= p_region.size.width;
+			ofs.y /= p_region.size.height;
+			render_list_params.uv_offset = ofs;
+			_render_list(draw_list, RD::get_singleton()->framebuffer_get_format(p_framebuffer), &render_list_params, 0, render_list_params.element_count); //first wireframe, for pseudo conservative
+		}
+		render_list_params.uv_offset = Vector2();
+		_render_list(draw_list, RD::get_singleton()->framebuffer_get_format(p_framebuffer), &render_list_params, 0, render_list_params.element_count); //second regular triangles
+
+		RD::get_singleton()->draw_list_end();
+	}
+
+	RD::get_singleton()->draw_command_end_label();
+}
+
+void RenderForwardMobile::_render_sdfgi(RID p_render_buffers, const Vector3i &p_from, const Vector3i &p_size, const AABB &p_bounds, const PagedArray<GeometryInstance *> &p_instances, const RID &p_albedo_texture, const RID &p_emission_texture, const RID &p_emission_aniso_texture, const RID &p_geom_facing_texture) {
+	// we don't do GI in low end..
+}
+
+void RenderForwardMobile::_render_particle_collider_heightfield(RID p_fb, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, const PagedArray<GeometryInstance *> &p_instances) {
+	RENDER_TIMESTAMP("Setup Render Collider Heightfield");
+
+	RD::get_singleton()->draw_command_begin_label("Render Collider Heightfield");
+
+	_update_render_base_uniform_set();
+	scene_state.ubo.dual_paraboloid_side = 0;
+
+	_setup_environment(RID(), RID(), p_cam_projection, p_cam_transform, RID(), true, Vector2(1, 1), RID(), true, Color(), 0, p_cam_projection.get_z_far(), false, false);
+
+	PassMode pass_mode = PASS_MODE_SHADOW;
+
+	_fill_render_list(RENDER_LIST_SECONDARY, p_instances, pass_mode, p_cam_projection, p_cam_transform);
+	render_list[RENDER_LIST_SECONDARY].sort_by_key();
+	_fill_instance_data(RENDER_LIST_SECONDARY);
+
+	RID rp_uniform_set = _setup_render_pass_uniform_set(RENDER_LIST_SECONDARY, RID(), RID(), RID(), RID(), PagedArray<RID>());
+
+	RENDER_TIMESTAMP("Render Collider Heightfield");
+
+	{
+		//regular forward for now
+		RenderListParameters render_list_params(render_list[RENDER_LIST_SECONDARY].elements.ptr(), render_list[RENDER_LIST_SECONDARY].element_info.ptr(), render_list[RENDER_LIST_SECONDARY].elements.size(), false, pass_mode, rp_uniform_set);
+		_render_list_with_threads(&render_list_params, p_fb, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_READ, RD::INITIAL_ACTION_CLEAR, RD::FINAL_ACTION_READ);
+	}
+	RD::get_singleton()->draw_command_end_label();
+}
+
+void RenderForwardMobile::_base_uniforms_changed() {
+	if (!render_base_uniform_set.is_null() && RD::get_singleton()->uniform_set_is_valid(render_base_uniform_set)) {
+		RD::get_singleton()->free(render_base_uniform_set);
+	}
+	render_base_uniform_set = RID();
+}
+
+void RenderForwardMobile::_update_render_base_uniform_set() {
+	if (render_base_uniform_set.is_null() || !RD::get_singleton()->uniform_set_is_valid(render_base_uniform_set) || (lightmap_texture_array_version != storage->lightmap_array_get_version())) {
+		if (render_base_uniform_set.is_valid() && RD::get_singleton()->uniform_set_is_valid(render_base_uniform_set)) {
+			RD::get_singleton()->free(render_base_uniform_set);
+		}
+
+		// This is all loaded into set 0
+
+		lightmap_texture_array_version = storage->lightmap_array_get_version();
+
+		Vector<RD::Uniform> uniforms;
+
+		{
+			RD::Uniform u;
+			u.uniform_type = RD::UNIFORM_TYPE_SAMPLER;
+			u.binding = 1;
+			u.ids.resize(12);
+			RID *ids_ptr = u.ids.ptrw();
+			ids_ptr[0] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_NEAREST, RS::CANVAS_ITEM_TEXTURE_REPEAT_DISABLED);
+			ids_ptr[1] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_LINEAR, RS::CANVAS_ITEM_TEXTURE_REPEAT_DISABLED);
+			ids_ptr[2] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_NEAREST_WITH_MIPMAPS, RS::CANVAS_ITEM_TEXTURE_REPEAT_DISABLED);
+			ids_ptr[3] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_LINEAR_WITH_MIPMAPS, RS::CANVAS_ITEM_TEXTURE_REPEAT_DISABLED);
+			ids_ptr[4] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_NEAREST_WITH_MIPMAPS_ANISOTROPIC, RS::CANVAS_ITEM_TEXTURE_REPEAT_DISABLED);
+			ids_ptr[5] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_LINEAR_WITH_MIPMAPS_ANISOTROPIC, RS::CANVAS_ITEM_TEXTURE_REPEAT_DISABLED);
+			ids_ptr[6] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_NEAREST, RS::CANVAS_ITEM_TEXTURE_REPEAT_ENABLED);
+			ids_ptr[7] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_LINEAR, RS::CANVAS_ITEM_TEXTURE_REPEAT_ENABLED);
+			ids_ptr[8] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_NEAREST_WITH_MIPMAPS, RS::CANVAS_ITEM_TEXTURE_REPEAT_ENABLED);
+			ids_ptr[9] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_LINEAR_WITH_MIPMAPS, RS::CANVAS_ITEM_TEXTURE_REPEAT_ENABLED);
+			ids_ptr[10] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_NEAREST_WITH_MIPMAPS_ANISOTROPIC, RS::CANVAS_ITEM_TEXTURE_REPEAT_ENABLED);
+			ids_ptr[11] = storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_LINEAR_WITH_MIPMAPS_ANISOTROPIC, RS::CANVAS_ITEM_TEXTURE_REPEAT_ENABLED);
+			uniforms.push_back(u);
+		}
+
+		{
+			RD::Uniform u;
+			u.binding = 2;
+			u.uniform_type = RD::UNIFORM_TYPE_SAMPLER;
+			u.ids.push_back(scene_shader.shadow_sampler);
+			uniforms.push_back(u);
+		}
+
+		{
+			RD::Uniform u;
+			u.binding = 3;
+			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+			u.ids.push_back(get_omni_light_buffer());
+			uniforms.push_back(u);
+		}
+		{
+			RD::Uniform u;
+			u.binding = 4;
+			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+			u.ids.push_back(get_spot_light_buffer());
+			uniforms.push_back(u);
+		}
+
+		{
+			RD::Uniform u;
+			u.binding = 5;
+			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+			u.ids.push_back(get_reflection_probe_buffer());
+			uniforms.push_back(u);
+		}
+		{
+			RD::Uniform u;
+			u.binding = 6;
+			u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER;
+			u.ids.push_back(get_directional_light_buffer());
+			uniforms.push_back(u);
+		}
+		{
+			RD::Uniform u;
+			u.binding = 7;
+			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+			u.ids.push_back(scene_state.lightmap_buffer);
+			uniforms.push_back(u);
+		}
+		{
+			RD::Uniform u;
+			u.binding = 8;
+			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+			u.ids.push_back(scene_state.lightmap_capture_buffer);
+			uniforms.push_back(u);
+		}
+		{
+			RD::Uniform u;
+			u.binding = 9;
+			u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
+			RID decal_atlas = storage->decal_atlas_get_texture();
+			u.ids.push_back(decal_atlas);
+			uniforms.push_back(u);
+		}
+		{
+			RD::Uniform u;
+			u.binding = 10;
+			u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
+			RID decal_atlas = storage->decal_atlas_get_texture_srgb();
+			u.ids.push_back(decal_atlas);
+			uniforms.push_back(u);
+		}
+		{
+			RD::Uniform u;
+			u.binding = 11;
+			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+			u.ids.push_back(get_decal_buffer());
+			uniforms.push_back(u);
+		}
+
+		{
+			RD::Uniform u;
+			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+			u.binding = 12;
+			u.ids.push_back(storage->global_variables_get_storage_buffer());
+			uniforms.push_back(u);
+		}
+
+		render_base_uniform_set = RD::get_singleton()->uniform_set_create(uniforms, scene_shader.default_shader_rd, SCENE_UNIFORM_SET);
+	}
+}
+
+RID RenderForwardMobile::_render_buffers_get_normal_texture(RID p_render_buffers) {
+	// RenderBufferDataForwardMobile *rb = (RenderBufferDataForwardMobile *)render_buffers_get_data(p_render_buffers);
+
+	// We don't have this. This is for debugging
+	// return rb->normal_roughness_buffer;
+	return RID();
+}
+
+void RenderForwardMobile::_fill_render_list(RenderListType p_render_list, const PagedArray<GeometryInstance *> &p_instances, PassMode p_pass_mode, const CameraMatrix &p_cam_projection, const Transform &p_cam_transform, const Plane &p_lod_plane, float p_lod_distance_multiplier, float p_screen_lod_threshold, bool p_append) {
+	if (p_render_list == RENDER_LIST_OPAQUE) {
+		scene_state.used_sss = false;
+		scene_state.used_screen_texture = false;
+		scene_state.used_normal_texture = false;
+		scene_state.used_depth_texture = false;
+	}
+	uint32_t lightmap_captures_used = 0;
+
+	Plane near_plane(p_cam_transform.origin, -p_cam_transform.basis.get_axis(Vector3::AXIS_Z));
+	near_plane.d += p_cam_projection.get_z_near();
+	float z_max = p_cam_projection.get_z_far() - p_cam_projection.get_z_near();
+
+	RenderList *rl = &render_list[p_render_list];
+
+	// Parse any updates on our geometry, updates surface caches and such
+	_update_dirty_geometry_instances();
+
+	if (!p_append) {
+		rl->clear();
+		if (p_render_list == RENDER_LIST_OPAQUE) {
+			render_list[RENDER_LIST_ALPHA].clear(); //opaque fills alpha too
+		}
+	}
+
+	//fill list
+
+	for (int i = 0; i < (int)p_instances.size(); i++) {
+		GeometryInstanceForwardMobile *inst = static_cast<GeometryInstanceForwardMobile *>(p_instances[i]);
+
+		Vector3 support_min = inst->transformed_aabb.get_support(-near_plane.normal);
+		inst->depth = near_plane.distance_to(support_min);
+		uint32_t depth_layer = CLAMP(int(inst->depth * 16 / z_max), 0, 15);
+
+		uint32_t flags = inst->base_flags; //fill flags if appropriate
+
+		bool uses_lightmap = false;
+		// bool uses_gi = false;
+
+		if (p_render_list == RENDER_LIST_OPAQUE) {
+			if (inst->lightmap_instance.is_valid()) {
+				int32_t lightmap_cull_index = -1;
+				for (uint32_t j = 0; j < scene_state.lightmaps_used; j++) {
+					if (scene_state.lightmap_ids[j] == inst->lightmap_instance) {
+						lightmap_cull_index = j;
+						break;
+					}
+				}
+				if (lightmap_cull_index >= 0) {
+					inst->gi_offset_cache = inst->lightmap_slice_index << 16;
+					inst->gi_offset_cache |= lightmap_cull_index;
+					flags |= INSTANCE_DATA_FLAG_USE_LIGHTMAP;
+					if (scene_state.lightmap_has_sh[lightmap_cull_index]) {
+						flags |= INSTANCE_DATA_FLAG_USE_SH_LIGHTMAP;
+					}
+					uses_lightmap = true;
+				} else {
+					inst->gi_offset_cache = 0xFFFFFFFF;
+				}
+
+			} else if (inst->lightmap_sh) {
+				if (lightmap_captures_used < scene_state.max_lightmap_captures) {
+					const Color *src_capture = inst->lightmap_sh->sh;
+					LightmapCaptureData &lcd = scene_state.lightmap_captures[lightmap_captures_used];
+					for (int j = 0; j < 9; j++) {
+						lcd.sh[j * 4 + 0] = src_capture[j].r;
+						lcd.sh[j * 4 + 1] = src_capture[j].g;
+						lcd.sh[j * 4 + 2] = src_capture[j].b;
+						lcd.sh[j * 4 + 3] = src_capture[j].a;
+					}
+					flags |= INSTANCE_DATA_FLAG_USE_LIGHTMAP_CAPTURE;
+					inst->gi_offset_cache = lightmap_captures_used;
+					lightmap_captures_used++;
+					uses_lightmap = true;
+				}
+			}
+		}
+		inst->flags_cache = flags;
+
+		GeometryInstanceSurfaceDataCache *surf = inst->surface_caches;
+
+		while (surf) {
+			surf->sort.uses_lightmap = 0;
+
+			// LOD
+
+			if (p_screen_lod_threshold > 0.0 && storage->mesh_surface_has_lod(surf->surface)) {
+				//lod
+				Vector3 lod_support_min = inst->transformed_aabb.get_support(-p_lod_plane.normal);
+				Vector3 lod_support_max = inst->transformed_aabb.get_support(p_lod_plane.normal);
+
+				float distance_min = p_lod_plane.distance_to(lod_support_min);
+				float distance_max = p_lod_plane.distance_to(lod_support_max);
+
+				float distance = 0.0;
+
+				if (distance_min * distance_max < 0.0) {
+					//crossing plane
+					distance = 0.0;
+				} else if (distance_min >= 0.0) {
+					distance = distance_min;
+				} else if (distance_max <= 0.0) {
+					distance = -distance_max;
+				}
+
+				surf->lod_index = storage->mesh_surface_get_lod(surf->surface, inst->lod_model_scale * inst->lod_bias, distance * p_lod_distance_multiplier, p_screen_lod_threshold);
+			} else {
+				surf->lod_index = 0;
+			}
+
+			// ADD Element
+			if (p_pass_mode == PASS_MODE_COLOR) {
+				if (surf->flags & (GeometryInstanceSurfaceDataCache::FLAG_PASS_DEPTH | GeometryInstanceSurfaceDataCache::FLAG_PASS_OPAQUE)) {
+					rl->add_element(surf);
+				}
+				if (surf->flags & GeometryInstanceSurfaceDataCache::FLAG_PASS_ALPHA) {
+					render_list[RENDER_LIST_ALPHA].add_element(surf);
+					// if (uses_gi) {
+					//	surf->sort.uses_forward_gi = 1;
+					// }
+				}
+
+				if (uses_lightmap) {
+					surf->sort.uses_lightmap = 1; // This needs to become our lightmap index but we'll do that in a separate PR.
+				}
+
+				if (surf->flags & GeometryInstanceSurfaceDataCache::FLAG_USES_SUBSURFACE_SCATTERING) {
+					scene_state.used_sss = true;
+				}
+				if (surf->flags & GeometryInstanceSurfaceDataCache::FLAG_USES_SCREEN_TEXTURE) {
+					scene_state.used_screen_texture = true;
+				}
+				if (surf->flags & GeometryInstanceSurfaceDataCache::FLAG_USES_NORMAL_TEXTURE) {
+					scene_state.used_normal_texture = true;
+				}
+				if (surf->flags & GeometryInstanceSurfaceDataCache::FLAG_USES_DEPTH_TEXTURE) {
+					scene_state.used_depth_texture = true;
+				}
+
+			} else if (p_pass_mode == PASS_MODE_SHADOW || p_pass_mode == PASS_MODE_SHADOW_DP) {
+				if (surf->flags & GeometryInstanceSurfaceDataCache::FLAG_PASS_SHADOW) {
+					rl->add_element(surf);
+				}
+			} else {
+				if (surf->flags & (GeometryInstanceSurfaceDataCache::FLAG_PASS_DEPTH | GeometryInstanceSurfaceDataCache::FLAG_PASS_OPAQUE)) {
+					rl->add_element(surf);
+				}
+			}
+
+			surf->sort.depth_layer = depth_layer;
+
+			surf = surf->next;
+		}
+	}
+}
+
+void RenderForwardMobile::_setup_environment(RID p_environment, RID p_render_buffers, const CameraMatrix &p_cam_projection, const Transform &p_cam_transform, RID p_reflection_probe, bool p_no_fog, const Size2i &p_screen_size, RID p_shadow_atlas, bool p_flip_y, const Color &p_default_bg_color, float p_znear, float p_zfar, bool p_opaque_render_buffers, bool p_pancake_shadows, int p_index) {
+	//!BAS! need to go through this and find out what we don't need anymore
+
+	// This populates our UBO with main scene data that is pushed into set 1
+
+	//CameraMatrix projection = p_cam_projection;
+	//projection.flip_y(); // Vulkan and modern APIs use Y-Down
+	CameraMatrix correction;
+	correction.set_depth_correction(p_flip_y);
+	CameraMatrix projection = correction * p_cam_projection;
+
+	//store camera into ubo
+	RendererStorageRD::store_camera(projection, scene_state.ubo.projection_matrix);
+	RendererStorageRD::store_camera(projection.inverse(), scene_state.ubo.inv_projection_matrix);
+	RendererStorageRD::store_transform(p_cam_transform, scene_state.ubo.camera_matrix);
+	RendererStorageRD::store_transform(p_cam_transform.affine_inverse(), scene_state.ubo.inv_camera_matrix);
+
+	scene_state.ubo.z_far = p_zfar;
+	scene_state.ubo.z_near = p_znear;
+
+	scene_state.ubo.pancake_shadows = p_pancake_shadows;
+
+	RendererStorageRD::store_soft_shadow_kernel(directional_penumbra_shadow_kernel_get(), scene_state.ubo.directional_penumbra_shadow_kernel);
+	RendererStorageRD::store_soft_shadow_kernel(directional_soft_shadow_kernel_get(), scene_state.ubo.directional_soft_shadow_kernel);
+	RendererStorageRD::store_soft_shadow_kernel(penumbra_shadow_kernel_get(), scene_state.ubo.penumbra_shadow_kernel);
+	RendererStorageRD::store_soft_shadow_kernel(soft_shadow_kernel_get(), scene_state.ubo.soft_shadow_kernel);
+
+	scene_state.ubo.directional_penumbra_shadow_samples = directional_penumbra_shadow_samples_get();
+	scene_state.ubo.directional_soft_shadow_samples = directional_soft_shadow_samples_get();
+	scene_state.ubo.penumbra_shadow_samples = penumbra_shadow_samples_get();
+	scene_state.ubo.soft_shadow_samples = soft_shadow_samples_get();
+
+	Size2 screen_pixel_size = Vector2(1.0, 1.0) / Size2(p_screen_size);
+	scene_state.ubo.screen_pixel_size[0] = screen_pixel_size.x;
+	scene_state.ubo.screen_pixel_size[1] = screen_pixel_size.y;
+
+	/*
+	scene_state.ubo.cluster_shift = get_shift_from_power_of_2(p_cluster_size);
+	scene_state.ubo.max_cluster_element_count_div_32 = p_max_cluster_elements / 32;
+	{
+		uint32_t cluster_screen_width = (p_screen_size.width - 1) / p_cluster_size + 1;
+		uint32_t cluster_screen_height = (p_screen_size.height - 1) / p_cluster_size + 1;
+		scene_state.ubo.cluster_type_size = cluster_screen_width * cluster_screen_height * (scene_state.ubo.max_cluster_element_count_div_32 + 32);
+		scene_state.ubo.cluster_width = cluster_screen_width;
+	}
+	*/
+
+	if (p_shadow_atlas.is_valid()) {
+		Vector2 sas = shadow_atlas_get_size(p_shadow_atlas);
+		scene_state.ubo.shadow_atlas_pixel_size[0] = 1.0 / sas.x;
+		scene_state.ubo.shadow_atlas_pixel_size[1] = 1.0 / sas.y;
+	}
+	{
+		Vector2 dss = directional_shadow_get_size();
+		scene_state.ubo.directional_shadow_pixel_size[0] = 1.0 / dss.x;
+		scene_state.ubo.directional_shadow_pixel_size[1] = 1.0 / dss.y;
+	}
+
+	//time global variables
+	scene_state.ubo.time = time;
+
+	/*
+	scene_state.ubo.gi_upscale_for_msaa = false;
+	scene_state.ubo.volumetric_fog_enabled = false;
+	scene_state.ubo.fog_enabled = false;
+
+	if (p_render_buffers.is_valid()) {
+		RenderBufferDataForwardMobile *render_buffers = (RenderBufferDataForwardMobile *)render_buffers_get_data(p_render_buffers);
+		if (render_buffers->msaa != RS::VIEWPORT_MSAA_DISABLED) {
+			scene_state.ubo.gi_upscale_for_msaa = true;
+		}
+
+		if (render_buffers_has_volumetric_fog(p_render_buffers)) {
+			scene_state.ubo.volumetric_fog_enabled = true;
+			float fog_end = render_buffers_get_volumetric_fog_end(p_render_buffers);
+			if (fog_end > 0.0) {
+				scene_state.ubo.volumetric_fog_inv_length = 1.0 / fog_end;
+			} else {
+				scene_state.ubo.volumetric_fog_inv_length = 1.0;
+			}
+
+			float fog_detail_spread = render_buffers_get_volumetric_fog_detail_spread(p_render_buffers); //reverse lookup
+			if (fog_detail_spread > 0.0) {
+				scene_state.ubo.volumetric_fog_detail_spread = 1.0 / fog_detail_spread;
+			} else {
+				scene_state.ubo.volumetric_fog_detail_spread = 1.0;
+			}
+		}
+	}
+
+	*/
+
+	if (get_debug_draw_mode() == RS::VIEWPORT_DEBUG_DRAW_UNSHADED) {
+		scene_state.ubo.use_ambient_light = true;
+		scene_state.ubo.ambient_light_color_energy[0] = 1;
+		scene_state.ubo.ambient_light_color_energy[1] = 1;
+		scene_state.ubo.ambient_light_color_energy[2] = 1;
+		scene_state.ubo.ambient_light_color_energy[3] = 1.0;
+		scene_state.ubo.use_ambient_cubemap = false;
+		scene_state.ubo.use_reflection_cubemap = false;
+		scene_state.ubo.ssao_enabled = false;
+
+	} else if (is_environment(p_environment)) {
+		RS::EnvironmentBG env_bg = environment_get_background(p_environment);
+		RS::EnvironmentAmbientSource ambient_src = environment_get_ambient_source(p_environment);
+
+		float bg_energy = environment_get_bg_energy(p_environment);
+		scene_state.ubo.ambient_light_color_energy[3] = bg_energy;
+
+		scene_state.ubo.ambient_color_sky_mix = environment_get_ambient_sky_contribution(p_environment);
+
+		//ambient
+		if (ambient_src == RS::ENV_AMBIENT_SOURCE_BG && (env_bg == RS::ENV_BG_CLEAR_COLOR || env_bg == RS::ENV_BG_COLOR)) {
+			Color color = env_bg == RS::ENV_BG_CLEAR_COLOR ? p_default_bg_color : environment_get_bg_color(p_environment);
+			color = color.to_linear();
+
+			scene_state.ubo.ambient_light_color_energy[0] = color.r * bg_energy;
+			scene_state.ubo.ambient_light_color_energy[1] = color.g * bg_energy;
+			scene_state.ubo.ambient_light_color_energy[2] = color.b * bg_energy;
+			scene_state.ubo.use_ambient_light = true;
+			scene_state.ubo.use_ambient_cubemap = false;
+		} else {
+			float energy = environment_get_ambient_light_energy(p_environment);
+			Color color = environment_get_ambient_light_color(p_environment);
+			color = color.to_linear();
+			scene_state.ubo.ambient_light_color_energy[0] = color.r * energy;
+			scene_state.ubo.ambient_light_color_energy[1] = color.g * energy;
+			scene_state.ubo.ambient_light_color_energy[2] = color.b * energy;
+
+			Basis sky_transform = environment_get_sky_orientation(p_environment);
+			sky_transform = sky_transform.inverse() * p_cam_transform.basis;
+			RendererStorageRD::store_transform_3x3(sky_transform, scene_state.ubo.radiance_inverse_xform);
+
+			scene_state.ubo.use_ambient_cubemap = (ambient_src == RS::ENV_AMBIENT_SOURCE_BG && env_bg == RS::ENV_BG_SKY) || ambient_src == RS::ENV_AMBIENT_SOURCE_SKY;
+			scene_state.ubo.use_ambient_light = scene_state.ubo.use_ambient_cubemap || ambient_src == RS::ENV_AMBIENT_SOURCE_COLOR;
+		}
+
+		//specular
+		RS::EnvironmentReflectionSource ref_src = environment_get_reflection_source(p_environment);
+		if ((ref_src == RS::ENV_REFLECTION_SOURCE_BG && env_bg == RS::ENV_BG_SKY) || ref_src == RS::ENV_REFLECTION_SOURCE_SKY) {
+			scene_state.ubo.use_reflection_cubemap = true;
+		} else {
+			scene_state.ubo.use_reflection_cubemap = false;
+		}
+
+		scene_state.ubo.ssao_enabled = p_opaque_render_buffers && environment_is_ssao_enabled(p_environment);
+		scene_state.ubo.ssao_ao_affect = environment_get_ssao_ao_affect(p_environment);
+		scene_state.ubo.ssao_light_affect = environment_get_ssao_light_affect(p_environment);
+
+		Color ao_color = environment_get_ao_color(p_environment).to_linear();
+		scene_state.ubo.ao_color[0] = ao_color.r;
+		scene_state.ubo.ao_color[1] = ao_color.g;
+		scene_state.ubo.ao_color[2] = ao_color.b;
+		scene_state.ubo.ao_color[3] = ao_color.a;
+
+		scene_state.ubo.fog_enabled = environment_is_fog_enabled(p_environment);
+		scene_state.ubo.fog_density = environment_get_fog_density(p_environment);
+		scene_state.ubo.fog_height = environment_get_fog_height(p_environment);
+		scene_state.ubo.fog_height_density = environment_get_fog_height_density(p_environment);
+		if (scene_state.ubo.fog_height_density >= 0.0001) {
+			scene_state.ubo.fog_height_density = 1.0 / scene_state.ubo.fog_height_density;
+		}
+		scene_state.ubo.fog_aerial_perspective = environment_get_fog_aerial_perspective(p_environment);
+
+		Color fog_color = environment_get_fog_light_color(p_environment).to_linear();
+		float fog_energy = environment_get_fog_light_energy(p_environment);
+
+		scene_state.ubo.fog_light_color[0] = fog_color.r * fog_energy;
+		scene_state.ubo.fog_light_color[1] = fog_color.g * fog_energy;
+		scene_state.ubo.fog_light_color[2] = fog_color.b * fog_energy;
+
+		scene_state.ubo.fog_sun_scatter = environment_get_fog_sun_scatter(p_environment);
+
+	} else {
+		if (p_reflection_probe.is_valid() && storage->reflection_probe_is_interior(reflection_probe_instance_get_probe(p_reflection_probe))) {
+			scene_state.ubo.use_ambient_light = false;
+		} else {
+			scene_state.ubo.use_ambient_light = true;
+			Color clear_color = p_default_bg_color;
+			clear_color = clear_color.to_linear();
+			scene_state.ubo.ambient_light_color_energy[0] = clear_color.r;
+			scene_state.ubo.ambient_light_color_energy[1] = clear_color.g;
+			scene_state.ubo.ambient_light_color_energy[2] = clear_color.b;
+			scene_state.ubo.ambient_light_color_energy[3] = 1.0;
+		}
+
+		scene_state.ubo.use_ambient_cubemap = false;
+		scene_state.ubo.use_reflection_cubemap = false;
+		scene_state.ubo.ssao_enabled = false;
+	}
+
+	scene_state.ubo.roughness_limiter_enabled = p_opaque_render_buffers && screen_space_roughness_limiter_is_active();
+	scene_state.ubo.roughness_limiter_amount = screen_space_roughness_limiter_get_amount();
+	scene_state.ubo.roughness_limiter_limit = screen_space_roughness_limiter_get_limit();
+
+	if (p_index >= (int)scene_state.uniform_buffers.size()) {
+		uint32_t from = scene_state.uniform_buffers.size();
+		scene_state.uniform_buffers.resize(p_index + 1);
+		render_pass_uniform_sets.resize(p_index + 1);
+		for (uint32_t i = from; i < scene_state.uniform_buffers.size(); i++) {
+			scene_state.uniform_buffers[i] = RD::get_singleton()->uniform_buffer_create(sizeof(SceneState::UBO));
+		}
+	}
+	RD::get_singleton()->buffer_update(scene_state.uniform_buffers[p_index], 0, sizeof(SceneState::UBO), &scene_state.ubo, RD::BARRIER_MASK_RASTER);
+}
+
+void RenderForwardMobile::_fill_instance_data(RenderListType p_render_list, uint32_t p_offset, int32_t p_max_elements, bool p_update_buffer) {
+	// !BAS! Rename this to make clear this is not the same as with the forward renderer and remove p_update_buffer?
+
+	RenderList *rl = &render_list[p_render_list];
+	uint32_t element_total = p_max_elements >= 0 ? uint32_t(p_max_elements) : rl->elements.size();
+
+	rl->element_info.resize(p_offset + element_total);
+
+	uint32_t repeats = 0;
+	GeometryInstanceSurfaceDataCache *prev_surface = nullptr;
+	for (uint32_t i = 0; i < element_total; i++) {
+		GeometryInstanceSurfaceDataCache *surface = rl->elements[i + p_offset];
+		GeometryInstanceForwardMobile *inst = surface->owner;
+
+		bool cant_repeat = inst->flags_cache & INSTANCE_DATA_FLAG_MULTIMESH || inst->mesh_instance.is_valid();
+
+		if (prev_surface != nullptr && !cant_repeat && prev_surface->sort.sort_key1 == surface->sort.sort_key1 && prev_surface->sort.sort_key2 == surface->sort.sort_key2) {
+			//this element is the same as the previous one, count repeats to draw it using instancing
+			repeats++;
+		} else {
+			if (repeats > 0) {
+				for (uint32_t j = 1; j <= repeats; j++) {
+					rl->element_info[p_offset + i - j].repeat = j;
+				}
+			}
+			repeats = 1;
+		}
+
+		RenderElementInfo &element_info = rl->element_info[p_offset + i];
+
+		element_info.lod_index = surface->lod_index;
+		element_info.uses_lightmap = surface->sort.uses_lightmap;
+
+		if (cant_repeat) {
+			prev_surface = nullptr;
+		} else {
+			prev_surface = surface;
+		}
+	}
+
+	if (repeats > 0) {
+		for (uint32_t j = 1; j <= repeats; j++) {
+			rl->element_info[p_offset + element_total - j].repeat = j;
+		}
+	}
+}
+
+/// RENDERING ///
+
+void RenderForwardMobile::_render_list(RenderingDevice::DrawListID p_draw_list, RenderingDevice::FramebufferFormatID p_framebuffer_Format, RenderListParameters *p_params, uint32_t p_from_element, uint32_t p_to_element) {
+	//use template for faster performance (pass mode comparisons are inlined)
+
+	switch (p_params->pass_mode) {
+		case PASS_MODE_COLOR: {
+			_render_list_template<PASS_MODE_COLOR>(p_draw_list, p_framebuffer_Format, p_params, p_from_element, p_to_element);
+		} break;
+		case PASS_MODE_COLOR_TRANSPARENT: {
+			_render_list_template<PASS_MODE_COLOR_TRANSPARENT>(p_draw_list, p_framebuffer_Format, p_params, p_from_element, p_to_element);
+		} break;
+		case PASS_MODE_SHADOW: {
+			_render_list_template<PASS_MODE_SHADOW>(p_draw_list, p_framebuffer_Format, p_params, p_from_element, p_to_element);
+		} break;
+		case PASS_MODE_SHADOW_DP: {
+			_render_list_template<PASS_MODE_SHADOW_DP>(p_draw_list, p_framebuffer_Format, p_params, p_from_element, p_to_element);
+		} break;
+		case PASS_MODE_DEPTH_MATERIAL: {
+			_render_list_template<PASS_MODE_DEPTH_MATERIAL>(p_draw_list, p_framebuffer_Format, p_params, p_from_element, p_to_element);
+		} break;
+	}
+}
+
+void RenderForwardMobile::_render_list_thread_function(uint32_t p_thread, RenderListParameters *p_params) {
+	uint32_t render_total = p_params->element_count;
+	uint32_t total_threads = RendererThreadPool::singleton->thread_work_pool.get_thread_count();
+	uint32_t render_from = p_thread * render_total / total_threads;
+	uint32_t render_to = (p_thread + 1 == total_threads) ? render_total : ((p_thread + 1) * render_total / total_threads);
+	_render_list(thread_draw_lists[p_thread], p_params->framebuffer_format, p_params, render_from, render_to);
+}
+
+void RenderForwardMobile::_render_list_with_threads(RenderListParameters *p_params, RID p_framebuffer, RD::InitialAction p_initial_color_action, RD::FinalAction p_final_color_action, RD::InitialAction p_initial_depth_action, RD::FinalAction p_final_depth_action, const Vector<Color> &p_clear_color_values, float p_clear_depth, uint32_t p_clear_stencil, const Rect2 &p_region, const Vector<RID> &p_storage_textures) {
+	RD::FramebufferFormatID fb_format = RD::get_singleton()->framebuffer_get_format(p_framebuffer);
+	p_params->framebuffer_format = fb_format;
+
+	if ((uint32_t)p_params->element_count > render_list_thread_threshold && false) { // secondary command buffers need more testing at this time
+		//multi threaded
+		thread_draw_lists.resize(RendererThreadPool::singleton->thread_work_pool.get_thread_count());
+		RD::get_singleton()->draw_list_begin_split(p_framebuffer, thread_draw_lists.size(), thread_draw_lists.ptr(), p_initial_color_action, p_final_color_action, p_initial_depth_action, p_final_depth_action, p_clear_color_values, p_clear_depth, p_clear_stencil, p_region, p_storage_textures);
+		RendererThreadPool::singleton->thread_work_pool.do_work(thread_draw_lists.size(), this, &RenderForwardMobile::_render_list_thread_function, p_params);
+		RD::get_singleton()->draw_list_end(p_params->barrier);
+	} else {
+		//single threaded
+		RD::DrawListID draw_list = RD::get_singleton()->draw_list_begin(p_framebuffer, p_initial_color_action, p_final_color_action, p_initial_depth_action, p_final_depth_action, p_clear_color_values, p_clear_depth, p_clear_stencil, p_region, p_storage_textures);
+		_render_list(draw_list, fb_format, p_params, 0, p_params->element_count);
+		RD::get_singleton()->draw_list_end(p_params->barrier);
+	}
+}
+
+template <RenderForwardMobile::PassMode p_pass_mode>
+void RenderForwardMobile::_render_list_template(RenderingDevice::DrawListID p_draw_list, RenderingDevice::FramebufferFormatID p_framebuffer_Format, RenderListParameters *p_params, uint32_t p_from_element, uint32_t p_to_element) {
+	RD::DrawListID draw_list = p_draw_list;
+	RD::FramebufferFormatID framebuffer_format = p_framebuffer_Format;
+
+	//global scope bindings
+	RD::get_singleton()->draw_list_bind_uniform_set(draw_list, render_base_uniform_set, SCENE_UNIFORM_SET);
+	RD::get_singleton()->draw_list_bind_uniform_set(draw_list, p_params->render_pass_uniform_set, RENDER_PASS_UNIFORM_SET);
+	RD::get_singleton()->draw_list_bind_uniform_set(draw_list, scene_shader.default_vec4_xform_uniform_set, TRANSFORMS_UNIFORM_SET);
+
+	RID prev_material_uniform_set;
+
+	RID prev_vertex_array_rd;
+	RID prev_index_array_rd;
+	RID prev_pipeline_rd;
+	RID prev_xforms_uniform_set;
+
+	bool shadow_pass = (p_params->pass_mode == PASS_MODE_SHADOW) || (p_params->pass_mode == PASS_MODE_SHADOW_DP);
+
+	for (uint32_t i = p_from_element; i < p_to_element; i++) {
+		const GeometryInstanceSurfaceDataCache *surf = p_params->elements[i];
+		const RenderElementInfo &element_info = p_params->element_info[i];
+		const GeometryInstanceForwardMobile *inst = surf->owner;
+
+		// GeometryInstanceForwardMobile::PushConstant push_constant = inst->push_constant;
+		GeometryInstanceForwardMobile::PushConstant push_constant;
+
+		if (inst->store_transform_cache) {
+			RendererStorageRD::store_transform(inst->transform, push_constant.transform);
+		} else {
+			RendererStorageRD::store_transform(Transform(), push_constant.transform);
+		}
+
+		push_constant.flags = inst->flags_cache;
+		push_constant.gi_offset = inst->gi_offset_cache;
+		push_constant.layer_mask = inst->layer_mask;
+		push_constant.instance_uniforms_ofs = uint32_t(inst->shader_parameters_offset);
+
+		if (p_params->pass_mode == PASS_MODE_DEPTH_MATERIAL) {
+			// abuse lightmap_uv_scale[0] here, should not be needed here
+			push_constant.lightmap_uv_scale[0] = p_params->uv_offset.x;
+			push_constant.lightmap_uv_scale[1] = p_params->uv_offset.y;
+		} else {
+			push_constant.lightmap_uv_scale[0] = inst->lightmap_uv_scale.position.x;
+			push_constant.lightmap_uv_scale[1] = inst->lightmap_uv_scale.position.y;
+			push_constant.lightmap_uv_scale[2] = inst->lightmap_uv_scale.size.x;
+			push_constant.lightmap_uv_scale[3] = inst->lightmap_uv_scale.size.y;
+		};
+
+		_fill_instance_indices(inst->omni_lights, inst->omni_light_count, push_constant.omni_lights, inst->spot_lights, inst->spot_light_count, push_constant.spot_lights, inst->reflection_probes, inst->reflection_probe_count, push_constant.reflection_probes, inst->decals, inst->decals_count, push_constant.decals, push_constant.layer_mask);
+
+		RID material_uniform_set;
+		SceneShaderForwardMobile::ShaderData *shader;
+		void *mesh_surface;
+
+		if (shadow_pass) {
+			material_uniform_set = surf->material_uniform_set_shadow;
+			shader = surf->shader_shadow;
+			mesh_surface = surf->surface_shadow;
+
+		} else {
+			material_uniform_set = surf->material_uniform_set;
+			shader = surf->shader;
+			mesh_surface = surf->surface;
+		}
+
+		if (!mesh_surface) {
+			continue;
+		}
+
+		//find cull variant
+		SceneShaderForwardMobile::ShaderData::CullVariant cull_variant;
+
+		if (p_params->pass_mode == PASS_MODE_DEPTH_MATERIAL || ((p_params->pass_mode == PASS_MODE_SHADOW || p_params->pass_mode == PASS_MODE_SHADOW_DP) && surf->flags & GeometryInstanceSurfaceDataCache::FLAG_USES_DOUBLE_SIDED_SHADOWS)) {
+			cull_variant = SceneShaderForwardMobile::ShaderData::CULL_VARIANT_DOUBLE_SIDED;
+		} else {
+			bool mirror = surf->owner->mirror;
+			if (p_params->reverse_cull) {
+				mirror = !mirror;
+			}
+			cull_variant = mirror ? SceneShaderForwardMobile::ShaderData::CULL_VARIANT_REVERSED : SceneShaderForwardMobile::ShaderData::CULL_VARIANT_NORMAL;
+		}
+
+		RS::PrimitiveType primitive = surf->primitive;
+		RID xforms_uniform_set = surf->owner->transforms_uniform_set;
+
+		SceneShaderForwardMobile::ShaderVersion shader_version = SceneShaderForwardMobile::SHADER_VERSION_MAX; // Assigned to silence wrong -Wmaybe-initialized.
+
+		switch (p_params->pass_mode) {
+			case PASS_MODE_COLOR:
+			case PASS_MODE_COLOR_TRANSPARENT: {
+				if (element_info.uses_lightmap) {
+					shader_version = SceneShaderForwardMobile::SHADER_VERSION_LIGHTMAP_COLOR_PASS;
+				} else {
+					shader_version = SceneShaderForwardMobile::SHADER_VERSION_COLOR_PASS;
+				}
+			} break;
+			case PASS_MODE_SHADOW: {
+				shader_version = SceneShaderForwardMobile::SHADER_VERSION_SHADOW_PASS;
+			} break;
+			case PASS_MODE_SHADOW_DP: {
+				shader_version = SceneShaderForwardMobile::SHADER_VERSION_DEPTH_PASS_DP;
+			} break;
+			case PASS_MODE_DEPTH_MATERIAL: {
+				shader_version = SceneShaderForwardMobile::SHADER_VERSION_DEPTH_PASS_WITH_MATERIAL;
+			} break;
+		}
+
+		PipelineCacheRD *pipeline = nullptr;
+
+		pipeline = &shader->pipelines[cull_variant][primitive][shader_version];
+
+		RD::VertexFormatID vertex_format = -1;
+		RID vertex_array_rd;
+		RID index_array_rd;
+
+		//skeleton and blend shape
+		if (surf->owner->mesh_instance.is_valid()) {
+			storage->mesh_instance_surface_get_vertex_arrays_and_format(surf->owner->mesh_instance, surf->surface_index, pipeline->get_vertex_input_mask(), vertex_array_rd, vertex_format);
+		} else {
+			storage->mesh_surface_get_vertex_arrays_and_format(mesh_surface, pipeline->get_vertex_input_mask(), vertex_array_rd, vertex_format);
+		}
+
+		index_array_rd = storage->mesh_surface_get_index_array(mesh_surface, element_info.lod_index);
+
+		if (prev_vertex_array_rd != vertex_array_rd) {
+			RD::get_singleton()->draw_list_bind_vertex_array(draw_list, vertex_array_rd);
+			prev_vertex_array_rd = vertex_array_rd;
+		}
+
+		if (prev_index_array_rd != index_array_rd) {
+			if (index_array_rd.is_valid()) {
+				RD::get_singleton()->draw_list_bind_index_array(draw_list, index_array_rd);
+			}
+			prev_index_array_rd = index_array_rd;
+		}
+
+		RID pipeline_rd = pipeline->get_render_pipeline(vertex_format, framebuffer_format, p_params->force_wireframe);
+
+		if (pipeline_rd != prev_pipeline_rd) {
+			// checking with prev shader does not make so much sense, as
+			// the pipeline may still be different.
+			RD::get_singleton()->draw_list_bind_render_pipeline(draw_list, pipeline_rd);
+			prev_pipeline_rd = pipeline_rd;
+		}
+
+		if (xforms_uniform_set.is_valid() && prev_xforms_uniform_set != xforms_uniform_set) {
+			RD::get_singleton()->draw_list_bind_uniform_set(draw_list, xforms_uniform_set, TRANSFORMS_UNIFORM_SET);
+			prev_xforms_uniform_set = xforms_uniform_set;
+		}
+
+		if (material_uniform_set != prev_material_uniform_set) {
+			//update uniform set
+			if (material_uniform_set.is_valid()) {
+				RD::get_singleton()->draw_list_bind_uniform_set(draw_list, material_uniform_set, MATERIAL_UNIFORM_SET);
+			}
+
+			prev_material_uniform_set = material_uniform_set;
+		}
+
+		RD::get_singleton()->draw_list_set_push_constant(draw_list, &push_constant, sizeof(GeometryInstanceForwardMobile::PushConstant));
+
+		uint32_t instance_count = surf->owner->instance_count > 1 ? surf->owner->instance_count : element_info.repeat;
+		if (surf->flags & GeometryInstanceSurfaceDataCache::FLAG_USES_PARTICLE_TRAILS) {
+			instance_count /= surf->owner->trail_steps;
+		}
+
+		RD::get_singleton()->draw_list_draw(draw_list, index_array_rd.is_valid(), instance_count);
+		i += element_info.repeat - 1; //skip equal elements
+	}
+}
+
+/* Geometry instance */
+
+RendererSceneRender::GeometryInstance *RenderForwardMobile::geometry_instance_create(RID p_base) {
+	RS::InstanceType type = storage->get_base_type(p_base);
+	ERR_FAIL_COND_V(!((1 << type) & RS::INSTANCE_GEOMETRY_MASK), nullptr);
+
+	GeometryInstanceForwardMobile *ginstance = geometry_instance_alloc.alloc();
+	ginstance->data = memnew(GeometryInstanceForwardMobile::Data);
+
+	ginstance->data->base = p_base;
+	ginstance->data->base_type = type;
+
+	_geometry_instance_mark_dirty(ginstance);
+
+	return ginstance;
+}
+
+void RenderForwardMobile::geometry_instance_set_skeleton(GeometryInstance *p_geometry_instance, RID p_skeleton) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	ginstance->data->skeleton = p_skeleton;
+
+	_geometry_instance_mark_dirty(ginstance);
+	ginstance->data->dirty_dependencies = true;
+}
+
+void RenderForwardMobile::geometry_instance_set_material_override(GeometryInstance *p_geometry_instance, RID p_override) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	ginstance->data->material_override = p_override;
+
+	_geometry_instance_mark_dirty(ginstance);
+	ginstance->data->dirty_dependencies = true;
+}
+
+void RenderForwardMobile::geometry_instance_set_surface_materials(GeometryInstance *p_geometry_instance, const Vector<RID> &p_materials) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	ginstance->data->surface_materials = p_materials;
+
+	_geometry_instance_mark_dirty(ginstance);
+	ginstance->data->dirty_dependencies = true;
+}
+
+void RenderForwardMobile::geometry_instance_set_mesh_instance(GeometryInstance *p_geometry_instance, RID p_mesh_instance) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	ginstance->mesh_instance = p_mesh_instance;
+
+	_geometry_instance_mark_dirty(ginstance);
+}
+
+void RenderForwardMobile::geometry_instance_set_transform(GeometryInstance *p_geometry_instance, const Transform &p_transform, const AABB &p_aabb, const AABB &p_transformed_aabb) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	ginstance->transform = p_transform;
+	ginstance->mirror = p_transform.basis.determinant() < 0;
+	ginstance->data->aabb = p_aabb;
+	ginstance->transformed_aabb = p_transformed_aabb;
+
+	Vector3 model_scale_vec = p_transform.basis.get_scale_abs();
+	// handle non uniform scale here
+
+	float max_scale = MAX(model_scale_vec.x, MAX(model_scale_vec.y, model_scale_vec.z));
+	float min_scale = MIN(model_scale_vec.x, MIN(model_scale_vec.y, model_scale_vec.z));
+	ginstance->non_uniform_scale = max_scale >= 0.0 && (min_scale / max_scale) < 0.9;
+
+	ginstance->lod_model_scale = max_scale;
+}
+
+void RenderForwardMobile::geometry_instance_set_layer_mask(GeometryInstance *p_geometry_instance, uint32_t p_layer_mask) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	ginstance->layer_mask = p_layer_mask;
+}
+
+void RenderForwardMobile::geometry_instance_set_lod_bias(GeometryInstance *p_geometry_instance, float p_lod_bias) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	ginstance->lod_bias = p_lod_bias;
+}
+
+void RenderForwardMobile::geometry_instance_set_use_baked_light(GeometryInstance *p_geometry_instance, bool p_enable) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	ginstance->data->use_baked_light = p_enable;
+
+	_geometry_instance_mark_dirty(ginstance);
+}
+
+void RenderForwardMobile::geometry_instance_set_use_dynamic_gi(GeometryInstance *p_geometry_instance, bool p_enable) {
+	// !BAS! do we support this in mobile?
+	// GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	// ERR_FAIL_COND(!ginstance);
+	// ginstance->data->use_dynamic_gi = p_enable;
+	// _geometry_instance_mark_dirty(ginstance);
+}
+
+void RenderForwardMobile::geometry_instance_set_use_lightmap(GeometryInstance *p_geometry_instance, RID p_lightmap_instance, const Rect2 &p_lightmap_uv_scale, int p_lightmap_slice_index) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	ginstance->lightmap_instance = p_lightmap_instance;
+	ginstance->lightmap_uv_scale = p_lightmap_uv_scale;
+	ginstance->lightmap_slice_index = p_lightmap_slice_index;
+	_geometry_instance_mark_dirty(ginstance);
+}
+
+void RenderForwardMobile::geometry_instance_set_lightmap_capture(GeometryInstance *p_geometry_instance, const Color *p_sh9) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	if (p_sh9) {
+		if (ginstance->lightmap_sh == nullptr) {
+			ginstance->lightmap_sh = geometry_instance_lightmap_sh.alloc();
+		}
+
+		memcpy(ginstance->lightmap_sh->sh, p_sh9, sizeof(Color) * 9);
+	} else {
+		if (ginstance->lightmap_sh != nullptr) {
+			geometry_instance_lightmap_sh.free(ginstance->lightmap_sh);
+			ginstance->lightmap_sh = nullptr;
+		}
+	}
+	_geometry_instance_mark_dirty(ginstance);
+}
+
+void RenderForwardMobile::geometry_instance_set_instance_shader_parameters_offset(GeometryInstance *p_geometry_instance, int32_t p_offset) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	ginstance->shader_parameters_offset = p_offset;
+	_geometry_instance_mark_dirty(ginstance);
+}
+
+void RenderForwardMobile::geometry_instance_set_cast_double_sided_shadows(GeometryInstance *p_geometry_instance, bool p_enable) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+
+	ginstance->data->cast_double_sided_shadows = p_enable;
+	_geometry_instance_mark_dirty(ginstance);
+}
+
+Transform RenderForwardMobile::geometry_instance_get_transform(GeometryInstance *p_instance) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_instance);
+	ERR_FAIL_COND_V(!ginstance, Transform());
+	return ginstance->transform;
+}
+
+AABB RenderForwardMobile::geometry_instance_get_aabb(GeometryInstance *p_instance) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_instance);
+	ERR_FAIL_COND_V(!ginstance, AABB());
+	return ginstance->data->aabb;
+}
+
+void RenderForwardMobile::geometry_instance_free(GeometryInstance *p_geometry_instance) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+	if (ginstance->lightmap_sh != nullptr) {
+		geometry_instance_lightmap_sh.free(ginstance->lightmap_sh);
+	}
+	GeometryInstanceSurfaceDataCache *surf = ginstance->surface_caches;
+	while (surf) {
+		GeometryInstanceSurfaceDataCache *next = surf->next;
+		geometry_instance_surface_alloc.free(surf);
+		surf = next;
+	}
+	memdelete(ginstance->data);
+	geometry_instance_alloc.free(ginstance);
+}
+
+uint32_t RenderForwardMobile::geometry_instance_get_pair_mask() {
+	return ((1 << RS::INSTANCE_LIGHT) + (1 << RS::INSTANCE_REFLECTION_PROBE) + (1 << RS::INSTANCE_DECAL));
+}
+
+void RenderForwardMobile::geometry_instance_pair_light_instances(GeometryInstance *p_geometry_instance, const RID *p_light_instances, uint32_t p_light_instance_count) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+
+	ginstance->omni_light_count = 0;
+	ginstance->spot_light_count = 0;
+
+	for (uint32_t i = 0; i < p_light_instance_count; i++) {
+		RS::LightType type = light_instance_get_type(p_light_instances[i]);
+		switch (type) {
+			case RS::LIGHT_OMNI: {
+				if (ginstance->omni_light_count < (uint32_t)MAX_RDL_CULL) {
+					ginstance->omni_lights[ginstance->omni_light_count] = p_light_instances[i];
+					ginstance->omni_light_count++;
+				}
+			} break;
+			case RS::LIGHT_SPOT: {
+				if (ginstance->spot_light_count < (uint32_t)MAX_RDL_CULL) {
+					ginstance->spot_lights[ginstance->spot_light_count] = p_light_instances[i];
+					ginstance->spot_light_count++;
+				}
+			} break;
+			default:
+				break;
+		}
+	}
+}
+
+void RenderForwardMobile::geometry_instance_pair_reflection_probe_instances(GeometryInstance *p_geometry_instance, const RID *p_reflection_probe_instances, uint32_t p_reflection_probe_instance_count) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+
+	ginstance->reflection_probe_count = p_reflection_probe_instance_count < (uint32_t)MAX_RDL_CULL ? p_reflection_probe_instance_count : (uint32_t)MAX_RDL_CULL;
+	for (uint32_t i = 0; i < ginstance->reflection_probe_count; i++) {
+		ginstance->reflection_probes[i] = p_reflection_probe_instances[i];
+	}
+}
+
+void RenderForwardMobile::geometry_instance_pair_decal_instances(GeometryInstance *p_geometry_instance, const RID *p_decal_instances, uint32_t p_decal_instance_count) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	ERR_FAIL_COND(!ginstance);
+
+	ginstance->decals_count = p_decal_instance_count < (uint32_t)MAX_RDL_CULL ? p_decal_instance_count : (uint32_t)MAX_RDL_CULL;
+	for (uint32_t i = 0; i < ginstance->decals_count; i++) {
+		ginstance->decals[i] = p_decal_instances[i];
+	}
+}
+
+void RenderForwardMobile::geometry_instance_pair_gi_probe_instances(GeometryInstance *p_geometry_instance, const RID *p_gi_probe_instances, uint32_t p_gi_probe_instance_count) {
+	// We do not have this here!
+}
+
+void RenderForwardMobile::_geometry_instance_mark_dirty(GeometryInstance *p_geometry_instance) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+	if (ginstance->dirty_list_element.in_list()) {
+		return;
+	}
+
+	//clear surface caches
+	GeometryInstanceSurfaceDataCache *surf = ginstance->surface_caches;
+
+	while (surf) {
+		GeometryInstanceSurfaceDataCache *next = surf->next;
+		geometry_instance_surface_alloc.free(surf);
+		surf = next;
+	}
+
+	ginstance->surface_caches = nullptr;
+
+	geometry_instance_dirty_list.add(&ginstance->dirty_list_element);
+}
+
+void RenderForwardMobile::_geometry_instance_add_surface_with_material(GeometryInstanceForwardMobile *ginstance, uint32_t p_surface, SceneShaderForwardMobile::MaterialData *p_material, uint32_t p_material_id, uint32_t p_shader_id, RID p_mesh) {
+	bool has_read_screen_alpha = p_material->shader_data->uses_screen_texture || p_material->shader_data->uses_depth_texture || p_material->shader_data->uses_normal_texture;
+	bool has_base_alpha = (p_material->shader_data->uses_alpha || has_read_screen_alpha);
+	bool has_blend_alpha = p_material->shader_data->uses_blend_alpha;
+	bool has_alpha = has_base_alpha || has_blend_alpha;
+
+	uint32_t flags = 0;
+
+	if (p_material->shader_data->uses_sss) {
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_USES_SUBSURFACE_SCATTERING;
+	}
+
+	if (p_material->shader_data->uses_screen_texture) {
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_USES_SCREEN_TEXTURE;
+	}
+
+	if (p_material->shader_data->uses_depth_texture) {
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_USES_DEPTH_TEXTURE;
+	}
+
+	if (p_material->shader_data->uses_normal_texture) {
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_USES_NORMAL_TEXTURE;
+	}
+
+	if (ginstance->data->cast_double_sided_shadows) {
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_USES_DOUBLE_SIDED_SHADOWS;
+	}
+
+	if (has_alpha || has_read_screen_alpha || p_material->shader_data->depth_draw == SceneShaderForwardMobile::ShaderData::DEPTH_DRAW_DISABLED || p_material->shader_data->depth_test == SceneShaderForwardMobile::ShaderData::DEPTH_TEST_DISABLED) {
+		//material is only meant for alpha pass
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_PASS_ALPHA;
+		if (p_material->shader_data->uses_depth_pre_pass && !(p_material->shader_data->depth_draw == SceneShaderForwardMobile::ShaderData::DEPTH_DRAW_DISABLED || p_material->shader_data->depth_test == SceneShaderForwardMobile::ShaderData::DEPTH_TEST_DISABLED)) {
+			flags |= GeometryInstanceSurfaceDataCache::FLAG_PASS_DEPTH;
+			flags |= GeometryInstanceSurfaceDataCache::FLAG_PASS_SHADOW;
+		}
+	} else {
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_PASS_OPAQUE;
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_PASS_DEPTH;
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_PASS_SHADOW;
+	}
+
+	if (p_material->shader_data->uses_particle_trails) {
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_USES_PARTICLE_TRAILS;
+	}
+
+	SceneShaderForwardMobile::MaterialData *material_shadow = nullptr;
+	void *surface_shadow = nullptr;
+	if (!p_material->shader_data->uses_particle_trails && !p_material->shader_data->writes_modelview_or_projection && !p_material->shader_data->uses_vertex && !p_material->shader_data->uses_discard && !p_material->shader_data->uses_depth_pre_pass) {
+		flags |= GeometryInstanceSurfaceDataCache::FLAG_USES_SHARED_SHADOW_MATERIAL;
+		material_shadow = (SceneShaderForwardMobile::MaterialData *)storage->material_get_data(scene_shader.default_material, RendererStorageRD::SHADER_TYPE_3D);
+
+		RID shadow_mesh = storage->mesh_get_shadow_mesh(p_mesh);
+
+		if (shadow_mesh.is_valid()) {
+			surface_shadow = storage->mesh_get_surface(shadow_mesh, p_surface);
+		}
+
+	} else {
+		material_shadow = p_material;
+	}
+
+	GeometryInstanceSurfaceDataCache *sdcache = geometry_instance_surface_alloc.alloc();
+
+	sdcache->flags = flags;
+
+	sdcache->shader = p_material->shader_data;
+	sdcache->material_uniform_set = p_material->uniform_set;
+	sdcache->surface = storage->mesh_get_surface(p_mesh, p_surface);
+	sdcache->primitive = storage->mesh_surface_get_primitive(sdcache->surface);
+	sdcache->surface_index = p_surface;
+
+	if (ginstance->data->dirty_dependencies) {
+		storage->base_update_dependency(p_mesh, &ginstance->data->dependency_tracker);
+	}
+
+	//shadow
+	sdcache->shader_shadow = material_shadow->shader_data;
+	sdcache->material_uniform_set_shadow = material_shadow->uniform_set;
+
+	sdcache->surface_shadow = surface_shadow ? surface_shadow : sdcache->surface;
+
+	sdcache->owner = ginstance;
+
+	sdcache->next = ginstance->surface_caches;
+	ginstance->surface_caches = sdcache;
+
+	//sortkey
+
+	sdcache->sort.sort_key1 = 0;
+	sdcache->sort.sort_key2 = 0;
+
+	sdcache->sort.surface_index = p_surface;
+	sdcache->sort.material_id_low = p_material_id & 0x0000FFFF;
+	sdcache->sort.material_id_hi = p_material_id >> 16;
+	sdcache->sort.shader_id = p_shader_id;
+	sdcache->sort.geometry_id = p_mesh.get_local_index(); //only meshes can repeat anyway
+	// sdcache->sort.uses_forward_gi = ginstance->can_sdfgi;
+	sdcache->sort.priority = p_material->priority;
+}
+
+void RenderForwardMobile::_geometry_instance_add_surface(GeometryInstanceForwardMobile *ginstance, uint32_t p_surface, RID p_material, RID p_mesh) {
+	RID m_src;
+
+	m_src = ginstance->data->material_override.is_valid() ? ginstance->data->material_override : p_material;
+
+	SceneShaderForwardMobile::MaterialData *material = nullptr;
+
+	if (m_src.is_valid()) {
+		material = (SceneShaderForwardMobile::MaterialData *)storage->material_get_data(m_src, RendererStorageRD::SHADER_TYPE_3D);
+		if (!material || !material->shader_data->valid) {
+			material = nullptr;
+		}
+	}
+
+	if (material) {
+		if (ginstance->data->dirty_dependencies) {
+			storage->material_update_dependency(m_src, &ginstance->data->dependency_tracker);
+		}
+	} else {
+		material = (SceneShaderForwardMobile::MaterialData *)storage->material_get_data(scene_shader.default_material, RendererStorageRD::SHADER_TYPE_3D);
+		m_src = scene_shader.default_material;
+	}
+
+	ERR_FAIL_COND(!material);
+
+	_geometry_instance_add_surface_with_material(ginstance, p_surface, material, m_src.get_local_index(), storage->material_get_shader_id(m_src), p_mesh);
+
+	while (material->next_pass.is_valid()) {
+		RID next_pass = material->next_pass;
+		material = (SceneShaderForwardMobile::MaterialData *)storage->material_get_data(next_pass, RendererStorageRD::SHADER_TYPE_3D);
+		if (!material || !material->shader_data->valid) {
+			break;
+		}
+		if (ginstance->data->dirty_dependencies) {
+			storage->material_update_dependency(next_pass, &ginstance->data->dependency_tracker);
+		}
+		_geometry_instance_add_surface_with_material(ginstance, p_surface, material, next_pass.get_local_index(), storage->material_get_shader_id(next_pass), p_mesh);
+	}
+}
+
+void RenderForwardMobile::_geometry_instance_update(GeometryInstance *p_geometry_instance) {
+	GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_geometry_instance);
+
+	if (ginstance->data->dirty_dependencies) {
+		ginstance->data->dependency_tracker.update_begin();
+	}
+
+	//add geometry for drawing
+	switch (ginstance->data->base_type) {
+		case RS::INSTANCE_MESH: {
+			const RID *materials = nullptr;
+			uint32_t surface_count;
+			RID mesh = ginstance->data->base;
+
+			materials = storage->mesh_get_surface_count_and_materials(mesh, surface_count);
+			if (materials) {
+				//if no materials, no surfaces.
+				const RID *inst_materials = ginstance->data->surface_materials.ptr();
+				uint32_t surf_mat_count = ginstance->data->surface_materials.size();
+
+				for (uint32_t j = 0; j < surface_count; j++) {
+					RID material = (j < surf_mat_count && inst_materials[j].is_valid()) ? inst_materials[j] : materials[j];
+					_geometry_instance_add_surface(ginstance, j, material, mesh);
+				}
+			}
+
+			ginstance->instance_count = 1;
+
+		} break;
+
+		case RS::INSTANCE_MULTIMESH: {
+			RID mesh = storage->multimesh_get_mesh(ginstance->data->base);
+			if (mesh.is_valid()) {
+				const RID *materials = nullptr;
+				uint32_t surface_count;
+
+				materials = storage->mesh_get_surface_count_and_materials(mesh, surface_count);
+				if (materials) {
+					for (uint32_t j = 0; j < surface_count; j++) {
+						_geometry_instance_add_surface(ginstance, j, materials[j], mesh);
+					}
+				}
+
+				ginstance->instance_count = storage->multimesh_get_instances_to_draw(ginstance->data->base);
+			}
+
+		} break;
+#if 0
+		case RS::INSTANCE_IMMEDIATE: {
+			RasterizerStorageGLES3::Immediate *immediate = storage->immediate_owner.getornull(inst->base);
+			ERR_CONTINUE(!immediate);
+
+			_add_geometry(immediate, inst, nullptr, -1, p_depth_pass, p_shadow_pass);
+
+		} break;
+#endif
+		case RS::INSTANCE_PARTICLES: {
+			int draw_passes = storage->particles_get_draw_passes(ginstance->data->base);
+
+			for (int j = 0; j < draw_passes; j++) {
+				RID mesh = storage->particles_get_draw_pass_mesh(ginstance->data->base, j);
+				if (!mesh.is_valid()) {
+					continue;
+				}
+
+				const RID *materials = nullptr;
+				uint32_t surface_count;
+
+				materials = storage->mesh_get_surface_count_and_materials(mesh, surface_count);
+				if (materials) {
+					for (uint32_t k = 0; k < surface_count; k++) {
+						_geometry_instance_add_surface(ginstance, k, materials[k], mesh);
+					}
+				}
+			}
+
+			ginstance->instance_count = storage->particles_get_amount(ginstance->data->base, ginstance->trail_steps);
+
+		} break;
+
+		default: {
+		}
+	}
+
+	//Fill push constant
+
+	bool store_transform = true;
+
+	if (ginstance->data->base_type == RS::INSTANCE_MULTIMESH) {
+		ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH;
+		if (storage->multimesh_get_transform_format(ginstance->data->base) == RS::MULTIMESH_TRANSFORM_2D) {
+			ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_FORMAT_2D;
+		}
+		if (storage->multimesh_uses_colors(ginstance->data->base)) {
+			ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_HAS_COLOR;
+		}
+		if (storage->multimesh_uses_custom_data(ginstance->data->base)) {
+			ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_HAS_CUSTOM_DATA;
+		}
+
+		ginstance->transforms_uniform_set = storage->multimesh_get_3d_uniform_set(ginstance->data->base, scene_shader.default_shader_rd, TRANSFORMS_UNIFORM_SET);
+
+	} else if (ginstance->data->base_type == RS::INSTANCE_PARTICLES) {
+		ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH;
+		if (false) { // 2D particles
+			ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_FORMAT_2D;
+		}
+
+		ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_HAS_COLOR;
+		ginstance->base_flags |= INSTANCE_DATA_FLAG_MULTIMESH_HAS_CUSTOM_DATA;
+
+		//for particles, stride is the trail size
+		ginstance->base_flags |= (ginstance->trail_steps << INSTANCE_DATA_FLAGS_PARTICLE_TRAIL_SHIFT);
+
+		if (!storage->particles_is_using_local_coords(ginstance->data->base)) {
+			store_transform = false;
+		}
+		ginstance->transforms_uniform_set = storage->particles_get_instance_buffer_uniform_set(ginstance->data->base, scene_shader.default_shader_rd, TRANSFORMS_UNIFORM_SET);
+
+	} else if (ginstance->data->base_type == RS::INSTANCE_MESH) {
+		if (storage->skeleton_is_valid(ginstance->data->skeleton)) {
+			ginstance->transforms_uniform_set = storage->skeleton_get_3d_uniform_set(ginstance->data->skeleton, scene_shader.default_shader_rd, TRANSFORMS_UNIFORM_SET);
+			if (ginstance->data->dirty_dependencies) {
+				storage->skeleton_update_dependency(ginstance->data->skeleton, &ginstance->data->dependency_tracker);
+			}
+		}
+	}
+
+	ginstance->store_transform_cache = store_transform;
+
+	if (ginstance->data->dirty_dependencies) {
+		ginstance->data->dependency_tracker.update_end();
+		ginstance->data->dirty_dependencies = false;
+	}
+
+	ginstance->dirty_list_element.remove_from_list();
+}
+
+void RenderForwardMobile::_update_dirty_geometry_instances() {
+	while (geometry_instance_dirty_list.first()) {
+		_geometry_instance_update(geometry_instance_dirty_list.first()->self());
+	}
+}
+
+void RenderForwardMobile::_geometry_instance_dependency_changed(RendererStorage::DependencyChangedNotification p_notification, RendererStorage::DependencyTracker *p_tracker) {
+	switch (p_notification) {
+		case RendererStorage::DEPENDENCY_CHANGED_MATERIAL:
+		case RendererStorage::DEPENDENCY_CHANGED_MESH:
+		case RendererStorage::DEPENDENCY_CHANGED_PARTICLES:
+		case RendererStorage::DEPENDENCY_CHANGED_MULTIMESH:
+		case RendererStorage::DEPENDENCY_CHANGED_SKELETON_DATA: {
+			static_cast<RenderForwardMobile *>(singleton)->_geometry_instance_mark_dirty(static_cast<GeometryInstance *>(p_tracker->userdata));
+		} break;
+		case RendererStorage::DEPENDENCY_CHANGED_MULTIMESH_VISIBLE_INSTANCES: {
+			GeometryInstanceForwardMobile *ginstance = static_cast<GeometryInstanceForwardMobile *>(p_tracker->userdata);
+			if (ginstance->data->base_type == RS::INSTANCE_MULTIMESH) {
+				ginstance->instance_count = static_cast<RenderForwardMobile *>(singleton)->storage->multimesh_get_instances_to_draw(ginstance->data->base);
+			}
+		} break;
+		default: {
+			//rest of notifications of no interest
+		} break;
+	}
+}
+void RenderForwardMobile::_geometry_instance_dependency_deleted(const RID &p_dependency, RendererStorage::DependencyTracker *p_tracker) {
+	static_cast<RenderForwardMobile *>(singleton)->_geometry_instance_mark_dirty(static_cast<GeometryInstance *>(p_tracker->userdata));
+}
+
+/* misc */
+
+bool RenderForwardMobile::is_dynamic_gi_supported() const {
+	return false;
+}
+
+bool RenderForwardMobile::is_clustered_enabled() const {
+	return false;
+}
+
+bool RenderForwardMobile::is_volumetric_supported() const {
+	return false;
+}
+
+uint32_t RenderForwardMobile::get_max_elements() const {
+	return 256;
+}
+
+RenderForwardMobile *RenderForwardMobile::singleton = nullptr;
+
+RenderForwardMobile::RenderForwardMobile(RendererStorageRD *p_storage) :
+		RendererSceneRenderRD(p_storage) {
+	singleton = this;
+
+	String defines;
+
+	defines += "\n#define MAX_ROUGHNESS_LOD " + itos(get_roughness_layers() - 1) + ".0\n";
+	if (is_using_radiance_cubemap_array()) {
+		defines += "\n#define USE_RADIANCE_CUBEMAP_ARRAY \n";
+	}
+	// defines += "\n#define SDFGI_OCT_SIZE " + itos(gi.sdfgi_get_lightprobe_octahedron_size()) + "\n";
+	defines += "\n#define MAX_DIRECTIONAL_LIGHT_DATA_STRUCTS " + itos(get_max_directional_lights()) + "\n";
+
+	{
+		//lightmaps
+		scene_state.max_lightmaps = 2;
+		defines += "\n#define MAX_LIGHTMAP_TEXTURES " + itos(scene_state.max_lightmaps) + "\n";
+		defines += "\n#define MAX_LIGHTMAPS " + itos(scene_state.max_lightmaps) + "\n";
+
+		scene_state.lightmap_buffer = RD::get_singleton()->storage_buffer_create(sizeof(LightmapData) * scene_state.max_lightmaps);
+	}
+	{
+		//captures
+		scene_state.max_lightmap_captures = 2048;
+		scene_state.lightmap_captures = memnew_arr(LightmapCaptureData, scene_state.max_lightmap_captures);
+		scene_state.lightmap_capture_buffer = RD::get_singleton()->storage_buffer_create(sizeof(LightmapCaptureData) * scene_state.max_lightmap_captures);
+	}
+	{
+		defines += "\n#define MATERIAL_UNIFORM_SET " + itos(MATERIAL_UNIFORM_SET) + "\n";
+	}
+
+	scene_shader.init(p_storage, defines);
+
+	// !BAS! maybe we need a mobile version of this setting?
+	render_list_thread_threshold = GLOBAL_GET("rendering/limits/forward_renderer/threaded_render_minimum_instances");
+}
+
+RenderForwardMobile::~RenderForwardMobile() {
+	directional_shadow_atlas_set_size(0);
+
+	//clear base uniform set if still valid
+	for (uint32_t i = 0; i < render_pass_uniform_sets.size(); i++) {
+		if (render_pass_uniform_sets[i].is_valid() && RD::get_singleton()->uniform_set_is_valid(render_pass_uniform_sets[i])) {
+			RD::get_singleton()->free(render_pass_uniform_sets[i]);
+		}
+	}
+
+	{
+		for (uint32_t i = 0; i < scene_state.uniform_buffers.size(); i++) {
+			RD::get_singleton()->free(scene_state.uniform_buffers[i]);
+		}
+		RD::get_singleton()->free(scene_state.lightmap_buffer);
+		RD::get_singleton()->free(scene_state.lightmap_capture_buffer);
+		memdelete_arr(scene_state.lightmap_captures);
+	}
+}
diff --git a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h
new file mode 100644
index 0000000000..d356d88335
--- /dev/null
+++ b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h
@@ -0,0 +1,604 @@
+/*************************************************************************/
+/*  render_forward_mobile.h                                              */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef RENDERING_SERVER_SCENE_RENDER_FORWARD_MOBILE_H
+#define RENDERING_SERVER_SCENE_RENDER_FORWARD_MOBILE_H
+
+#include "core/templates/paged_allocator.h"
+#include "servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.h"
+#include "servers/rendering/renderer_rd/pipeline_cache_rd.h"
+#include "servers/rendering/renderer_rd/renderer_scene_render_rd.h"
+#include "servers/rendering/renderer_rd/renderer_storage_rd.h"
+
+namespace RendererSceneRenderImplementation {
+
+class RenderForwardMobile : public RendererSceneRenderRD {
+	friend SceneShaderForwardMobile;
+
+protected:
+	/* Scene Shader */
+
+	enum {
+		SCENE_UNIFORM_SET = 0,
+		RENDER_PASS_UNIFORM_SET = 1,
+		TRANSFORMS_UNIFORM_SET = 2,
+		MATERIAL_UNIFORM_SET = 3
+	};
+
+	enum {
+		MAX_LIGHTMAPS = 8,
+		MAX_RDL_CULL = 8, // maximum number of reflection probes, decals or lights we can cull per geometry instance
+		INSTANCE_DATA_BUFFER_MIN_SIZE = 4096
+	};
+
+	enum RenderListType {
+		RENDER_LIST_OPAQUE, //used for opaque objects
+		RENDER_LIST_ALPHA, //used for transparent objects
+		RENDER_LIST_SECONDARY, //used for shadows and other objects
+		RENDER_LIST_MAX
+	};
+
+	/* Scene Shader */
+
+	SceneShaderForwardMobile scene_shader;
+
+	/* Render Buffer */
+
+	struct RenderBufferDataForwardMobile : public RenderBufferData {
+		RID color;
+		RID depth;
+		// RID normal_roughness_buffer;
+
+		RS::ViewportMSAA msaa;
+		RD::TextureSamples texture_samples;
+
+		RID color_msaa;
+		RID depth_msaa;
+		// RID normal_roughness_buffer_msaa;
+
+		RID color_fb;
+		int width, height;
+
+		void clear();
+		virtual void configure(RID p_color_buffer, RID p_depth_buffer, int p_width, int p_height, RS::ViewportMSAA p_msaa);
+
+		~RenderBufferDataForwardMobile();
+	};
+
+	virtual RenderBufferData *_create_render_buffer_data();
+
+	/* Rendering */
+
+	enum PassMode {
+		PASS_MODE_COLOR,
+		// PASS_MODE_COLOR_SPECULAR,
+		PASS_MODE_COLOR_TRANSPARENT,
+		PASS_MODE_SHADOW,
+		PASS_MODE_SHADOW_DP,
+		// PASS_MODE_DEPTH,
+		// PASS_MODE_DEPTH_NORMAL_ROUGHNESS,
+		// PASS_MODE_DEPTH_NORMAL_ROUGHNESS_GIPROBE,
+		PASS_MODE_DEPTH_MATERIAL,
+		// PASS_MODE_SDF,
+	};
+
+	struct GeometryInstanceForwardMobile;
+	struct GeometryInstanceSurfaceDataCache;
+	struct RenderElementInfo;
+
+	struct RenderListParameters {
+		GeometryInstanceSurfaceDataCache **elements = nullptr;
+		RenderElementInfo *element_info = nullptr;
+		int element_count = 0;
+		bool reverse_cull = false;
+		PassMode pass_mode = PASS_MODE_COLOR;
+		// bool no_gi = false;
+		RID render_pass_uniform_set;
+		bool force_wireframe = false;
+		Vector2 uv_offset;
+		Plane lod_plane;
+		float lod_distance_multiplier = 0.0;
+		float screen_lod_threshold = 0.0;
+		RD::FramebufferFormatID framebuffer_format = 0;
+		uint32_t element_offset = 0;
+		uint32_t barrier = RD::BARRIER_MASK_ALL;
+
+		RenderListParameters(GeometryInstanceSurfaceDataCache **p_elements, RenderElementInfo *p_element_info, int p_element_count, bool p_reverse_cull, PassMode p_pass_mode, RID p_render_pass_uniform_set, bool p_force_wireframe = false, const Vector2 &p_uv_offset = Vector2(), const Plane &p_lod_plane = Plane(), float p_lod_distance_multiplier = 0.0, float p_screen_lod_threshold = 0.0, uint32_t p_element_offset = 0, uint32_t p_barrier = RD::BARRIER_MASK_ALL) {
+			elements = p_elements;
+			element_info = p_element_info;
+			element_count = p_element_count;
+			reverse_cull = p_reverse_cull;
+			pass_mode = p_pass_mode;
+			// no_gi = p_no_gi;
+			render_pass_uniform_set = p_render_pass_uniform_set;
+			force_wireframe = p_force_wireframe;
+			uv_offset = p_uv_offset;
+			lod_plane = p_lod_plane;
+			lod_distance_multiplier = p_lod_distance_multiplier;
+			screen_lod_threshold = p_screen_lod_threshold;
+			element_offset = p_element_offset;
+			barrier = p_barrier;
+		}
+	};
+
+	RID _setup_render_pass_uniform_set(RenderListType p_render_list, RID p_render_buffers, RID p_radiance_texture, RID p_shadow_atlas, RID p_reflection_atlas, const PagedArray<RID> &p_lightmaps, bool p_use_directional_shadow_atlas = false, int p_index = 0);
+	virtual void _render_scene(RID p_render_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_cluster_buffer, uint32_t p_cluster_size, uint32_t p_cluster_max_elements, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, const Color &p_default_bg_color, float p_screen_lod_threshold);
+
+	virtual void _render_shadow_begin();
+	virtual void _render_shadow_append(RID p_framebuffer, const PagedArray<GeometryInstance *> &p_instances, const CameraMatrix &p_projection, const Transform &p_transform, float p_zfar, float p_bias, float p_normal_bias, bool p_use_dp, bool p_use_dp_flip, bool p_use_pancake, const Plane &p_camera_plane = Plane(), float p_lod_distance_multiplier = 0.0, float p_screen_lod_threshold = 0.0, const Rect2i &p_rect = Rect2i(), bool p_flip_y = false, bool p_clear_region = true, bool p_begin = true, bool p_end = true);
+	virtual void _render_shadow_process();
+	virtual void _render_shadow_end(uint32_t p_barrier = RD::BARRIER_MASK_ALL);
+
+	virtual void _render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region);
+	virtual void _render_uv2(const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region);
+	virtual void _render_sdfgi(RID p_render_buffers, const Vector3i &p_from, const Vector3i &p_size, const AABB &p_bounds, const PagedArray<GeometryInstance *> &p_instances, const RID &p_albedo_texture, const RID &p_emission_texture, const RID &p_emission_aniso_texture, const RID &p_geom_facing_texture);
+	virtual void _render_particle_collider_heightfield(RID p_fb, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, const PagedArray<GeometryInstance *> &p_instances);
+
+	uint64_t lightmap_texture_array_version = 0xFFFFFFFF;
+
+	virtual void _base_uniforms_changed();
+	void _update_render_base_uniform_set();
+	virtual RID _render_buffers_get_normal_texture(RID p_render_buffers);
+
+	void _fill_render_list(RenderListType p_render_list, const PagedArray<GeometryInstance *> &p_instances, PassMode p_pass_mode, const CameraMatrix &p_cam_projection, const Transform &p_cam_transform, const Plane &p_lod_camera_plane = Plane(), float p_lod_distance_multiplier = 0.0, float p_screen_lod_threshold = 0.0, bool p_append = false);
+	void _fill_instance_data(RenderListType p_render_list, uint32_t p_offset = 0, int32_t p_max_elements = -1, bool p_update_buffer = true);
+	// void _update_instance_data_buffer(RenderListType p_render_list);
+
+	static RenderForwardMobile *singleton;
+
+	void _setup_environment(RID p_environment, RID p_render_buffers, const CameraMatrix &p_cam_projection, const Transform &p_cam_transform, RID p_reflection_probe, bool p_no_fog, const Size2i &p_screen_size, RID p_shadow_atlas, bool p_flip_y, const Color &p_default_bg_color, float p_znear, float p_zfar, bool p_opaque_render_buffers = false, bool p_pancake_shadows = false, int p_index = 0);
+	void _setup_lightmaps(const PagedArray<RID> &p_lightmaps, const Transform &p_cam_transform);
+
+	RID render_base_uniform_set;
+	LocalVector<RID> render_pass_uniform_sets;
+
+	/* Light map */
+
+	struct LightmapData {
+		float normal_xform[12];
+	};
+
+	struct LightmapCaptureData {
+		float sh[9 * 4];
+	};
+
+	/* Scene state */
+
+	struct SceneState {
+		// This struct is loaded into Set 1 - Binding 0, populated at start of rendering a frame, must match with shader code
+		struct UBO {
+			float projection_matrix[16];
+			float inv_projection_matrix[16];
+
+			float camera_matrix[16];
+			float inv_camera_matrix[16];
+
+			float viewport_size[2];
+			float screen_pixel_size[2];
+
+			float directional_penumbra_shadow_kernel[128]; //32 vec4s
+			float directional_soft_shadow_kernel[128];
+			float penumbra_shadow_kernel[128];
+			float soft_shadow_kernel[128];
+
+			uint32_t directional_penumbra_shadow_samples;
+			uint32_t directional_soft_shadow_samples;
+			uint32_t penumbra_shadow_samples;
+			uint32_t soft_shadow_samples;
+
+			float ambient_light_color_energy[4];
+
+			float ambient_color_sky_mix;
+			uint32_t use_ambient_light;
+			uint32_t use_ambient_cubemap;
+			uint32_t use_reflection_cubemap;
+
+			float radiance_inverse_xform[12];
+
+			float shadow_atlas_pixel_size[2];
+			float directional_shadow_pixel_size[2];
+
+			uint32_t directional_light_count;
+			float dual_paraboloid_side;
+			float z_far;
+			float z_near;
+
+			uint32_t ssao_enabled;
+			float ssao_light_affect;
+			float ssao_ao_affect;
+			uint32_t roughness_limiter_enabled;
+
+			float roughness_limiter_amount;
+			float roughness_limiter_limit;
+			uint32_t roughness_limiter_pad[2];
+
+			float ao_color[4];
+
+			// Fog
+			uint32_t fog_enabled;
+			float fog_density;
+			float fog_height;
+			float fog_height_density;
+
+			float fog_light_color[3];
+			float fog_sun_scatter;
+
+			float fog_aerial_perspective;
+			uint32_t material_uv2_mode;
+
+			float time;
+			float reflection_multiplier;
+
+			uint32_t pancake_shadows;
+			uint32_t pad1;
+			uint32_t pad2;
+			uint32_t pad3;
+		};
+
+		UBO ubo;
+
+		LocalVector<RID> uniform_buffers;
+
+		// !BAS! We need to change lightmaps, we're not going to do this with a buffer but pushing the used lightmap in
+		LightmapData lightmaps[MAX_LIGHTMAPS];
+		RID lightmap_ids[MAX_LIGHTMAPS];
+		bool lightmap_has_sh[MAX_LIGHTMAPS];
+		uint32_t lightmaps_used = 0;
+		uint32_t max_lightmaps;
+		RID lightmap_buffer;
+
+		LightmapCaptureData *lightmap_captures;
+		uint32_t max_lightmap_captures;
+		RID lightmap_capture_buffer;
+
+		bool used_screen_texture = false;
+		bool used_normal_texture = false;
+		bool used_depth_texture = false;
+		bool used_sss = false;
+
+		struct ShadowPass {
+			uint32_t element_from;
+			uint32_t element_count;
+			bool flip_cull;
+			PassMode pass_mode;
+
+			RID rp_uniform_set;
+			Plane camera_plane;
+			float lod_distance_multiplier;
+			float screen_lod_threshold;
+
+			RID framebuffer;
+			RD::InitialAction initial_depth_action;
+			RD::FinalAction final_depth_action;
+			Rect2i rect;
+		};
+
+		LocalVector<ShadowPass> shadow_passes;
+	} scene_state;
+
+	/* Render List */
+
+	// !BAS! Render list can probably be reused between clustered and mobile?
+	struct RenderList {
+		LocalVector<GeometryInstanceSurfaceDataCache *> elements;
+		LocalVector<RenderElementInfo> element_info;
+
+		void clear() {
+			elements.clear();
+			element_info.clear();
+		}
+
+		//should eventually be replaced by radix
+
+		struct SortByKey {
+			_FORCE_INLINE_ bool operator()(const GeometryInstanceSurfaceDataCache *A, const GeometryInstanceSurfaceDataCache *B) const {
+				return (A->sort.sort_key2 == B->sort.sort_key2) ? (A->sort.sort_key1 < B->sort.sort_key1) : (A->sort.sort_key2 < B->sort.sort_key2);
+			}
+		};
+
+		void sort_by_key() {
+			SortArray<GeometryInstanceSurfaceDataCache *, SortByKey> sorter;
+			sorter.sort(elements.ptr(), elements.size());
+		}
+
+		void sort_by_key_range(uint32_t p_from, uint32_t p_size) {
+			SortArray<GeometryInstanceSurfaceDataCache *, SortByKey> sorter;
+			sorter.sort(elements.ptr() + p_from, p_size);
+		}
+
+		struct SortByDepth {
+			_FORCE_INLINE_ bool operator()(const GeometryInstanceSurfaceDataCache *A, const GeometryInstanceSurfaceDataCache *B) const {
+				return (A->owner->depth < B->owner->depth);
+			}
+		};
+
+		void sort_by_depth() { //used for shadows
+
+			SortArray<GeometryInstanceSurfaceDataCache *, SortByDepth> sorter;
+			sorter.sort(elements.ptr(), elements.size());
+		}
+
+		struct SortByReverseDepthAndPriority {
+			_FORCE_INLINE_ bool operator()(const GeometryInstanceSurfaceDataCache *A, const GeometryInstanceSurfaceDataCache *B) const {
+				return (A->sort.priority == B->sort.priority) ? (A->owner->depth > B->owner->depth) : (A->sort.priority < B->sort.priority);
+			}
+		};
+
+		void sort_by_reverse_depth_and_priority(bool p_alpha) { //used for alpha
+
+			SortArray<GeometryInstanceSurfaceDataCache *, SortByReverseDepthAndPriority> sorter;
+			sorter.sort(elements.ptr(), elements.size());
+		}
+
+		_FORCE_INLINE_ void add_element(GeometryInstanceSurfaceDataCache *p_element) {
+			elements.push_back(p_element);
+		}
+	};
+
+	struct RenderElementInfo {
+		uint32_t repeat : 22;
+		uint32_t uses_lightmap : 1;
+		uint32_t lod_index : 8;
+		uint32_t reserved : 1; // was uses_forward_gi but we don't use that here
+	};
+
+	template <PassMode p_pass_mode>
+	_FORCE_INLINE_ void _render_list_template(RenderingDevice::DrawListID p_draw_list, RenderingDevice::FramebufferFormatID p_framebuffer_Format, RenderListParameters *p_params, uint32_t p_from_element, uint32_t p_to_element);
+
+	void _render_list(RenderingDevice::DrawListID p_draw_list, RenderingDevice::FramebufferFormatID p_framebuffer_Format, RenderListParameters *p_params, uint32_t p_from_element, uint32_t p_to_element);
+
+	LocalVector<RD::DrawListID> thread_draw_lists;
+	void _render_list_thread_function(uint32_t p_thread, RenderListParameters *p_params);
+	void _render_list_with_threads(RenderListParameters *p_params, RID p_framebuffer, RD::InitialAction p_initial_color_action, RD::FinalAction p_final_color_action, RD::InitialAction p_initial_depth_action, RD::FinalAction p_final_depth_action, const Vector<Color> &p_clear_color_values = Vector<Color>(), float p_clear_depth = 1.0, uint32_t p_clear_stencil = 0, const Rect2 &p_region = Rect2(), const Vector<RID> &p_storage_textures = Vector<RID>());
+
+	uint32_t render_list_thread_threshold = 500;
+
+	RenderList render_list[RENDER_LIST_MAX];
+
+	/* Geometry instance */
+
+	// check which ones of these apply, probably all except GI and SDFGI
+	enum {
+		INSTANCE_DATA_FLAG_USE_GI_BUFFERS = 1 << 6,
+		INSTANCE_DATA_FLAG_USE_SDFGI = 1 << 7,
+		INSTANCE_DATA_FLAG_USE_LIGHTMAP_CAPTURE = 1 << 8,
+		INSTANCE_DATA_FLAG_USE_LIGHTMAP = 1 << 9,
+		INSTANCE_DATA_FLAG_USE_SH_LIGHTMAP = 1 << 10,
+		INSTANCE_DATA_FLAG_USE_GIPROBE = 1 << 11,
+		INSTANCE_DATA_FLAG_MULTIMESH = 1 << 12,
+		INSTANCE_DATA_FLAG_MULTIMESH_FORMAT_2D = 1 << 13,
+		INSTANCE_DATA_FLAG_MULTIMESH_HAS_COLOR = 1 << 14,
+		INSTANCE_DATA_FLAG_MULTIMESH_HAS_CUSTOM_DATA = 1 << 15,
+		INSTANCE_DATA_FLAGS_PARTICLE_TRAIL_SHIFT = 16,
+		INSTANCE_DATA_FLAGS_PARTICLE_TRAIL_MASK = 0xFF,
+		INSTANCE_DATA_FLAGS_NON_UNIFORM_SCALE = 1 << 24,
+	};
+
+	struct GeometryInstanceLightmapSH {
+		Color sh[9];
+	};
+
+	// Cached data for drawing surfaces
+	struct GeometryInstanceSurfaceDataCache {
+		enum {
+			FLAG_PASS_DEPTH = 1,
+			FLAG_PASS_OPAQUE = 2,
+			FLAG_PASS_ALPHA = 4,
+			FLAG_PASS_SHADOW = 8,
+			FLAG_USES_SHARED_SHADOW_MATERIAL = 128,
+			FLAG_USES_SUBSURFACE_SCATTERING = 2048,
+			FLAG_USES_SCREEN_TEXTURE = 4096,
+			FLAG_USES_DEPTH_TEXTURE = 8192,
+			FLAG_USES_NORMAL_TEXTURE = 16384,
+			FLAG_USES_DOUBLE_SIDED_SHADOWS = 32768,
+			FLAG_USES_PARTICLE_TRAILS = 65536,
+		};
+
+		union {
+			struct {
+				// !BAS! CHECK BITS!!!
+
+				uint64_t surface_index : 10;
+				uint64_t geometry_id : 32;
+				uint64_t material_id_low : 16;
+
+				uint64_t material_id_hi : 16;
+				uint64_t shader_id : 32;
+				uint64_t uses_lightmap : 4; // sort by lightmap id here, not whether its yes/no (is 4 bits enough?)
+				uint64_t depth_layer : 4;
+				uint64_t priority : 8;
+
+				// uint64_t lod_index : 8; // no need to sort on LOD
+				// uint64_t uses_forward_gi : 1; // no GI here, remove
+			};
+			struct {
+				uint64_t sort_key1;
+				uint64_t sort_key2;
+			};
+		} sort;
+
+		RS::PrimitiveType primitive = RS::PRIMITIVE_MAX;
+		uint32_t flags = 0;
+		uint32_t surface_index = 0;
+		uint32_t lod_index = 0;
+
+		void *surface = nullptr;
+		RID material_uniform_set;
+		SceneShaderForwardMobile::ShaderData *shader = nullptr;
+
+		void *surface_shadow = nullptr;
+		RID material_uniform_set_shadow;
+		SceneShaderForwardMobile::ShaderData *shader_shadow = nullptr;
+
+		GeometryInstanceSurfaceDataCache *next = nullptr;
+		GeometryInstanceForwardMobile *owner = nullptr;
+	};
+
+	// !BAS! GeometryInstanceForwardClustered and GeometryInstanceForwardMobile will likely have a lot of overlap
+	// may need to think about making this its own class like GeometryInstanceRD?
+
+	struct GeometryInstanceForwardMobile : public GeometryInstance {
+		// setup
+		uint32_t base_flags = 0;
+		uint32_t flags_cache = 0;
+
+		// this structure maps to our push constant in our shader and is populated right before our draw call
+		struct PushConstant {
+			float transform[16];
+			uint32_t flags;
+			uint32_t instance_uniforms_ofs; //base offset in global buffer for instance variables
+			uint32_t gi_offset; //GI information when using lightmapping (VCT or lightmap index)
+			uint32_t layer_mask = 1;
+			float lightmap_uv_scale[4]; // doubles as uv_offset when needed
+			uint32_t reflection_probes[2]; // packed reflection probes
+			uint32_t omni_lights[2]; // packed omni lights
+			uint32_t spot_lights[2]; // packed spot lights
+			uint32_t decals[2]; // packed spot lights
+		};
+
+		// PushConstant push_constant; // we populate this from our instance data
+
+		//used during rendering
+		uint32_t layer_mask = 1;
+		RID transforms_uniform_set;
+		float depth = 0;
+		bool mirror = false;
+		Transform transform;
+		bool store_transform_cache = true; // if true we copy our transform into our PushConstant, if false we use our transforms UBO and clear our PushConstants transform
+		bool non_uniform_scale = false;
+		AABB transformed_aabb; //needed for LOD
+		float lod_bias = 0.0;
+		float lod_model_scale = 1.0;
+		int32_t shader_parameters_offset = -1;
+		uint32_t instance_count = 0;
+		uint32_t trail_steps = 1;
+		RID mesh_instance;
+
+		// lightmap
+		uint32_t gi_offset_cache = 0; // !BAS! Should rename this to lightmap_offset_cache, in forward clustered this was shared between gi and lightmap
+		uint32_t lightmap_slice_index;
+		Rect2 lightmap_uv_scale;
+		RID lightmap_instance;
+		GeometryInstanceLightmapSH *lightmap_sh = nullptr;
+
+		// culled light info
+		uint32_t reflection_probe_count;
+		RID reflection_probes[MAX_RDL_CULL];
+		uint32_t omni_light_count;
+		RID omni_lights[MAX_RDL_CULL];
+		uint32_t spot_light_count;
+		RID spot_lights[MAX_RDL_CULL];
+		uint32_t decals_count;
+		RID decals[MAX_RDL_CULL];
+
+		GeometryInstanceSurfaceDataCache *surface_caches = nullptr;
+
+		// do we use this?
+		SelfList<GeometryInstanceForwardMobile> dirty_list_element;
+
+		struct Data {
+			//data used less often goes into regular heap
+			RID base;
+			RS::InstanceType base_type;
+
+			RID skeleton;
+			Vector<RID> surface_materials;
+			RID material_override;
+			AABB aabb;
+
+			bool use_baked_light = false;
+			bool cast_double_sided_shadows = false;
+			// bool mirror = false; // !BAS! Does not seem used, we already have this in the main struct
+
+			bool dirty_dependencies = false;
+
+			RendererStorage::DependencyTracker dependency_tracker;
+		};
+
+		Data *data = nullptr;
+
+		GeometryInstanceForwardMobile() :
+				dirty_list_element(this) {}
+	};
+
+public:
+	static void _geometry_instance_dependency_changed(RendererStorage::DependencyChangedNotification p_notification, RendererStorage::DependencyTracker *p_tracker);
+	static void _geometry_instance_dependency_deleted(const RID &p_dependency, RendererStorage::DependencyTracker *p_tracker);
+
+	SelfList<GeometryInstanceForwardMobile>::List geometry_instance_dirty_list;
+
+	PagedAllocator<GeometryInstanceForwardMobile> geometry_instance_alloc;
+	PagedAllocator<GeometryInstanceSurfaceDataCache> geometry_instance_surface_alloc;
+	PagedAllocator<GeometryInstanceLightmapSH> geometry_instance_lightmap_sh;
+
+	void _geometry_instance_add_surface_with_material(GeometryInstanceForwardMobile *ginstance, uint32_t p_surface, SceneShaderForwardMobile::MaterialData *p_material, uint32_t p_material_id, uint32_t p_shader_id, RID p_mesh);
+	void _geometry_instance_add_surface(GeometryInstanceForwardMobile *ginstance, uint32_t p_surface, RID p_material, RID p_mesh);
+	void _geometry_instance_mark_dirty(GeometryInstance *p_geometry_instance);
+	void _geometry_instance_update(GeometryInstance *p_geometry_instance);
+	void _update_dirty_geometry_instances();
+
+	virtual GeometryInstance *geometry_instance_create(RID p_base);
+	virtual void geometry_instance_set_skeleton(GeometryInstance *p_geometry_instance, RID p_skeleton);
+	virtual void geometry_instance_set_material_override(GeometryInstance *p_geometry_instance, RID p_override);
+	virtual void geometry_instance_set_surface_materials(GeometryInstance *p_geometry_instance, const Vector<RID> &p_materials);
+	virtual void geometry_instance_set_mesh_instance(GeometryInstance *p_geometry_instance, RID p_mesh_instance);
+	virtual void geometry_instance_set_transform(GeometryInstance *p_geometry_instance, const Transform &p_transform, const AABB &p_aabb, const AABB &p_transformed_aabb);
+	virtual void geometry_instance_set_layer_mask(GeometryInstance *p_geometry_instance, uint32_t p_layer_mask);
+	virtual void geometry_instance_set_lod_bias(GeometryInstance *p_geometry_instance, float p_lod_bias);
+	virtual void geometry_instance_set_use_baked_light(GeometryInstance *p_geometry_instance, bool p_enable);
+	virtual void geometry_instance_set_use_dynamic_gi(GeometryInstance *p_geometry_instance, bool p_enable);
+	virtual void geometry_instance_set_use_lightmap(GeometryInstance *p_geometry_instance, RID p_lightmap_instance, const Rect2 &p_lightmap_uv_scale, int p_lightmap_slice_index);
+	virtual void geometry_instance_set_lightmap_capture(GeometryInstance *p_geometry_instance, const Color *p_sh9);
+	virtual void geometry_instance_set_instance_shader_parameters_offset(GeometryInstance *p_geometry_instance, int32_t p_offset);
+	virtual void geometry_instance_set_cast_double_sided_shadows(GeometryInstance *p_geometry_instance, bool p_enable);
+
+	virtual Transform geometry_instance_get_transform(GeometryInstance *p_instance);
+	virtual AABB geometry_instance_get_aabb(GeometryInstance *p_instance);
+
+	virtual void geometry_instance_free(GeometryInstance *p_geometry_instance);
+
+	virtual uint32_t geometry_instance_get_pair_mask();
+	virtual void geometry_instance_pair_light_instances(GeometryInstance *p_geometry_instance, const RID *p_light_instances, uint32_t p_light_instance_count);
+	virtual void geometry_instance_pair_reflection_probe_instances(GeometryInstance *p_geometry_instance, const RID *p_reflection_probe_instances, uint32_t p_reflection_probe_instance_count);
+	virtual void geometry_instance_pair_decal_instances(GeometryInstance *p_geometry_instance, const RID *p_decal_instances, uint32_t p_decal_instance_count);
+	virtual void geometry_instance_pair_gi_probe_instances(GeometryInstance *p_geometry_instance, const RID *p_gi_probe_instances, uint32_t p_gi_probe_instance_count);
+
+	virtual bool free(RID p_rid);
+
+	virtual bool is_dynamic_gi_supported() const;
+	virtual bool is_clustered_enabled() const;
+	virtual bool is_volumetric_supported() const;
+	virtual uint32_t get_max_elements() const;
+
+	RenderForwardMobile(RendererStorageRD *p_storage);
+	~RenderForwardMobile();
+};
+} // namespace RendererSceneRenderImplementation
+#endif // !RENDERING_SERVER_SCENE_RENDER_FORWARD_MOBILE_H
diff --git a/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.cpp b/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.cpp
new file mode 100644
index 0000000000..b9220cc514
--- /dev/null
+++ b/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.cpp
@@ -0,0 +1,833 @@
+/*************************************************************************/
+/*  scene_shader_forward_mobile.cpp                                      */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "scene_shader_forward_mobile.h"
+#include "core/config/project_settings.h"
+#include "render_forward_mobile.h"
+
+using namespace RendererSceneRenderImplementation;
+
+/* ShaderData */
+
+void SceneShaderForwardMobile::ShaderData::set_code(const String &p_code) {
+	//compile
+
+	code = p_code;
+	valid = false;
+	ubo_size = 0;
+	uniforms.clear();
+	uses_screen_texture = false;
+
+	if (code == String()) {
+		return; //just invalid, but no error
+	}
+
+	ShaderCompilerRD::GeneratedCode gen_code;
+
+	int blend_mode = BLEND_MODE_MIX;
+	int depth_testi = DEPTH_TEST_ENABLED;
+	int alpha_antialiasing_mode = ALPHA_ANTIALIASING_OFF;
+	int cull = CULL_BACK;
+
+	uses_point_size = false;
+	uses_alpha = false;
+	uses_blend_alpha = false;
+	uses_depth_pre_pass = false;
+	uses_discard = false;
+	uses_roughness = false;
+	uses_normal = false;
+	bool wireframe = false;
+
+	unshaded = false;
+	uses_vertex = false;
+	uses_sss = false;
+	uses_transmittance = false;
+	uses_screen_texture = false;
+	uses_depth_texture = false;
+	uses_normal_texture = false;
+	uses_time = false;
+	writes_modelview_or_projection = false;
+	uses_world_coordinates = false;
+	uses_particle_trails = false;
+
+	int depth_drawi = DEPTH_DRAW_OPAQUE;
+
+	ShaderCompilerRD::IdentifierActions actions;
+	actions.entry_point_stages["vertex"] = ShaderCompilerRD::STAGE_VERTEX;
+	actions.entry_point_stages["fragment"] = ShaderCompilerRD::STAGE_FRAGMENT;
+	actions.entry_point_stages["light"] = ShaderCompilerRD::STAGE_FRAGMENT;
+
+	actions.render_mode_values["blend_add"] = Pair<int *, int>(&blend_mode, BLEND_MODE_ADD);
+	actions.render_mode_values["blend_mix"] = Pair<int *, int>(&blend_mode, BLEND_MODE_MIX);
+	actions.render_mode_values["blend_sub"] = Pair<int *, int>(&blend_mode, BLEND_MODE_SUB);
+	actions.render_mode_values["blend_mul"] = Pair<int *, int>(&blend_mode, BLEND_MODE_MUL);
+
+	actions.render_mode_values["alpha_to_coverage"] = Pair<int *, int>(&alpha_antialiasing_mode, ALPHA_ANTIALIASING_ALPHA_TO_COVERAGE);
+	actions.render_mode_values["alpha_to_coverage_and_one"] = Pair<int *, int>(&alpha_antialiasing_mode, ALPHA_ANTIALIASING_ALPHA_TO_COVERAGE_AND_TO_ONE);
+
+	actions.render_mode_values["depth_draw_never"] = Pair<int *, int>(&depth_drawi, DEPTH_DRAW_DISABLED);
+	actions.render_mode_values["depth_draw_opaque"] = Pair<int *, int>(&depth_drawi, DEPTH_DRAW_OPAQUE);
+	actions.render_mode_values["depth_draw_always"] = Pair<int *, int>(&depth_drawi, DEPTH_DRAW_ALWAYS);
+
+	actions.render_mode_values["depth_test_disabled"] = Pair<int *, int>(&depth_testi, DEPTH_TEST_DISABLED);
+
+	actions.render_mode_values["cull_disabled"] = Pair<int *, int>(&cull, CULL_DISABLED);
+	actions.render_mode_values["cull_front"] = Pair<int *, int>(&cull, CULL_FRONT);
+	actions.render_mode_values["cull_back"] = Pair<int *, int>(&cull, CULL_BACK);
+
+	actions.render_mode_flags["unshaded"] = &unshaded;
+	actions.render_mode_flags["wireframe"] = &wireframe;
+	actions.render_mode_flags["particle_trails"] = &uses_particle_trails;
+
+	actions.usage_flag_pointers["ALPHA"] = &uses_alpha;
+	actions.render_mode_flags["depth_prepass_alpha"] = &uses_depth_pre_pass;
+
+	// actions.usage_flag_pointers["SSS_STRENGTH"] = &uses_sss;
+	// actions.usage_flag_pointers["SSS_TRANSMITTANCE_DEPTH"] = &uses_transmittance;
+
+	actions.usage_flag_pointers["SCREEN_TEXTURE"] = &uses_screen_texture;
+	actions.usage_flag_pointers["DEPTH_TEXTURE"] = &uses_depth_texture;
+	actions.usage_flag_pointers["NORMAL_TEXTURE"] = &uses_normal_texture;
+	actions.usage_flag_pointers["DISCARD"] = &uses_discard;
+	actions.usage_flag_pointers["TIME"] = &uses_time;
+	actions.usage_flag_pointers["ROUGHNESS"] = &uses_roughness;
+	actions.usage_flag_pointers["NORMAL"] = &uses_normal;
+	actions.usage_flag_pointers["NORMAL_MAP"] = &uses_normal;
+
+	actions.usage_flag_pointers["POINT_SIZE"] = &uses_point_size;
+	actions.usage_flag_pointers["POINT_COORD"] = &uses_point_size;
+
+	actions.write_flag_pointers["MODELVIEW_MATRIX"] = &writes_modelview_or_projection;
+	actions.write_flag_pointers["PROJECTION_MATRIX"] = &writes_modelview_or_projection;
+	actions.write_flag_pointers["VERTEX"] = &uses_vertex;
+
+	actions.uniforms = &uniforms;
+
+	SceneShaderForwardMobile *shader_singleton = (SceneShaderForwardMobile *)SceneShaderForwardMobile::singleton;
+
+	Error err = shader_singleton->compiler.compile(RS::SHADER_SPATIAL, code, &actions, path, gen_code);
+
+	ERR_FAIL_COND(err != OK);
+
+	if (version.is_null()) {
+		version = shader_singleton->shader.version_create();
+	}
+
+	depth_draw = DepthDraw(depth_drawi);
+	depth_test = DepthTest(depth_testi);
+
+#if 0
+	print_line("**compiling shader:");
+	print_line("**defines:\n");
+	for (int i = 0; i < gen_code.defines.size(); i++) {
+		print_line(gen_code.defines[i]);
+	}
+
+	Map<String, String>::Element * el = gen_code.code.front();
+	while (el) {
+		print_line("\n**code " + el->key() + ":\n" + el->value());
+
+		el = el->next();
+	}
+
+	print_line("\n**uniforms:\n" + gen_code.uniforms);
+	print_line("\n**vertex_globals:\n" + gen_code.stage_globals[ShaderCompilerRD::STAGE_VERTEX]);
+	print_line("\n**fragment_globals:\n" + gen_code.stage_globals[ShaderCompilerRD::STAGE_FRAGMENT]);
+#endif
+
+	shader_singleton->shader.version_set_code(version, gen_code.code, gen_code.uniforms, gen_code.stage_globals[ShaderCompilerRD::STAGE_VERTEX], gen_code.stage_globals[ShaderCompilerRD::STAGE_FRAGMENT], gen_code.defines);
+	ERR_FAIL_COND(!shader_singleton->shader.version_is_valid(version));
+
+	ubo_size = gen_code.uniform_total_size;
+	ubo_offsets = gen_code.uniform_offsets;
+	texture_uniforms = gen_code.texture_uniforms;
+
+	//blend modes
+
+	// if any form of Alpha Antialiasing is enabled, set the blend mode to alpha to coverage
+	if (alpha_antialiasing_mode != ALPHA_ANTIALIASING_OFF) {
+		blend_mode = BLEND_MODE_ALPHA_TO_COVERAGE;
+	}
+
+	RD::PipelineColorBlendState::Attachment blend_attachment;
+
+	switch (blend_mode) {
+		case BLEND_MODE_MIX: {
+			blend_attachment.enable_blend = true;
+			blend_attachment.alpha_blend_op = RD::BLEND_OP_ADD;
+			blend_attachment.color_blend_op = RD::BLEND_OP_ADD;
+			blend_attachment.src_color_blend_factor = RD::BLEND_FACTOR_SRC_ALPHA;
+			blend_attachment.dst_color_blend_factor = RD::BLEND_FACTOR_ONE_MINUS_SRC_ALPHA;
+			blend_attachment.src_alpha_blend_factor = RD::BLEND_FACTOR_ONE;
+			blend_attachment.dst_alpha_blend_factor = RD::BLEND_FACTOR_ONE_MINUS_SRC_ALPHA;
+
+		} break;
+		case BLEND_MODE_ADD: {
+			blend_attachment.enable_blend = true;
+			blend_attachment.alpha_blend_op = RD::BLEND_OP_ADD;
+			blend_attachment.color_blend_op = RD::BLEND_OP_ADD;
+			blend_attachment.src_color_blend_factor = RD::BLEND_FACTOR_SRC_ALPHA;
+			blend_attachment.dst_color_blend_factor = RD::BLEND_FACTOR_ONE;
+			blend_attachment.src_alpha_blend_factor = RD::BLEND_FACTOR_SRC_ALPHA;
+			blend_attachment.dst_alpha_blend_factor = RD::BLEND_FACTOR_ONE;
+			uses_blend_alpha = true; //force alpha used because of blend
+
+		} break;
+		case BLEND_MODE_SUB: {
+			blend_attachment.enable_blend = true;
+			blend_attachment.alpha_blend_op = RD::BLEND_OP_SUBTRACT;
+			blend_attachment.color_blend_op = RD::BLEND_OP_SUBTRACT;
+			blend_attachment.src_color_blend_factor = RD::BLEND_FACTOR_SRC_ALPHA;
+			blend_attachment.dst_color_blend_factor = RD::BLEND_FACTOR_ONE;
+			blend_attachment.src_alpha_blend_factor = RD::BLEND_FACTOR_SRC_ALPHA;
+			blend_attachment.dst_alpha_blend_factor = RD::BLEND_FACTOR_ONE;
+			uses_blend_alpha = true; //force alpha used because of blend
+
+		} break;
+		case BLEND_MODE_MUL: {
+			blend_attachment.enable_blend = true;
+			blend_attachment.alpha_blend_op = RD::BLEND_OP_ADD;
+			blend_attachment.color_blend_op = RD::BLEND_OP_ADD;
+			blend_attachment.src_color_blend_factor = RD::BLEND_FACTOR_DST_COLOR;
+			blend_attachment.dst_color_blend_factor = RD::BLEND_FACTOR_ZERO;
+			blend_attachment.src_alpha_blend_factor = RD::BLEND_FACTOR_DST_ALPHA;
+			blend_attachment.dst_alpha_blend_factor = RD::BLEND_FACTOR_ZERO;
+			uses_blend_alpha = true; //force alpha used because of blend
+		} break;
+		case BLEND_MODE_ALPHA_TO_COVERAGE: {
+			blend_attachment.enable_blend = true;
+			blend_attachment.alpha_blend_op = RD::BLEND_OP_ADD;
+			blend_attachment.color_blend_op = RD::BLEND_OP_ADD;
+			blend_attachment.src_color_blend_factor = RD::BLEND_FACTOR_SRC_ALPHA;
+			blend_attachment.dst_color_blend_factor = RD::BLEND_FACTOR_ONE_MINUS_SRC_ALPHA;
+			blend_attachment.src_alpha_blend_factor = RD::BLEND_FACTOR_ONE;
+			blend_attachment.dst_alpha_blend_factor = RD::BLEND_FACTOR_ZERO;
+		}
+	}
+
+	RD::PipelineColorBlendState blend_state_blend;
+	blend_state_blend.attachments.push_back(blend_attachment);
+	RD::PipelineColorBlendState blend_state_opaque = RD::PipelineColorBlendState::create_disabled(1);
+	RD::PipelineColorBlendState blend_state_opaque_specular = RD::PipelineColorBlendState::create_disabled(2);
+	RD::PipelineColorBlendState blend_state_depth_normal_roughness = RD::PipelineColorBlendState::create_disabled(1);
+	RD::PipelineColorBlendState blend_state_depth_normal_roughness_giprobe = RD::PipelineColorBlendState::create_disabled(2);
+
+	//update pipelines
+
+	RD::PipelineDepthStencilState depth_stencil_state;
+
+	if (depth_test != DEPTH_TEST_DISABLED) {
+		depth_stencil_state.enable_depth_test = true;
+		depth_stencil_state.depth_compare_operator = RD::COMPARE_OP_LESS_OR_EQUAL;
+		depth_stencil_state.enable_depth_write = depth_draw != DEPTH_DRAW_DISABLED ? true : false;
+	}
+
+	for (int i = 0; i < CULL_VARIANT_MAX; i++) {
+		RD::PolygonCullMode cull_mode_rd_table[CULL_VARIANT_MAX][3] = {
+			{ RD::POLYGON_CULL_DISABLED, RD::POLYGON_CULL_FRONT, RD::POLYGON_CULL_BACK },
+			{ RD::POLYGON_CULL_DISABLED, RD::POLYGON_CULL_BACK, RD::POLYGON_CULL_FRONT },
+			{ RD::POLYGON_CULL_DISABLED, RD::POLYGON_CULL_DISABLED, RD::POLYGON_CULL_DISABLED }
+		};
+
+		RD::PolygonCullMode cull_mode_rd = cull_mode_rd_table[i][cull];
+
+		for (int j = 0; j < RS::PRIMITIVE_MAX; j++) {
+			RD::RenderPrimitive primitive_rd_table[RS::PRIMITIVE_MAX] = {
+				RD::RENDER_PRIMITIVE_POINTS,
+				RD::RENDER_PRIMITIVE_LINES,
+				RD::RENDER_PRIMITIVE_LINESTRIPS,
+				RD::RENDER_PRIMITIVE_TRIANGLES,
+				RD::RENDER_PRIMITIVE_TRIANGLE_STRIPS,
+			};
+
+			RD::RenderPrimitive primitive_rd = uses_point_size ? RD::RENDER_PRIMITIVE_POINTS : primitive_rd_table[j];
+
+			for (int k = 0; k < SHADER_VERSION_MAX; k++) {
+				if (!static_cast<SceneShaderForwardMobile *>(singleton)->shader.is_variant_enabled(k)) {
+					continue;
+				}
+				RD::PipelineRasterizationState raster_state;
+				raster_state.cull_mode = cull_mode_rd;
+				raster_state.wireframe = wireframe;
+
+				RD::PipelineColorBlendState blend_state;
+				RD::PipelineDepthStencilState depth_stencil = depth_stencil_state;
+				RD::PipelineMultisampleState multisample_state;
+
+				if (uses_alpha || uses_blend_alpha) {
+					// only allow these flags to go through if we have some form of msaa
+					if (alpha_antialiasing_mode == ALPHA_ANTIALIASING_ALPHA_TO_COVERAGE) {
+						multisample_state.enable_alpha_to_coverage = true;
+					} else if (alpha_antialiasing_mode == ALPHA_ANTIALIASING_ALPHA_TO_COVERAGE_AND_TO_ONE) {
+						multisample_state.enable_alpha_to_coverage = true;
+						multisample_state.enable_alpha_to_one = true;
+					}
+
+					if (k == SHADER_VERSION_COLOR_PASS || k == SHADER_VERSION_LIGHTMAP_COLOR_PASS) {
+						blend_state = blend_state_blend;
+						if (depth_draw == DEPTH_DRAW_OPAQUE) {
+							depth_stencil.enable_depth_write = false; //alpha does not draw depth
+						}
+					} else if (k == SHADER_VERSION_SHADOW_PASS || k == SHADER_VERSION_DEPTH_PASS_DP) {
+						//none, blend state contains nothing
+					} else if (k == SHADER_VERSION_DEPTH_PASS_WITH_MATERIAL) {
+						blend_state = RD::PipelineColorBlendState::create_disabled(5); //writes to normal and roughness in opaque way
+					} else {
+						pipelines[i][j][k].clear();
+						continue; // do not use this version (will error if using it is attempted)
+					}
+
+					/*
+					if (k == SHADER_VERSION_COLOR_PASS || k == SHADER_VERSION_COLOR_PASS_WITH_FORWARD_GI || k == SHADER_VERSION_LIGHTMAP_COLOR_PASS) {
+						blend_state = blend_state_blend;
+						if (depth_draw == DEPTH_DRAW_OPAQUE) {
+							depth_stencil.enable_depth_write = false; //alpha does not draw depth
+						}
+					} else if (uses_depth_pre_pass && (k == SHADER_VERSION_DEPTH_PASS || k == SHADER_VERSION_DEPTH_PASS_DP || k == SHADER_VERSION_DEPTH_PASS_WITH_NORMAL_AND_ROUGHNESS || k == SHADER_VERSION_DEPTH_PASS_WITH_MATERIAL)) {
+						if (k == SHADER_VERSION_DEPTH_PASS || k == SHADER_VERSION_DEPTH_PASS_DP) {
+							//none, blend state contains nothing
+						} else if (k == SHADER_VERSION_DEPTH_PASS_WITH_MATERIAL) {
+							blend_state = RD::PipelineColorBlendState::create_disabled(5); //writes to normal and roughness in opaque way
+						} else {
+							blend_state = blend_state_opaque; //writes to normal and roughness in opaque way
+						}
+					} else {
+						pipelines[i][j][k].clear();
+						continue; // do not use this version (will error if using it is attempted)
+					}
+					*/
+				} else {
+					if (k == SHADER_VERSION_COLOR_PASS || k == SHADER_VERSION_LIGHTMAP_COLOR_PASS) {
+						blend_state = blend_state_opaque;
+					} else if (k == SHADER_VERSION_SHADOW_PASS || k == SHADER_VERSION_DEPTH_PASS_DP) {
+						//none, leave empty
+					} else if (k == SHADER_VERSION_DEPTH_PASS_WITH_MATERIAL) {
+						blend_state = RD::PipelineColorBlendState::create_disabled(5); //writes to normal and roughness in opaque way
+					} else {
+						// ???
+					}
+
+					/*
+					if (k == SHADER_VERSION_COLOR_PASS || k == SHADER_VERSION_COLOR_PASS_WITH_FORWARD_GI || k == SHADER_VERSION_LIGHTMAP_COLOR_PASS) {
+						blend_state = blend_state_opaque;
+					} else if (k == SHADER_VERSION_DEPTH_PASS || k == SHADER_VERSION_DEPTH_PASS_DP) {
+						//none, leave empty
+					} else if (k == SHADER_VERSION_DEPTH_PASS_WITH_NORMAL_AND_ROUGHNESS) {
+						blend_state = blend_state_depth_normal_roughness;
+					} else if (k == SHADER_VERSION_DEPTH_PASS_WITH_NORMAL_AND_ROUGHNESS_AND_GIPROBE) {
+						blend_state = blend_state_depth_normal_roughness_giprobe;
+					} else if (k == SHADER_VERSION_DEPTH_PASS_WITH_MATERIAL) {
+						blend_state = RD::PipelineColorBlendState::create_disabled(5); //writes to normal and roughness in opaque way
+					} else if (k == SHADER_VERSION_DEPTH_PASS_WITH_SDF) {
+						blend_state = RD::PipelineColorBlendState(); //no color targets for SDF
+					} else {
+						//specular write
+						blend_state = blend_state_opaque_specular;
+						depth_stencil.enable_depth_test = false;
+						depth_stencil.enable_depth_write = false;
+					}
+					*/
+				}
+
+				RID shader_variant = shader_singleton->shader.version_get_shader(version, k);
+				pipelines[i][j][k].setup(shader_variant, primitive_rd, raster_state, multisample_state, depth_stencil, blend_state, 0);
+			}
+		}
+	}
+
+	valid = true;
+}
+
+void SceneShaderForwardMobile::ShaderData::set_default_texture_param(const StringName &p_name, RID p_texture) {
+	if (!p_texture.is_valid()) {
+		default_texture_params.erase(p_name);
+	} else {
+		default_texture_params[p_name] = p_texture;
+	}
+}
+
+void SceneShaderForwardMobile::ShaderData::get_param_list(List<PropertyInfo> *p_param_list) const {
+	Map<int, StringName> order;
+
+	for (Map<StringName, ShaderLanguage::ShaderNode::Uniform>::Element *E = uniforms.front(); E; E = E->next()) {
+		if (E->get().scope != ShaderLanguage::ShaderNode::Uniform::SCOPE_LOCAL) {
+			continue;
+		}
+
+		if (E->get().texture_order >= 0) {
+			order[E->get().texture_order + 100000] = E->key();
+		} else {
+			order[E->get().order] = E->key();
+		}
+	}
+
+	for (Map<int, StringName>::Element *E = order.front(); E; E = E->next()) {
+		PropertyInfo pi = ShaderLanguage::uniform_to_property_info(uniforms[E->get()]);
+		pi.name = E->get();
+		p_param_list->push_back(pi);
+	}
+}
+
+void SceneShaderForwardMobile::ShaderData::get_instance_param_list(List<RendererStorage::InstanceShaderParam> *p_param_list) const {
+	for (Map<StringName, ShaderLanguage::ShaderNode::Uniform>::Element *E = uniforms.front(); E; E = E->next()) {
+		if (E->get().scope != ShaderLanguage::ShaderNode::Uniform::SCOPE_INSTANCE) {
+			continue;
+		}
+
+		RendererStorage::InstanceShaderParam p;
+		p.info = ShaderLanguage::uniform_to_property_info(E->get());
+		p.info.name = E->key(); //supply name
+		p.index = E->get().instance_index;
+		p.default_value = ShaderLanguage::constant_value_to_variant(E->get().default_value, E->get().type, E->get().hint);
+		p_param_list->push_back(p);
+	}
+}
+
+bool SceneShaderForwardMobile::ShaderData::is_param_texture(const StringName &p_param) const {
+	if (!uniforms.has(p_param)) {
+		return false;
+	}
+
+	return uniforms[p_param].texture_order >= 0;
+}
+
+bool SceneShaderForwardMobile::ShaderData::is_animated() const {
+	return false;
+}
+
+bool SceneShaderForwardMobile::ShaderData::casts_shadows() const {
+	return false;
+}
+
+Variant SceneShaderForwardMobile::ShaderData::get_default_parameter(const StringName &p_parameter) const {
+	if (uniforms.has(p_parameter)) {
+		ShaderLanguage::ShaderNode::Uniform uniform = uniforms[p_parameter];
+		Vector<ShaderLanguage::ConstantNode::Value> default_value = uniform.default_value;
+		return ShaderLanguage::constant_value_to_variant(default_value, uniform.type, uniform.hint);
+	}
+	return Variant();
+}
+
+RS::ShaderNativeSourceCode SceneShaderForwardMobile::ShaderData::get_native_source_code() const {
+	SceneShaderForwardMobile *shader_singleton = (SceneShaderForwardMobile *)SceneShaderForwardMobile::singleton;
+
+	return shader_singleton->shader.version_get_native_source_code(version);
+}
+
+SceneShaderForwardMobile::ShaderData::ShaderData() {
+	valid = false;
+	uses_screen_texture = false;
+}
+
+SceneShaderForwardMobile::ShaderData::~ShaderData() {
+	SceneShaderForwardMobile *shader_singleton = (SceneShaderForwardMobile *)SceneShaderForwardMobile::singleton;
+	ERR_FAIL_COND(!shader_singleton);
+	//pipeline variants will clear themselves if shader is gone
+	if (version.is_valid()) {
+		shader_singleton->shader.version_free(version);
+	}
+}
+
+RendererStorageRD::ShaderData *SceneShaderForwardMobile::_create_shader_func() {
+	ShaderData *shader_data = memnew(ShaderData);
+	return shader_data;
+}
+
+void SceneShaderForwardMobile::MaterialData::set_render_priority(int p_priority) {
+	priority = p_priority - RS::MATERIAL_RENDER_PRIORITY_MIN; //8 bits
+}
+
+void SceneShaderForwardMobile::MaterialData::set_next_pass(RID p_pass) {
+	next_pass = p_pass;
+}
+
+void SceneShaderForwardMobile::MaterialData::update_parameters(const Map<StringName, Variant> &p_parameters, bool p_uniform_dirty, bool p_textures_dirty) {
+	SceneShaderForwardMobile *shader_singleton = (SceneShaderForwardMobile *)SceneShaderForwardMobile::singleton;
+
+	if ((uint32_t)ubo_data.size() != shader_data->ubo_size) {
+		p_uniform_dirty = true;
+		if (uniform_buffer.is_valid()) {
+			RD::get_singleton()->free(uniform_buffer);
+			uniform_buffer = RID();
+		}
+
+		ubo_data.resize(shader_data->ubo_size);
+		if (ubo_data.size()) {
+			uniform_buffer = RD::get_singleton()->uniform_buffer_create(ubo_data.size());
+			memset(ubo_data.ptrw(), 0, ubo_data.size()); //clear
+		}
+
+		//clear previous uniform set
+		if (uniform_set.is_valid() && RD::get_singleton()->uniform_set_is_valid(uniform_set)) {
+			RD::get_singleton()->free(uniform_set);
+			uniform_set = RID();
+		}
+	}
+
+	//check whether buffer changed
+	if (p_uniform_dirty && ubo_data.size()) {
+		update_uniform_buffer(shader_data->uniforms, shader_data->ubo_offsets.ptr(), p_parameters, ubo_data.ptrw(), ubo_data.size(), false);
+		RD::get_singleton()->buffer_update(uniform_buffer, 0, ubo_data.size(), ubo_data.ptrw(), RD::BARRIER_MASK_RASTER);
+	}
+
+	uint32_t tex_uniform_count = shader_data->texture_uniforms.size();
+
+	if ((uint32_t)texture_cache.size() != tex_uniform_count) {
+		texture_cache.resize(tex_uniform_count);
+		p_textures_dirty = true;
+
+		//clear previous uniform set
+		if (uniform_set.is_valid() && RD::get_singleton()->uniform_set_is_valid(uniform_set)) {
+			RD::get_singleton()->free(uniform_set);
+			uniform_set = RID();
+		}
+	}
+
+	if (p_textures_dirty && tex_uniform_count) {
+		update_textures(p_parameters, shader_data->default_texture_params, shader_data->texture_uniforms, texture_cache.ptrw(), true);
+	}
+
+	if (shader_data->ubo_size == 0 && shader_data->texture_uniforms.size() == 0) {
+		// This material does not require an uniform set, so don't create it.
+		return;
+	}
+
+	if (!p_textures_dirty && uniform_set.is_valid() && RD::get_singleton()->uniform_set_is_valid(uniform_set)) {
+		//no reason to update uniform set, only UBO (or nothing) was needed to update
+		return;
+	}
+
+	Vector<RD::Uniform> uniforms;
+
+	{
+		if (shader_data->ubo_size) {
+			RD::Uniform u;
+			u.uniform_type = RD::UNIFORM_TYPE_UNIFORM_BUFFER;
+			u.binding = 0;
+			u.ids.push_back(uniform_buffer);
+			uniforms.push_back(u);
+		}
+
+		const RID *textures = texture_cache.ptrw();
+		for (uint32_t i = 0; i < tex_uniform_count; i++) {
+			RD::Uniform u;
+			u.uniform_type = RD::UNIFORM_TYPE_TEXTURE;
+			u.binding = 1 + i;
+			u.ids.push_back(textures[i]);
+			uniforms.push_back(u);
+		}
+	}
+
+	uniform_set = RD::get_singleton()->uniform_set_create(uniforms, shader_singleton->shader.version_get_shader(shader_data->version, 0), RenderForwardMobile::MATERIAL_UNIFORM_SET);
+}
+
+SceneShaderForwardMobile::MaterialData::~MaterialData() {
+	if (uniform_set.is_valid() && RD::get_singleton()->uniform_set_is_valid(uniform_set)) {
+		RD::get_singleton()->free(uniform_set);
+	}
+
+	if (uniform_buffer.is_valid()) {
+		RD::get_singleton()->free(uniform_buffer);
+	}
+}
+
+RendererStorageRD::MaterialData *SceneShaderForwardMobile::_create_material_func(ShaderData *p_shader) {
+	MaterialData *material_data = memnew(MaterialData);
+	material_data->shader_data = p_shader;
+	material_data->last_frame = false;
+	//update will happen later anyway so do nothing.
+	return material_data;
+}
+
+/* Scene Shader */
+
+SceneShaderForwardMobile *SceneShaderForwardMobile::singleton = nullptr;
+
+SceneShaderForwardMobile::SceneShaderForwardMobile() {
+	// there should be only one of these, contained within our RenderForwardMobile singleton.
+	singleton = this;
+}
+
+void SceneShaderForwardMobile::init(RendererStorageRD *p_storage, const String p_defines) {
+	storage = p_storage;
+
+	/* SCENE SHADER */
+
+	{
+		Vector<String> shader_versions;
+		shader_versions.push_back(""); // SHADER_VERSION_COLOR_PASS
+		shader_versions.push_back("\n#define USE_LIGHTMAP\n"); // SHADER_VERSION_LIGHTMAP_COLOR_PASS
+		shader_versions.push_back("\n#define MODE_RENDER_DEPTH\n"); // !BAS! SHADER_VERSION_SHADOW_PASS, should probably change this to MODE_RENDER_SHADOW because we don't have a depth pass here...
+		shader_versions.push_back("\n#define MODE_RENDER_DEPTH\n#define MODE_DUAL_PARABOLOID\n"); // SHADER_VERSION_DEPTH_PASS_DP (maybe rename to SHADER_VERSION_SHADOW_PASS_DP?)
+		shader_versions.push_back("\n#define MODE_RENDER_DEPTH\n#define MODE_RENDER_MATERIAL\n"); // SHADER_VERSION_DEPTH_PASS_WITH_MATERIAL
+		shader.initialize(shader_versions, p_defines);
+	}
+
+	storage->shader_set_data_request_function(RendererStorageRD::SHADER_TYPE_3D, _create_shader_funcs);
+	storage->material_set_data_request_function(RendererStorageRD::SHADER_TYPE_3D, _create_material_funcs);
+
+	{
+		//shader compiler
+		ShaderCompilerRD::DefaultIdentifierActions actions;
+
+		actions.renames["WORLD_MATRIX"] = "world_matrix";
+		actions.renames["WORLD_NORMAL_MATRIX"] = "world_normal_matrix";
+		actions.renames["INV_CAMERA_MATRIX"] = "scene_data.inv_camera_matrix";
+		actions.renames["CAMERA_MATRIX"] = "scene_data.camera_matrix";
+		actions.renames["PROJECTION_MATRIX"] = "projection_matrix";
+		actions.renames["INV_PROJECTION_MATRIX"] = "scene_data.inv_projection_matrix";
+		actions.renames["MODELVIEW_MATRIX"] = "modelview";
+		actions.renames["MODELVIEW_NORMAL_MATRIX"] = "modelview_normal";
+
+		actions.renames["VERTEX"] = "vertex";
+		actions.renames["NORMAL"] = "normal";
+		actions.renames["TANGENT"] = "tangent";
+		actions.renames["BINORMAL"] = "binormal";
+		actions.renames["POSITION"] = "position";
+		actions.renames["UV"] = "uv_interp";
+		actions.renames["UV2"] = "uv2_interp";
+		actions.renames["COLOR"] = "color_interp";
+		actions.renames["POINT_SIZE"] = "gl_PointSize";
+		actions.renames["INSTANCE_ID"] = "gl_InstanceIndex";
+
+		actions.renames["ALPHA_SCISSOR_THRESHOLD"] = "alpha_scissor_threshold";
+		actions.renames["ALPHA_HASH_SCALE"] = "alpha_hash_scale";
+		actions.renames["ALPHA_ANTIALIASING_EDGE"] = "alpha_antialiasing_edge";
+		actions.renames["ALPHA_TEXTURE_COORDINATE"] = "alpha_texture_coordinate";
+
+		//builtins
+
+		actions.renames["TIME"] = "scene_data.time";
+		actions.renames["VIEWPORT_SIZE"] = "scene_data.viewport_size";
+
+		actions.renames["FRAGCOORD"] = "gl_FragCoord";
+		actions.renames["FRONT_FACING"] = "gl_FrontFacing";
+		actions.renames["NORMAL_MAP"] = "normal_map";
+		actions.renames["NORMAL_MAP_DEPTH"] = "normal_map_depth";
+		actions.renames["ALBEDO"] = "albedo";
+		actions.renames["ALPHA"] = "alpha";
+		actions.renames["METALLIC"] = "metallic";
+		actions.renames["SPECULAR"] = "specular";
+		actions.renames["ROUGHNESS"] = "roughness";
+		actions.renames["RIM"] = "rim";
+		actions.renames["RIM_TINT"] = "rim_tint";
+		actions.renames["CLEARCOAT"] = "clearcoat";
+		actions.renames["CLEARCOAT_GLOSS"] = "clearcoat_gloss";
+		actions.renames["ANISOTROPY"] = "anisotropy";
+		actions.renames["ANISOTROPY_FLOW"] = "anisotropy_flow";
+		actions.renames["SSS_STRENGTH"] = "sss_strength";
+		actions.renames["SSS_TRANSMITTANCE_COLOR"] = "transmittance_color";
+		actions.renames["SSS_TRANSMITTANCE_DEPTH"] = "transmittance_depth";
+		actions.renames["SSS_TRANSMITTANCE_CURVE"] = "transmittance_curve";
+		actions.renames["SSS_TRANSMITTANCE_BOOST"] = "transmittance_boost";
+		actions.renames["BACKLIGHT"] = "backlight";
+		actions.renames["AO"] = "ao";
+		actions.renames["AO_LIGHT_AFFECT"] = "ao_light_affect";
+		actions.renames["EMISSION"] = "emission";
+		actions.renames["POINT_COORD"] = "gl_PointCoord";
+		actions.renames["INSTANCE_CUSTOM"] = "instance_custom";
+		actions.renames["SCREEN_UV"] = "screen_uv";
+		actions.renames["SCREEN_TEXTURE"] = "color_buffer";
+		actions.renames["DEPTH_TEXTURE"] = "depth_buffer";
+		actions.renames["NORMAL_ROUGHNESS_TEXTURE"] = "normal_roughness_buffer";
+		actions.renames["DEPTH"] = "gl_FragDepth";
+		actions.renames["OUTPUT_IS_SRGB"] = "true";
+		actions.renames["FOG"] = "custom_fog";
+		actions.renames["RADIANCE"] = "custom_radiance";
+		actions.renames["IRRADIANCE"] = "custom_irradiance";
+		actions.renames["BONE_INDICES"] = "bone_attrib";
+		actions.renames["BONE_WEIGHTS"] = "weight_attrib";
+		actions.renames["CUSTOM0"] = "custom0_attrib";
+		actions.renames["CUSTOM1"] = "custom1_attrib";
+		actions.renames["CUSTOM2"] = "custom2_attrib";
+		actions.renames["CUSTOM3"] = "custom3_attrib";
+
+		//for light
+		actions.renames["VIEW"] = "view";
+		actions.renames["LIGHT_COLOR"] = "light_color";
+		actions.renames["LIGHT"] = "light";
+		actions.renames["ATTENUATION"] = "attenuation";
+		actions.renames["SHADOW_ATTENUATION"] = "shadow_attenuation";
+		actions.renames["DIFFUSE_LIGHT"] = "diffuse_light";
+		actions.renames["SPECULAR_LIGHT"] = "specular_light";
+
+		actions.usage_defines["NORMAL"] = "#define NORMAL_USED\n";
+		actions.usage_defines["TANGENT"] = "#define TANGENT_USED\n";
+		actions.usage_defines["BINORMAL"] = "@TANGENT";
+		actions.usage_defines["RIM"] = "#define LIGHT_RIM_USED\n";
+		actions.usage_defines["RIM_TINT"] = "@RIM";
+		actions.usage_defines["CLEARCOAT"] = "#define LIGHT_CLEARCOAT_USED\n";
+		actions.usage_defines["CLEARCOAT_GLOSS"] = "@CLEARCOAT";
+		actions.usage_defines["ANISOTROPY"] = "#define LIGHT_ANISOTROPY_USED\n";
+		actions.usage_defines["ANISOTROPY_FLOW"] = "@ANISOTROPY";
+		actions.usage_defines["AO"] = "#define AO_USED\n";
+		actions.usage_defines["AO_LIGHT_AFFECT"] = "#define AO_USED\n";
+		actions.usage_defines["UV"] = "#define UV_USED\n";
+		actions.usage_defines["UV2"] = "#define UV2_USED\n";
+		actions.usage_defines["BONE_INDICES"] = "#define BONES_USED\n";
+		actions.usage_defines["BONE_WEIGHTS"] = "#define WEIGHTS_USED\n";
+		actions.usage_defines["CUSTOM0"] = "#define CUSTOM0\n";
+		actions.usage_defines["CUSTOM1"] = "#define CUSTOM1\n";
+		actions.usage_defines["CUSTOM2"] = "#define CUSTOM2\n";
+		actions.usage_defines["CUSTOM3"] = "#define CUSTOM3\n";
+		actions.usage_defines["NORMAL_MAP"] = "#define NORMAL_MAP_USED\n";
+		actions.usage_defines["NORMAL_MAP_DEPTH"] = "@NORMAL_MAP";
+		actions.usage_defines["COLOR"] = "#define COLOR_USED\n";
+		actions.usage_defines["INSTANCE_CUSTOM"] = "#define ENABLE_INSTANCE_CUSTOM\n";
+		actions.usage_defines["POSITION"] = "#define OVERRIDE_POSITION\n";
+
+		actions.usage_defines["ALPHA_SCISSOR_THRESHOLD"] = "#define ALPHA_SCISSOR_USED\n";
+		actions.usage_defines["ALPHA_HASH_SCALE"] = "#define ALPHA_HASH_USED\n";
+		actions.usage_defines["ALPHA_ANTIALIASING_EDGE"] = "#define ALPHA_ANTIALIASING_EDGE_USED\n";
+		actions.usage_defines["ALPHA_TEXTURE_COORDINATE"] = "@ALPHA_ANTIALIASING_EDGE";
+
+		actions.usage_defines["SSS_STRENGTH"] = "#define ENABLE_SSS\n";
+		actions.usage_defines["SSS_TRANSMITTANCE_DEPTH"] = "#define ENABLE_TRANSMITTANCE\n";
+		actions.usage_defines["BACKLIGHT"] = "#define LIGHT_BACKLIGHT_USED\n";
+		actions.usage_defines["SCREEN_TEXTURE"] = "#define SCREEN_TEXTURE_USED\n";
+		actions.usage_defines["SCREEN_UV"] = "#define SCREEN_UV_USED\n";
+
+		actions.usage_defines["DIFFUSE_LIGHT"] = "#define USE_LIGHT_SHADER_CODE\n";
+		actions.usage_defines["SPECULAR_LIGHT"] = "#define USE_LIGHT_SHADER_CODE\n";
+
+		actions.usage_defines["FOG"] = "#define CUSTOM_FOG_USED\n";
+		actions.usage_defines["RADIANCE"] = "#define CUSTOM_RADIANCE_USED\n";
+		actions.usage_defines["IRRADIANCE"] = "#define CUSTOM_IRRADIANCE_USED\n";
+
+		actions.render_mode_defines["skip_vertex_transform"] = "#define SKIP_TRANSFORM_USED\n";
+		actions.render_mode_defines["world_vertex_coords"] = "#define VERTEX_WORLD_COORDS_USED\n";
+		actions.render_mode_defines["ensure_correct_normals"] = "#define ENSURE_CORRECT_NORMALS\n";
+		actions.render_mode_defines["cull_front"] = "#define DO_SIDE_CHECK\n";
+		actions.render_mode_defines["cull_disabled"] = "#define DO_SIDE_CHECK\n";
+		actions.render_mode_defines["particle_trails"] = "#define USE_PARTICLE_TRAILS\n";
+
+		bool force_lambert = GLOBAL_GET("rendering/shading/overrides/force_lambert_over_burley");
+		if (!force_lambert) {
+			actions.render_mode_defines["diffuse_burley"] = "#define DIFFUSE_BURLEY\n";
+		}
+
+		actions.render_mode_defines["diffuse_oren_nayar"] = "#define DIFFUSE_OREN_NAYAR\n";
+		actions.render_mode_defines["diffuse_lambert_wrap"] = "#define DIFFUSE_LAMBERT_WRAP\n";
+		actions.render_mode_defines["diffuse_toon"] = "#define DIFFUSE_TOON\n";
+
+		actions.render_mode_defines["sss_mode_skin"] = "#define SSS_MODE_SKIN\n";
+
+		bool force_blinn = GLOBAL_GET("rendering/shading/overrides/force_blinn_over_ggx");
+		if (!force_blinn) {
+			actions.render_mode_defines["specular_schlick_ggx"] = "#define SPECULAR_SCHLICK_GGX\n";
+		} else {
+			actions.render_mode_defines["specular_schlick_ggx"] = "#define SPECULAR_BLINN\n";
+		}
+
+		actions.render_mode_defines["specular_blinn"] = "#define SPECULAR_BLINN\n";
+		actions.render_mode_defines["specular_phong"] = "#define SPECULAR_PHONG\n";
+		actions.render_mode_defines["specular_toon"] = "#define SPECULAR_TOON\n";
+		actions.render_mode_defines["specular_disabled"] = "#define SPECULAR_DISABLED\n";
+		actions.render_mode_defines["shadows_disabled"] = "#define SHADOWS_DISABLED\n";
+		actions.render_mode_defines["ambient_light_disabled"] = "#define AMBIENT_LIGHT_DISABLED\n";
+		actions.render_mode_defines["shadow_to_opacity"] = "#define USE_SHADOW_TO_OPACITY\n";
+		actions.render_mode_defines["unshaded"] = "#define MODE_UNSHADED\n";
+
+		actions.sampler_array_name = "material_samplers";
+		actions.base_texture_binding_index = 1;
+		actions.texture_layout_set = RenderForwardMobile::MATERIAL_UNIFORM_SET;
+		actions.base_uniform_string = "material.";
+		actions.base_varying_index = 10;
+
+		actions.default_filter = ShaderLanguage::FILTER_LINEAR_MIPMAP;
+		actions.default_repeat = ShaderLanguage::REPEAT_ENABLE;
+		actions.global_buffer_array_variable = "global_variables.data";
+		actions.instance_uniform_index_variable = "draw_call.instance_uniforms_ofs";
+
+		compiler.initialize(actions);
+	}
+
+	{
+		//default material and shader
+		default_shader = storage->shader_allocate();
+		storage->shader_initialize(default_shader);
+		storage->shader_set_code(default_shader, "shader_type spatial; void vertex() { ROUGHNESS = 0.8; } void fragment() { ALBEDO=vec3(0.6); ROUGHNESS=0.8; METALLIC=0.2; } \n");
+		default_material = storage->material_allocate();
+		storage->material_initialize(default_material);
+		storage->material_set_shader(default_material, default_shader);
+
+		MaterialData *md = (MaterialData *)storage->material_get_data(default_material, RendererStorageRD::SHADER_TYPE_3D);
+		default_shader_rd = shader.version_get_shader(md->shader_data->version, SHADER_VERSION_COLOR_PASS);
+	}
+
+	{
+		overdraw_material_shader = storage->shader_allocate();
+		storage->shader_initialize(overdraw_material_shader);
+		storage->shader_set_code(overdraw_material_shader, "shader_type spatial;\nrender_mode blend_add,unshaded;\n void fragment() { ALBEDO=vec3(0.4,0.8,0.8); ALPHA=0.2; }");
+		overdraw_material = storage->material_allocate();
+		storage->material_initialize(overdraw_material);
+		storage->material_set_shader(overdraw_material, overdraw_material_shader);
+
+		wireframe_material_shader = storage->shader_allocate();
+		storage->shader_initialize(wireframe_material_shader);
+		storage->shader_set_code(wireframe_material_shader, "shader_type spatial;\nrender_mode wireframe,unshaded;\n void fragment() { ALBEDO=vec3(0.0,0.0,0.0); }");
+		wireframe_material = storage->material_allocate();
+		storage->material_initialize(wireframe_material);
+		storage->material_set_shader(wireframe_material, wireframe_material_shader);
+	}
+
+	{
+		default_vec4_xform_buffer = RD::get_singleton()->storage_buffer_create(256);
+		Vector<RD::Uniform> uniforms;
+		RD::Uniform u;
+		u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+		u.ids.push_back(default_vec4_xform_buffer);
+		u.binding = 0;
+		uniforms.push_back(u);
+
+		default_vec4_xform_uniform_set = RD::get_singleton()->uniform_set_create(uniforms, default_shader_rd, RenderForwardMobile::TRANSFORMS_UNIFORM_SET);
+	}
+	{
+		RD::SamplerState sampler;
+		sampler.mag_filter = RD::SAMPLER_FILTER_LINEAR;
+		sampler.min_filter = RD::SAMPLER_FILTER_LINEAR;
+		sampler.enable_compare = true;
+		sampler.compare_op = RD::COMPARE_OP_LESS;
+		shadow_sampler = RD::get_singleton()->sampler_create(sampler);
+	}
+}
+
+SceneShaderForwardMobile::~SceneShaderForwardMobile() {
+	RD::get_singleton()->free(default_vec4_xform_buffer);
+	RD::get_singleton()->free(shadow_sampler);
+
+	storage->free(wireframe_material_shader);
+	storage->free(overdraw_material_shader);
+	storage->free(default_shader);
+
+	storage->free(wireframe_material);
+	storage->free(overdraw_material);
+	storage->free(default_material);
+}
diff --git a/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.h b/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.h
new file mode 100644
index 0000000000..1517197d25
--- /dev/null
+++ b/servers/rendering/renderer_rd/forward_mobile/scene_shader_forward_mobile.h
@@ -0,0 +1,203 @@
+/*************************************************************************/
+/*  scene_shader_forward_mobile.h                                        */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef RSSR_SCENE_SHADER_FM_H
+#define RSSR_SCENE_SHADER_FM_H
+
+#include "servers/rendering/renderer_rd/renderer_scene_render_rd.h"
+#include "servers/rendering/renderer_rd/renderer_storage_rd.h"
+#include "servers/rendering/renderer_rd/shaders/scene_forward_mobile.glsl.gen.h"
+
+namespace RendererSceneRenderImplementation {
+
+class SceneShaderForwardMobile {
+private:
+	static SceneShaderForwardMobile *singleton;
+	RendererStorageRD *storage;
+
+public:
+	enum ShaderVersion {
+		SHADER_VERSION_COLOR_PASS,
+		SHADER_VERSION_LIGHTMAP_COLOR_PASS,
+		SHADER_VERSION_SHADOW_PASS,
+		SHADER_VERSION_DEPTH_PASS_DP,
+		SHADER_VERSION_DEPTH_PASS_WITH_MATERIAL,
+		SHADER_VERSION_MAX
+	};
+
+	struct ShaderData : public RendererStorageRD::ShaderData {
+		enum BlendMode { //used internally
+			BLEND_MODE_MIX,
+			BLEND_MODE_ADD,
+			BLEND_MODE_SUB,
+			BLEND_MODE_MUL,
+			BLEND_MODE_ALPHA_TO_COVERAGE
+		};
+
+		enum DepthDraw {
+			DEPTH_DRAW_DISABLED,
+			DEPTH_DRAW_OPAQUE,
+			DEPTH_DRAW_ALWAYS
+		};
+
+		enum DepthTest {
+			DEPTH_TEST_DISABLED,
+			DEPTH_TEST_ENABLED
+		};
+
+		enum Cull {
+			CULL_DISABLED,
+			CULL_FRONT,
+			CULL_BACK
+		};
+
+		enum CullVariant {
+			CULL_VARIANT_NORMAL,
+			CULL_VARIANT_REVERSED,
+			CULL_VARIANT_DOUBLE_SIDED,
+			CULL_VARIANT_MAX
+
+		};
+
+		enum AlphaAntiAliasing {
+			ALPHA_ANTIALIASING_OFF,
+			ALPHA_ANTIALIASING_ALPHA_TO_COVERAGE,
+			ALPHA_ANTIALIASING_ALPHA_TO_COVERAGE_AND_TO_ONE
+		};
+
+		bool valid;
+		RID version;
+		uint32_t vertex_input_mask;
+		PipelineCacheRD pipelines[CULL_VARIANT_MAX][RS::PRIMITIVE_MAX][SHADER_VERSION_MAX];
+
+		String path;
+
+		Map<StringName, ShaderLanguage::ShaderNode::Uniform> uniforms;
+		Vector<ShaderCompilerRD::GeneratedCode::Texture> texture_uniforms;
+
+		Vector<uint32_t> ubo_offsets;
+		uint32_t ubo_size;
+
+		String code;
+		Map<StringName, RID> default_texture_params;
+
+		DepthDraw depth_draw;
+		DepthTest depth_test;
+
+		bool uses_point_size;
+		bool uses_alpha;
+		bool uses_blend_alpha;
+		bool uses_alpha_clip;
+		bool uses_depth_pre_pass;
+		bool uses_discard;
+		bool uses_roughness;
+		bool uses_normal;
+		bool uses_particle_trails;
+
+		bool unshaded;
+		bool uses_vertex;
+		bool uses_sss;
+		bool uses_transmittance;
+		bool uses_screen_texture;
+		bool uses_depth_texture;
+		bool uses_normal_texture;
+		bool uses_time;
+		bool writes_modelview_or_projection;
+		bool uses_world_coordinates;
+
+		uint64_t last_pass = 0;
+		uint32_t index = 0;
+
+		virtual void set_code(const String &p_Code);
+		virtual void set_default_texture_param(const StringName &p_name, RID p_texture);
+		virtual void get_param_list(List<PropertyInfo> *p_param_list) const;
+		void get_instance_param_list(List<RendererStorage::InstanceShaderParam> *p_param_list) const;
+
+		virtual bool is_param_texture(const StringName &p_param) const;
+		virtual bool is_animated() const;
+		virtual bool casts_shadows() const;
+		virtual Variant get_default_parameter(const StringName &p_parameter) const;
+		virtual RS::ShaderNativeSourceCode get_native_source_code() const;
+
+		ShaderData();
+		virtual ~ShaderData();
+	};
+
+	RendererStorageRD::ShaderData *_create_shader_func();
+	static RendererStorageRD::ShaderData *_create_shader_funcs() {
+		return static_cast<SceneShaderForwardMobile *>(singleton)->_create_shader_func();
+	}
+
+	struct MaterialData : public RendererStorageRD::MaterialData {
+		uint64_t last_frame;
+		ShaderData *shader_data;
+		RID uniform_buffer;
+		RID uniform_set;
+		Vector<RID> texture_cache;
+		Vector<uint8_t> ubo_data;
+		uint64_t last_pass = 0;
+		uint32_t index = 0;
+		RID next_pass;
+		uint8_t priority;
+		virtual void set_render_priority(int p_priority);
+		virtual void set_next_pass(RID p_pass);
+		virtual void update_parameters(const Map<StringName, Variant> &p_parameters, bool p_uniform_dirty, bool p_textures_dirty);
+		virtual ~MaterialData();
+	};
+
+	RendererStorageRD::MaterialData *_create_material_func(ShaderData *p_shader);
+	static RendererStorageRD::MaterialData *_create_material_funcs(RendererStorageRD::ShaderData *p_shader) {
+		return static_cast<SceneShaderForwardMobile *>(singleton)->_create_material_func(static_cast<ShaderData *>(p_shader));
+	}
+
+	SceneForwardMobileShaderRD shader;
+	ShaderCompilerRD compiler;
+
+	RID default_shader;
+	RID default_material;
+	RID overdraw_material_shader;
+	RID overdraw_material;
+	RID wireframe_material_shader;
+	RID wireframe_material;
+	RID default_shader_rd;
+
+	RID default_vec4_xform_buffer;
+	RID default_vec4_xform_uniform_set;
+
+	RID shadow_sampler;
+
+	SceneShaderForwardMobile();
+	~SceneShaderForwardMobile();
+
+	void init(RendererStorageRD *p_storage, const String p_defines);
+};
+
+} // namespace RendererSceneRenderImplementation
+#endif // !RSSR_SCENE_SHADER_FM_H
diff --git a/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp b/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp
index 7d6e2fa8e4..377b0fd72d 100644
--- a/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_canvas_render_rd.cpp
@@ -304,7 +304,7 @@ RendererCanvasRender::PolygonID RendererCanvasRenderRD::request_polygon(const Ve
 		index_buffer.resize(p_indices.size() * sizeof(int32_t));
 		{
 			uint8_t *w = index_buffer.ptrw();
-			copymem(w, p_indices.ptr(), sizeof(int32_t) * p_indices.size());
+			memcpy(w, p_indices.ptr(), sizeof(int32_t) * p_indices.size());
 		}
 		pb.index_buffer = RD::get_singleton()->index_buffer_create(p_indices.size(), RD::INDEX_BUFFER_FORMAT_UINT32, index_buffer);
 		pb.indices = RD::get_singleton()->index_array_create(pb.index_buffer, 0, p_indices.size());
@@ -2012,6 +2012,9 @@ void RendererCanvasRenderRD::ShaderData::set_code(const String &p_code) {
 	uses_screen_texture = false;
 
 	ShaderCompilerRD::IdentifierActions actions;
+	actions.entry_point_stages["vertex"] = ShaderCompilerRD::STAGE_VERTEX;
+	actions.entry_point_stages["fragment"] = ShaderCompilerRD::STAGE_FRAGMENT;
+	actions.entry_point_stages["light"] = ShaderCompilerRD::STAGE_FRAGMENT;
 
 	actions.render_mode_values["blend_add"] = Pair<int *, int>(&blend_mode, BLEND_MODE_ADD);
 	actions.render_mode_values["blend_mix"] = Pair<int *, int>(&blend_mode, BLEND_MODE_MIX);
@@ -2048,7 +2051,7 @@ void RendererCanvasRenderRD::ShaderData::set_code(const String &p_code) {
 	print_line("\n**fragment_code:\n" + gen_code.fragment);
 	print_line("\n**light_code:\n" + gen_code.light);
 #endif
-	canvas_singleton->shader.canvas_shader.version_set_code(version, gen_code.uniforms, gen_code.vertex_global, gen_code.vertex, gen_code.fragment_global, gen_code.light, gen_code.fragment, gen_code.defines);
+	canvas_singleton->shader.canvas_shader.version_set_code(version, gen_code.code, gen_code.uniforms, gen_code.stage_globals[ShaderCompilerRD::STAGE_VERTEX], gen_code.stage_globals[ShaderCompilerRD::STAGE_FRAGMENT], gen_code.defines);
 	ERR_FAIL_COND(!canvas_singleton->shader.canvas_shader.version_is_valid(version));
 
 	ubo_size = gen_code.uniform_total_size;
diff --git a/servers/rendering/renderer_rd/renderer_compositor_rd.cpp b/servers/rendering/renderer_rd/renderer_compositor_rd.cpp
index 2247b841c9..cb3e67e990 100644
--- a/servers/rendering/renderer_rd/renderer_compositor_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_compositor_rd.cpp
@@ -175,5 +175,14 @@ RendererCompositorRD::RendererCompositorRD() {
 
 	storage = memnew(RendererStorageRD);
 	canvas = memnew(RendererCanvasRenderRD(storage));
-	scene = memnew(RendererSceneRenderImplementation::RenderForwardClustered(storage));
+
+	uint32_t back_end = GLOBAL_GET("rendering/vulkan/rendering/back_end");
+	uint32_t textures_per_stage = RD::get_singleton()->limit_get(RD::LIMIT_MAX_TEXTURES_PER_SHADER_STAGE);
+
+	if (back_end == 1 || textures_per_stage < 48) {
+		scene = memnew(RendererSceneRenderImplementation::RenderForwardMobile(storage));
+	} else { // back_end == 0
+		// default to our high end renderer
+		scene = memnew(RendererSceneRenderImplementation::RenderForwardClustered(storage));
+	}
 }
diff --git a/servers/rendering/renderer_rd/renderer_compositor_rd.h b/servers/rendering/renderer_rd/renderer_compositor_rd.h
index 5b5f3ad0cb..b3865de2bf 100644
--- a/servers/rendering/renderer_rd/renderer_compositor_rd.h
+++ b/servers/rendering/renderer_rd/renderer_compositor_rd.h
@@ -35,6 +35,7 @@
 #include "core/templates/thread_work_pool.h"
 #include "servers/rendering/renderer_compositor.h"
 #include "servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h"
+#include "servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h"
 #include "servers/rendering/renderer_rd/renderer_canvas_render_rd.h"
 #include "servers/rendering/renderer_rd/renderer_storage_rd.h"
 
diff --git a/servers/rendering/renderer_rd/renderer_scene_gi_rd.cpp b/servers/rendering/renderer_rd/renderer_scene_gi_rd.cpp
index 3856f38457..bc92e0b1ad 100644
--- a/servers/rendering/renderer_rd/renderer_scene_gi_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_scene_gi_rd.cpp
@@ -1564,7 +1564,7 @@ void RendererSceneGIRD::SDFGI::render_region(RID p_render_buffers, int p_region,
 		//clear dispatch indirect data
 
 		SDFGIShader::PreprocessPushConstant push_constant;
-		zeromem(&push_constant, sizeof(SDFGIShader::PreprocessPushConstant));
+		memset(&push_constant, 0, sizeof(SDFGIShader::PreprocessPushConstant));
 
 		RENDER_TIMESTAMP("Scroll SDF");
 
@@ -2602,7 +2602,7 @@ void RendererSceneGIRD::GIProbeInstance::update(bool p_update_light_instances, c
 				p_scene_render->_render_material(to_world_xform * xform, cm, true, p_scene_render->cull_argument, dynamic_maps[0].fb, Rect2i(Vector2i(), rect.size));
 
 				GIProbeDynamicPushConstant push_constant;
-				zeromem(&push_constant, sizeof(GIProbeDynamicPushConstant));
+				memset(&push_constant, 0, sizeof(GIProbeDynamicPushConstant));
 				push_constant.limits[0] = octree_size.x;
 				push_constant.limits[1] = octree_size.y;
 				push_constant.limits[2] = octree_size.z;
@@ -3144,8 +3144,6 @@ void RendererSceneGIRD::process_gi(RID p_render_buffers, RID p_normal_roughness_
 		rb->reflection_buffer = RD::get_singleton()->texture_create(tf, RD::TextureView());
 		rb->ambient_buffer = RD::get_singleton()->texture_create(tf, RD::TextureView());
 		rb->gi.using_half_size_gi = half_resolution;
-
-		p_scene_render->_render_buffers_uniform_set_changed(p_render_buffers);
 	}
 
 	PushConstant push_constant;
diff --git a/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp b/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp
index 4c5bded2ff..d7a5d1211c 100644
--- a/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_scene_render_rd.cpp
@@ -58,8 +58,6 @@ void RendererSceneRenderRD::sdfgi_update(RID p_render_buffers, RID p_environment
 			rb->sdfgi->erase();
 			memdelete(rb->sdfgi);
 			rb->sdfgi = nullptr;
-
-			_render_buffers_uniform_set_changed(p_render_buffers);
 		}
 		return;
 	}
@@ -78,8 +76,6 @@ void RendererSceneRenderRD::sdfgi_update(RID p_render_buffers, RID p_environment
 	if (sdfgi == nullptr) {
 		// re-create
 		rb->sdfgi = gi.create_sdfgi(env, p_world_position, requested_history_size);
-
-		_render_buffers_uniform_set_changed(p_render_buffers);
 	} else {
 		//check for updates
 		rb->sdfgi->update(env, p_world_position);
@@ -317,7 +313,7 @@ void RendererSceneRenderRD::environment_set_sdfgi(RID p_env, bool p_enable, RS::
 	RendererSceneEnvironmentRD *env = environment_owner.getornull(p_env);
 	ERR_FAIL_COND(!env);
 
-	if (low_end) {
+	if (!is_dynamic_gi_supported()) {
 		return;
 	}
 
@@ -379,7 +375,7 @@ void RendererSceneRenderRD::environment_set_volumetric_fog(RID p_env, bool p_ena
 	RendererSceneEnvironmentRD *env = environment_owner.getornull(p_env);
 	ERR_FAIL_COND(!env);
 
-	if (low_end) {
+	if (!is_volumetric_supported()) {
 		return;
 	}
 
@@ -410,10 +406,6 @@ void RendererSceneRenderRD::environment_set_ssr(RID p_env, bool p_enable, int p_
 	RendererSceneEnvironmentRD *env = environment_owner.getornull(p_env);
 	ERR_FAIL_COND(!env);
 
-	if (low_end) {
-		return;
-	}
-
 	env->set_ssr(p_enable, p_max_steps, p_fade_int, p_fade_out, p_depth_tolerance);
 }
 
@@ -429,10 +421,6 @@ void RendererSceneRenderRD::environment_set_ssao(RID p_env, bool p_enable, float
 	RendererSceneEnvironmentRD *env = environment_owner.getornull(p_env);
 	ERR_FAIL_COND(!env);
 
-	if (low_end) {
-		return;
-	}
-
 	env->set_ssao(p_enable, p_radius, p_intensity, p_power, p_detail, p_horizon, p_sharpness, p_light_affect, p_ao_channel_affect);
 }
 
@@ -1347,7 +1335,7 @@ void RendererSceneRenderRD::gi_probe_instance_set_transform_to_data(RID p_probe,
 }
 
 bool RendererSceneRenderRD::gi_probe_needs_update(RID p_probe) const {
-	if (low_end) {
+	if (!is_dynamic_gi_supported()) {
 		return false;
 	}
 
@@ -1355,7 +1343,7 @@ bool RendererSceneRenderRD::gi_probe_needs_update(RID p_probe) const {
 }
 
 void RendererSceneRenderRD::gi_probe_update(RID p_probe, bool p_update_light_instances, const Vector<RID> &p_light_instances, const PagedArray<GeometryInstance *> &p_dynamic_objects) {
-	if (low_end) {
+	if (!is_dynamic_gi_supported()) {
 		return;
 	}
 
@@ -1541,7 +1529,6 @@ void RendererSceneRenderRD::_process_sss(RID p_render_buffers, const CameraMatri
 
 	if (rb->blur[0].texture.is_null()) {
 		_allocate_blur_textures(rb);
-		_render_buffers_uniform_set_changed(p_render_buffers);
 	}
 
 	storage->get_effects()->sub_surface_scattering(rb->texture, rb->blur[0].mipmaps[0].texture, rb->depth_texture, p_camera, Size2i(rb->width, rb->height), sss_scale, sss_depth_scale, sss_quality);
@@ -1593,7 +1580,6 @@ void RendererSceneRenderRD::_process_ssr(RID p_render_buffers, RID p_dest_frameb
 
 	if (rb->blur[0].texture.is_null()) {
 		_allocate_blur_textures(rb);
-		_render_buffers_uniform_set_changed(p_render_buffers);
 	}
 
 	storage->get_effects()->screen_space_reflection(rb->texture, p_normal_buffer, ssr_roughness_quality, rb->ssr.blur_radius[0], rb->ssr.blur_radius[1], p_metallic, p_metallic_mask, rb->depth_texture, rb->ssr.depth_scaled, rb->ssr.normal_scaled, rb->blur[0].mipmaps[1].texture, rb->blur[1].mipmaps[0].texture, Size2i(rb->width / 2, rb->height / 2), env->ssr_max_steps, env->ssr_fade_in, env->ssr_fade_out, env->ssr_depth_tolerance, p_projection);
@@ -1719,7 +1705,6 @@ void RendererSceneRenderRD::_process_ssao(RID p_render_buffers, RID p_environmen
 			tf.usage_bits = RD::TEXTURE_USAGE_SAMPLING_BIT | RD::TEXTURE_USAGE_STORAGE_BIT;
 			rb->ssao.ao_final = RD::get_singleton()->texture_create(tf, RD::TextureView());
 			RD::get_singleton()->set_resource_name(rb->ssao.ao_final, "SSAO Final");
-			_render_buffers_uniform_set_changed(p_render_buffers);
 		}
 		ssao_using_half_size = ssao_half_size;
 		uniform_sets_are_invalid = true;
@@ -1759,7 +1744,6 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(RID p_rende
 	if (can_use_effects && camfx && (camfx->dof_blur_near_enabled || camfx->dof_blur_far_enabled) && camfx->dof_blur_amount > 0.0) {
 		if (rb->blur[0].texture.is_null()) {
 			_allocate_blur_textures(rb);
-			_render_buffers_uniform_set_changed(p_render_buffers);
 		}
 
 		float bokeh_size = camfx->dof_blur_amount * 64.0;
@@ -1769,7 +1753,6 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(RID p_rende
 	if (can_use_effects && env && env->auto_exposure) {
 		if (rb->luminance.current.is_null()) {
 			_allocate_luminance_textures(rb);
-			_render_buffers_uniform_set_changed(p_render_buffers);
 		}
 
 		bool set_immediate = env->auto_exposure_version != rb->auto_exposure_version;
@@ -1790,7 +1773,6 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(RID p_rende
 
 		if (rb->blur[1].texture.is_null()) {
 			_allocate_blur_textures(rb);
-			_render_buffers_uniform_set_changed(p_render_buffers);
 		}
 
 		for (int i = 0; i < RS::MAX_GLOW_LEVELS; i++) {
@@ -1881,7 +1863,7 @@ void RendererSceneRenderRD::_render_buffers_post_process_and_tonemap(RID p_rende
 	storage->render_target_disable_clear_request(rb->render_target);
 }
 
-void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas) {
+void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas, RID p_occlusion_buffer) {
 	EffectsRD *effects = storage->get_effects();
 
 	RenderBuffers *rb = render_buffers_owner.getornull(p_render_buffers);
@@ -1940,6 +1922,13 @@ void RendererSceneRenderRD::_render_buffers_debug_draw(RID p_render_buffers, RID
 		RID reflection_texture = rb->reflection_buffer;
 		effects->copy_to_fb_rect(ambient_texture, storage->render_target_get_rd_framebuffer(rb->render_target), Rect2(Vector2(), rtsize), false, false, false, true, reflection_texture);
 	}
+
+	if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_OCCLUDERS) {
+		if (p_occlusion_buffer.is_valid()) {
+			Size2 rtsize = storage->render_target_get_size(rb->render_target);
+			effects->copy_to_fb_rect(storage->texture_get_rd_texture(p_occlusion_buffer), storage->render_target_get_rd_framebuffer(rb->render_target), Rect2i(Vector2(), rtsize), true, false);
+		}
+	}
 }
 
 void RendererSceneRenderRD::environment_set_adjustment(RID p_env, bool p_enable, float p_brightness, float p_contrast, float p_saturation, bool p_use_1d_color_correction, RID p_color_correction) {
@@ -2178,7 +2167,6 @@ void RendererSceneRenderRD::render_buffers_configure(RID p_render_buffers, RID p
 	}
 
 	rb->data->configure(rb->texture, rb->depth_texture, p_width, p_height, p_msaa);
-	_render_buffers_uniform_set_changed(p_render_buffers);
 
 	if (is_clustered_enabled()) {
 		rb->cluster_builder->setup(Size2i(p_width, p_height), max_cluster_elements, rb->depth_texture, storage->sampler_rd_get_default(RS::CANVAS_ITEM_TEXTURE_FILTER_NEAREST, RS::CANVAS_ITEM_TEXTURE_REPEAT_DISABLED), rb->texture);
@@ -2330,6 +2318,8 @@ void RendererSceneRenderRD::_setup_reflections(const PagedArray<RID> &p_reflecti
 
 		Vector3 extents = storage->reflection_probe_get_extents(base_probe);
 
+		rpi->cull_mask = storage->reflection_probe_get_cull_mask(base_probe);
+
 		reflection_ubo.box_extents[0] = extents.x;
 		reflection_ubo.box_extents[1] = extents.y;
 		reflection_ubo.box_extents[2] = extents.z;
@@ -2358,13 +2348,15 @@ void RendererSceneRenderRD::_setup_reflections(const PagedArray<RID> &p_reflecti
 		Transform proj = (p_camera_inverse_transform * transform).inverse();
 		RendererStorageRD::store_transform(proj, reflection_ubo.local_matrix);
 
-		current_cluster_builder->add_box(ClusterBuilderRD::BOX_TYPE_REFLECTION_PROBE, transform, extents);
+		if (current_cluster_builder != nullptr) {
+			current_cluster_builder->add_box(ClusterBuilderRD::BOX_TYPE_REFLECTION_PROBE, transform, extents);
+		}
 
 		rpi->last_pass = RSG::rasterizer->get_frame_number();
 	}
 
 	if (cluster.reflection_count) {
-		RD::get_singleton()->buffer_update(cluster.reflection_buffer, 0, cluster.reflection_count * sizeof(RendererSceneSkyRD::ReflectionData), cluster.reflections, RD::BARRIER_MASK_RASTER | RD::BARRIER_MASK_COMPUTE);
+		RD::get_singleton()->buffer_update(cluster.reflection_buffer, 0, cluster.reflection_count * sizeof(Cluster::ReflectionData), cluster.reflections, RD::BARRIER_MASK_RASTER | RD::BARRIER_MASK_COMPUTE);
 	}
 }
 
@@ -2562,6 +2554,7 @@ void RendererSceneRenderRD::_setup_lights(const PagedArray<RID> &p_lights, const
 
 					light_data.soft_shadow_scale = storage->light_get_param(base, RS::LIGHT_PARAM_SHADOW_BLUR);
 					light_data.softshadow_angle = angular_diameter;
+					light_data.bake_mode = storage->light_get_bake_mode(base);
 
 					if (angular_diameter <= 0.0) {
 						light_data.soft_shadow_scale *= directional_shadow_quality_radius_get(); // Only use quality radius for PCF
@@ -2629,6 +2622,7 @@ void RendererSceneRenderRD::_setup_lights(const PagedArray<RID> &p_lights, const
 		light_data.color[1] = linear_col.g * energy;
 		light_data.color[2] = linear_col.b * energy;
 		light_data.specular_amount = storage->light_get_param(base, RS::LIGHT_PARAM_SPECULAR) * 2.0;
+		light_data.bake_mode = storage->light_get_bake_mode(base);
 
 		float radius = MAX(0.001, storage->light_get_param(base, RS::LIGHT_PARAM_RANGE));
 		light_data.inv_radius = 1.0 / radius;
@@ -2748,8 +2742,11 @@ void RendererSceneRenderRD::_setup_lights(const PagedArray<RID> &p_lights, const
 		}
 
 		li->light_index = index;
+		li->cull_mask = storage->light_get_cull_mask(base);
 
-		current_cluster_builder->add_light(type == RS::LIGHT_SPOT ? ClusterBuilderRD::LIGHT_TYPE_SPOT : ClusterBuilderRD::LIGHT_TYPE_OMNI, light_transform, radius, spot_angle);
+		if (current_cluster_builder != nullptr) {
+			current_cluster_builder->add_light(type == RS::LIGHT_SPOT ? ClusterBuilderRD::LIGHT_TYPE_SPOT : ClusterBuilderRD::LIGHT_TYPE_OMNI, light_transform, radius, spot_angle);
+		}
 
 		r_positional_light_count++;
 	}
@@ -2817,6 +2814,9 @@ void RendererSceneRenderRD::_setup_decals(const PagedArray<RID> &p_decals, const
 		DecalInstance *di = cluster.decal_sort[i].instance;
 		RID decal = di->decal;
 
+		di->render_index = i;
+		di->cull_mask = storage->decal_get_cull_mask(decal);
+
 		Transform xform = di->transform;
 		float fade = 1.0;
 
@@ -2921,7 +2921,9 @@ void RendererSceneRenderRD::_setup_decals(const PagedArray<RID> &p_decals, const
 		dd.upper_fade = storage->decal_get_upper_fade(decal);
 		dd.lower_fade = storage->decal_get_lower_fade(decal);
 
-		current_cluster_builder->add_box(ClusterBuilderRD::BOX_TYPE_DECAL, xform, decal_extents);
+		if (current_cluster_builder != nullptr) {
+			current_cluster_builder->add_box(ClusterBuilderRD::BOX_TYPE_DECAL, xform, decal_extents);
+		}
 	}
 
 	if (cluster.decal_count > 0) {
@@ -2929,6 +2931,116 @@ void RendererSceneRenderRD::_setup_decals(const PagedArray<RID> &p_decals, const
 	}
 }
 
+void RendererSceneRenderRD::_fill_instance_indices(const RID *p_omni_light_instances, uint32_t p_omni_light_instance_count, uint32_t *p_omni_light_indices, const RID *p_spot_light_instances, uint32_t p_spot_light_instance_count, uint32_t *p_spot_light_indices, const RID *p_reflection_probe_instances, uint32_t p_reflection_probe_instance_count, uint32_t *p_reflection_probe_indices, const RID *p_decal_instances, uint32_t p_decal_instance_count, uint32_t *p_decal_instance_indices, uint32_t p_layer_mask, uint32_t p_max_dst_words) {
+	// first zero out our indices
+	for (uint32_t i = 0; i < p_max_dst_words; i++) {
+		p_omni_light_indices[i] = 0;
+		p_spot_light_indices[i] = 0;
+		p_reflection_probe_indices[i] = 0;
+		p_decal_instance_indices[i] = 0;
+	}
+
+	{
+		// process omni lights
+		uint32_t dword = 0;
+		uint32_t shift = 0;
+
+		for (uint32_t i = 0; i < p_omni_light_instance_count && dword < p_max_dst_words; i++) {
+			LightInstance *li = light_instance_owner.getornull(p_omni_light_instances[i]);
+
+			if ((li->cull_mask & p_layer_mask) && (li->light_index < 255)) {
+				p_omni_light_indices[dword] += li->light_index << shift;
+				if (shift == 24) {
+					dword++;
+					shift = 0;
+				} else {
+					shift += 8;
+				}
+			}
+		}
+
+		if (dword < 2) {
+			// put in ending mark
+			p_omni_light_indices[dword] += 0xFF << shift;
+		}
+	}
+
+	{
+		// process spot lights
+		uint32_t dword = 0;
+		uint32_t shift = 0;
+
+		for (uint32_t i = 0; i < p_spot_light_instance_count && dword < p_max_dst_words; i++) {
+			LightInstance *li = light_instance_owner.getornull(p_spot_light_instances[i]);
+
+			if ((li->cull_mask & p_layer_mask) && (li->light_index < 255)) {
+				p_spot_light_indices[dword] += li->light_index << shift;
+				if (shift == 24) {
+					dword++;
+					shift = 0;
+				} else {
+					shift += 8;
+				}
+			}
+		}
+
+		if (dword < 2) {
+			// put in ending mark
+			p_spot_light_indices[dword] += 0xFF << shift;
+		}
+	}
+
+	{
+		// process reflection probes
+		uint32_t dword = 0;
+		uint32_t shift = 0;
+
+		for (uint32_t i = 0; i < p_reflection_probe_instance_count && dword < p_max_dst_words; i++) {
+			ReflectionProbeInstance *rpi = reflection_probe_instance_owner.getornull(p_reflection_probe_instances[i]);
+
+			if ((rpi->cull_mask & p_layer_mask) && (rpi->render_index < 255)) {
+				p_reflection_probe_indices[dword] += rpi->render_index << shift;
+				if (shift == 24) {
+					dword++;
+					shift = 0;
+				} else {
+					shift += 8;
+				}
+			}
+		}
+
+		if (dword < 2) {
+			// put in ending mark
+			p_reflection_probe_indices[dword] += 0xFF << shift;
+		}
+	}
+
+	{
+		// process decals
+		uint32_t dword = 0;
+		uint32_t shift = 0;
+
+		for (uint32_t i = 0; i < p_decal_instance_count && dword < p_max_dst_words; i++) {
+			DecalInstance *decal = decal_instance_owner.getornull(p_decal_instances[i]);
+
+			if ((decal->cull_mask & p_layer_mask) && (decal->render_index < 255)) {
+				p_decal_instance_indices[dword] += decal->render_index << shift;
+				if (shift == 24) {
+					dword++;
+					shift = 0;
+				} else {
+					shift += 8;
+				}
+			}
+		}
+
+		if (dword < 2) {
+			// put in ending mark
+			p_decal_instance_indices[dword] += 0xFF << shift;
+		}
+	}
+}
+
 void RendererSceneRenderRD::_volumetric_fog_erase(RenderBuffers *rb) {
 	ERR_FAIL_COND(!rb->volumetric_fog);
 
@@ -2968,7 +3080,6 @@ void RendererSceneRenderRD::_update_volumetric_fog(RID p_render_buffers, RID p_e
 		//validate
 		if (!env || !env->volumetric_fog_enabled || rb->volumetric_fog->width != target_width || rb->volumetric_fog->height != target_height || rb->volumetric_fog->depth != volumetric_fog_depth) {
 			_volumetric_fog_erase(rb);
-			_render_buffers_uniform_set_changed(p_render_buffers);
 		}
 	}
 
@@ -3004,7 +3115,6 @@ void RendererSceneRenderRD::_update_volumetric_fog(RID p_render_buffers, RID p_e
 		tf.usage_bits = RD::TEXTURE_USAGE_STORAGE_BIT | RD::TEXTURE_USAGE_SAMPLING_BIT;
 
 		rb->volumetric_fog->fog_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
-		_render_buffers_uniform_set_changed(p_render_buffers);
 
 		Vector<RD::Uniform> uniforms;
 		{
@@ -3524,12 +3634,12 @@ void RendererSceneRenderRD::_pre_opaque_render(bool p_use_ssao, bool p_use_gi, R
 	}
 }
 
-void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data) {
+void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data) {
 	// getting this here now so we can direct call a bunch of things more easily
 	RenderBuffers *rb = nullptr;
 	if (p_render_buffers.is_valid()) {
 		rb = render_buffers_owner.getornull(p_render_buffers);
-		ERR_FAIL_COND(!rb); // !BAS! Do we fail here or skip the parts that won't work. can't really see a case why we would be rendering without buffers....
+		ERR_FAIL_COND(!rb);
 	}
 
 	//assign render data
@@ -3585,10 +3695,12 @@ void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &
 	}
 
 	//assign render indices to giprobes
-	for (uint32_t i = 0; i < (uint32_t)p_gi_probes.size(); i++) {
-		RendererSceneGIRD::GIProbeInstance *giprobe_inst = gi.gi_probe_instance_owner.getornull(p_gi_probes[i]);
-		if (giprobe_inst) {
-			giprobe_inst->render_index = i;
+	if (is_dynamic_gi_supported()) {
+		for (uint32_t i = 0; i < (uint32_t)p_gi_probes.size(); i++) {
+			RendererSceneGIRD::GIProbeInstance *giprobe_inst = gi.gi_probe_instance_owner.getornull(p_gi_probes[i]);
+			if (giprobe_inst) {
+				giprobe_inst->render_index = i;
+			}
 		}
 	}
 
@@ -3624,7 +3736,11 @@ void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &
 
 	render_state.depth_prepass_used = false;
 	//calls _pre_opaque_render between depth pre-pass and opaque pass
-	_render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_ortogonal, p_instances, *render_state.gi_probes, p_lightmaps, p_environment, current_cluster_builder->get_cluster_buffer(), current_cluster_builder->get_cluster_size(), current_cluster_builder->get_max_cluster_elements(), p_camera_effects, p_shadow_atlas, p_reflection_atlas, p_reflection_probe, p_reflection_probe_pass, clear_color, p_screen_lod_threshold);
+	if (current_cluster_builder != nullptr) {
+		_render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_ortogonal, p_instances, *render_state.gi_probes, p_lightmaps, p_environment, current_cluster_builder->get_cluster_buffer(), current_cluster_builder->get_cluster_size(), current_cluster_builder->get_max_cluster_elements(), p_camera_effects, p_shadow_atlas, p_reflection_atlas, p_reflection_probe, p_reflection_probe_pass, clear_color, p_screen_lod_threshold);
+	} else {
+		_render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_ortogonal, p_instances, *render_state.gi_probes, p_lightmaps, p_environment, RID(), 0, 0, p_camera_effects, p_shadow_atlas, p_reflection_atlas, p_reflection_probe, p_reflection_probe_pass, clear_color, p_screen_lod_threshold);
+	}
 
 	if (p_render_buffers.is_valid()) {
 		if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_CLUSTER_OMNI_LIGHTS || debug_draw == RS::VIEWPORT_DEBUG_DRAW_CLUSTER_SPOT_LIGHTS || debug_draw == RS::VIEWPORT_DEBUG_DRAW_CLUSTER_DECALS || debug_draw == RS::VIEWPORT_DEBUG_DRAW_CLUSTER_REFLECTION_PROBES) {
@@ -3645,13 +3761,15 @@ void RendererSceneRenderRD::render_scene(RID p_render_buffers, const Transform &
 				default: {
 				}
 			}
-			current_cluster_builder->debug(elem_type);
+			if (current_cluster_builder != nullptr) {
+				current_cluster_builder->debug(elem_type);
+			}
 		}
 
 		RENDER_TIMESTAMP("Tonemap");
 
 		_render_buffers_post_process_and_tonemap(p_render_buffers, p_environment, p_camera_effects, p_cam_projection);
-		_render_buffers_debug_draw(p_render_buffers, p_shadow_atlas);
+		_render_buffers_debug_draw(p_render_buffers, p_shadow_atlas, p_occluder_debug_tex);
 		if (debug_draw == RS::VIEWPORT_DEBUG_DRAW_SDFGI && rb != nullptr && rb->sdfgi != nullptr) {
 			rb->sdfgi->debug_draw(p_cam_projection, p_cam_transform, rb->width, rb->height, rb->render_target, rb->texture);
 		}
@@ -4091,11 +4209,6 @@ int RendererSceneRenderRD::get_max_directional_lights() const {
 	return cluster.max_directional_lights;
 }
 
-bool RendererSceneRenderRD::is_low_end() const {
-	// by default we switch this on this (may be ignored in some implementations)
-	return GLOBAL_GET("rendering/driver/rd_renderer/use_low_end_renderer");
-}
-
 bool RendererSceneRenderRD::is_dynamic_gi_supported() const {
 	// usable by default (unless low end = true)
 	return true;
@@ -4111,8 +4224,12 @@ bool RendererSceneRenderRD::is_volumetric_supported() const {
 	return true;
 }
 
+uint32_t RendererSceneRenderRD::get_max_elements() const {
+	return GLOBAL_GET("rendering/limits/cluster_builder/max_clustered_elements");
+}
+
 RendererSceneRenderRD::RendererSceneRenderRD(RendererStorageRD *p_storage) {
-	max_cluster_elements = GLOBAL_GET("rendering/limits/cluster_builder/max_clustered_elements");
+	max_cluster_elements = get_max_elements();
 
 	storage = p_storage;
 	singleton = this;
@@ -4120,21 +4237,13 @@ RendererSceneRenderRD::RendererSceneRenderRD(RendererStorageRD *p_storage) {
 	directional_shadow.size = GLOBAL_GET("rendering/shadows/directional_shadow/size");
 	directional_shadow.use_16_bits = GLOBAL_GET("rendering/shadows/directional_shadow/16_bits");
 
-	uint32_t textures_per_stage = RD::get_singleton()->limit_get(RD::LIMIT_MAX_TEXTURES_PER_SHADER_STAGE);
-
-	low_end = is_low_end();
-
-	if (textures_per_stage < 48) {
-		low_end = true;
-	}
-
 	/* SKY SHADER */
 
 	sky.init(storage);
 
 	/* GI */
 
-	if (!low_end && is_dynamic_gi_supported()) {
+	if (is_dynamic_gi_supported()) {
 		gi.init(storage, &sky);
 	}
 
@@ -4172,7 +4281,7 @@ RendererSceneRenderRD::RendererSceneRenderRD(RendererStorageRD *p_storage) {
 		cluster.directional_light_buffer = RD::get_singleton()->uniform_buffer_create(directional_light_buffer_size);
 	}
 
-	if (!low_end && is_volumetric_supported()) {
+	if (is_volumetric_supported()) {
 		String defines = "\n#define MAX_DIRECTIONAL_LIGHT_DATA_STRUCTS " + itos(cluster.max_directional_lights) + "\n";
 		Vector<String> volumetric_fog_modes;
 		volumetric_fog_modes.push_back("\n#define MODE_DENSITY\n");
@@ -4230,7 +4339,7 @@ RendererSceneRenderRD::~RendererSceneRenderRD() {
 		RD::get_singleton()->free(sky.sky_scene_state.uniform_set);
 	}
 
-	if (!low_end) {
+	if (is_dynamic_gi_supported()) {
 		gi.free();
 
 		volumetric_fog.shader.version_free(volumetric_fog.shader_version);
diff --git a/servers/rendering/renderer_rd/renderer_scene_render_rd.h b/servers/rendering/renderer_rd/renderer_scene_render_rd.h
index 264c0e4276..7600d6823e 100644
--- a/servers/rendering/renderer_rd/renderer_scene_render_rd.h
+++ b/servers/rendering/renderer_rd/renderer_scene_render_rd.h
@@ -51,7 +51,6 @@ protected:
 	RendererStorageRD *storage;
 	double time;
 	double time_step = 0;
-	bool low_end = false; // If true GI and Volumetric fog are disabled
 
 	struct RenderBufferData {
 		virtual void configure(RID p_color_buffer, RID p_depth_buffer, int p_width, int p_height, RS::ViewportMSAA p_msaa) = 0;
@@ -80,7 +79,6 @@ protected:
 	RenderBufferData *render_buffers_get_data(RID p_render_buffers);
 
 	virtual void _base_uniforms_changed() = 0;
-	virtual void _render_buffers_uniform_set_changed(RID p_render_buffers) = 0;
 	virtual RID _render_buffers_get_normal_texture(RID p_render_buffers) = 0;
 
 	void _process_ssao(RID p_render_buffers, RID p_environment, RID p_normal_buffer, const CameraMatrix &p_projection);
@@ -151,6 +149,7 @@ private:
 		uint32_t render_step = 0;
 		uint64_t last_pass = 0;
 		uint32_t render_index = 0;
+		uint32_t cull_mask = 0;
 
 		Transform transform;
 	};
@@ -162,6 +161,8 @@ private:
 	struct DecalInstance {
 		RID decal;
 		Transform transform;
+		uint32_t render_index;
+		uint32_t cull_mask;
 	};
 
 	mutable RID_Owner<DecalInstance> decal_instance_owner;
@@ -306,6 +307,7 @@ private:
 		uint64_t last_scene_shadow_pass = 0;
 		uint64_t last_pass = 0;
 		uint32_t light_index = 0;
+		uint32_t cull_mask = 0;
 		uint32_t light_directional_index = 0;
 
 		uint32_t current_shadow_atlas_key = 0;
@@ -442,7 +444,7 @@ private:
 	void _allocate_blur_textures(RenderBuffers *rb);
 	void _allocate_luminance_textures(RenderBuffers *rb);
 
-	void _render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas);
+	void _render_buffers_debug_draw(RID p_render_buffers, RID p_shadow_atlas, RID p_occlusion_buffer);
 	void _render_buffers_post_process_and_tonemap(RID p_render_buffers, RID p_environment, RID p_camera_effects, const CameraMatrix &p_projection);
 
 	/* Cluster */
@@ -450,6 +452,8 @@ private:
 	struct Cluster {
 		/* Scene State UBO */
 
+		// !BAS! Most data here is not just used by our clustering logic but also by other lighting implementations. Maybe rename this struct to something more appropriate
+
 		enum {
 			REFLECTION_AMBIENT_DISABLED = 0,
 			REFLECTION_AMBIENT_ENVIRONMENT = 1,
@@ -463,8 +467,8 @@ private:
 			uint32_t mask;
 			float ambient[3]; // ambient color,
 			float intensity;
-			bool exterior;
-			bool box_project;
+			uint32_t exterior;
+			uint32_t box_project;
 			uint32_t ambient_mode;
 			uint32_t pad;
 			float local_matrix[16]; // up to here for spot and omni, rest is for directional
@@ -493,7 +497,7 @@ private:
 			float soft_shadow_scale;
 			uint32_t mask;
 			float shadow_volumetric_fog_fade;
-			uint32_t pad;
+			uint32_t bake_mode;
 			float projector_rect[4];
 		};
 
@@ -510,7 +514,8 @@ private:
 			uint32_t shadow_enabled;
 			float fade_from;
 			float fade_to;
-			uint32_t pad[3];
+			uint32_t pad[2];
+			uint32_t bake_mode;
 			float shadow_volumetric_fog_fade;
 			float shadow_bias[4];
 			float shadow_normal_bias[4];
@@ -1086,6 +1091,8 @@ public:
 		return li->transform;
 	}
 
+	void _fill_instance_indices(const RID *p_omni_light_instances, uint32_t p_omni_light_instance_count, uint32_t *p_omni_light_indices, const RID *p_spot_light_instances, uint32_t p_spot_light_instance_count, uint32_t *p_spot_light_indices, const RID *p_reflection_probe_instances, uint32_t p_reflection_probe_instance_count, uint32_t *p_reflection_probe_indices, const RID *p_decal_instances, uint32_t p_decal_instance_count, uint32_t *p_decal_instance_indices, uint32_t p_layer_mask, uint32_t p_max_dst_words = 2);
+
 	/* gi light probes */
 
 	RID gi_probe_instance_create(RID p_base);
@@ -1126,7 +1133,7 @@ public:
 	float render_buffers_get_volumetric_fog_end(RID p_render_buffers);
 	float render_buffers_get_volumetric_fog_detail_spread(RID p_render_buffers);
 
-	void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr);
+	void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr);
 
 	void render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region);
 
@@ -1190,11 +1197,10 @@ public:
 
 	void sdfgi_set_debug_probe_select(const Vector3 &p_position, const Vector3 &p_dir);
 
-	virtual bool is_low_end() const;
-
 	virtual bool is_dynamic_gi_supported() const;
 	virtual bool is_clustered_enabled() const;
 	virtual bool is_volumetric_supported() const;
+	virtual uint32_t get_max_elements() const;
 
 	RendererSceneRenderRD(RendererStorageRD *p_storage);
 	~RendererSceneRenderRD();
diff --git a/servers/rendering/renderer_rd/renderer_scene_sky_rd.cpp b/servers/rendering/renderer_rd/renderer_scene_sky_rd.cpp
index 769335ac16..54c6e81110 100644
--- a/servers/rendering/renderer_rd/renderer_scene_sky_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_scene_sky_rd.cpp
@@ -50,6 +50,7 @@ void RendererSceneSkyRD::SkyShaderData::set_code(const String &p_code) {
 
 	ShaderCompilerRD::GeneratedCode gen_code;
 	ShaderCompilerRD::IdentifierActions actions;
+	actions.entry_point_stages["sky"] = ShaderCompilerRD::STAGE_FRAGMENT;
 
 	uses_time = false;
 	uses_half_res = false;
@@ -110,7 +111,7 @@ void RendererSceneSkyRD::SkyShaderData::set_code(const String &p_code) {
 	print_line("\n**light_code:\n" + gen_code.light);
 #endif
 
-	scene_singleton->sky.sky_shader.shader.version_set_code(version, gen_code.uniforms, gen_code.vertex_global, gen_code.vertex, gen_code.fragment_global, gen_code.light, gen_code.fragment, gen_code.defines);
+	scene_singleton->sky.sky_shader.shader.version_set_code(version, gen_code.code, gen_code.uniforms, gen_code.stage_globals[ShaderCompilerRD::STAGE_VERTEX], gen_code.stage_globals[ShaderCompilerRD::STAGE_FRAGMENT], gen_code.defines);
 	ERR_FAIL_COND(!scene_singleton->sky.sky_shader.shader.version_is_valid(version));
 
 	ubo_size = gen_code.uniform_total_size;
@@ -759,7 +760,7 @@ void RendererSceneSkyRD::init(RendererStorageRD *p_storage) {
 		sky_shader.default_shader = storage->shader_allocate();
 		storage->shader_initialize(sky_shader.default_shader);
 
-		storage->shader_set_code(sky_shader.default_shader, "shader_type sky; void fragment() { COLOR = vec3(0.0); } \n");
+		storage->shader_set_code(sky_shader.default_shader, "shader_type sky; void sky() { COLOR = vec3(0.0); } \n");
 
 		sky_shader.default_material = storage->material_allocate();
 		storage->material_initialize(sky_shader.default_material);
@@ -840,7 +841,7 @@ void RendererSceneSkyRD::init(RendererStorageRD *p_storage) {
 		sky_scene_state.fog_shader = storage->shader_allocate();
 		storage->shader_initialize(sky_scene_state.fog_shader);
 
-		storage->shader_set_code(sky_scene_state.fog_shader, "shader_type sky; uniform vec4 clear_color; void fragment() { COLOR = clear_color.rgb; } \n");
+		storage->shader_set_code(sky_scene_state.fog_shader, "shader_type sky; uniform vec4 clear_color; void sky() { COLOR = clear_color.rgb; } \n");
 		sky_scene_state.fog_material = storage->material_allocate();
 		storage->material_initialize(sky_scene_state.fog_material);
 
diff --git a/servers/rendering/renderer_rd/renderer_storage_rd.cpp b/servers/rendering/renderer_rd/renderer_storage_rd.cpp
index b984f850a0..47b9e33ca6 100644
--- a/servers/rendering/renderer_rd/renderer_storage_rd.cpp
+++ b/servers/rendering/renderer_rd/renderer_storage_rd.cpp
@@ -756,7 +756,7 @@ void RendererStorageRD::texture_3d_initialize(RID p_texture, Image::Format p_for
 		for (int i = 0; i < p_data.size(); i++) {
 			uint32_t s = images[i]->get_data().size();
 
-			copymem(&all_data.write[offset], images[i]->get_data().ptr(), s);
+			memcpy(&all_data.write[offset], images[i]->get_data().ptr(), s);
 			{
 				Texture::BufferSlice3D slice;
 				slice.size.width = images[i]->get_width();
@@ -919,7 +919,7 @@ void RendererStorageRD::texture_3d_update(RID p_texture, const Vector<Ref<Image>
 
 		for (int i = 0; i < p_data.size(); i++) {
 			uint32_t s = images[i]->get_data().size();
-			copymem(&all_data.write[offset], images[i]->get_data().ptr(), s);
+			memcpy(&all_data.write[offset], images[i]->get_data().ptr(), s);
 			offset += s;
 		}
 	}
@@ -2108,13 +2108,13 @@ _FORCE_INLINE_ static void _fill_std140_ubo_empty(ShaderLanguage::DataType type,
 		case ShaderLanguage::TYPE_INT:
 		case ShaderLanguage::TYPE_UINT:
 		case ShaderLanguage::TYPE_FLOAT: {
-			zeromem(data, 4);
+			memset(data, 0, 4);
 		} break;
 		case ShaderLanguage::TYPE_BVEC2:
 		case ShaderLanguage::TYPE_IVEC2:
 		case ShaderLanguage::TYPE_UVEC2:
 		case ShaderLanguage::TYPE_VEC2: {
-			zeromem(data, 8);
+			memset(data, 0, 8);
 		} break;
 		case ShaderLanguage::TYPE_BVEC3:
 		case ShaderLanguage::TYPE_IVEC3:
@@ -2124,16 +2124,16 @@ _FORCE_INLINE_ static void _fill_std140_ubo_empty(ShaderLanguage::DataType type,
 		case ShaderLanguage::TYPE_IVEC4:
 		case ShaderLanguage::TYPE_UVEC4:
 		case ShaderLanguage::TYPE_VEC4: {
-			zeromem(data, 16);
+			memset(data, 0, 16);
 		} break;
 		case ShaderLanguage::TYPE_MAT2: {
-			zeromem(data, 32);
+			memset(data, 0, 32);
 		} break;
 		case ShaderLanguage::TYPE_MAT3: {
-			zeromem(data, 48);
+			memset(data, 0, 48);
 		} break;
 		case ShaderLanguage::TYPE_MAT4: {
-			zeromem(data, 64);
+			memset(data, 0, 64);
 		} break;
 
 		default: {
@@ -3412,10 +3412,10 @@ void RendererStorageRD::_multimesh_make_local(MultiMesh *multimesh) const {
 			Vector<uint8_t> buffer = RD::get_singleton()->buffer_get_data(multimesh->buffer);
 			{
 				const uint8_t *r = buffer.ptr();
-				copymem(w, r, buffer.size());
+				memcpy(w, r, buffer.size());
 			}
 		} else {
-			zeromem(w, multimesh->instances * multimesh->stride_cache * sizeof(float));
+			memset(w, 0, multimesh->instances * multimesh->stride_cache * sizeof(float));
 		}
 	}
 	uint32_t data_cache_dirty_region_count = (multimesh->instances - 1) / MULTIMESH_DIRTY_REGION_SIZE + 1;
@@ -3771,7 +3771,7 @@ Vector<float> RendererStorageRD::multimesh_get_buffer(RID p_multimesh) const {
 		{
 			float *w = ret.ptrw();
 			const uint8_t *r = buffer.ptr();
-			copymem(w, r, buffer.size());
+			memcpy(w, r, buffer.size());
 		}
 
 		return ret;
@@ -3888,22 +3888,37 @@ bool RendererStorageRD::particles_get_emitting(RID p_particles) {
 }
 
 void RendererStorageRD::_particles_free_data(Particles *particles) {
-	if (!particles->particle_buffer.is_valid()) {
-		return;
+	if (particles->particle_buffer.is_valid()) {
+		RD::get_singleton()->free(particles->particle_buffer);
+		particles->particle_buffer = RID();
+		RD::get_singleton()->free(particles->particle_instance_buffer);
+		particles->particle_instance_buffer = RID();
+	}
+
+	if (particles->frame_params_buffer.is_valid()) {
+		RD::get_singleton()->free(particles->frame_params_buffer);
+		particles->frame_params_buffer = RID();
 	}
-	RD::get_singleton()->free(particles->particle_buffer);
-	RD::get_singleton()->free(particles->frame_params_buffer);
-	RD::get_singleton()->free(particles->particle_instance_buffer);
 	particles->particles_transforms_buffer_uniform_set = RID();
-	particles->particle_buffer = RID();
 
+	if (RD::get_singleton()->uniform_set_is_valid(particles->trail_bind_pose_uniform_set)) {
+		RD::get_singleton()->free(particles->trail_bind_pose_uniform_set);
+	}
+	particles->trail_bind_pose_uniform_set = RID();
+
+	if (particles->trail_bind_pose_buffer.is_valid()) {
+		RD::get_singleton()->free(particles->trail_bind_pose_buffer);
+		particles->trail_bind_pose_buffer = RID();
+	}
 	if (RD::get_singleton()->uniform_set_is_valid(particles->collision_textures_uniform_set)) {
 		RD::get_singleton()->free(particles->collision_textures_uniform_set);
 	}
+	particles->collision_textures_uniform_set = RID();
 
 	if (particles->particles_sort_buffer.is_valid()) {
 		RD::get_singleton()->free(particles->particles_sort_buffer);
 		particles->particles_sort_buffer = RID();
+		particles->particles_sort_uniform_set = RID();
 	}
 
 	if (particles->emission_buffer != nullptr) {
@@ -3912,6 +3927,12 @@ void RendererStorageRD::_particles_free_data(Particles *particles) {
 		RD::get_singleton()->free(particles->emission_storage_buffer);
 		particles->emission_storage_buffer = RID();
 	}
+
+	if (RD::get_singleton()->uniform_set_is_valid(particles->particles_material_uniform_set)) {
+		//will need to be re-created
+		RD::get_singleton()->free(particles->particles_material_uniform_set);
+	}
+	particles->particles_material_uniform_set = RID();
 }
 
 void RendererStorageRD::particles_set_amount(RID p_particles, int p_amount) {
@@ -3926,38 +3947,12 @@ void RendererStorageRD::particles_set_amount(RID p_particles, int p_amount) {
 
 	particles->amount = p_amount;
 
-	if (particles->amount > 0) {
-		particles->particle_buffer = RD::get_singleton()->storage_buffer_create(sizeof(ParticleData) * p_amount);
-		particles->frame_params_buffer = RD::get_singleton()->storage_buffer_create(sizeof(ParticlesFrameParams) * 1);
-		particles->particle_instance_buffer = RD::get_singleton()->storage_buffer_create(sizeof(float) * 4 * (3 + 1 + 1) * p_amount);
-		//needs to clear it
-
-		{
-			Vector<RD::Uniform> uniforms;
-
-			{
-				RD::Uniform u;
-				u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-				u.binding = 1;
-				u.ids.push_back(particles->particle_buffer);
-				uniforms.push_back(u);
-			}
-			{
-				RD::Uniform u;
-				u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-				u.binding = 2;
-				u.ids.push_back(particles->particle_instance_buffer);
-				uniforms.push_back(u);
-			}
-
-			particles->particles_copy_uniform_set = RD::get_singleton()->uniform_set_create(uniforms, particles_shader.copy_shader.version_get_shader(particles_shader.copy_shader_version, 0), 0);
-		}
-	}
-
 	particles->prev_ticks = 0;
 	particles->phase = 0;
 	particles->prev_phase = 0;
 	particles->clear = true;
+
+	particles->dependency.changed_notify(DEPENDENCY_CHANGED_PARTICLES);
 }
 
 void RendererStorageRD::particles_set_lifetime(RID p_particles, float p_lifetime) {
@@ -4013,6 +4008,22 @@ void RendererStorageRD::particles_set_fixed_fps(RID p_particles, int p_fps) {
 	ERR_FAIL_COND(!particles);
 
 	particles->fixed_fps = p_fps;
+
+	_particles_free_data(particles);
+
+	particles->prev_ticks = 0;
+	particles->phase = 0;
+	particles->prev_phase = 0;
+	particles->clear = true;
+
+	particles->dependency.changed_notify(DEPENDENCY_CHANGED_PARTICLES);
+}
+
+void RendererStorageRD::particles_set_interpolate(RID p_particles, bool p_enable) {
+	Particles *particles = particles_owner.getornull(p_particles);
+	ERR_FAIL_COND(!particles);
+
+	particles->interpolate = p_enable;
 }
 
 void RendererStorageRD::particles_set_fractional_delta(RID p_particles, bool p_enable) {
@@ -4022,6 +4033,42 @@ void RendererStorageRD::particles_set_fractional_delta(RID p_particles, bool p_e
 	particles->fractional_delta = p_enable;
 }
 
+void RendererStorageRD::particles_set_trails(RID p_particles, bool p_enable, float p_length) {
+	Particles *particles = particles_owner.getornull(p_particles);
+	ERR_FAIL_COND(!particles);
+	ERR_FAIL_COND(p_length < 0.1);
+	p_length = MIN(10.0, p_length);
+
+	particles->trails_enabled = p_enable;
+	particles->trail_length = p_length;
+
+	_particles_free_data(particles);
+
+	particles->prev_ticks = 0;
+	particles->phase = 0;
+	particles->prev_phase = 0;
+	particles->clear = true;
+
+	particles->dependency.changed_notify(DEPENDENCY_CHANGED_PARTICLES);
+}
+
+void RendererStorageRD::particles_set_trail_bind_poses(RID p_particles, const Vector<Transform> &p_bind_poses) {
+	Particles *particles = particles_owner.getornull(p_particles);
+	ERR_FAIL_COND(!particles);
+	if (particles->trail_bind_pose_buffer.is_valid() && particles->trail_bind_poses.size() != p_bind_poses.size()) {
+		_particles_free_data(particles);
+
+		particles->prev_ticks = 0;
+		particles->phase = 0;
+		particles->prev_phase = 0;
+		particles->clear = true;
+	}
+	particles->trail_bind_poses = p_bind_poses;
+	particles->trail_bind_poses_dirty = true;
+
+	particles->dependency.changed_notify(DEPENDENCY_CHANGED_PARTICLES);
+}
+
 void RendererStorageRD::particles_set_collision_base_size(RID p_particles, float p_size) {
 	Particles *particles = particles_owner.getornull(p_particles);
 	ERR_FAIL_COND(!particles);
@@ -4029,6 +4076,13 @@ void RendererStorageRD::particles_set_collision_base_size(RID p_particles, float
 	particles->collision_base_size = p_size;
 }
 
+void RendererStorageRD::particles_set_transform_align(RID p_particles, RS::ParticlesTransformAlign p_transform_align) {
+	Particles *particles = particles_owner.getornull(p_particles);
+	ERR_FAIL_COND(!particles);
+
+	particles->transform_align = p_transform_align;
+}
+
 void RendererStorageRD::particles_set_process_material(RID p_particles, RID p_material) {
 	Particles *particles = particles_owner.getornull(p_particles);
 	ERR_FAIL_COND(!particles);
@@ -4068,7 +4122,7 @@ void RendererStorageRD::_particles_allocate_emission_buffer(Particles *particles
 	ERR_FAIL_COND(particles->emission_buffer != nullptr);
 
 	particles->emission_buffer_data.resize(sizeof(ParticleEmissionBuffer::Data) * particles->amount + sizeof(uint32_t) * 4);
-	zeromem(particles->emission_buffer_data.ptrw(), particles->emission_buffer_data.size());
+	memset(particles->emission_buffer_data.ptrw(), 0, particles->emission_buffer_data.size());
 	particles->emission_buffer = (ParticleEmissionBuffer *)particles->emission_buffer_data.ptrw();
 	particles->emission_buffer->particle_max = particles->amount;
 
@@ -4152,8 +4206,13 @@ AABB RendererStorageRD::particles_get_current_aabb(RID p_particles) {
 	const Particles *particles = particles_owner.getornull(p_particles);
 	ERR_FAIL_COND_V(!particles, AABB());
 
+	int total_amount = particles->amount;
+	if (particles->trails_enabled && particles->trail_bind_poses.size() > 1) {
+		total_amount *= particles->trail_bind_poses.size();
+	}
+
 	Vector<ParticleData> data;
-	data.resize(particles->amount);
+	data.resize(total_amount);
 
 	Vector<uint8_t> buffer = RD::get_singleton()->buffer_get_data(particles->particle_buffer);
 
@@ -4162,8 +4221,9 @@ AABB RendererStorageRD::particles_get_current_aabb(RID p_particles) {
 	AABB aabb;
 	if (buffer.size()) {
 		bool first = true;
+
 		const ParticleData *particle_data = (const ParticleData *)data.ptr();
-		for (int i = 0; i < particles->amount; i++) {
+		for (int i = 0; i < total_amount; i++) {
 			if (particle_data[i].active) {
 				Vector3 pos = Vector3(particle_data[i].xform[12], particle_data[i].xform[13], particle_data[i].xform[14]);
 				if (!particles->use_local_coords) {
@@ -4224,14 +4284,12 @@ RID RendererStorageRD::particles_get_draw_pass_mesh(RID p_particles, int p_pass)
 void RendererStorageRD::particles_add_collision(RID p_particles, RID p_particles_collision_instance) {
 	Particles *particles = particles_owner.getornull(p_particles);
 	ERR_FAIL_COND(!particles);
-
 	particles->collisions.insert(p_particles_collision_instance);
 }
 
 void RendererStorageRD::particles_remove_collision(RID p_particles, RID p_particles_collision_instance) {
 	Particles *particles = particles_owner.getornull(p_particles);
 	ERR_FAIL_COND(!particles);
-
 	particles->collisions.erase(p_particles_collision_instance);
 }
 
@@ -4286,7 +4344,12 @@ void RendererStorageRD::_particles_process(Particles *p_particles, float p_delta
 
 	float new_phase = Math::fmod((float)p_particles->phase + (p_delta / p_particles->lifetime) * p_particles->speed_scale, (float)1.0);
 
-	ParticlesFrameParams &frame_params = p_particles->frame_params;
+	//move back history (if there is any)
+	for (uint32_t i = p_particles->frame_history.size() - 1; i > 0; i--) {
+		p_particles->frame_history[i] = p_particles->frame_history[i - 1];
+	}
+	//update current frame
+	ParticlesFrameParams &frame_params = p_particles->frame_history[0];
 
 	if (p_particles->clear) {
 		p_particles->cycle_number = 0;
@@ -4317,6 +4380,10 @@ void RendererStorageRD::_particles_process(Particles *p_particles, float p_delta
 	}
 
 	frame_params.cycle = p_particles->cycle_number;
+	frame_params.frame = p_particles->frame_counter++;
+	frame_params.pad0 = 0;
+	frame_params.pad1 = 0;
+	frame_params.pad2 = 0;
 
 	{ //collision and attractors
 
@@ -4515,12 +4582,18 @@ void RendererStorageRD::_particles_process(Particles *p_particles, float p_delta
 
 	ParticlesShader::PushConstant push_constant;
 
+	int process_amount = p_particles->amount;
+
+	if (p_particles->trails_enabled && p_particles->trail_bind_poses.size() > 1) {
+		process_amount *= p_particles->trail_bind_poses.size();
+	}
 	push_constant.clear = p_particles->clear;
 	push_constant.total_particles = p_particles->amount;
 	push_constant.lifetime = p_particles->lifetime;
-	push_constant.trail_size = 1;
+	push_constant.trail_size = p_particles->trail_params.size();
 	push_constant.use_fractional_delta = p_particles->fractional_delta;
 	push_constant.sub_emitter_mode = !p_particles->emitting && p_particles->emission_buffer && (p_particles->emission_buffer->particle_count > 0 || p_particles->force_sub_emit);
+	push_constant.trail_pass = false;
 
 	p_particles->force_sub_emit = false; //reset
 
@@ -4553,7 +4626,17 @@ void RendererStorageRD::_particles_process(Particles *p_particles, float p_delta
 
 	p_particles->clear = false;
 
-	RD::get_singleton()->buffer_update(p_particles->frame_params_buffer, 0, sizeof(ParticlesFrameParams), &frame_params);
+	if (p_particles->trail_params.size() > 1) {
+		//fill the trail params
+		for (uint32_t i = 0; i < p_particles->trail_params.size(); i++) {
+			uint32_t src_idx = i * p_particles->frame_history.size() / p_particles->trail_params.size();
+			p_particles->trail_params[i] = p_particles->frame_history[src_idx];
+		}
+	} else {
+		p_particles->trail_params[0] = p_particles->frame_history[0];
+	}
+
+	RD::get_singleton()->buffer_update(p_particles->frame_params_buffer, 0, sizeof(ParticlesFrameParams) * p_particles->trail_params.size(), p_particles->trail_params.ptr());
 
 	ParticlesMaterialData *m = (ParticlesMaterialData *)material_get_data(p_particles->process_material, SHADER_TYPE_PARTICLES);
 	if (!m) {
@@ -4575,27 +4658,45 @@ void RendererStorageRD::_particles_process(Particles *p_particles, float p_delta
 
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(ParticlesShader::PushConstant));
 
-	RD::get_singleton()->compute_list_dispatch_threads(compute_list, p_particles->amount, 1, 1);
+	if (p_particles->trails_enabled && p_particles->trail_bind_poses.size() > 1) {
+		//trails requires two passes in order to catch particle starts
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, process_amount / p_particles->trail_bind_poses.size(), 1, 1);
+
+		RD::get_singleton()->compute_list_add_barrier(compute_list);
+
+		push_constant.trail_pass = true;
+		RD::get_singleton()->compute_list_set_push_constant(compute_list, &push_constant, sizeof(ParticlesShader::PushConstant));
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, process_amount - p_particles->amount, 1, 1);
+	} else {
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, process_amount, 1, 1);
+	}
 
 	RD::get_singleton()->compute_list_end();
 }
 
-void RendererStorageRD::particles_set_view_axis(RID p_particles, const Vector3 &p_axis) {
+void RendererStorageRD::particles_set_view_axis(RID p_particles, const Vector3 &p_axis, const Vector3 &p_up_axis) {
 	Particles *particles = particles_owner.getornull(p_particles);
 	ERR_FAIL_COND(!particles);
 
-	if (particles->draw_order != RS::PARTICLES_DRAW_ORDER_VIEW_DEPTH) {
-		return; //uninteresting for other modes
+	if (particles->draw_order != RS::PARTICLES_DRAW_ORDER_VIEW_DEPTH && particles->transform_align != RS::PARTICLES_TRANSFORM_ALIGN_Z_BILLBOARD && particles->transform_align != RS::PARTICLES_TRANSFORM_ALIGN_Z_BILLBOARD_Y_TO_VELOCITY) {
+		return;
+	}
+
+	if (particles->particle_buffer.is_null()) {
+		return; //particles have not processed yet
 	}
 
+	bool do_sort = particles->draw_order == RS::PARTICLES_DRAW_ORDER_VIEW_DEPTH;
+
 	//copy to sort buffer
-	if (particles->particles_sort_buffer == RID()) {
+	if (do_sort && particles->particles_sort_buffer == RID()) {
 		uint32_t size = particles->amount;
 		if (size & 1) {
 			size++; //make multiple of 16
 		}
 		size *= sizeof(float) * 2;
 		particles->particles_sort_buffer = RD::get_singleton()->storage_buffer_create(size);
+
 		{
 			Vector<RD::Uniform> uniforms;
 
@@ -4611,41 +4712,105 @@ void RendererStorageRD::particles_set_view_axis(RID p_particles, const Vector3 &
 		}
 	}
 
+	ParticlesShader::CopyPushConstant copy_push_constant;
+
+	if (particles->trails_enabled && particles->trail_bind_poses.size() > 1) {
+		int fixed_fps = 60.0;
+		if (particles->fixed_fps > 0) {
+			fixed_fps = particles->fixed_fps;
+		}
+
+		copy_push_constant.trail_size = particles->trail_bind_poses.size();
+		copy_push_constant.trail_total = particles->frame_history.size();
+		copy_push_constant.frame_delta = 1.0 / fixed_fps;
+	} else {
+		copy_push_constant.trail_size = 1;
+		copy_push_constant.trail_total = 1;
+		copy_push_constant.frame_delta = 0.0;
+	}
+	copy_push_constant.frame_remainder = particles->interpolate ? particles->frame_remainder : 0.0;
+	copy_push_constant.total_particles = particles->amount;
+
 	Vector3 axis = -p_axis; // cameras look to z negative
 
 	if (particles->use_local_coords) {
 		axis = particles->emission_transform.basis.xform_inv(axis).normalized();
 	}
 
-	ParticlesShader::CopyPushConstant copy_push_constant;
-	copy_push_constant.total_particles = particles->amount;
 	copy_push_constant.sort_direction[0] = axis.x;
 	copy_push_constant.sort_direction[1] = axis.y;
 	copy_push_constant.sort_direction[2] = axis.z;
 
-	RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
-	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, particles_shader.copy_pipelines[ParticlesShader::COPY_MODE_FILL_SORT_BUFFER]);
-	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->particles_copy_uniform_set, 0);
-	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->particles_sort_uniform_set, 1);
-	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy_push_constant, sizeof(ParticlesShader::CopyPushConstant));
+	copy_push_constant.align_up[0] = p_up_axis.x;
+	copy_push_constant.align_up[1] = p_up_axis.y;
+	copy_push_constant.align_up[2] = p_up_axis.z;
 
-	RD::get_singleton()->compute_list_dispatch_threads(compute_list, particles->amount, 1, 1);
+	copy_push_constant.align_mode = particles->transform_align;
 
-	RD::get_singleton()->compute_list_end();
+	if (do_sort) {
+		RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
+
+		RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, particles_shader.copy_pipelines[ParticlesShader::COPY_MODE_FILL_SORT_BUFFER]);
+		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->particles_copy_uniform_set, 0);
+		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->particles_sort_uniform_set, 1);
+		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->trail_bind_pose_uniform_set, 2);
+		RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy_push_constant, sizeof(ParticlesShader::CopyPushConstant));
+
+		RD::get_singleton()->compute_list_dispatch_threads(compute_list, particles->amount, 1, 1);
+
+		RD::get_singleton()->compute_list_end();
+		effects.sort_buffer(particles->particles_sort_uniform_set, particles->amount);
+	}
 
-	effects.sort_buffer(particles->particles_sort_uniform_set, particles->amount);
+	copy_push_constant.total_particles *= copy_push_constant.total_particles;
 
-	compute_list = RD::get_singleton()->compute_list_begin();
-	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, particles_shader.copy_pipelines[ParticlesShader::COPY_MODE_FILL_INSTANCES_WITH_SORT_BUFFER]);
+	RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
+	RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, particles_shader.copy_pipelines[do_sort ? ParticlesShader::COPY_MODE_FILL_INSTANCES_WITH_SORT_BUFFER : ParticlesShader::COPY_MODE_FILL_INSTANCES]);
 	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->particles_copy_uniform_set, 0);
-	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->particles_sort_uniform_set, 1);
+	if (do_sort) {
+		RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->particles_sort_uniform_set, 1);
+	}
+	RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->trail_bind_pose_uniform_set, 2);
+
 	RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy_push_constant, sizeof(ParticlesShader::CopyPushConstant));
 
-	RD::get_singleton()->compute_list_dispatch_threads(compute_list, particles->amount, 1, 1);
+	RD::get_singleton()->compute_list_dispatch_threads(compute_list, copy_push_constant.total_particles, 1, 1);
 
 	RD::get_singleton()->compute_list_end();
 }
 
+void RendererStorageRD::_particles_update_buffers(Particles *particles) {
+	if (particles->amount > 0 && particles->particle_buffer.is_null()) {
+		int total_amount = particles->amount;
+		if (particles->trails_enabled && particles->trail_bind_poses.size() > 1) {
+			total_amount *= particles->trail_bind_poses.size();
+		}
+		particles->particle_buffer = RD::get_singleton()->storage_buffer_create(sizeof(ParticleData) * total_amount);
+		particles->particle_instance_buffer = RD::get_singleton()->storage_buffer_create(sizeof(float) * 4 * (3 + 1 + 1) * total_amount);
+		//needs to clear it
+
+		{
+			Vector<RD::Uniform> uniforms;
+
+			{
+				RD::Uniform u;
+				u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+				u.binding = 1;
+				u.ids.push_back(particles->particle_buffer);
+				uniforms.push_back(u);
+			}
+			{
+				RD::Uniform u;
+				u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+				u.binding = 2;
+				u.ids.push_back(particles->particle_instance_buffer);
+				uniforms.push_back(u);
+			}
+
+			particles->particles_copy_uniform_set = RD::get_singleton()->uniform_set_create(uniforms, particles_shader.copy_shader.version_get_shader(particles_shader.copy_shader_version, 0), 0);
+		}
+	}
+}
 void RendererStorageRD::update_particles() {
 	while (particle_update_list) {
 		//use transform feedback to process particles
@@ -4657,6 +4822,8 @@ void RendererStorageRD::update_particles() {
 		particles->update_list = nullptr;
 		particles->dirty = false;
 
+		_particles_update_buffers(particles);
+
 		if (particles->restart_request) {
 			particles->prev_ticks = 0;
 			particles->phase = 0;
@@ -4688,12 +4855,81 @@ void RendererStorageRD::update_particles() {
 			}
 		}
 
+#ifndef _MSC_VER
+#warning Should use display refresh rate for all this
+#endif
+
+		float screen_hz = 60;
+
+		int fixed_fps = 0;
+		if (particles->fixed_fps > 0) {
+			fixed_fps = particles->fixed_fps;
+		} else if (particles->trails_enabled && particles->trail_bind_poses.size() > 1) {
+			fixed_fps = screen_hz;
+		}
+		{
+			//update trails
+			int history_size = 1;
+			int trail_steps = 1;
+			if (particles->trails_enabled && particles->trail_bind_poses.size() > 1) {
+				history_size = MAX(1, int(particles->trail_length * fixed_fps));
+				trail_steps = particles->trail_bind_poses.size();
+			}
+
+			if (uint32_t(history_size) != particles->frame_history.size()) {
+				particles->frame_history.resize(history_size);
+				memset(particles->frame_history.ptr(), 0, sizeof(ParticlesFrameParams) * history_size);
+			}
+
+			if (uint32_t(trail_steps) != particles->trail_params.size() || particles->frame_params_buffer.is_null()) {
+				particles->trail_params.resize(trail_steps);
+				if (particles->frame_params_buffer.is_valid()) {
+					RD::get_singleton()->free(particles->frame_params_buffer);
+				}
+				particles->frame_params_buffer = RD::get_singleton()->storage_buffer_create(sizeof(ParticlesFrameParams) * trail_steps);
+			}
+
+			if (particles->trail_bind_poses.size() > 1 && particles->trail_bind_pose_buffer.is_null()) {
+				particles->trail_bind_pose_buffer = RD::get_singleton()->storage_buffer_create(sizeof(float) * 16 * particles->trail_bind_poses.size());
+				particles->trail_bind_poses_dirty = true;
+			}
+
+			if (particles->trail_bind_pose_uniform_set.is_null()) {
+				Vector<RD::Uniform> uniforms;
+				{
+					RD::Uniform u;
+					u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
+					u.binding = 0;
+					if (particles->trail_bind_pose_buffer.is_valid()) {
+						u.ids.push_back(particles->trail_bind_pose_buffer);
+					} else {
+						u.ids.push_back(default_rd_storage_buffer);
+					}
+					uniforms.push_back(u);
+				}
+
+				particles->trail_bind_pose_uniform_set = RD::get_singleton()->uniform_set_create(uniforms, particles_shader.copy_shader.version_get_shader(particles_shader.copy_shader_version, 0), 2);
+			}
+
+			if (particles->trail_bind_pose_buffer.is_valid() && particles->trail_bind_poses_dirty) {
+				if (particles_shader.pose_update_buffer.size() < uint32_t(particles->trail_bind_poses.size()) * 16) {
+					particles_shader.pose_update_buffer.resize(particles->trail_bind_poses.size() * 16);
+				}
+
+				for (int i = 0; i < particles->trail_bind_poses.size(); i++) {
+					store_transform(particles->trail_bind_poses[i], &particles_shader.pose_update_buffer[i * 16]);
+				}
+
+				RD::get_singleton()->buffer_update(particles->trail_bind_pose_buffer, 0, particles->trail_bind_poses.size() * 16 * sizeof(float), particles_shader.pose_update_buffer.ptr());
+			}
+		}
+
 		bool zero_time_scale = Engine::get_singleton()->get_time_scale() <= 0.0;
 
 		if (particles->clear && particles->pre_process_time > 0.0) {
 			float frame_time;
-			if (particles->fixed_fps > 0) {
-				frame_time = 1.0 / particles->fixed_fps;
+			if (fixed_fps > 0) {
+				frame_time = 1.0 / fixed_fps;
 			} else {
 				frame_time = 1.0 / 30.0;
 			}
@@ -4706,14 +4942,14 @@ void RendererStorageRD::update_particles() {
 			}
 		}
 
-		if (particles->fixed_fps > 0) {
+		if (fixed_fps > 0) {
 			float frame_time;
 			float decr;
 			if (zero_time_scale) {
 				frame_time = 0.0;
-				decr = 1.0 / particles->fixed_fps;
+				decr = 1.0 / fixed_fps;
 			} else {
-				frame_time = 1.0 / particles->fixed_fps;
+				frame_time = 1.0 / fixed_fps;
 				decr = frame_time;
 			}
 			float delta = RendererCompositorRD::singleton->get_frame_delta_time();
@@ -4741,16 +4977,39 @@ void RendererStorageRD::update_particles() {
 
 		//copy particles to instance buffer
 
-		if (particles->draw_order != RS::PARTICLES_DRAW_ORDER_VIEW_DEPTH) {
+		if (particles->draw_order != RS::PARTICLES_DRAW_ORDER_VIEW_DEPTH && particles->transform_align != RS::PARTICLES_TRANSFORM_ALIGN_Z_BILLBOARD && particles->transform_align != RS::PARTICLES_TRANSFORM_ALIGN_Z_BILLBOARD_Y_TO_VELOCITY) {
+			//does not need view dependent operation, do copy here
 			ParticlesShader::CopyPushConstant copy_push_constant;
-			copy_push_constant.total_particles = particles->amount;
+
+			int total_amount = particles->amount;
+			if (particles->trails_enabled && particles->trail_bind_poses.size() > 1) {
+				total_amount *= particles->trail_bind_poses.size();
+			}
+
+			copy_push_constant.total_particles = total_amount;
+			copy_push_constant.frame_remainder = particles->interpolate ? particles->frame_remainder : 0.0;
+			copy_push_constant.align_mode = particles->transform_align;
+			copy_push_constant.align_up[0] = 0;
+			copy_push_constant.align_up[1] = 0;
+			copy_push_constant.align_up[2] = 0;
+
+			if (particles->trails_enabled && particles->trail_bind_poses.size() > 1) {
+				copy_push_constant.trail_size = particles->trail_bind_poses.size();
+				copy_push_constant.trail_total = particles->frame_history.size();
+				copy_push_constant.frame_delta = 1.0 / fixed_fps;
+			} else {
+				copy_push_constant.trail_size = 1;
+				copy_push_constant.trail_total = 1;
+				copy_push_constant.frame_delta = 0.0;
+			}
 
 			RD::ComputeListID compute_list = RD::get_singleton()->compute_list_begin();
 			RD::get_singleton()->compute_list_bind_compute_pipeline(compute_list, particles_shader.copy_pipelines[ParticlesShader::COPY_MODE_FILL_INSTANCES]);
 			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->particles_copy_uniform_set, 0);
+			RD::get_singleton()->compute_list_bind_uniform_set(compute_list, particles->trail_bind_pose_uniform_set, 2);
 			RD::get_singleton()->compute_list_set_push_constant(compute_list, &copy_push_constant, sizeof(ParticlesShader::CopyPushConstant));
 
-			RD::get_singleton()->compute_list_dispatch_threads(compute_list, particles->amount, 1, 1);
+			RD::get_singleton()->compute_list_dispatch_threads(compute_list, total_amount, 1, 1);
 
 			RD::get_singleton()->compute_list_end();
 		}
@@ -4781,6 +5040,8 @@ void RendererStorageRD::ParticlesShaderData::set_code(const String &p_code) {
 
 	ShaderCompilerRD::GeneratedCode gen_code;
 	ShaderCompilerRD::IdentifierActions actions;
+	actions.entry_point_stages["start"] = ShaderCompilerRD::STAGE_COMPUTE;
+	actions.entry_point_stages["process"] = ShaderCompilerRD::STAGE_COMPUTE;
 
 	/*
 	uses_time = false;
@@ -4801,7 +5062,7 @@ void RendererStorageRD::ParticlesShaderData::set_code(const String &p_code) {
 		version = base_singleton->particles_shader.shader.version_create();
 	}
 
-	base_singleton->particles_shader.shader.version_set_compute_code(version, gen_code.uniforms, gen_code.compute_global, gen_code.compute, gen_code.defines);
+	base_singleton->particles_shader.shader.version_set_compute_code(version, gen_code.code, gen_code.uniforms, gen_code.stage_globals[ShaderCompilerRD::STAGE_COMPUTE], gen_code.defines);
 	ERR_FAIL_COND(!base_singleton->particles_shader.shader.version_is_valid(version));
 
 	ubo_size = gen_code.uniform_total_size;
@@ -5228,7 +5489,7 @@ void RendererStorageRD::skeleton_allocate_data(RID p_skeleton, int p_bones, bool
 	if (skeleton->size) {
 		skeleton->data.resize(skeleton->size * (skeleton->use_2d ? 8 : 12));
 		skeleton->buffer = RD::get_singleton()->storage_buffer_create(skeleton->data.size() * sizeof(float));
-		zeromem(skeleton->data.ptrw(), skeleton->data.size() * sizeof(float));
+		memset(skeleton->data.ptrw(), 0, skeleton->data.size() * sizeof(float));
 
 		_skeleton_make_dirty(skeleton);
 
@@ -6870,7 +7131,7 @@ RID RendererStorageRD::render_target_get_sdf_texture(RID p_render_target) {
 
 		Vector<uint8_t> pv;
 		pv.resize(16 * 4);
-		zeromem(pv.ptrw(), 16 * 4);
+		memset(pv.ptrw(), 0, 16 * 4);
 		Vector<Vector<uint8_t>> vpv;
 
 		rt->sdf_buffer_read = RD::get_singleton()->texture_create(tformat, RD::TextureView(), vpv);
@@ -7357,7 +7618,7 @@ void RendererStorageRD::_update_decal_atlas() {
 			v_offsetsv.resize(base_size);
 
 			int *v_offsets = v_offsetsv.ptrw();
-			zeromem(v_offsets, sizeof(int) * base_size);
+			memset(v_offsets, 0, sizeof(int) * base_size);
 
 			int max_height = 0;
 
@@ -7921,7 +8182,6 @@ void RendererStorageRD::global_variable_set_override(const StringName &p_name, c
 		_global_variable_mark_buffer_dirty(gv.buffer_index, gv.buffer_elements);
 	} else {
 		//texture
-		//texture
 		for (Set<RID>::Element *E = gv.texture_materials.front(); E; E = E->next()) {
 			Material *material = material_owner.getornull(E->get());
 			ERR_CONTINUE(!material);
@@ -8114,7 +8374,7 @@ void RendererStorageRD::_update_global_variables() {
 		if (total_regions / global_variables.buffer_dirty_region_count <= 4) {
 			// 25% of regions dirty, just update all buffer
 			RD::get_singleton()->buffer_update(global_variables.buffer, 0, sizeof(GlobalVariables::Value) * global_variables.buffer_size, global_variables.buffer_values);
-			zeromem(global_variables.buffer_dirty_regions, sizeof(bool) * total_regions);
+			memset(global_variables.buffer_dirty_regions, 0, sizeof(bool) * total_regions);
 		} else {
 			uint32_t region_byte_size = sizeof(GlobalVariables::Value) * GlobalVariables::BUFFER_DIRTY_REGION_SIZE;
 
@@ -8323,9 +8583,10 @@ bool RendererStorageRD::free(RID p_rid) {
 		light_owner.free(p_rid);
 
 	} else if (particles_owner.owns(p_rid)) {
+		update_particles();
 		Particles *particles = particles_owner.getornull(p_rid);
-		_particles_free_data(particles);
 		particles->dependency.deleted_notify(p_rid);
+		_particles_free_data(particles);
 		particles_owner.free(p_rid);
 	} else if (particles_collision_owner.owns(p_rid)) {
 		ParticlesCollision *particles_collision = particles_collision_owner.getornull(p_rid);
@@ -8402,10 +8663,10 @@ RendererStorageRD::RendererStorageRD() {
 	global_variables.buffer_size = GLOBAL_GET("rendering/limits/global_shader_variables/buffer_size");
 	global_variables.buffer_size = MAX(4096, global_variables.buffer_size);
 	global_variables.buffer_values = memnew_arr(GlobalVariables::Value, global_variables.buffer_size);
-	zeromem(global_variables.buffer_values, sizeof(GlobalVariables::Value) * global_variables.buffer_size);
+	memset(global_variables.buffer_values, 0, sizeof(GlobalVariables::Value) * global_variables.buffer_size);
 	global_variables.buffer_usage = memnew_arr(GlobalVariables::ValueUsage, global_variables.buffer_size);
 	global_variables.buffer_dirty_regions = memnew_arr(bool, global_variables.buffer_size / GlobalVariables::BUFFER_DIRTY_REGION_SIZE);
-	zeromem(global_variables.buffer_dirty_regions, sizeof(bool) * global_variables.buffer_size / GlobalVariables::BUFFER_DIRTY_REGION_SIZE);
+	memset(global_variables.buffer_dirty_regions, 0, sizeof(bool) * global_variables.buffer_size / GlobalVariables::BUFFER_DIRTY_REGION_SIZE);
 	global_variables.buffer = RD::get_singleton()->storage_buffer_create(sizeof(GlobalVariables::Value) * global_variables.buffer_size);
 
 	material_update_list = nullptr;
@@ -8824,7 +9085,6 @@ RendererStorageRD::RendererStorageRD() {
 		sdf_versions.push_back(""); //one only
 		giprobe_sdf_shader.initialize(sdf_versions);
 		giprobe_sdf_shader_version = giprobe_sdf_shader.version_create();
-		giprobe_sdf_shader.version_set_compute_code(giprobe_sdf_shader_version, "", "", "", Vector<String>());
 		giprobe_sdf_shader_version_shader = giprobe_sdf_shader.version_get_shader(giprobe_sdf_shader_version, 0);
 		giprobe_sdf_shader_pipeline = RD::get_singleton()->compute_pipeline_create(giprobe_sdf_shader_version_shader);
 	}
@@ -8863,14 +9123,14 @@ RendererStorageRD::RendererStorageRD() {
 		actions.renames["COLOR"] = "PARTICLE.color";
 		actions.renames["VELOCITY"] = "PARTICLE.velocity";
 		//actions.renames["MASS"] = "mass"; ?
-		actions.renames["ACTIVE"] = "PARTICLE.is_active";
+		actions.renames["ACTIVE"] = "particle_active";
 		actions.renames["RESTART"] = "restart";
 		actions.renames["CUSTOM"] = "PARTICLE.custom";
 		actions.renames["TRANSFORM"] = "PARTICLE.xform";
 		actions.renames["TIME"] = "FRAME.time";
 		actions.renames["LIFETIME"] = "params.lifetime";
 		actions.renames["DELTA"] = "local_delta";
-		actions.renames["NUMBER"] = "particle";
+		actions.renames["NUMBER"] = "particle_number";
 		actions.renames["INDEX"] = "index";
 		//actions.renames["GRAVITY"] = "current_gravity";
 		actions.renames["EMISSION_TRANSFORM"] = "FRAME.emission_transform";
@@ -8913,7 +9173,7 @@ RendererStorageRD::RendererStorageRD() {
 		// default material and shader for particles shader
 		particles_shader.default_shader = shader_allocate();
 		shader_initialize(particles_shader.default_shader);
-		shader_set_code(particles_shader.default_shader, "shader_type particles; void compute() { COLOR = vec4(1.0); } \n");
+		shader_set_code(particles_shader.default_shader, "shader_type particles; void process() { COLOR = vec4(1.0); } \n");
 		particles_shader.default_material = material_allocate();
 		material_initialize(particles_shader.default_material);
 		material_set_shader(particles_shader.default_material, particles_shader.default_shader);
diff --git a/servers/rendering/renderer_rd/renderer_storage_rd.h b/servers/rendering/renderer_rd/renderer_storage_rd.h
index 6405bb75b0..961bdfb178 100644
--- a/servers/rendering/renderer_rd/renderer_storage_rd.h
+++ b/servers/rendering/renderer_rd/renderer_storage_rd.h
@@ -660,6 +660,11 @@ private:
 		float time;
 		float delta;
 
+		uint32_t frame;
+		uint32_t pad0;
+		uint32_t pad1;
+		uint32_t pad2;
+
 		uint32_t random_seed;
 		uint32_t attractor_count;
 		uint32_t collider_count;
@@ -704,10 +709,16 @@ private:
 		AABB custom_aabb = AABB(Vector3(-4, -4, -4), Vector3(8, 8, 8));
 		bool use_local_coords = true;
 		RID process_material;
+		uint32_t frame_counter = 0;
+		RS::ParticlesTransformAlign transform_align = RS::PARTICLES_TRANSFORM_ALIGN_DISABLED;
 
 		RS::ParticlesDrawOrder draw_order = RS::PARTICLES_DRAW_ORDER_INDEX;
 
 		Vector<RID> draw_passes;
+		Vector<Transform> trail_bind_poses;
+		bool trail_bind_poses_dirty = false;
+		RID trail_bind_pose_buffer;
+		RID trail_bind_pose_uniform_set;
 
 		RID particle_buffer;
 		RID particle_instance_buffer;
@@ -739,7 +750,8 @@ private:
 
 		float speed_scale = 1.0;
 
-		int fixed_fps = 0;
+		int fixed_fps = 30;
+		bool interpolate = true;
 		bool fractional_delta = false;
 		float frame_remainder = 0;
 		float collision_base_size = 0.01;
@@ -759,12 +771,19 @@ private:
 
 		Dependency dependency;
 
-		ParticlesFrameParams frame_params;
+		float trail_length = 1.0;
+		bool trails_enabled = false;
+		LocalVector<ParticlesFrameParams> frame_history;
+		LocalVector<ParticlesFrameParams> trail_params;
+
+		Particles() {
+		}
 	};
 
 	void _particles_process(Particles *p_particles, float p_delta);
 	void _particles_allocate_emission_buffer(Particles *particles);
 	void _particles_free_data(Particles *particles);
+	void _particles_update_buffers(Particles *particles);
 
 	struct ParticlesShader {
 		struct PushConstant {
@@ -776,7 +795,7 @@ private:
 			uint32_t use_fractional_delta;
 			uint32_t sub_emitter_mode;
 			uint32_t can_emit;
-			uint32_t pad;
+			uint32_t trail_pass;
 		};
 
 		ParticlesShaderRD shader;
@@ -791,6 +810,14 @@ private:
 		struct CopyPushConstant {
 			float sort_direction[3];
 			uint32_t total_particles;
+
+			uint32_t trail_size;
+			uint32_t trail_total;
+			float frame_delta;
+			float frame_remainder;
+
+			float align_up[3];
+			uint32_t align_mode;
 		};
 
 		enum {
@@ -804,6 +831,8 @@ private:
 		RID copy_shader_version;
 		RID copy_pipelines[COPY_MODE_MAX];
 
+		LocalVector<float> pose_update_buffer;
+
 	} particles_shader;
 
 	Particles *particle_update_list = nullptr;
@@ -2076,10 +2105,17 @@ public:
 	void particles_set_use_local_coordinates(RID p_particles, bool p_enable);
 	void particles_set_process_material(RID p_particles, RID p_material);
 	void particles_set_fixed_fps(RID p_particles, int p_fps);
+	void particles_set_interpolate(RID p_particles, bool p_enable);
 	void particles_set_fractional_delta(RID p_particles, bool p_enable);
 	void particles_set_collision_base_size(RID p_particles, float p_size);
+	void particles_set_transform_align(RID p_particles, RS::ParticlesTransformAlign p_transform_align);
+
+	void particles_set_trails(RID p_particles, bool p_enable, float p_length);
+	void particles_set_trail_bind_poses(RID p_particles, const Vector<Transform> &p_bind_poses);
+
 	void particles_restart(RID p_particles);
 	void particles_emit(RID p_particles, const Transform &p_transform, const Vector3 &p_velocity, const Color &p_color, const Color &p_custom, uint32_t p_emit_flags);
+
 	void particles_set_subemitter(RID p_particles, RID p_subemitter_particles);
 
 	void particles_set_draw_order(RID p_particles, RS::ParticlesDrawOrder p_order);
@@ -2097,15 +2133,21 @@ public:
 	int particles_get_draw_passes(RID p_particles) const;
 	RID particles_get_draw_pass_mesh(RID p_particles, int p_pass) const;
 
-	void particles_set_view_axis(RID p_particles, const Vector3 &p_axis);
+	void particles_set_view_axis(RID p_particles, const Vector3 &p_axis, const Vector3 &p_up_axis);
 
 	virtual bool particles_is_inactive(RID p_particles) const;
 
-	_FORCE_INLINE_ uint32_t particles_get_amount(RID p_particles) {
+	_FORCE_INLINE_ uint32_t particles_get_amount(RID p_particles, uint32_t &r_trail_divisor) {
 		Particles *particles = particles_owner.getornull(p_particles);
 		ERR_FAIL_COND_V(!particles, 0);
 
-		return particles->amount;
+		if (particles->trails_enabled && particles->trail_bind_poses.size() > 1) {
+			r_trail_divisor = particles->trail_bind_poses.size();
+		} else {
+			r_trail_divisor = 1;
+		}
+
+		return particles->amount * r_trail_divisor;
 	}
 
 	_FORCE_INLINE_ uint32_t particles_is_using_local_coords(RID p_particles) {
@@ -2119,6 +2161,8 @@ public:
 		Particles *particles = particles_owner.getornull(p_particles);
 		ERR_FAIL_COND_V(!particles, RID());
 		if (particles->particles_transforms_buffer_uniform_set.is_null()) {
+			_particles_update_buffers(particles);
+
 			Vector<RD::Uniform> uniforms;
 
 			{
diff --git a/servers/rendering/renderer_rd/shader_compiler_rd.cpp b/servers/rendering/renderer_rd/shader_compiler_rd.cpp
index 8135d388e1..24ac85bb35 100644
--- a/servers/rendering/renderer_rd/shader_compiler_rd.cpp
+++ b/servers/rendering/renderer_rd/shader_compiler_rd.cpp
@@ -535,9 +535,9 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 				struct_code += "}";
 				struct_code += ";\n";
 
-				r_gen_code.vertex_global += struct_code;
-				r_gen_code.fragment_global += struct_code;
-				r_gen_code.compute_global += struct_code;
+				for (int j = 0; j < STAGE_MAX; j++) {
+					r_gen_code.stage_globals[j] += struct_code;
+				}
 			}
 
 			int max_texture_uniforms = 0;
@@ -590,9 +590,9 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 				ucode += " " + _mkid(E->key());
 				ucode += ";\n";
 				if (SL::is_sampler_type(E->get().type)) {
-					r_gen_code.vertex_global += ucode;
-					r_gen_code.fragment_global += ucode;
-					r_gen_code.compute_global += ucode;
+					for (int j = 0; j < STAGE_MAX; j++) {
+						r_gen_code.stage_globals[j] += ucode;
+					}
 
 					GeneratedCode::Texture texture;
 					texture.name = E->key();
@@ -608,7 +608,6 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 					r_gen_code.texture_uniforms.write[E->get().texture_order] = texture;
 				} else {
 					if (!uses_uniforms) {
-						r_gen_code.defines.push_back(String("#define USE_MATERIAL_UNIFORMS\n"));
 						uses_uniforms = true;
 					}
 					uniform_defines.write[E->get().order] = ucode;
@@ -707,9 +706,10 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 					vcode += "]";
 				}
 				vcode += ";\n";
-				r_gen_code.vertex_global += "layout(location=" + itos(index) + ") " + interp_mode + "out " + vcode;
-				r_gen_code.fragment_global += "layout(location=" + itos(index) + ") " + interp_mode + "in " + vcode;
-				r_gen_code.compute_global += "layout(location=" + itos(index) + ") " + interp_mode + "out " + vcode;
+
+				r_gen_code.stage_globals[STAGE_VERTEX] += "layout(location=" + itos(index) + ") " + interp_mode + "out " + vcode;
+				r_gen_code.stage_globals[STAGE_FRAGMENT] += "layout(location=" + itos(index) + ") " + interp_mode + "in " + vcode;
+
 				index++;
 			}
 
@@ -725,7 +725,7 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 					gcode += ";\n";
 				}
 				gcode += "} frag_to_light;\n";
-				r_gen_code.fragment_global += gcode;
+				r_gen_code.stage_globals[STAGE_FRAGMENT] += gcode;
 			}
 
 			for (int i = 0; i < pnode->vconstants.size(); i++) {
@@ -747,9 +747,9 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 				gcode += "=";
 				gcode += _dump_node_code(cnode.initializer, p_level, r_gen_code, p_actions, p_default_actions, p_assigning);
 				gcode += ";\n";
-				r_gen_code.vertex_global += gcode;
-				r_gen_code.fragment_global += gcode;
-				r_gen_code.compute_global += gcode;
+				for (int j = 0; j < STAGE_MAX; j++) {
+					r_gen_code.stage_globals[j] += gcode;
+				}
 			}
 
 			Map<StringName, String> function_code;
@@ -765,9 +765,7 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 
 			//place functions in actual code
 
-			Set<StringName> added_vtx;
-			Set<StringName> added_fragment; //share for light
-			Set<StringName> added_compute; //share for light
+			Set<StringName> added_funcs_per_stage[STAGE_MAX];
 
 			for (int i = 0; i < pnode->functions.size(); i++) {
 				SL::FunctionNode *fnode = pnode->functions[i].function;
@@ -776,24 +774,10 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 
 				current_func_name = fnode->name;
 
-				if (fnode->name == vertex_name) {
-					_dump_function_deps(pnode, fnode->name, function_code, r_gen_code.vertex_global, added_vtx);
-					r_gen_code.vertex = function_code[vertex_name];
-				}
-
-				if (fnode->name == fragment_name) {
-					_dump_function_deps(pnode, fnode->name, function_code, r_gen_code.fragment_global, added_fragment);
-					r_gen_code.fragment = function_code[fragment_name];
-				}
-
-				if (fnode->name == light_name) {
-					_dump_function_deps(pnode, fnode->name, function_code, r_gen_code.fragment_global, added_fragment);
-					r_gen_code.light = function_code[light_name];
-				}
-
-				if (fnode->name == compute_name) {
-					_dump_function_deps(pnode, fnode->name, function_code, r_gen_code.compute_global, added_compute);
-					r_gen_code.compute = function_code[compute_name];
+				if (p_actions.entry_point_stages.has(fnode->name)) {
+					Stage stage = p_actions.entry_point_stages[fnode->name];
+					_dump_function_deps(pnode, fnode->name, function_code, r_gen_code.stage_globals[stage], added_funcs_per_stage[stage]);
+					r_gen_code.code[fnode->name] = function_code[fnode->name];
 				}
 
 				function = nullptr;
@@ -858,7 +842,7 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 			SL::VariableNode *vnode = (SL::VariableNode *)p_node;
 			bool use_fragment_varying = false;
 
-			if (current_func_name != vertex_name) {
+			if (!(p_actions.entry_point_stages.has(current_func_name) && p_actions.entry_point_stages[current_func_name] == STAGE_VERTEX)) {
 				if (p_assigning) {
 					if (shader->varyings.has(vnode->name)) {
 						use_fragment_varying = true;
@@ -921,10 +905,10 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 			}
 
 			if (vnode->name == time_name) {
-				if (current_func_name == vertex_name) {
+				if (p_actions.entry_point_stages.has(current_func_name) && p_actions.entry_point_stages[current_func_name] == STAGE_VERTEX) {
 					r_gen_code.uses_vertex_time = true;
 				}
-				if (current_func_name == fragment_name || current_func_name == light_name) {
+				if (p_actions.entry_point_stages.has(current_func_name) && p_actions.entry_point_stages[current_func_name] == STAGE_FRAGMENT) {
 					r_gen_code.uses_fragment_time = true;
 				}
 			}
@@ -1003,7 +987,7 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 			SL::ArrayNode *anode = (SL::ArrayNode *)p_node;
 			bool use_fragment_varying = false;
 
-			if (current_func_name != vertex_name) {
+			if (!(p_actions.entry_point_stages.has(current_func_name) && p_actions.entry_point_stages[current_func_name] == STAGE_VERTEX)) {
 				if (anode->assign_expression != nullptr) {
 					use_fragment_varying = true;
 				} else {
@@ -1059,10 +1043,10 @@ String ShaderCompilerRD::_dump_node_code(const SL::Node *p_node, int p_level, Ge
 			}
 
 			if (anode->name == time_name) {
-				if (current_func_name == vertex_name) {
+				if (p_actions.entry_point_stages.has(current_func_name) && p_actions.entry_point_stages[current_func_name] == STAGE_VERTEX) {
 					r_gen_code.uses_vertex_time = true;
 				}
-				if (current_func_name == fragment_name || current_func_name == light_name) {
+				if (p_actions.entry_point_stages.has(current_func_name) && p_actions.entry_point_stages[current_func_name] == STAGE_FRAGMENT) {
 					r_gen_code.uses_fragment_time = true;
 				}
 			}
@@ -1309,7 +1293,7 @@ ShaderLanguage::DataType ShaderCompilerRD::_get_variable_type(const StringName &
 }
 
 Error ShaderCompilerRD::compile(RS::ShaderMode p_mode, const String &p_code, IdentifierActions *p_actions, const String &p_path, GeneratedCode &r_gen_code) {
-	Error err = parser.compile(p_code, ShaderTypes::get_singleton()->get_functions(p_mode), ShaderTypes::get_singleton()->get_modes(p_mode), ShaderTypes::get_singleton()->get_types(), _get_variable_type);
+	Error err = parser.compile(p_code, ShaderTypes::get_singleton()->get_functions(p_mode), ShaderTypes::get_singleton()->get_modes(p_mode), ShaderLanguage::VaryingFunctionNames(), ShaderTypes::get_singleton()->get_types(), _get_variable_type);
 
 	if (err != OK) {
 		Vector<String> shader = p_code.split("\n");
@@ -1322,13 +1306,10 @@ Error ShaderCompilerRD::compile(RS::ShaderMode p_mode, const String &p_code, Ide
 	}
 
 	r_gen_code.defines.clear();
-	r_gen_code.vertex = String();
-	r_gen_code.vertex_global = String();
-	r_gen_code.fragment = String();
-	r_gen_code.fragment_global = String();
-	r_gen_code.compute = String();
-	r_gen_code.compute_global = String();
-	r_gen_code.light = String();
+	r_gen_code.code.clear();
+	for (int i = 0; i < STAGE_MAX; i++) {
+		r_gen_code.stage_globals[i] = String();
+	}
 	r_gen_code.uses_fragment_time = false;
 	r_gen_code.uses_vertex_time = false;
 	r_gen_code.uses_global_textures = false;
@@ -1348,10 +1329,6 @@ Error ShaderCompilerRD::compile(RS::ShaderMode p_mode, const String &p_code, Ide
 void ShaderCompilerRD::initialize(DefaultIdentifierActions p_actions) {
 	actions = p_actions;
 
-	vertex_name = "vertex";
-	fragment_name = "fragment";
-	compute_name = "compute";
-	light_name = "light";
 	time_name = "TIME";
 
 	List<String> func_list;
diff --git a/servers/rendering/renderer_rd/shader_compiler_rd.h b/servers/rendering/renderer_rd/shader_compiler_rd.h
index 6575829e73..2da127ffa3 100644
--- a/servers/rendering/renderer_rd/shader_compiler_rd.h
+++ b/servers/rendering/renderer_rd/shader_compiler_rd.h
@@ -38,7 +38,16 @@
 
 class ShaderCompilerRD {
 public:
+	enum Stage {
+		STAGE_VERTEX,
+		STAGE_FRAGMENT,
+		STAGE_COMPUTE,
+		STAGE_MAX
+	};
+
 	struct IdentifierActions {
+		Map<StringName, Stage> entry_point_stages;
+
 		Map<StringName, Pair<int *, int>> render_mode_values;
 		Map<StringName, bool *> render_mode_flags;
 		Map<StringName, bool *> usage_flag_pointers;
@@ -63,13 +72,9 @@ public:
 		Vector<uint32_t> uniform_offsets;
 		uint32_t uniform_total_size;
 		String uniforms;
-		String vertex_global;
-		String vertex;
-		String fragment_global;
-		String fragment;
-		String light;
-		String compute_global;
-		String compute;
+		String stage_globals[STAGE_MAX];
+
+		Map<String, String> code;
 
 		bool uses_global_textures;
 		bool uses_fragment_time;
@@ -103,10 +108,6 @@ private:
 	const ShaderLanguage::ShaderNode *shader;
 	const ShaderLanguage::FunctionNode *function;
 	StringName current_func_name;
-	StringName vertex_name;
-	StringName fragment_name;
-	StringName light_name;
-	StringName compute_name;
 	StringName time_name;
 	Set<StringName> texture_functions;
 
diff --git a/servers/rendering/renderer_rd/shader_rd.cpp b/servers/rendering/renderer_rd/shader_rd.cpp
index e4a39ff813..f7242a2b17 100644
--- a/servers/rendering/renderer_rd/shader_rd.cpp
+++ b/servers/rendering/renderer_rd/shader_rd.cpp
@@ -30,146 +30,83 @@
 
 #include "shader_rd.h"
 
-#include "core/string/string_builder.h"
 #include "renderer_compositor_rd.h"
 #include "servers/rendering/rendering_device.h"
 
-void ShaderRD::setup(const char *p_vertex_code, const char *p_fragment_code, const char *p_compute_code, const char *p_name) {
-	name = p_name;
-	//split vertex and shader code (thank you, shader compiler programmers from you know what company).
-	if (p_vertex_code) {
-		String defines_tag = "\nVERSION_DEFINES";
-		String globals_tag = "\nVERTEX_SHADER_GLOBALS";
-		String material_tag = "\nMATERIAL_UNIFORMS";
-		String code_tag = "\nVERTEX_SHADER_CODE";
-		String code = p_vertex_code;
-
-		int cpos = code.find(defines_tag);
-		if (cpos != -1) {
-			vertex_codev = code.substr(0, cpos).ascii();
-			code = code.substr(cpos + defines_tag.length(), code.length());
-		}
-
-		cpos = code.find(material_tag);
-
-		if (cpos == -1) {
-			vertex_code0 = code.ascii();
-		} else {
-			vertex_code0 = code.substr(0, cpos).ascii();
-			code = code.substr(cpos + material_tag.length(), code.length());
-
-			cpos = code.find(globals_tag);
-
-			if (cpos == -1) {
-				vertex_code1 = code.ascii();
-			} else {
-				vertex_code1 = code.substr(0, cpos).ascii();
-				String code2 = code.substr(cpos + globals_tag.length(), code.length());
-
-				cpos = code2.find(code_tag);
-				if (cpos == -1) {
-					vertex_code2 = code2.ascii();
-				} else {
-					vertex_code2 = code2.substr(0, cpos).ascii();
-					vertex_code3 = code2.substr(cpos + code_tag.length(), code2.length()).ascii();
+void ShaderRD::_add_stage(const char *p_code, StageType p_stage_type) {
+	Vector<String> lines = String(p_code).split("\n");
+
+	String text;
+
+	for (int i = 0; i < lines.size(); i++) {
+		String l = lines[i];
+		bool push_chunk = false;
+
+		StageTemplate::Chunk chunk;
+
+		if (l.begins_with("#VERSION_DEFINES")) {
+			chunk.type = StageTemplate::Chunk::TYPE_VERSION_DEFINES;
+			push_chunk = true;
+		} else if (l.begins_with("#GLOBALS")) {
+			switch (p_stage_type) {
+				case STAGE_TYPE_VERTEX:
+					chunk.type = StageTemplate::Chunk::TYPE_VERTEX_GLOBALS;
+					break;
+				case STAGE_TYPE_FRAGMENT:
+					chunk.type = StageTemplate::Chunk::TYPE_FRAGMENT_GLOBALS;
+					break;
+				case STAGE_TYPE_COMPUTE:
+					chunk.type = StageTemplate::Chunk::TYPE_COMPUTE_GLOBALS;
+					break;
+				default: {
 				}
 			}
-		}
-	}
 
-	if (p_fragment_code) {
-		String defines_tag = "\nVERSION_DEFINES";
-		String globals_tag = "\nFRAGMENT_SHADER_GLOBALS";
-		String material_tag = "\nMATERIAL_UNIFORMS";
-		String code_tag = "\nFRAGMENT_SHADER_CODE";
-		String light_code_tag = "\nLIGHT_SHADER_CODE";
-		String code = p_fragment_code;
-
-		int cpos = code.find(defines_tag);
-		if (cpos != -1) {
-			fragment_codev = code.substr(0, cpos).ascii();
-			code = code.substr(cpos + defines_tag.length(), code.length());
+			push_chunk = true;
+		} else if (l.begins_with("#MATERIAL_UNIFORMS")) {
+			chunk.type = StageTemplate::Chunk::TYPE_MATERIAL_UNIFORMS;
+			push_chunk = true;
+		} else if (l.begins_with("#CODE")) {
+			chunk.type = StageTemplate::Chunk::TYPE_CODE;
+			push_chunk = true;
+			chunk.code = l.replace_first("#CODE", String()).replace(":", "").strip_edges().to_upper();
+		} else {
+			text += l + "\n";
 		}
 
-		cpos = code.find(material_tag);
-		if (cpos == -1) {
-			fragment_code0 = code.ascii();
-		} else {
-			fragment_code0 = code.substr(0, cpos).ascii();
-			//print_line("CODE0:\n"+String(fragment_code0.get_data()));
-			code = code.substr(cpos + material_tag.length(), code.length());
-			cpos = code.find(globals_tag);
-
-			if (cpos == -1) {
-				fragment_code1 = code.ascii();
-			} else {
-				fragment_code1 = code.substr(0, cpos).ascii();
-				//print_line("CODE1:\n"+String(fragment_code1.get_data()));
-
-				String code2 = code.substr(cpos + globals_tag.length(), code.length());
-				cpos = code2.find(light_code_tag);
-
-				if (cpos == -1) {
-					fragment_code2 = code2.ascii();
-				} else {
-					fragment_code2 = code2.substr(0, cpos).ascii();
-					//print_line("CODE2:\n"+String(fragment_code2.get_data()));
-
-					String code3 = code2.substr(cpos + light_code_tag.length(), code2.length());
-
-					cpos = code3.find(code_tag);
-					if (cpos == -1) {
-						fragment_code3 = code3.ascii();
-					} else {
-						fragment_code3 = code3.substr(0, cpos).ascii();
-						//print_line("CODE3:\n"+String(fragment_code3.get_data()));
-						fragment_code4 = code3.substr(cpos + code_tag.length(), code3.length()).ascii();
-						//print_line("CODE4:\n"+String(fragment_code4.get_data()));
-					}
-				}
+		if (push_chunk) {
+			if (text != String()) {
+				StageTemplate::Chunk text_chunk;
+				text_chunk.type = StageTemplate::Chunk::TYPE_TEXT;
+				text_chunk.text = text.utf8();
+				stage_templates[p_stage_type].chunks.push_back(text_chunk);
+				text = String();
 			}
+			stage_templates[p_stage_type].chunks.push_back(chunk);
 		}
 	}
 
+	if (text != String()) {
+		StageTemplate::Chunk text_chunk;
+		text_chunk.type = StageTemplate::Chunk::TYPE_TEXT;
+		text_chunk.text = text.utf8();
+		stage_templates[p_stage_type].chunks.push_back(text_chunk);
+		text = String();
+	}
+}
+
+void ShaderRD::setup(const char *p_vertex_code, const char *p_fragment_code, const char *p_compute_code, const char *p_name) {
+	name = p_name;
 	if (p_compute_code) {
+		_add_stage(p_compute_code, STAGE_TYPE_COMPUTE);
 		is_compute = true;
-
-		String defines_tag = "\nVERSION_DEFINES";
-		String globals_tag = "\nCOMPUTE_SHADER_GLOBALS";
-		String material_tag = "\nMATERIAL_UNIFORMS";
-		String code_tag = "\nCOMPUTE_SHADER_CODE";
-		String code = p_compute_code;
-
-		int cpos = code.find(defines_tag);
-		if (cpos != -1) {
-			compute_codev = code.substr(0, cpos).ascii();
-			code = code.substr(cpos + defines_tag.length(), code.length());
+	} else {
+		is_compute = false;
+		if (p_vertex_code) {
+			_add_stage(p_vertex_code, STAGE_TYPE_VERTEX);
 		}
-
-		cpos = code.find(material_tag);
-
-		if (cpos == -1) {
-			compute_code0 = code.ascii();
-		} else {
-			compute_code0 = code.substr(0, cpos).ascii();
-			code = code.substr(cpos + material_tag.length(), code.length());
-
-			cpos = code.find(globals_tag);
-
-			if (cpos == -1) {
-				compute_code1 = code.ascii();
-			} else {
-				compute_code1 = code.substr(0, cpos).ascii();
-				String code2 = code.substr(cpos + globals_tag.length(), code.length());
-
-				cpos = code2.find(code_tag);
-				if (cpos == -1) {
-					compute_code2 = code2.ascii();
-				} else {
-					compute_code2 = code2.substr(0, cpos).ascii();
-					compute_code3 = code2.substr(cpos + code_tag.length(), code2.length()).ascii();
-				}
-			}
+		if (p_fragment_code) {
+			_add_stage(p_fragment_code, STAGE_TYPE_FRAGMENT);
 		}
 	}
 }
@@ -198,6 +135,49 @@ void ShaderRD::_clear_version(Version *p_version) {
 	}
 }
 
+void ShaderRD::_build_variant_code(StringBuilder &builder, uint32_t p_variant, const Version *p_version, const StageTemplate &p_template) {
+	for (uint32_t i = 0; i < p_template.chunks.size(); i++) {
+		const StageTemplate::Chunk &chunk = p_template.chunks[i];
+		switch (chunk.type) {
+			case StageTemplate::Chunk::TYPE_VERSION_DEFINES: {
+				builder.append("\n"); //make sure defines begin at newline
+				builder.append(general_defines.get_data());
+				builder.append(variant_defines[p_variant].get_data());
+				for (int j = 0; j < p_version->custom_defines.size(); j++) {
+					builder.append(p_version->custom_defines[j].get_data());
+				}
+				builder.append("\n"); //make sure defines begin at newline
+				if (p_version->uniforms.size()) {
+					builder.append("#define MATERIAL_UNIFORMS_USED\n");
+				}
+				for (Map<StringName, CharString>::Element *E = p_version->code_sections.front(); E; E = E->next()) {
+					builder.append(String("#define ") + String(E->key()) + "_CODE_USED\n");
+				}
+			} break;
+			case StageTemplate::Chunk::TYPE_MATERIAL_UNIFORMS: {
+				builder.append(p_version->uniforms.get_data()); //uniforms (same for vertex and fragment)
+			} break;
+			case StageTemplate::Chunk::TYPE_VERTEX_GLOBALS: {
+				builder.append(p_version->vertex_globals.get_data()); // vertex globals
+			} break;
+			case StageTemplate::Chunk::TYPE_FRAGMENT_GLOBALS: {
+				builder.append(p_version->fragment_globals.get_data()); // fragment globals
+			} break;
+			case StageTemplate::Chunk::TYPE_COMPUTE_GLOBALS: {
+				builder.append(p_version->compute_globals.get_data()); // compute globals
+			} break;
+			case StageTemplate::Chunk::TYPE_CODE: {
+				if (p_version->code_sections.has(chunk.code)) {
+					builder.append(p_version->code_sections[chunk.code].get_data());
+				}
+			} break;
+			case StageTemplate::Chunk::TYPE_TEXT: {
+				builder.append(chunk.text.get_data());
+			} break;
+		}
+	}
+}
+
 void ShaderRD::_compile_variant(uint32_t p_variant, Version *p_version) {
 	if (!variants_enabled[p_variant]) {
 		return; //variant is disabled, return
@@ -214,29 +194,7 @@ void ShaderRD::_compile_variant(uint32_t p_variant, Version *p_version) {
 		//vertex stage
 
 		StringBuilder builder;
-
-		builder.append(vertex_codev.get_data()); // version info (if exists)
-		builder.append("\n"); //make sure defines begin at newline
-		builder.append(general_defines.get_data());
-		builder.append(variant_defines[p_variant].get_data());
-
-		for (int j = 0; j < p_version->custom_defines.size(); j++) {
-			builder.append(p_version->custom_defines[j].get_data());
-		}
-
-		builder.append(vertex_code0.get_data()); //first part of vertex
-
-		builder.append(p_version->uniforms.get_data()); //uniforms (same for vertex and fragment)
-
-		builder.append(vertex_code1.get_data()); //second part of vertex
-
-		builder.append(p_version->vertex_globals.get_data()); // vertex globals
-
-		builder.append(vertex_code2.get_data()); //third part of vertex
-
-		builder.append(p_version->vertex_code.get_data()); // code
-
-		builder.append(vertex_code3.get_data()); //fourth of vertex
+		_build_variant_code(builder, p_variant, p_version, stage_templates[STAGE_TYPE_VERTEX]);
 
 		current_source = builder.as_string();
 		RD::ShaderStageData stage;
@@ -254,33 +212,7 @@ void ShaderRD::_compile_variant(uint32_t p_variant, Version *p_version) {
 		current_stage = RD::SHADER_STAGE_FRAGMENT;
 
 		StringBuilder builder;
-
-		builder.append(fragment_codev.get_data()); // version info (if exists)
-		builder.append("\n"); //make sure defines begin at newline
-
-		builder.append(general_defines.get_data());
-		builder.append(variant_defines[p_variant].get_data());
-		for (int j = 0; j < p_version->custom_defines.size(); j++) {
-			builder.append(p_version->custom_defines[j].get_data());
-		}
-
-		builder.append(fragment_code0.get_data()); //first part of fragment
-
-		builder.append(p_version->uniforms.get_data()); //uniforms (same for fragment and fragment)
-
-		builder.append(fragment_code1.get_data()); //first part of fragment
-
-		builder.append(p_version->fragment_globals.get_data()); // fragment globals
-
-		builder.append(fragment_code2.get_data()); //third part of fragment
-
-		builder.append(p_version->fragment_light.get_data()); // fragment light
-
-		builder.append(fragment_code3.get_data()); //fourth part of fragment
-
-		builder.append(p_version->fragment_code.get_data()); // fragment code
-
-		builder.append(fragment_code4.get_data()); //fourth part of fragment
+		_build_variant_code(builder, p_variant, p_version, stage_templates[STAGE_TYPE_FRAGMENT]);
 
 		current_source = builder.as_string();
 		RD::ShaderStageData stage;
@@ -298,32 +230,10 @@ void ShaderRD::_compile_variant(uint32_t p_variant, Version *p_version) {
 		current_stage = RD::SHADER_STAGE_COMPUTE;
 
 		StringBuilder builder;
-
-		builder.append(compute_codev.get_data()); // version info (if exists)
-		builder.append("\n"); //make sure defines begin at newline
-		builder.append(base_compute_defines.get_data());
-		builder.append(general_defines.get_data());
-		builder.append(variant_defines[p_variant].get_data());
-
-		for (int j = 0; j < p_version->custom_defines.size(); j++) {
-			builder.append(p_version->custom_defines[j].get_data());
-		}
-
-		builder.append(compute_code0.get_data()); //first part of compute
-
-		builder.append(p_version->uniforms.get_data()); //uniforms (same for compute and fragment)
-
-		builder.append(compute_code1.get_data()); //second part of compute
-
-		builder.append(p_version->compute_globals.get_data()); // compute globals
-
-		builder.append(compute_code2.get_data()); //third part of compute
-
-		builder.append(p_version->compute_code.get_data()); // code
-
-		builder.append(compute_code3.get_data()); //fourth of compute
+		_build_variant_code(builder, p_variant, p_version, stage_templates[STAGE_TYPE_COMPUTE]);
 
 		current_source = builder.as_string();
+
 		RD::ShaderStageData stage;
 		stage.spir_v = RD::get_singleton()->shader_compile_from_source(RD::SHADER_STAGE_COMPUTE, current_source, RD::SHADER_LANGUAGE_GLSL, &error);
 		if (stage.spir_v.size() == 0) {
@@ -364,29 +274,7 @@ RS::ShaderNativeSourceCode ShaderRD::version_get_native_source_code(RID p_versio
 			//vertex stage
 
 			StringBuilder builder;
-
-			builder.append(vertex_codev.get_data()); // version info (if exists)
-			builder.append("\n"); //make sure defines begin at newline
-			builder.append(general_defines.get_data());
-			builder.append(variant_defines[i].get_data());
-
-			for (int j = 0; j < version->custom_defines.size(); j++) {
-				builder.append(version->custom_defines[j].get_data());
-			}
-
-			builder.append(vertex_code0.get_data()); //first part of vertex
-
-			builder.append(version->uniforms.get_data()); //uniforms (same for vertex and fragment)
-
-			builder.append(vertex_code1.get_data()); //second part of vertex
-
-			builder.append(version->vertex_globals.get_data()); // vertex globals
-
-			builder.append(vertex_code2.get_data()); //third part of vertex
-
-			builder.append(version->vertex_code.get_data()); // code
-
-			builder.append(vertex_code3.get_data()); //fourth of vertex
+			_build_variant_code(builder, i, version, stage_templates[STAGE_TYPE_VERTEX]);
 
 			RS::ShaderNativeSourceCode::Version::Stage stage;
 			stage.name = "vertex";
@@ -399,32 +287,7 @@ RS::ShaderNativeSourceCode ShaderRD::version_get_native_source_code(RID p_versio
 			//fragment stage
 
 			StringBuilder builder;
-
-			builder.append(fragment_codev.get_data()); // version info (if exists)
-			builder.append("\n"); //make sure defines begin at newline
-			builder.append(general_defines.get_data());
-			builder.append(variant_defines[i].get_data());
-			for (int j = 0; j < version->custom_defines.size(); j++) {
-				builder.append(version->custom_defines[j].get_data());
-			}
-
-			builder.append(fragment_code0.get_data()); //first part of fragment
-
-			builder.append(version->uniforms.get_data()); //uniforms (same for fragment and fragment)
-
-			builder.append(fragment_code1.get_data()); //first part of fragment
-
-			builder.append(version->fragment_globals.get_data()); // fragment globals
-
-			builder.append(fragment_code2.get_data()); //third part of fragment
-
-			builder.append(version->fragment_light.get_data()); // fragment light
-
-			builder.append(fragment_code3.get_data()); //fourth part of fragment
-
-			builder.append(version->fragment_code.get_data()); // fragment code
-
-			builder.append(fragment_code4.get_data()); //fourth part of fragment
+			_build_variant_code(builder, i, version, stage_templates[STAGE_TYPE_FRAGMENT]);
 
 			RS::ShaderNativeSourceCode::Version::Stage stage;
 			stage.name = "fragment";
@@ -437,30 +300,7 @@ RS::ShaderNativeSourceCode ShaderRD::version_get_native_source_code(RID p_versio
 			//compute stage
 
 			StringBuilder builder;
-
-			builder.append(compute_codev.get_data()); // version info (if exists)
-			builder.append("\n"); //make sure defines begin at newline
-			builder.append(base_compute_defines.get_data());
-			builder.append(general_defines.get_data());
-			builder.append(variant_defines[i].get_data());
-
-			for (int j = 0; j < version->custom_defines.size(); j++) {
-				builder.append(version->custom_defines[j].get_data());
-			}
-
-			builder.append(compute_code0.get_data()); //first part of compute
-
-			builder.append(version->uniforms.get_data()); //uniforms (same for compute and fragment)
-
-			builder.append(compute_code1.get_data()); //second part of compute
-
-			builder.append(version->compute_globals.get_data()); // compute globals
-
-			builder.append(compute_code2.get_data()); //third part of compute
-
-			builder.append(version->compute_code.get_data()); // code
-
-			builder.append(compute_code3.get_data()); //fourth of compute
+			_build_variant_code(builder, i, version, stage_templates[STAGE_TYPE_COMPUTE]);
 
 			RS::ShaderNativeSourceCode::Version::Stage stage;
 			stage.name = "compute";
@@ -518,17 +358,18 @@ void ShaderRD::_compile_version(Version *p_version) {
 	p_version->valid = true;
 }
 
-void ShaderRD::version_set_code(RID p_version, const String &p_uniforms, const String &p_vertex_globals, const String &p_vertex_code, const String &p_fragment_globals, const String &p_fragment_light, const String &p_fragment_code, const Vector<String> &p_custom_defines) {
+void ShaderRD::version_set_code(RID p_version, const Map<String, String> &p_code, const String &p_uniforms, const String &p_vertex_globals, const String &p_fragment_globals, const Vector<String> &p_custom_defines) {
 	ERR_FAIL_COND(is_compute);
 
 	Version *version = version_owner.getornull(p_version);
 	ERR_FAIL_COND(!version);
 	version->vertex_globals = p_vertex_globals.utf8();
-	version->vertex_code = p_vertex_code.utf8();
-	version->fragment_light = p_fragment_light.utf8();
 	version->fragment_globals = p_fragment_globals.utf8();
-	version->fragment_code = p_fragment_code.utf8();
 	version->uniforms = p_uniforms.utf8();
+	version->code_sections.clear();
+	for (Map<String, String>::Element *E = p_code.front(); E; E = E->next()) {
+		version->code_sections[StringName(E->key().to_upper())] = E->get().utf8();
+	}
 
 	version->custom_defines.clear();
 	for (int i = 0; i < p_custom_defines.size(); i++) {
@@ -542,15 +383,20 @@ void ShaderRD::version_set_code(RID p_version, const String &p_uniforms, const S
 	}
 }
 
-void ShaderRD::version_set_compute_code(RID p_version, const String &p_uniforms, const String &p_compute_globals, const String &p_compute_code, const Vector<String> &p_custom_defines) {
+void ShaderRD::version_set_compute_code(RID p_version, const Map<String, String> &p_code, const String &p_uniforms, const String &p_compute_globals, const Vector<String> &p_custom_defines) {
 	ERR_FAIL_COND(!is_compute);
 
 	Version *version = version_owner.getornull(p_version);
 	ERR_FAIL_COND(!version);
+
 	version->compute_globals = p_compute_globals.utf8();
-	version->compute_code = p_compute_code.utf8();
 	version->uniforms = p_uniforms.utf8();
 
+	version->code_sections.clear();
+	for (Map<String, String>::Element *E = p_code.front(); E; E = E->next()) {
+		version->code_sections[StringName(E->key().to_upper())] = E->get().utf8();
+	}
+
 	version->custom_defines.clear();
 	for (int i = 0; i < p_custom_defines.size(); i++) {
 		version->custom_defines.push_back(p_custom_defines[i].utf8());
diff --git a/servers/rendering/renderer_rd/shader_rd.h b/servers/rendering/renderer_rd/shader_rd.h
index e0f4dcf2d0..f20d539621 100644
--- a/servers/rendering/renderer_rd/shader_rd.h
+++ b/servers/rendering/renderer_rd/shader_rd.h
@@ -32,7 +32,9 @@
 #define SHADER_RD_H
 
 #include "core/os/mutex.h"
+#include "core/string/string_builder.h"
 #include "core/templates/hash_map.h"
+#include "core/templates/local_vector.h"
 #include "core/templates/map.h"
 #include "core/templates/rid_owner.h"
 #include "core/variant/variant.h"
@@ -52,12 +54,9 @@ class ShaderRD {
 	struct Version {
 		CharString uniforms;
 		CharString vertex_globals;
-		CharString vertex_code;
 		CharString compute_globals;
-		CharString compute_code;
-		CharString fragment_light;
 		CharString fragment_globals;
-		CharString fragment_code;
+		Map<StringName, CharString> code_sections;
 		Vector<CharString> custom_defines;
 
 		RID *variants; //same size as version defines
@@ -76,31 +75,44 @@ class ShaderRD {
 
 	RID_Owner<Version> version_owner;
 
-	CharString fragment_codev; //for version and extensions
-	CharString fragment_code0;
-	CharString fragment_code1;
-	CharString fragment_code2;
-	CharString fragment_code3;
-	CharString fragment_code4;
-
-	CharString vertex_codev; //for version and extensions
-	CharString vertex_code0;
-	CharString vertex_code1;
-	CharString vertex_code2;
-	CharString vertex_code3;
+	struct StageTemplate {
+		struct Chunk {
+			enum Type {
+				TYPE_VERSION_DEFINES,
+				TYPE_MATERIAL_UNIFORMS,
+				TYPE_VERTEX_GLOBALS,
+				TYPE_FRAGMENT_GLOBALS,
+				TYPE_COMPUTE_GLOBALS,
+				TYPE_CODE,
+				TYPE_TEXT
+			};
+
+			Type type;
+			StringName code;
+			CharString text;
+		};
+		LocalVector<Chunk> chunks;
+	};
 
 	bool is_compute = false;
 
-	CharString compute_codev; //for version and extensions
-	CharString compute_code0;
-	CharString compute_code1;
-	CharString compute_code2;
-	CharString compute_code3;
-
 	const char *name;
 
 	CharString base_compute_defines;
 
+	enum StageType {
+		STAGE_TYPE_VERTEX,
+		STAGE_TYPE_FRAGMENT,
+		STAGE_TYPE_COMPUTE,
+		STAGE_TYPE_MAX,
+	};
+
+	StageTemplate stage_templates[STAGE_TYPE_MAX];
+
+	void _build_variant_code(StringBuilder &p_builder, uint32_t p_variant, const Version *p_version, const StageTemplate &p_template);
+
+	void _add_stage(const char *p_code, StageType p_stage_type);
+
 protected:
 	ShaderRD();
 	void setup(const char *p_vertex_code, const char *p_fragment_code, const char *p_compute_code, const char *p_name);
@@ -108,8 +120,8 @@ protected:
 public:
 	RID version_create();
 
-	void version_set_code(RID p_version, const String &p_uniforms, const String &p_vertex_globals, const String &p_vertex_code, const String &p_fragment_globals, const String &p_fragment_light, const String &p_fragment_code, const Vector<String> &p_custom_defines);
-	void version_set_compute_code(RID p_version, const String &p_uniforms, const String &p_compute_globals, const String &p_compute_code, const Vector<String> &p_custom_defines);
+	void version_set_code(RID p_version, const Map<String, String> &p_code, const String &p_uniforms, const String &p_vertex_globals, const String &p_fragment_globals, const Vector<String> &p_custom_defines);
+	void version_set_compute_code(RID p_version, const Map<String, String> &p_code, const String &p_uniforms, const String &p_compute_globals, const Vector<String> &p_custom_defines);
 
 	_FORCE_INLINE_ RID version_get_shader(RID p_version, int p_variant) {
 		ERR_FAIL_INDEX_V(p_variant, variant_defines.size(), RID());
diff --git a/servers/rendering/renderer_rd/shaders/bokeh_dof.glsl b/servers/rendering/renderer_rd/shaders/bokeh_dof.glsl
index 63f086a83d..b70e0b6bd5 100644
--- a/servers/rendering/renderer_rd/shaders/bokeh_dof.glsl
+++ b/servers/rendering/renderer_rd/shaders/bokeh_dof.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #define BLOCK_SIZE 8
 
diff --git a/servers/rendering/renderer_rd/shaders/canvas.glsl b/servers/rendering/renderer_rd/shaders/canvas.glsl
index 3b39edc70e..8b97ec119f 100644
--- a/servers/rendering/renderer_rd/shaders/canvas.glsl
+++ b/servers/rendering/renderer_rd/shaders/canvas.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #ifdef USE_ATTRIBUTES
 layout(location = 0) in vec2 vertex_attrib;
@@ -26,17 +26,15 @@ layout(location = 3) out vec2 pixel_size_interp;
 
 #endif
 
-#ifdef USE_MATERIAL_UNIFORMS
+#ifdef MATERIAL_UNIFORMS_USED
 layout(set = 1, binding = 0, std140) uniform MaterialUniforms{
-	/* clang-format off */
-MATERIAL_UNIFORMS
-	/* clang-format on */
+
+#MATERIAL_UNIFORMS
+
 } material;
 #endif
 
-/* clang-format off */
-VERTEX_SHADER_GLOBALS
-/* clang-format on */
+#GLOBALS
 
 void main() {
 	vec4 instance_custom = vec4(0.0);
@@ -132,9 +130,7 @@ void main() {
 	float point_size = 1.0;
 #endif
 	{
-		/* clang-format off */
-VERTEX_SHADER_CODE
-		/* clang-format on */
+#CODE : VERTEX
 	}
 
 #ifdef USE_NINEPATCH
@@ -212,7 +208,7 @@ VERTEX_SHADER_CODE
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #include "canvas_uniforms_inc.glsl"
 
@@ -228,11 +224,11 @@ layout(location = 3) in vec2 pixel_size_interp;
 
 layout(location = 0) out vec4 frag_color;
 
-#ifdef USE_MATERIAL_UNIFORMS
+#ifdef MATERIAL_UNIFORMS_USED
 layout(set = 1, binding = 0, std140) uniform MaterialUniforms{
-	/* clang-format off */
-MATERIAL_UNIFORMS
-	/* clang-format on */
+
+#MATERIAL_UNIFORMS
+
 } material;
 #endif
 
@@ -260,11 +256,9 @@ vec2 sdf_to_screen_uv(vec2 p_sdf) {
 	return p_sdf * canvas_data.sdf_to_screen;
 }
 
-/* clang-format off */
-FRAGMENT_SHADER_GLOBALS
-/* clang-format on */
+#GLOBALS
 
-#ifdef LIGHT_SHADER_CODE_USED
+#ifdef LIGHT_CODE_USED
 
 vec4 light_compute(
 		vec3 light_vertex,
@@ -278,9 +272,9 @@ vec4 light_compute(
 		vec2 uv,
 		vec4 color, bool is_directional) {
 	vec4 light = vec4(0.0);
-	/* clang-format off */
-LIGHT_SHADER_CODE
-	/* clang-format on */
+
+#CODE : LIGHT
+
 	return light;
 }
 
@@ -356,7 +350,7 @@ vec3 light_normal_compute(vec3 light_vec, vec3 normal, vec3 base_color, vec3 lig
 
 //float distance = length(shadow_pos);
 vec4 light_shadow_compute(uint light_base, vec4 light_color, vec4 shadow_uv
-#ifdef LIGHT_SHADER_CODE_USED
+#ifdef LIGHT_CODE_USED
 		,
 		vec3 shadow_modulate
 #endif
@@ -395,7 +389,7 @@ vec4 light_shadow_compute(uint light_base, vec4 light_color, vec4 shadow_uv
 	}
 
 	vec4 shadow_color = unpackUnorm4x8(light_array.data[light_base].shadow_color);
-#ifdef LIGHT_SHADER_CODE_USED
+#ifdef LIGHT_CODE_USED
 	shadow_color.rgb *= shadow_modulate;
 #endif
 
@@ -504,11 +498,7 @@ void main() {
 		normal_used = true;
 #endif
 
-		/* clang-format off */
-
-FRAGMENT_SHADER_CODE
-
-		/* clang-format on */
+#CODE : FRAGMENT
 
 #if defined(NORMAL_MAP_USED)
 		normal = mix(vec3(0.0, 0.0, 1.0), normal_map * vec3(2.0, -2.0, 1.0) - vec3(1.0, -1.0, 0.0), normal_map_depth);
@@ -543,7 +533,7 @@ FRAGMENT_SHADER_CODE
 		vec2 direction = light_array.data[light_base].position;
 		vec4 light_color = light_array.data[light_base].color;
 
-#ifdef LIGHT_SHADER_CODE_USED
+#ifdef LIGHT_CODE_USED
 
 		vec4 shadow_modulate = vec4(1.0);
 		light_color = light_compute(light_vertex, vec3(direction, light_array.data[light_base].height), normal, light_color, light_color.a, specular_shininess, shadow_modulate, screen_uv, uv, color, true);
@@ -561,7 +551,7 @@ FRAGMENT_SHADER_CODE
 			vec4 shadow_uv = vec4(shadow_pos.x, light_array.data[light_base].shadow_y_ofs, shadow_pos.y * light_array.data[light_base].shadow_zfar_inv, 1.0);
 
 			light_color = light_shadow_compute(light_base, light_color, shadow_uv
-#ifdef LIGHT_SHADER_CODE_USED
+#ifdef LIGHT_CODE_USED
 					,
 					shadow_modulate.rgb
 #endif
@@ -599,7 +589,7 @@ FRAGMENT_SHADER_CODE
 		vec4 light_color = textureLod(sampler2D(atlas_texture, texture_sampler), tex_uv_atlas, 0.0);
 		vec4 light_base_color = light_array.data[light_base].color;
 
-#ifdef LIGHT_SHADER_CODE_USED
+#ifdef LIGHT_CODE_USED
 
 		vec4 shadow_modulate = vec4(1.0);
 		vec3 light_position = vec3(light_array.data[light_base].position, light_array.data[light_base].height);
@@ -657,7 +647,7 @@ FRAGMENT_SHADER_CODE
 			vec4 shadow_uv = vec4(tex_ofs, light_array.data[light_base].shadow_y_ofs, distance, 1.0);
 
 			light_color = light_shadow_compute(light_base, light_color, shadow_uv
-#ifdef LIGHT_SHADER_CODE_USED
+#ifdef LIGHT_CODE_USED
 					,
 					shadow_modulate.rgb
 #endif
diff --git a/servers/rendering/renderer_rd/shaders/canvas_occlusion.glsl b/servers/rendering/renderer_rd/shaders/canvas_occlusion.glsl
index 5c25235c58..9f89f4b3b7 100644
--- a/servers/rendering/renderer_rd/shaders/canvas_occlusion.glsl
+++ b/servers/rendering/renderer_rd/shaders/canvas_occlusion.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(location = 0) in highp vec3 vertex;
 
@@ -32,7 +32,7 @@ void main() {
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(push_constant, binding = 0, std430) uniform Constants {
 	mat4 projection;
diff --git a/servers/rendering/renderer_rd/shaders/canvas_sdf.glsl b/servers/rendering/renderer_rd/shaders/canvas_sdf.glsl
index 302ad03b41..65a554e839 100644
--- a/servers/rendering/renderer_rd/shaders/canvas_sdf.glsl
+++ b/servers/rendering/renderer_rd/shaders/canvas_sdf.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/cluster_data_inc.glsl b/servers/rendering/renderer_rd/shaders/cluster_data_inc.glsl
index 3a4bf4da07..8e616ebe1f 100644
--- a/servers/rendering/renderer_rd/shaders/cluster_data_inc.glsl
+++ b/servers/rendering/renderer_rd/shaders/cluster_data_inc.glsl
@@ -1,105 +1,3 @@
-
 #define CLUSTER_COUNTER_SHIFT 20
 #define CLUSTER_POINTER_MASK ((1 << CLUSTER_COUNTER_SHIFT) - 1)
 #define CLUSTER_COUNTER_MASK 0xfff
-
-struct LightData { //this structure needs to be as packed as possible
-	vec3 position;
-	float inv_radius;
-
-	vec3 direction;
-	float size;
-
-	vec3 color;
-	float attenuation;
-
-	float cone_attenuation;
-	float cone_angle;
-	float specular_amount;
-	bool shadow_enabled;
-
-	vec4 atlas_rect; // rect in the shadow atlas
-	mat4 shadow_matrix;
-	float shadow_bias;
-	float shadow_normal_bias;
-	float transmittance_bias;
-	float soft_shadow_size; // for spot, it's the size in uv coordinates of the light, for omni it's the span angle
-	float soft_shadow_scale; // scales the shadow kernel for blurrier shadows
-	uint mask;
-	float shadow_volumetric_fog_fade;
-	uint pad;
-	vec4 projector_rect; //projector rect in srgb decal atlas
-};
-
-#define REFLECTION_AMBIENT_DISABLED 0
-#define REFLECTION_AMBIENT_ENVIRONMENT 1
-#define REFLECTION_AMBIENT_COLOR 2
-
-struct ReflectionData {
-	vec3 box_extents;
-	float index;
-	vec3 box_offset;
-	uint mask;
-	vec3 ambient; // ambient color
-	float intensity;
-	bool exterior;
-	bool box_project;
-	uint ambient_mode;
-	uint pad;
-	//0-8 is intensity,8-9 is ambient, mode
-	mat4 local_matrix; // up to here for spot and omni, rest is for directional
-	// notes: for ambientblend, use distance to edge to blend between already existing global environment
-};
-
-struct DirectionalLightData {
-	vec3 direction;
-	float energy;
-	vec3 color;
-	float size;
-	float specular;
-	uint mask;
-	float softshadow_angle;
-	float soft_shadow_scale;
-	bool blend_splits;
-	bool shadow_enabled;
-	float fade_from;
-	float fade_to;
-	uvec3 pad;
-	float shadow_volumetric_fog_fade;
-	vec4 shadow_bias;
-	vec4 shadow_normal_bias;
-	vec4 shadow_transmittance_bias;
-	vec4 shadow_z_range;
-	vec4 shadow_range_begin;
-	vec4 shadow_split_offsets;
-	mat4 shadow_matrix1;
-	mat4 shadow_matrix2;
-	mat4 shadow_matrix3;
-	mat4 shadow_matrix4;
-	vec4 shadow_color1;
-	vec4 shadow_color2;
-	vec4 shadow_color3;
-	vec4 shadow_color4;
-	vec2 uv_scale1;
-	vec2 uv_scale2;
-	vec2 uv_scale3;
-	vec2 uv_scale4;
-};
-
-struct DecalData {
-	mat4 xform; //to decal transform
-	vec3 inv_extents;
-	float albedo_mix;
-	vec4 albedo_rect;
-	vec4 normal_rect;
-	vec4 orm_rect;
-	vec4 emission_rect;
-	vec4 modulate;
-	float emission_energy;
-	uint mask;
-	float upper_fade;
-	float lower_fade;
-	mat3x4 normal_xform;
-	vec3 normal;
-	float normal_fade;
-};
diff --git a/servers/rendering/renderer_rd/shaders/cluster_debug.glsl b/servers/rendering/renderer_rd/shaders/cluster_debug.glsl
index 70a875192c..40da2c6e5c 100644
--- a/servers/rendering/renderer_rd/shaders/cluster_debug.glsl
+++ b/servers/rendering/renderer_rd/shaders/cluster_debug.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/cluster_render.glsl b/servers/rendering/renderer_rd/shaders/cluster_render.glsl
index ca92d2104e..da7d189281 100644
--- a/servers/rendering/renderer_rd/shaders/cluster_render.glsl
+++ b/servers/rendering/renderer_rd/shaders/cluster_render.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(location = 0) in vec3 vertex_attrib;
 
@@ -63,7 +63,7 @@ void main() {
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #if defined(has_GL_KHR_shader_subgroup_ballot) && defined(has_GL_KHR_shader_subgroup_arithmetic) && defined(has_GL_KHR_shader_subgroup_vote)
 
diff --git a/servers/rendering/renderer_rd/shaders/cluster_store.glsl b/servers/rendering/renderer_rd/shaders/cluster_store.glsl
index 5be0893c4f..b0606efa94 100644
--- a/servers/rendering/renderer_rd/shaders/cluster_store.glsl
+++ b/servers/rendering/renderer_rd/shaders/cluster_store.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/copy.glsl b/servers/rendering/renderer_rd/shaders/copy.glsl
index cdd35dfb3f..4110a95ddb 100644
--- a/servers/rendering/renderer_rd/shaders/copy.glsl
+++ b/servers/rendering/renderer_rd/shaders/copy.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/copy_to_fb.glsl b/servers/rendering/renderer_rd/shaders/copy_to_fb.glsl
index 9751e13b4e..8c68e2dc2f 100644
--- a/servers/rendering/renderer_rd/shaders/copy_to_fb.glsl
+++ b/servers/rendering/renderer_rd/shaders/copy_to_fb.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(location = 0) out vec2 uv_interp;
 
@@ -37,7 +37,7 @@ void main() {
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(push_constant, binding = 1, std430) uniform Params {
 	vec4 section;
diff --git a/servers/rendering/renderer_rd/shaders/cube_to_dp.glsl b/servers/rendering/renderer_rd/shaders/cube_to_dp.glsl
index c3ac0bee57..dfbce29119 100644
--- a/servers/rendering/renderer_rd/shaders/cube_to_dp.glsl
+++ b/servers/rendering/renderer_rd/shaders/cube_to_dp.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(push_constant, binding = 1, std430) uniform Params {
 	float z_far;
@@ -26,7 +26,7 @@ void main() {
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(location = 0) in vec2 uv_interp;
 
diff --git a/servers/rendering/renderer_rd/shaders/cubemap_downsampler.glsl b/servers/rendering/renderer_rd/shaders/cubemap_downsampler.glsl
index 7f269b7af3..9fa84657d1 100644
--- a/servers/rendering/renderer_rd/shaders/cubemap_downsampler.glsl
+++ b/servers/rendering/renderer_rd/shaders/cubemap_downsampler.glsl
@@ -22,7 +22,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #define BLOCK_SIZE 8
 
diff --git a/servers/rendering/renderer_rd/shaders/cubemap_filter.glsl b/servers/rendering/renderer_rd/shaders/cubemap_filter.glsl
index 987545fb76..2a774b0eb4 100644
--- a/servers/rendering/renderer_rd/shaders/cubemap_filter.glsl
+++ b/servers/rendering/renderer_rd/shaders/cubemap_filter.glsl
@@ -22,7 +22,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #define GROUP_SIZE 64
 
diff --git a/servers/rendering/renderer_rd/shaders/cubemap_roughness.glsl b/servers/rendering/renderer_rd/shaders/cubemap_roughness.glsl
index 5cbb00baa4..ce7c03c1d4 100644
--- a/servers/rendering/renderer_rd/shaders/cubemap_roughness.glsl
+++ b/servers/rendering/renderer_rd/shaders/cubemap_roughness.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #define GROUP_SIZE 8
 
diff --git a/servers/rendering/renderer_rd/shaders/decal_data_inc.glsl b/servers/rendering/renderer_rd/shaders/decal_data_inc.glsl
new file mode 100644
index 0000000000..ccaad13311
--- /dev/null
+++ b/servers/rendering/renderer_rd/shaders/decal_data_inc.glsl
@@ -0,0 +1,18 @@
+
+struct DecalData {
+	mat4 xform; //to decal transform
+	vec3 inv_extents;
+	float albedo_mix;
+	vec4 albedo_rect;
+	vec4 normal_rect;
+	vec4 orm_rect;
+	vec4 emission_rect;
+	vec4 modulate;
+	float emission_energy;
+	uint mask;
+	float upper_fade;
+	float lower_fade;
+	mat3x4 normal_xform;
+	vec3 normal;
+	float normal_fade;
+};
diff --git a/servers/rendering/renderer_rd/shaders/gi.glsl b/servers/rendering/renderer_rd/shaders/gi.glsl
index 92a5682572..bfd5c4c88d 100644
--- a/servers/rendering/renderer_rd/shaders/gi.glsl
+++ b/servers/rendering/renderer_rd/shaders/gi.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/giprobe.glsl b/servers/rendering/renderer_rd/shaders/giprobe.glsl
index b931461b31..49a493cdc7 100644
--- a/servers/rendering/renderer_rd/shaders/giprobe.glsl
+++ b/servers/rendering/renderer_rd/shaders/giprobe.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #ifdef MODE_DYNAMIC
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
diff --git a/servers/rendering/renderer_rd/shaders/giprobe_debug.glsl b/servers/rendering/renderer_rd/shaders/giprobe_debug.glsl
index 515cc35507..7d4d72967a 100644
--- a/servers/rendering/renderer_rd/shaders/giprobe_debug.glsl
+++ b/servers/rendering/renderer_rd/shaders/giprobe_debug.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 struct CellData {
 	uint position; // xyz 10 bits
@@ -172,7 +172,7 @@ void main() {
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(location = 0) in vec4 color_interp;
 layout(location = 0) out vec4 frag_color;
diff --git a/servers/rendering/renderer_rd/shaders/giprobe_sdf.glsl b/servers/rendering/renderer_rd/shaders/giprobe_sdf.glsl
index 5b3dec0ee7..e20b3f680d 100644
--- a/servers/rendering/renderer_rd/shaders/giprobe_sdf.glsl
+++ b/servers/rendering/renderer_rd/shaders/giprobe_sdf.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 4, local_size_y = 4, local_size_z = 4) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/giprobe_write.glsl b/servers/rendering/renderer_rd/shaders/giprobe_write.glsl
index 56b3b7ccb4..5dc2d08a3b 100644
--- a/servers/rendering/renderer_rd/shaders/giprobe_write.glsl
+++ b/servers/rendering/renderer_rd/shaders/giprobe_write.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/light_data_inc.glsl b/servers/rendering/renderer_rd/shaders/light_data_inc.glsl
new file mode 100644
index 0000000000..2fce258cff
--- /dev/null
+++ b/servers/rendering/renderer_rd/shaders/light_data_inc.glsl
@@ -0,0 +1,87 @@
+#define LIGHT_BAKE_DISABLED 0
+#define LIGHT_BAKE_DYNAMIC 1
+#define LIGHT_BAKE_STATIC 2
+
+struct LightData { //this structure needs to be as packed as possible
+	vec3 position;
+	float inv_radius;
+
+	vec3 direction;
+	float size;
+
+	vec3 color;
+	float attenuation;
+
+	float cone_attenuation;
+	float cone_angle;
+	float specular_amount;
+	bool shadow_enabled;
+
+	vec4 atlas_rect; // rect in the shadow atlas
+	mat4 shadow_matrix;
+	float shadow_bias;
+	float shadow_normal_bias;
+	float transmittance_bias;
+	float soft_shadow_size; // for spot, it's the size in uv coordinates of the light, for omni it's the span angle
+	float soft_shadow_scale; // scales the shadow kernel for blurrier shadows
+	uint mask;
+	float shadow_volumetric_fog_fade;
+	uint bake_mode;
+	vec4 projector_rect; //projector rect in srgb decal atlas
+};
+
+#define REFLECTION_AMBIENT_DISABLED 0
+#define REFLECTION_AMBIENT_ENVIRONMENT 1
+#define REFLECTION_AMBIENT_COLOR 2
+
+struct ReflectionData {
+	vec3 box_extents;
+	float index;
+	vec3 box_offset;
+	uint mask;
+	vec3 ambient; // ambient color
+	float intensity;
+	bool exterior;
+	bool box_project;
+	uint ambient_mode;
+	uint pad;
+	//0-8 is intensity,8-9 is ambient, mode
+	mat4 local_matrix; // up to here for spot and omni, rest is for directional
+	// notes: for ambientblend, use distance to edge to blend between already existing global environment
+};
+
+struct DirectionalLightData {
+	vec3 direction;
+	float energy;
+	vec3 color;
+	float size;
+	float specular;
+	uint mask;
+	float softshadow_angle;
+	float soft_shadow_scale;
+	bool blend_splits;
+	bool shadow_enabled;
+	float fade_from;
+	float fade_to;
+	uvec2 pad;
+	uint bake_mode;
+	float shadow_volumetric_fog_fade;
+	vec4 shadow_bias;
+	vec4 shadow_normal_bias;
+	vec4 shadow_transmittance_bias;
+	vec4 shadow_z_range;
+	vec4 shadow_range_begin;
+	vec4 shadow_split_offsets;
+	mat4 shadow_matrix1;
+	mat4 shadow_matrix2;
+	mat4 shadow_matrix3;
+	mat4 shadow_matrix4;
+	vec4 shadow_color1;
+	vec4 shadow_color2;
+	vec4 shadow_color3;
+	vec4 shadow_color4;
+	vec2 uv_scale1;
+	vec2 uv_scale2;
+	vec2 uv_scale3;
+	vec2 uv_scale4;
+};
diff --git a/servers/rendering/renderer_rd/shaders/luminance_reduce.glsl b/servers/rendering/renderer_rd/shaders/luminance_reduce.glsl
index 8a11c35b78..466442b67a 100644
--- a/servers/rendering/renderer_rd/shaders/luminance_reduce.glsl
+++ b/servers/rendering/renderer_rd/shaders/luminance_reduce.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #define BLOCK_SIZE 8
 
diff --git a/servers/rendering/renderer_rd/shaders/particles.glsl b/servers/rendering/renderer_rd/shaders/particles.glsl
index cb6d8dc7f6..beaff10793 100644
--- a/servers/rendering/renderer_rd/shaders/particles.glsl
+++ b/servers/rendering/renderer_rd/shaders/particles.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 
@@ -76,6 +76,11 @@ struct FrameParams {
 	float time;
 	float delta;
 
+	uint frame;
+	uint pad0;
+	uint pad1;
+	uint pad2;
+
 	uint random_seed;
 	uint attractor_count;
 	uint collider_count;
@@ -92,10 +97,16 @@ layout(set = 1, binding = 0, std430) restrict buffer FrameHistory {
 }
 frame_history;
 
+#define PARTICLE_FLAG_ACTIVE uint(1)
+#define PARTICLE_FLAG_STARTED uint(2)
+#define PARTICLE_FLAG_TRAILED uint(4)
+#define PARTICLE_FRAME_MASK uint(0xFFFF)
+#define PARTICLE_FRAME_SHIFT uint(16)
+
 struct ParticleData {
 	mat4 xform;
 	vec3 velocity;
-	bool is_active;
+	uint flags;
 	vec4 color;
 	vec4 custom;
 };
@@ -146,11 +157,11 @@ layout(set = 2, binding = 1) uniform texture2D height_field_texture;
 
 /* SET 3: MATERIAL */
 
-#ifdef USE_MATERIAL_UNIFORMS
+#ifdef MATERIAL_UNIFORMS_USED
 layout(set = 3, binding = 0, std140) uniform MaterialUniforms{
-	/* clang-format off */
-MATERIAL_UNIFORMS
-	/* clang-format on */
+
+#MATERIAL_UNIFORMS
+
 } material;
 #endif
 
@@ -162,7 +173,7 @@ layout(push_constant, binding = 0, std430) uniform Params {
 	bool use_fractional_delta;
 	bool sub_emitter_mode;
 	bool can_emit;
-	uint pad;
+	bool trail_pass;
 }
 params;
 
@@ -196,15 +207,19 @@ bool emit_subparticle(mat4 p_xform, vec3 p_velocity, vec4 p_color, vec4 p_custom
 	return true;
 }
 
-/* clang-format off */
-
-COMPUTE_SHADER_GLOBALS
-
-/* clang-format on */
+#GLOBALS
 
 void main() {
 	uint particle = gl_GlobalInvocationID.x;
 
+	if (params.trail_size > 1) {
+		if (params.trail_pass) {
+			particle += (particle / (params.trail_size - 1)) + 1;
+		} else {
+			particle *= params.trail_size;
+		}
+	}
+
 	if (particle >= params.total_particles * params.trail_size) {
 		return; //discard
 	}
@@ -233,7 +248,7 @@ void main() {
 		PARTICLE.color = vec4(1.0);
 		PARTICLE.custom = vec4(0.0);
 		PARTICLE.velocity = vec3(0.0);
-		PARTICLE.is_active = false;
+		PARTICLE.flags = 0;
 		PARTICLE.xform = mat4(
 				vec4(1.0, 0.0, 0.0, 0.0),
 				vec4(0.0, 1.0, 0.0, 0.0),
@@ -241,6 +256,29 @@ void main() {
 				vec4(0.0, 0.0, 0.0, 1.0));
 	}
 
+	//clear started flag if set
+
+	if (params.trail_pass) {
+		//trail started
+		uint src_idx = index * params.trail_size;
+		if (bool(particles.data[src_idx].flags & PARTICLE_FLAG_STARTED)) {
+			//save start conditions for trails
+			PARTICLE.color = particles.data[src_idx].color;
+			PARTICLE.custom = particles.data[src_idx].custom;
+			PARTICLE.velocity = particles.data[src_idx].velocity;
+			PARTICLE.flags = PARTICLE_FLAG_TRAILED | ((frame_history.data[0].frame & PARTICLE_FRAME_MASK) << PARTICLE_FRAME_SHIFT); //mark it as trailed, save in which frame it will start
+			PARTICLE.xform = particles.data[src_idx].xform;
+		}
+
+		if (bool(PARTICLE.flags & PARTICLE_FLAG_TRAILED) && ((PARTICLE.flags >> PARTICLE_FRAME_SHIFT) == (FRAME.frame & PARTICLE_FRAME_MASK))) { //check this is trailed and see if it should start now
+			// we just assume that this is the first frame of the particle, the rest is deterministic
+			PARTICLE.flags = PARTICLE_FLAG_ACTIVE | (particles.data[src_idx].flags & (PARTICLE_FRAME_MASK << PARTICLE_FRAME_SHIFT));
+			return; //- this appears like it should be correct, but it seems not to be.. wonder why.
+		}
+	} else {
+		PARTICLE.flags &= ~PARTICLE_FLAG_STARTED;
+	}
+
 	bool collided = false;
 	vec3 collision_normal = vec3(0.0);
 	float collision_depth = 0.0;
@@ -249,14 +287,121 @@ void main() {
 
 #if !defined(DISABLE_VELOCITY)
 
-	if (PARTICLE.is_active) {
+	if (bool(PARTICLE.flags & PARTICLE_FLAG_ACTIVE)) {
 		PARTICLE.xform[3].xyz += PARTICLE.velocity * local_delta;
 	}
 #endif
 
-	/* Process physics if active */
+	if (!params.trail_pass && params.sub_emitter_mode) {
+		if (!bool(PARTICLE.flags & PARTICLE_FLAG_ACTIVE)) {
+			int src_index = atomicAdd(src_particles.particle_count, -1) - 1;
+
+			if (src_index >= 0) {
+				PARTICLE.flags = (PARTICLE_FLAG_ACTIVE | PARTICLE_FLAG_STARTED | (FRAME.cycle << PARTICLE_FRAME_SHIFT));
+				restart = true;
+
+				if (bool(src_particles.data[src_index].flags & EMISSION_FLAG_HAS_POSITION)) {
+					PARTICLE.xform[3] = src_particles.data[src_index].xform[3];
+				} else {
+					PARTICLE.xform[3] = vec4(0, 0, 0, 1);
+					restart_position = true;
+				}
+				if (bool(src_particles.data[src_index].flags & EMISSION_FLAG_HAS_ROTATION_SCALE)) {
+					PARTICLE.xform[0] = src_particles.data[src_index].xform[0];
+					PARTICLE.xform[1] = src_particles.data[src_index].xform[1];
+					PARTICLE.xform[2] = src_particles.data[src_index].xform[2];
+				} else {
+					PARTICLE.xform[0] = vec4(1, 0, 0, 0);
+					PARTICLE.xform[1] = vec4(0, 1, 0, 0);
+					PARTICLE.xform[2] = vec4(0, 0, 1, 0);
+					restart_rotation_scale = true;
+				}
+				if (bool(src_particles.data[src_index].flags & EMISSION_FLAG_HAS_VELOCITY)) {
+					PARTICLE.velocity = src_particles.data[src_index].velocity;
+				} else {
+					PARTICLE.velocity = vec3(0);
+					restart_velocity = true;
+				}
+				if (bool(src_particles.data[src_index].flags & EMISSION_FLAG_HAS_COLOR)) {
+					PARTICLE.color = src_particles.data[src_index].color;
+				} else {
+					PARTICLE.color = vec4(1);
+					restart_color = true;
+				}
 
-	if (PARTICLE.is_active) {
+				if (bool(src_particles.data[src_index].flags & EMISSION_FLAG_HAS_CUSTOM)) {
+					PARTICLE.custom = src_particles.data[src_index].custom;
+				} else {
+					PARTICLE.custom = vec4(0);
+					restart_custom = true;
+				}
+			}
+		}
+
+	} else if (FRAME.emitting) {
+		float restart_phase = float(index) / float(params.total_particles);
+
+		if (FRAME.randomness > 0.0) {
+			uint seed = FRAME.cycle;
+			if (restart_phase >= FRAME.system_phase) {
+				seed -= uint(1);
+			}
+			seed *= uint(params.total_particles);
+			seed += uint(index);
+			float random = float(hash(seed) % uint(65536)) / 65536.0;
+			restart_phase += FRAME.randomness * random * 1.0 / float(params.total_particles);
+		}
+
+		restart_phase *= (1.0 - FRAME.explosiveness);
+
+		if (FRAME.system_phase > FRAME.prev_system_phase) {
+			// restart_phase >= prev_system_phase is used so particles emit in the first frame they are processed
+
+			if (restart_phase >= FRAME.prev_system_phase && restart_phase < FRAME.system_phase) {
+				restart = true;
+				if (params.use_fractional_delta) {
+					local_delta = (FRAME.system_phase - restart_phase) * params.lifetime;
+				}
+			}
+
+		} else if (FRAME.delta > 0.0) {
+			if (restart_phase >= FRAME.prev_system_phase) {
+				restart = true;
+				if (params.use_fractional_delta) {
+					local_delta = (1.0 - restart_phase + FRAME.system_phase) * params.lifetime;
+				}
+
+			} else if (restart_phase < FRAME.system_phase) {
+				restart = true;
+				if (params.use_fractional_delta) {
+					local_delta = (FRAME.system_phase - restart_phase) * params.lifetime;
+				}
+			}
+		}
+
+		if (params.trail_pass) {
+			restart = false;
+		}
+
+		if (restart) {
+			PARTICLE.flags = FRAME.emitting ? (PARTICLE_FLAG_ACTIVE | PARTICLE_FLAG_STARTED | (FRAME.cycle << PARTICLE_FRAME_SHIFT)) : 0;
+			restart_position = true;
+			restart_rotation_scale = true;
+			restart_velocity = true;
+			restart_color = true;
+			restart_custom = true;
+		}
+	}
+
+	bool particle_active = bool(PARTICLE.flags & PARTICLE_FLAG_ACTIVE);
+
+	uint particle_number = (PARTICLE.flags >> PARTICLE_FRAME_SHIFT) * uint(params.total_particles) + index;
+
+	if (restart && particle_active) {
+#CODE : START
+	}
+
+	if (particle_active) {
 		for (uint i = 0; i < FRAME.attractor_count; i++) {
 			vec3 dir;
 			float amount;
@@ -434,116 +579,12 @@ void main() {
 		}
 	}
 
-	if (params.sub_emitter_mode) {
-		if (!PARTICLE.is_active) {
-			int src_index = atomicAdd(src_particles.particle_count, -1) - 1;
-
-			if (src_index >= 0) {
-				PARTICLE.is_active = true;
-				restart = true;
-
-				if (bool(src_particles.data[src_index].flags & EMISSION_FLAG_HAS_POSITION)) {
-					PARTICLE.xform[3] = src_particles.data[src_index].xform[3];
-				} else {
-					PARTICLE.xform[3] = vec4(0, 0, 0, 1);
-					restart_position = true;
-				}
-				if (bool(src_particles.data[src_index].flags & EMISSION_FLAG_HAS_ROTATION_SCALE)) {
-					PARTICLE.xform[0] = src_particles.data[src_index].xform[0];
-					PARTICLE.xform[1] = src_particles.data[src_index].xform[1];
-					PARTICLE.xform[2] = src_particles.data[src_index].xform[2];
-				} else {
-					PARTICLE.xform[0] = vec4(1, 0, 0, 0);
-					PARTICLE.xform[1] = vec4(0, 1, 0, 0);
-					PARTICLE.xform[2] = vec4(0, 0, 1, 0);
-					restart_rotation_scale = true;
-				}
-				if (bool(src_particles.data[src_index].flags & EMISSION_FLAG_HAS_VELOCITY)) {
-					PARTICLE.velocity = src_particles.data[src_index].velocity;
-				} else {
-					PARTICLE.velocity = vec3(0);
-					restart_velocity = true;
-				}
-				if (bool(src_particles.data[src_index].flags & EMISSION_FLAG_HAS_COLOR)) {
-					PARTICLE.color = src_particles.data[src_index].color;
-				} else {
-					PARTICLE.color = vec4(1);
-					restart_color = true;
-				}
-
-				if (bool(src_particles.data[src_index].flags & EMISSION_FLAG_HAS_CUSTOM)) {
-					PARTICLE.custom = src_particles.data[src_index].custom;
-				} else {
-					PARTICLE.custom = vec4(0);
-					restart_custom = true;
-				}
-			}
-		}
-
-	} else if (FRAME.emitting) {
-		float restart_phase = float(index) / float(params.total_particles);
-
-		if (FRAME.randomness > 0.0) {
-			uint seed = FRAME.cycle;
-			if (restart_phase >= FRAME.system_phase) {
-				seed -= uint(1);
-			}
-			seed *= uint(params.total_particles);
-			seed += uint(index);
-			float random = float(hash(seed) % uint(65536)) / 65536.0;
-			restart_phase += FRAME.randomness * random * 1.0 / float(params.total_particles);
-		}
-
-		restart_phase *= (1.0 - FRAME.explosiveness);
-
-		if (FRAME.system_phase > FRAME.prev_system_phase) {
-			// restart_phase >= prev_system_phase is used so particles emit in the first frame they are processed
-
-			if (restart_phase >= FRAME.prev_system_phase && restart_phase < FRAME.system_phase) {
-				restart = true;
-				if (params.use_fractional_delta) {
-					local_delta = (FRAME.system_phase - restart_phase) * params.lifetime;
-				}
-			}
-
-		} else if (FRAME.delta > 0.0) {
-			if (restart_phase >= FRAME.prev_system_phase) {
-				restart = true;
-				if (params.use_fractional_delta) {
-					local_delta = (1.0 - restart_phase + FRAME.system_phase) * params.lifetime;
-				}
-
-			} else if (restart_phase < FRAME.system_phase) {
-				restart = true;
-				if (params.use_fractional_delta) {
-					local_delta = (FRAME.system_phase - restart_phase) * params.lifetime;
-				}
-			}
-		}
-
-		uint current_cycle = FRAME.cycle;
-
-		if (FRAME.system_phase < restart_phase) {
-			current_cycle -= uint(1);
-		}
-
-		uint particle_number = current_cycle * uint(params.total_particles) + particle;
-
-		if (restart) {
-			PARTICLE.is_active = FRAME.emitting;
-			restart_position = true;
-			restart_rotation_scale = true;
-			restart_velocity = true;
-			restart_color = true;
-			restart_custom = true;
-		}
+	if (particle_active) {
+#CODE : PROCESS
 	}
 
-	if (PARTICLE.is_active) {
-		/* clang-format off */
-
-COMPUTE_SHADER_CODE
-
-		/* clang-format on */
+	PARTICLE.flags &= ~PARTICLE_FLAG_ACTIVE;
+	if (particle_active) {
+		PARTICLE.flags |= PARTICLE_FLAG_ACTIVE;
 	}
 }
diff --git a/servers/rendering/renderer_rd/shaders/particles_copy.glsl b/servers/rendering/renderer_rd/shaders/particles_copy.glsl
index 6c782b6045..7804d66d1c 100644
--- a/servers/rendering/renderer_rd/shaders/particles_copy.glsl
+++ b/servers/rendering/renderer_rd/shaders/particles_copy.glsl
@@ -2,14 +2,18 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 
+#define PARTICLE_FLAG_ACTIVE uint(1)
+#define PARTICLE_FLAG_STARTED uint(2)
+#define PARTICLE_FLAG_TRAILED uint(4)
+
 struct ParticleData {
 	mat4 xform;
 	vec3 velocity;
-	bool is_active;
+	uint flags;
 	vec4 color;
 	vec4 custom;
 };
@@ -33,12 +37,30 @@ sort_buffer;
 
 #endif // USE_SORT_BUFFER
 
+layout(set = 2, binding = 0, std430) restrict readonly buffer TrailBindPoses {
+	mat4 data[];
+}
+trail_bind_poses;
+
 layout(push_constant, binding = 0, std430) uniform Params {
 	vec3 sort_direction;
 	uint total_particles;
+
+	uint trail_size;
+	uint trail_total;
+	float frame_delta;
+	float frame_remainder;
+
+	vec3 align_up;
+	uint align_mode;
 }
 params;
 
+#define TRANSFORM_ALIGN_DISABLED 0
+#define TRANSFORM_ALIGN_Z_BILLBOARD 1
+#define TRANSFORM_ALIGN_Y_TO_VELOCITY 2
+#define TRANSFORM_ALIGN_Z_BILLBOARD_Y_TO_VELOCITY 3
+
 void main() {
 #ifdef MODE_FILL_SORT_BUFFER
 
@@ -47,7 +69,11 @@ void main() {
 		return; //discard
 	}
 
-	sort_buffer.data[particle].x = dot(params.sort_direction, particles.data[particle].xform[3].xyz);
+	uint src_particle = particle;
+	if (params.trail_size > 1) {
+		src_particle = src_particle * params.trail_size + params.trail_size / 2; //use trail center for sorting
+	}
+	sort_buffer.data[particle].x = dot(params.sort_direction, particles.data[src_particle].xform[3].xyz);
 	sort_buffer.data[particle].y = float(particle);
 #endif
 
@@ -61,13 +87,78 @@ void main() {
 	}
 
 #ifdef USE_SORT_BUFFER
-	particle = uint(sort_buffer.data[particle].y); //use index from sort buffer
+
+	if (params.trail_size > 1) {
+		particle = uint(sort_buffer.data[particle / params.trail_size].y) + (particle % params.trail_size);
+	} else {
+		particle = uint(sort_buffer.data[particle].y); //use index from sort buffer
+	}
 #endif
 
 	mat4 txform;
 
-	if (particles.data[particle].is_active) {
-		txform = transpose(particles.data[particle].xform);
+	if (bool(particles.data[particle].flags & PARTICLE_FLAG_ACTIVE) || bool(particles.data[particle].flags & PARTICLE_FLAG_TRAILED)) {
+		txform = particles.data[particle].xform;
+		if (params.trail_size > 1) {
+			// since the steps dont fit precisely in the history frames, must do a tiny bit of
+			// interpolation to get them close to their intended location.
+			uint part_ofs = particle % params.trail_size;
+			float natural_ofs = fract((float(part_ofs) / float(params.trail_size)) * float(params.trail_total)) * params.frame_delta;
+
+			txform[3].xyz -= particles.data[particle].velocity * natural_ofs;
+		}
+
+		switch (params.align_mode) {
+			case TRANSFORM_ALIGN_DISABLED: {
+			} break; //nothing
+			case TRANSFORM_ALIGN_Z_BILLBOARD: {
+				mat3 local = mat3(normalize(cross(params.align_up, params.sort_direction)), params.align_up, params.sort_direction);
+				local = local * mat3(txform);
+				txform[0].xyz = local[0];
+				txform[1].xyz = local[1];
+				txform[2].xyz = local[2];
+
+			} break;
+			case TRANSFORM_ALIGN_Y_TO_VELOCITY: {
+				vec3 v = particles.data[particle].velocity;
+				float s = (length(txform[0]) + length(txform[1]) + length(txform[2])) / 3.0;
+				if (length(v) > 0.0) {
+					txform[1].xyz = normalize(v);
+				} else {
+					txform[1].xyz = normalize(txform[1].xyz);
+				}
+
+				txform[0].xyz = normalize(cross(txform[1].xyz, txform[2].xyz));
+				txform[2].xyz = vec3(0.0, 0.0, 1.0) * s;
+				txform[0].xyz *= s;
+				txform[1].xyz *= s;
+			} break;
+			case TRANSFORM_ALIGN_Z_BILLBOARD_Y_TO_VELOCITY: {
+				vec3 v = particles.data[particle].velocity;
+				vec3 sv = v - params.sort_direction * dot(params.sort_direction, v); //screen velocity
+				float s = (length(txform[0]) + length(txform[1]) + length(txform[2])) / 3.0;
+
+				if (length(sv) == 0) {
+					sv = params.align_up;
+				}
+
+				sv = normalize(sv);
+
+				txform[0].xyz = normalize(cross(sv, params.sort_direction)) * s;
+				txform[1].xyz = sv * s;
+				txform[2].xyz = params.sort_direction * s;
+
+			} break;
+		}
+
+		txform[3].xyz += particles.data[particle].velocity * params.frame_remainder;
+
+		if (params.trail_size > 1) {
+			uint part_ofs = particle % params.trail_size;
+			txform = txform * trail_bind_poses.data[part_ofs];
+		}
+
+		txform = transpose(txform);
 	} else {
 		txform = mat4(vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0)); //zero scale, becomes invisible
 	}
diff --git a/servers/rendering/renderer_rd/shaders/resolve.glsl b/servers/rendering/renderer_rd/shaders/resolve.glsl
index e83c4ca93b..2286a26485 100644
--- a/servers/rendering/renderer_rd/shaders/resolve.glsl
+++ b/servers/rendering/renderer_rd/shaders/resolve.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/roughness_limiter.glsl b/servers/rendering/renderer_rd/shaders/roughness_limiter.glsl
index 464895928a..7b964675ca 100644
--- a/servers/rendering/renderer_rd/shaders/roughness_limiter.glsl
+++ b/servers/rendering/renderer_rd/shaders/roughness_limiter.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/scene_forward_aa_inc.glsl b/servers/rendering/renderer_rd/shaders/scene_forward_aa_inc.glsl
new file mode 100644
index 0000000000..99714b4504
--- /dev/null
+++ b/servers/rendering/renderer_rd/shaders/scene_forward_aa_inc.glsl
@@ -0,0 +1,58 @@
+#ifdef ALPHA_HASH_USED
+
+float hash_2d(vec2 p) {
+	return fract(1.0e4 * sin(17.0 * p.x + 0.1 * p.y) *
+				 (0.1 + abs(sin(13.0 * p.y + p.x))));
+}
+
+float hash_3d(vec3 p) {
+	return hash_2d(vec2(hash_2d(p.xy), p.z));
+}
+
+float compute_alpha_hash_threshold(vec3 pos, float hash_scale) {
+	vec3 dx = dFdx(pos);
+	vec3 dy = dFdx(pos);
+	float delta_max_sqr = max(length(dx), length(dy));
+	float pix_scale = 1.0 / (hash_scale * delta_max_sqr);
+
+	vec2 pix_scales =
+			vec2(exp2(floor(log2(pix_scale))), exp2(ceil(log2(pix_scale))));
+
+	vec2 a_thresh = vec2(hash_3d(floor(pix_scales.x * pos.xyz)),
+			hash_3d(floor(pix_scales.y * pos.xyz)));
+
+	float lerp_factor = fract(log2(pix_scale));
+
+	float a_interp = (1.0 - lerp_factor) * a_thresh.x + lerp_factor * a_thresh.y;
+
+	float min_lerp = min(lerp_factor, 1.0 - lerp_factor);
+
+	vec3 cases = vec3(a_interp * a_interp / (2.0 * min_lerp * (1.0 - min_lerp)),
+			(a_interp - 0.5 * min_lerp) / (1.0 - min_lerp),
+			1.0 - ((1.0 - a_interp) * (1.0 - a_interp) /
+						  (2.0 * min_lerp * (1.0 - min_lerp))));
+
+	float alpha_hash_threshold =
+			(lerp_factor < (1.0 - min_lerp)) ? ((lerp_factor < min_lerp) ? cases.x : cases.y) : cases.z;
+
+	return clamp(alpha_hash_threshold, 0.0, 1.0);
+}
+
+#endif // ALPHA_HASH_USED
+
+#ifdef ALPHA_ANTIALIASING_EDGE_USED
+
+float calc_mip_level(vec2 texture_coord) {
+	vec2 dx = dFdx(texture_coord);
+	vec2 dy = dFdy(texture_coord);
+	float delta_max_sqr = max(dot(dx, dx), dot(dy, dy));
+	return max(0.0, 0.5 * log2(delta_max_sqr));
+}
+
+float compute_alpha_antialiasing_edge(float input_alpha, vec2 texture_coord, float alpha_edge) {
+	input_alpha *= 1.0 + max(0, calc_mip_level(texture_coord)) * 0.25; // 0.25 mip scale, magic number
+	input_alpha = (input_alpha - alpha_edge) / max(fwidth(input_alpha), 0.0001) + 0.5;
+	return clamp(input_alpha, 0.0, 1.0);
+}
+
+#endif // ALPHA_ANTIALIASING_USED
diff --git a/servers/rendering/renderer_rd/shaders/scene_forward_clustered.glsl b/servers/rendering/renderer_rd/shaders/scene_forward_clustered.glsl
index 7b86dac143..1d67a3f1df 100644
--- a/servers/rendering/renderer_rd/shaders/scene_forward_clustered.glsl
+++ b/servers/rendering/renderer_rd/shaders/scene_forward_clustered.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #include "scene_forward_clustered_inc.glsl"
 
@@ -48,11 +48,11 @@ layout(location = 8) in vec4 custom2_attrib;
 layout(location = 9) in vec4 custom3_attrib;
 #endif
 
-#if defined(BONES_USED)
+#if defined(BONES_USED) || defined(USE_PARTICLE_TRAILS)
 layout(location = 10) in uvec4 bone_attrib;
 #endif
 
-#if defined(WEIGHTS_USED)
+#if defined(WEIGHTS_USED) || defined(USE_PARTICLE_TRAILS)
 layout(location = 11) in vec4 weight_attrib;
 #endif
 
@@ -81,16 +81,14 @@ layout(location = 5) out vec3 tangent_interp;
 layout(location = 6) out vec3 binormal_interp;
 #endif
 
-#ifdef USE_MATERIAL_UNIFORMS
+#ifdef MATERIAL_UNIFORMS_USED
 layout(set = MATERIAL_UNIFORM_SET, binding = 0, std140) uniform MaterialUniforms{
-	/* clang-format off */
-MATERIAL_UNIFORMS
-	/* clang-format on */
+
+#MATERIAL_UNIFORMS
+
 } material;
 #endif
 
-invariant gl_Position;
-
 #ifdef MODE_DUAL_PARABOLOID
 
 layout(location = 8) out float dp_clip;
@@ -99,11 +97,9 @@ layout(location = 8) out float dp_clip;
 
 layout(location = 9) out flat uint instance_index;
 
-/* clang-format off */
-
-VERTEX_SHADER_GLOBALS
+invariant gl_Position;
 
-/* clang-format on */
+#GLOBALS
 
 void main() {
 	vec4 instance_custom = vec4(0.0);
@@ -129,10 +125,72 @@ void main() {
 
 	if (is_multimesh) {
 		//multimesh, instances are for it
-		uint offset = (instances.data[instance_index].flags >> INSTANCE_FLAGS_MULTIMESH_STRIDE_SHIFT) & INSTANCE_FLAGS_MULTIMESH_STRIDE_MASK;
-		offset *= gl_InstanceIndex;
 
 		mat4 matrix;
+
+#ifdef USE_PARTICLE_TRAILS
+		uint trail_size = (instances.data[instance_index].flags >> INSTANCE_FLAGS_PARTICLE_TRAIL_SHIFT) & INSTANCE_FLAGS_PARTICLE_TRAIL_MASK;
+		uint stride = 3 + 1 + 1; //particles always uses this format
+
+		uint offset = trail_size * stride * gl_InstanceIndex;
+
+#ifdef COLOR_USED
+		vec4 pcolor;
+#endif
+		{
+			uint boffset = offset + bone_attrib.x * stride;
+			matrix = mat4(transforms.data[boffset + 0], transforms.data[boffset + 1], transforms.data[boffset + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weight_attrib.x;
+#ifdef COLOR_USED
+			pcolor = transforms.data[boffset + 3] * weight_attrib.x;
+#endif
+		}
+		if (weight_attrib.y > 0.001) {
+			uint boffset = offset + bone_attrib.y * stride;
+			matrix += mat4(transforms.data[boffset + 0], transforms.data[boffset + 1], transforms.data[boffset + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weight_attrib.y;
+#ifdef COLOR_USED
+			pcolor += transforms.data[boffset + 3] * weight_attrib.y;
+#endif
+		}
+		if (weight_attrib.z > 0.001) {
+			uint boffset = offset + bone_attrib.z * stride;
+			matrix += mat4(transforms.data[boffset + 0], transforms.data[boffset + 1], transforms.data[boffset + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weight_attrib.z;
+#ifdef COLOR_USED
+			pcolor += transforms.data[boffset + 3] * weight_attrib.z;
+#endif
+		}
+		if (weight_attrib.w > 0.001) {
+			uint boffset = offset + bone_attrib.w * stride;
+			matrix += mat4(transforms.data[boffset + 0], transforms.data[boffset + 1], transforms.data[boffset + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weight_attrib.w;
+#ifdef COLOR_USED
+			pcolor += transforms.data[boffset + 3] * weight_attrib.w;
+#endif
+		}
+
+		instance_custom = transforms.data[offset + 4];
+
+#ifdef COLOR_USED
+		color_interp *= pcolor;
+#endif
+
+#else
+		uint stride = 0;
+		{
+			//TODO implement a small lookup table for the stride
+			if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_MULTIMESH_FORMAT_2D)) {
+				stride += 2;
+			} else {
+				stride += 3;
+			}
+			if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_MULTIMESH_HAS_COLOR)) {
+				stride += 1;
+			}
+			if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_MULTIMESH_HAS_CUSTOM_DATA)) {
+				stride += 1;
+			}
+		}
+
+		uint offset = stride * gl_InstanceIndex;
+
 		if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_MULTIMESH_FORMAT_2D)) {
 			matrix = mat4(transforms.data[offset + 0], transforms.data[offset + 1], vec4(0.0, 0.0, 1.0, 0.0), vec4(0.0, 0.0, 0.0, 1.0));
 			offset += 2;
@@ -152,6 +210,7 @@ void main() {
 			instance_custom = transforms.data[offset];
 		}
 
+#endif
 		//transpose
 		matrix = transpose(matrix);
 		world_matrix = world_matrix * matrix;
@@ -169,32 +228,6 @@ void main() {
 	vec3 binormal = normalize(cross(normal, tangent) * binormalf);
 #endif
 
-#if 0
-	if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_SKELETON)) {
-		//multimesh, instances are for it
-
-		uvec2 bones_01 = uvec2(bone_attrib.x & 0xFFFF, bone_attrib.x >> 16) * 3;
-		uvec2 bones_23 = uvec2(bone_attrib.y & 0xFFFF, bone_attrib.y >> 16) * 3;
-		vec2 weights_01 = unpackUnorm2x16(bone_attrib.z);
-		vec2 weights_23 = unpackUnorm2x16(bone_attrib.w);
-
-		mat4 m = mat4(transforms.data[bones_01.x], transforms.data[bones_01.x + 1], transforms.data[bones_01.x + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weights_01.x;
-		m += mat4(transforms.data[bones_01.y], transforms.data[bones_01.y + 1], transforms.data[bones_01.y + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weights_01.y;
-		m += mat4(transforms.data[bones_23.x], transforms.data[bones_23.x + 1], transforms.data[bones_23.x + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weights_23.x;
-		m += mat4(transforms.data[bones_23.y], transforms.data[bones_23.y + 1], transforms.data[bones_23.y + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weights_23.y;
-
-		//reverse order because its transposed
-		vertex = (vec4(vertex, 1.0) * m).xyz;
-		normal = (vec4(normal, 0.0) * m).xyz;
-
-#if defined(TANGENT_USED) || defined(NORMAL_MAP_USED) || defined(LIGHT_ANISOTROPY_USED)
-
-		tangent = (vec4(tangent, 0.0) * m).xyz;
-		binormal = (vec4(binormal, 0.0) * m).xyz;
-#endif
-	}
-#endif
-
 #ifdef UV_USED
 	uv_interp = uv_attrib;
 #endif
@@ -230,11 +263,7 @@ void main() {
 	mat3 modelview_normal = mat3(scene_data.inv_camera_matrix) * world_normal_matrix;
 
 	{
-		/* clang-format off */
-
-VERTEX_SHADER_CODE
-
-		/* clang-format on */
+#CODE : VERTEX
 	}
 
 // using local coordinates (default)
@@ -325,7 +354,7 @@ VERTEX_SHADER_CODE
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #include "scene_forward_clustered_inc.glsl"
 
@@ -372,19 +401,15 @@ layout(location = 9) in flat uint instance_index;
 #define LIGHT_TRANSMITTANCE_USED
 #endif
 
-#ifdef USE_MATERIAL_UNIFORMS
+#ifdef MATERIAL_UNIFORMS_USED
 layout(set = MATERIAL_UNIFORM_SET, binding = 0, std140) uniform MaterialUniforms{
-	/* clang-format off */
-MATERIAL_UNIFORMS
-	/* clang-format on */
-} material;
-#endif
 
-/* clang-format off */
+#MATERIAL_UNIFORMS
 
-FRAGMENT_SHADER_GLOBALS
+} material;
+#endif
 
-/* clang-format on */
+#GLOBALS
 
 #ifdef MODE_RENDER_DEPTH
 
@@ -396,7 +421,7 @@ layout(location = 2) out vec4 orm_output_buffer;
 layout(location = 3) out vec4 emission_output_buffer;
 layout(location = 4) out float depth_output_buffer;
 
-#endif
+#endif // MODE_RENDER_MATERIAL
 
 #ifdef MODE_RENDER_NORMAL_ROUGHNESS
 layout(location = 0) out vec4 normal_roughness_output_buffer;
@@ -415,1319 +440,19 @@ layout(location = 1) out vec4 specular_buffer; //specular and SSS (subsurface sc
 #else
 
 layout(location = 0) out vec4 frag_color;
-#endif
+#endif // MODE_MULTIPLE_RENDER_TARGETS
 
 #endif // RENDER DEPTH
 
-#ifdef ALPHA_HASH_USED
-
-float hash_2d(vec2 p) {
-	return fract(1.0e4 * sin(17.0 * p.x + 0.1 * p.y) *
-				 (0.1 + abs(sin(13.0 * p.y + p.x))));
-}
-
-float hash_3d(vec3 p) {
-	return hash_2d(vec2(hash_2d(p.xy), p.z));
-}
-
-float compute_alpha_hash_threshold(vec3 pos, float hash_scale) {
-	vec3 dx = dFdx(pos);
-	vec3 dy = dFdx(pos);
-	float delta_max_sqr = max(length(dx), length(dy));
-	float pix_scale = 1.0 / (hash_scale * delta_max_sqr);
-
-	vec2 pix_scales =
-			vec2(exp2(floor(log2(pix_scale))), exp2(ceil(log2(pix_scale))));
-
-	vec2 a_thresh = vec2(hash_3d(floor(pix_scales.x * pos.xyz)),
-			hash_3d(floor(pix_scales.y * pos.xyz)));
-
-	float lerp_factor = fract(log2(pix_scale));
-
-	float a_interp = (1.0 - lerp_factor) * a_thresh.x + lerp_factor * a_thresh.y;
-
-	float min_lerp = min(lerp_factor, 1.0 - lerp_factor);
-
-	vec3 cases = vec3(a_interp * a_interp / (2.0 * min_lerp * (1.0 - min_lerp)),
-			(a_interp - 0.5 * min_lerp) / (1.0 - min_lerp),
-			1.0 - ((1.0 - a_interp) * (1.0 - a_interp) /
-						  (2.0 * min_lerp * (1.0 - min_lerp))));
-
-	float alpha_hash_threshold =
-			(lerp_factor < (1.0 - min_lerp)) ? ((lerp_factor < min_lerp) ? cases.x : cases.y) : cases.z;
-
-	return clamp(alpha_hash_threshold, 0.0, 1.0);
-}
-
-#endif // ALPHA_HASH_USED
-
-#ifdef ALPHA_ANTIALIASING_EDGE_USED
-
-float calc_mip_level(vec2 texture_coord) {
-	vec2 dx = dFdx(texture_coord);
-	vec2 dy = dFdy(texture_coord);
-	float delta_max_sqr = max(dot(dx, dx), dot(dy, dy));
-	return max(0.0, 0.5 * log2(delta_max_sqr));
-}
-
-float compute_alpha_antialiasing_edge(float input_alpha, vec2 texture_coord, float alpha_edge) {
-	input_alpha *= 1.0 + max(0, calc_mip_level(texture_coord)) * 0.25; // 0.25 mip scale, magic number
-	input_alpha = (input_alpha - alpha_edge) / max(fwidth(input_alpha), 0.0001) + 0.5;
-	return clamp(input_alpha, 0.0, 1.0);
-}
-
-#endif // ALPHA_ANTIALIASING_USED
-
-// This returns the G_GGX function divided by 2 cos_theta_m, where in practice cos_theta_m is either N.L or N.V.
-// We're dividing this factor off because the overall term we'll end up looks like
-// (see, for example, the first unnumbered equation in B. Burley, "Physically Based Shading at Disney", SIGGRAPH 2012):
-//
-//   F(L.V) D(N.H) G(N.L) G(N.V) / (4 N.L N.V)
-//
-// We're basically regouping this as
-//
-//   F(L.V) D(N.H) [G(N.L)/(2 N.L)] [G(N.V) / (2 N.V)]
-//
-// and thus, this function implements the [G(N.m)/(2 N.m)] part with m = L or V.
-//
-// The contents of the D and G (G1) functions (GGX) are taken from
-// E. Heitz, "Understanding the Masking-Shadowing Function in Microfacet-Based BRDFs", J. Comp. Graph. Tech. 3 (2) (2014).
-// Eqns 71-72 and 85-86 (see also Eqns 43 and 80).
+#include "scene_forward_aa_inc.glsl"
 
 #if !defined(MODE_RENDER_DEPTH) && !defined(MODE_UNSHADED)
 
-float G_GGX_2cos(float cos_theta_m, float alpha) {
-	// Schlick's approximation
-	// C. Schlick, "An Inexpensive BRDF Model for Physically-based Rendering", Computer Graphics Forum. 13 (3): 233 (1994)
-	// Eq. (19), although see Heitz (2014) the about the problems with his derivation.
-	// It nevertheless approximates GGX well with k = alpha/2.
-	float k = 0.5 * alpha;
-	return 0.5 / (cos_theta_m * (1.0 - k) + k);
-
-	// float cos2 = cos_theta_m * cos_theta_m;
-	// float sin2 = (1.0 - cos2);
-	// return 1.0 / (cos_theta_m + sqrt(cos2 + alpha * alpha * sin2));
-}
-
-float D_GGX(float cos_theta_m, float alpha) {
-	float alpha2 = alpha * alpha;
-	float d = 1.0 + (alpha2 - 1.0) * cos_theta_m * cos_theta_m;
-	return alpha2 / (M_PI * d * d);
-}
-
-float G_GGX_anisotropic_2cos(float cos_theta_m, float alpha_x, float alpha_y, float cos_phi, float sin_phi) {
-	float cos2 = cos_theta_m * cos_theta_m;
-	float sin2 = (1.0 - cos2);
-	float s_x = alpha_x * cos_phi;
-	float s_y = alpha_y * sin_phi;
-	return 1.0 / max(cos_theta_m + sqrt(cos2 + (s_x * s_x + s_y * s_y) * sin2), 0.001);
-}
-
-float D_GGX_anisotropic(float cos_theta_m, float alpha_x, float alpha_y, float cos_phi, float sin_phi) {
-	float cos2 = cos_theta_m * cos_theta_m;
-	float sin2 = (1.0 - cos2);
-	float r_x = cos_phi / alpha_x;
-	float r_y = sin_phi / alpha_y;
-	float d = cos2 + sin2 * (r_x * r_x + r_y * r_y);
-	return 1.0 / max(M_PI * alpha_x * alpha_y * d * d, 0.001);
-}
-
-float SchlickFresnel(float u) {
-	float m = 1.0 - u;
-	float m2 = m * m;
-	return m2 * m2 * m; // pow(m,5)
-}
-
-float GTR1(float NdotH, float a) {
-	if (a >= 1.0)
-		return 1.0 / M_PI;
-	float a2 = a * a;
-	float t = 1.0 + (a2 - 1.0) * NdotH * NdotH;
-	return (a2 - 1.0) / (M_PI * log(a2) * t);
-}
-
-vec3 F0(float metallic, float specular, vec3 albedo) {
-	float dielectric = 0.16 * specular * specular;
-	// use albedo * metallic as colored specular reflectance at 0 angle for metallic materials;
-	// see https://google.github.io/filament/Filament.md.html
-	return mix(vec3(dielectric), albedo, vec3(metallic));
-}
-
-void light_compute(vec3 N, vec3 L, vec3 V, vec3 light_color, float attenuation, vec3 f0, uint orms, float specular_amount,
-#ifdef LIGHT_BACKLIGHT_USED
-		vec3 backlight,
-#endif
-#ifdef LIGHT_TRANSMITTANCE_USED
-		vec4 transmittance_color,
-		float transmittance_depth,
-		float transmittance_curve,
-		float transmittance_boost,
-		float transmittance_z,
-#endif
-#ifdef LIGHT_RIM_USED
-		float rim, float rim_tint, vec3 rim_color,
-#endif
-#ifdef LIGHT_CLEARCOAT_USED
-		float clearcoat, float clearcoat_gloss,
-#endif
-#ifdef LIGHT_ANISOTROPY_USED
-		vec3 B, vec3 T, float anisotropy,
-#endif
-#ifdef USE_SOFT_SHADOWS
-		float A,
-#endif
-#ifdef USE_SHADOW_TO_OPACITY
-		inout float alpha,
-#endif
-		inout vec3 diffuse_light, inout vec3 specular_light) {
-
-#if defined(USE_LIGHT_SHADER_CODE)
-	// light is written by the light shader
-
-	vec3 normal = N;
-	vec3 light = L;
-	vec3 view = V;
-
-	/* clang-format off */
-
-LIGHT_SHADER_CODE
-
-	/* clang-format on */
-
-#else
-
-#ifdef USE_SOFT_SHADOWS
-	float NdotL = min(A + dot(N, L), 1.0);
-#else
-	float NdotL = dot(N, L);
-#endif
-	float cNdotL = max(NdotL, 0.0); // clamped NdotL
-	float NdotV = dot(N, V);
-	float cNdotV = max(NdotV, 0.0);
-
-#if defined(DIFFUSE_BURLEY) || defined(SPECULAR_BLINN) || defined(SPECULAR_SCHLICK_GGX) || defined(LIGHT_CLEARCOAT_USED)
-	vec3 H = normalize(V + L);
-#endif
-
-#if defined(SPECULAR_BLINN) || defined(SPECULAR_SCHLICK_GGX) || defined(LIGHT_CLEARCOAT_USED)
-#ifdef USE_SOFT_SHADOWS
-	float cNdotH = clamp(A + dot(N, H), 0.0, 1.0);
-#else
-	float cNdotH = clamp(dot(N, H), 0.0, 1.0);
-#endif
-#endif
-
-#if defined(DIFFUSE_BURLEY) || defined(SPECULAR_SCHLICK_GGX) || defined(LIGHT_CLEARCOAT_USED)
-#ifdef USE_SOFT_SHADOWS
-	float cLdotH = clamp(A + dot(L, H), 0.0, 1.0);
-#else
-	float cLdotH = clamp(dot(L, H), 0.0, 1.0);
-#endif
-#endif
-
-	float metallic = unpackUnorm4x8(orms).z;
-	if (metallic < 1.0) {
-		float roughness = unpackUnorm4x8(orms).y;
-
-#if defined(DIFFUSE_OREN_NAYAR)
-		vec3 diffuse_brdf_NL;
-#else
-		float diffuse_brdf_NL; // BRDF times N.L for calculating diffuse radiance
-#endif
-
-#if defined(DIFFUSE_LAMBERT_WRAP)
-		// energy conserving lambert wrap shader
-		diffuse_brdf_NL = max(0.0, (NdotL + roughness) / ((1.0 + roughness) * (1.0 + roughness)));
-#elif defined(DIFFUSE_TOON)
-
-		diffuse_brdf_NL = smoothstep(-roughness, max(roughness, 0.01), NdotL);
-
-#elif defined(DIFFUSE_BURLEY)
-
-		{
-			float FD90_minus_1 = 2.0 * cLdotH * cLdotH * roughness - 0.5;
-			float FdV = 1.0 + FD90_minus_1 * SchlickFresnel(cNdotV);
-			float FdL = 1.0 + FD90_minus_1 * SchlickFresnel(cNdotL);
-			diffuse_brdf_NL = (1.0 / M_PI) * FdV * FdL * cNdotL;
-			/*
-			float energyBias = mix(roughness, 0.0, 0.5);
-			float energyFactor = mix(roughness, 1.0, 1.0 / 1.51);
-			float fd90 = energyBias + 2.0 * VoH * VoH * roughness;
-			float f0 = 1.0;
-			float lightScatter = f0 + (fd90 - f0) * pow(1.0 - cNdotL, 5.0);
-			float viewScatter = f0 + (fd90 - f0) * pow(1.0 - cNdotV, 5.0);
-
-			diffuse_brdf_NL = lightScatter * viewScatter * energyFactor;
-			*/
-		}
-#else
-		// lambert
-		diffuse_brdf_NL = cNdotL * (1.0 / M_PI);
-#endif
-
-		diffuse_light += light_color * diffuse_brdf_NL * attenuation;
-
-#if defined(LIGHT_BACKLIGHT_USED)
-		diffuse_light += light_color * (vec3(1.0 / M_PI) - diffuse_brdf_NL) * backlight * attenuation;
-#endif
-
-#if defined(LIGHT_RIM_USED)
-		float rim_light = pow(max(0.0, 1.0 - cNdotV), max(0.0, (1.0 - roughness) * 16.0));
-		diffuse_light += rim_light * rim * mix(vec3(1.0), rim_color, rim_tint) * light_color;
-#endif
-
-#ifdef LIGHT_TRANSMITTANCE_USED
-
-#ifdef SSS_MODE_SKIN
-
-		{
-			float scale = 8.25 / transmittance_depth;
-			float d = scale * abs(transmittance_z);
-			float dd = -d * d;
-			vec3 profile = vec3(0.233, 0.455, 0.649) * exp(dd / 0.0064) +
-						   vec3(0.1, 0.336, 0.344) * exp(dd / 0.0484) +
-						   vec3(0.118, 0.198, 0.0) * exp(dd / 0.187) +
-						   vec3(0.113, 0.007, 0.007) * exp(dd / 0.567) +
-						   vec3(0.358, 0.004, 0.0) * exp(dd / 1.99) +
-						   vec3(0.078, 0.0, 0.0) * exp(dd / 7.41);
-
-			diffuse_light += profile * transmittance_color.a * light_color * clamp(transmittance_boost - NdotL, 0.0, 1.0) * (1.0 / M_PI);
-		}
-#else
-
-		if (transmittance_depth > 0.0) {
-			float fade = clamp(abs(transmittance_z / transmittance_depth), 0.0, 1.0);
-
-			fade = pow(max(0.0, 1.0 - fade), transmittance_curve);
-			fade *= clamp(transmittance_boost - NdotL, 0.0, 1.0);
-
-			diffuse_light += transmittance_color.rgb * light_color * (1.0 / M_PI) * transmittance_color.a * fade;
-		}
-
-#endif //SSS_MODE_SKIN
-
-#endif //LIGHT_TRANSMITTANCE_USED
-	}
-
-	float roughness = unpackUnorm4x8(orms).y;
-	if (roughness > 0.0) { // FIXME: roughness == 0 should not disable specular light entirely
-
-		// D
-
-#if defined(SPECULAR_BLINN)
-
-		//normalized blinn
-		float shininess = exp2(15.0 * (1.0 - roughness) + 1.0) * 0.25;
-		float blinn = pow(cNdotH, shininess) * cNdotL;
-		blinn *= (shininess + 8.0) * (1.0 / (8.0 * M_PI));
-		float intensity = blinn;
-
-		specular_light += light_color * intensity * attenuation * specular_amount;
-
-#elif defined(SPECULAR_PHONG)
-
-		vec3 R = normalize(-reflect(L, N));
-		float cRdotV = clamp(A + dot(R, V), 0.0, 1.0);
-		float shininess = exp2(15.0 * (1.0 - roughness) + 1.0) * 0.25;
-		float phong = pow(cRdotV, shininess);
-		phong *= (shininess + 8.0) * (1.0 / (8.0 * M_PI));
-		float intensity = (phong) / max(4.0 * cNdotV * cNdotL, 0.75);
-
-		specular_light += light_color * intensity * attenuation * specular_amount;
-
-#elif defined(SPECULAR_TOON)
-
-		vec3 R = normalize(-reflect(L, N));
-		float RdotV = dot(R, V);
-		float mid = 1.0 - roughness;
-		mid *= mid;
-		float intensity = smoothstep(mid - roughness * 0.5, mid + roughness * 0.5, RdotV) * mid;
-		diffuse_light += light_color * intensity * attenuation * specular_amount; // write to diffuse_light, as in toon shading you generally want no reflection
-
-#elif defined(SPECULAR_DISABLED)
-		// none..
-
-#elif defined(SPECULAR_SCHLICK_GGX)
-		// shlick+ggx as default
-
-#if defined(LIGHT_ANISOTROPY_USED)
-
-		float alpha_ggx = roughness * roughness;
-		float aspect = sqrt(1.0 - anisotropy * 0.9);
-		float ax = alpha_ggx / aspect;
-		float ay = alpha_ggx * aspect;
-		float XdotH = dot(T, H);
-		float YdotH = dot(B, H);
-		float D = D_GGX_anisotropic(cNdotH, ax, ay, XdotH, YdotH);
-		float G = G_GGX_anisotropic_2cos(cNdotL, ax, ay, XdotH, YdotH) * G_GGX_anisotropic_2cos(cNdotV, ax, ay, XdotH, YdotH);
-
-#else
-		float alpha_ggx = roughness * roughness;
-		float D = D_GGX(cNdotH, alpha_ggx);
-		float G = G_GGX_2cos(cNdotL, alpha_ggx) * G_GGX_2cos(cNdotV, alpha_ggx);
-#endif
-		// F
-		float cLdotH5 = SchlickFresnel(cLdotH);
-		vec3 F = mix(vec3(cLdotH5), vec3(1.0), f0);
-
-		vec3 specular_brdf_NL = cNdotL * D * F * G;
-
-		specular_light += specular_brdf_NL * light_color * attenuation * specular_amount;
-#endif
-
-#if defined(LIGHT_CLEARCOAT_USED)
-
-#if !defined(SPECULAR_SCHLICK_GGX)
-		float cLdotH5 = SchlickFresnel(cLdotH);
-#endif
-		float Dr = GTR1(cNdotH, mix(.1, .001, clearcoat_gloss));
-		float Fr = mix(.04, 1.0, cLdotH5);
-		float Gr = G_GGX_2cos(cNdotL, .25) * G_GGX_2cos(cNdotV, .25);
-
-		float clearcoat_specular_brdf_NL = 0.25 * clearcoat * Gr * Fr * Dr * cNdotL;
-
-		specular_light += clearcoat_specular_brdf_NL * light_color * attenuation * specular_amount;
-#endif
-	}
-
-#ifdef USE_SHADOW_TO_OPACITY
-	alpha = min(alpha, clamp(1.0 - attenuation), 0.0, 1.0));
-#endif
-
-#endif //defined(USE_LIGHT_SHADER_CODE)
-}
-
-#ifndef USE_NO_SHADOWS
-
-// Interleaved Gradient Noise
-// http://www.iryoku.com/next-generation-post-processing-in-call-of-duty-advanced-warfare
-float quick_hash(vec2 pos) {
-	const vec3 magic = vec3(0.06711056f, 0.00583715f, 52.9829189f);
-	return fract(magic.z * fract(dot(pos, magic.xy)));
-}
-
-float sample_directional_pcf_shadow(texture2D shadow, vec2 shadow_pixel_size, vec4 coord) {
-	vec2 pos = coord.xy;
-	float depth = coord.z;
-
-	//if only one sample is taken, take it from the center
-	if (scene_data.directional_soft_shadow_samples == 1) {
-		return textureProj(sampler2DShadow(shadow, shadow_sampler), vec4(pos, depth, 1.0));
-	}
-
-	mat2 disk_rotation;
-	{
-		float r = quick_hash(gl_FragCoord.xy) * 2.0 * M_PI;
-		float sr = sin(r);
-		float cr = cos(r);
-		disk_rotation = mat2(vec2(cr, -sr), vec2(sr, cr));
-	}
-
-	float avg = 0.0;
-
-	for (uint i = 0; i < scene_data.directional_soft_shadow_samples; i++) {
-		avg += textureProj(sampler2DShadow(shadow, shadow_sampler), vec4(pos + shadow_pixel_size * (disk_rotation * scene_data.directional_soft_shadow_kernel[i].xy), depth, 1.0));
-	}
-
-	return avg * (1.0 / float(scene_data.directional_soft_shadow_samples));
-}
-
-float sample_pcf_shadow(texture2D shadow, vec2 shadow_pixel_size, vec4 coord) {
-	vec2 pos = coord.xy;
-	float depth = coord.z;
-
-	//if only one sample is taken, take it from the center
-	if (scene_data.soft_shadow_samples == 1) {
-		return textureProj(sampler2DShadow(shadow, shadow_sampler), vec4(pos, depth, 1.0));
-	}
-
-	mat2 disk_rotation;
-	{
-		float r = quick_hash(gl_FragCoord.xy) * 2.0 * M_PI;
-		float sr = sin(r);
-		float cr = cos(r);
-		disk_rotation = mat2(vec2(cr, -sr), vec2(sr, cr));
-	}
-
-	float avg = 0.0;
-
-	for (uint i = 0; i < scene_data.soft_shadow_samples; i++) {
-		avg += textureProj(sampler2DShadow(shadow, shadow_sampler), vec4(pos + shadow_pixel_size * (disk_rotation * scene_data.soft_shadow_kernel[i].xy), depth, 1.0));
-	}
-
-	return avg * (1.0 / float(scene_data.soft_shadow_samples));
-}
-
-float sample_directional_soft_shadow(texture2D shadow, vec3 pssm_coord, vec2 tex_scale) {
-	//find blocker
-	float blocker_count = 0.0;
-	float blocker_average = 0.0;
-
-	mat2 disk_rotation;
-	{
-		float r = quick_hash(gl_FragCoord.xy) * 2.0 * M_PI;
-		float sr = sin(r);
-		float cr = cos(r);
-		disk_rotation = mat2(vec2(cr, -sr), vec2(sr, cr));
-	}
-
-	for (uint i = 0; i < scene_data.directional_penumbra_shadow_samples; i++) {
-		vec2 suv = pssm_coord.xy + (disk_rotation * scene_data.directional_penumbra_shadow_kernel[i].xy) * tex_scale;
-		float d = textureLod(sampler2D(shadow, material_samplers[SAMPLER_LINEAR_CLAMP]), suv, 0.0).r;
-		if (d < pssm_coord.z) {
-			blocker_average += d;
-			blocker_count += 1.0;
-		}
-	}
-
-	if (blocker_count > 0.0) {
-		//blockers found, do soft shadow
-		blocker_average /= blocker_count;
-		float penumbra = (pssm_coord.z - blocker_average) / blocker_average;
-		tex_scale *= penumbra;
-
-		float s = 0.0;
-		for (uint i = 0; i < scene_data.directional_penumbra_shadow_samples; i++) {
-			vec2 suv = pssm_coord.xy + (disk_rotation * scene_data.directional_penumbra_shadow_kernel[i].xy) * tex_scale;
-			s += textureProj(sampler2DShadow(shadow, shadow_sampler), vec4(suv, pssm_coord.z, 1.0));
-		}
-
-		return s / float(scene_data.directional_penumbra_shadow_samples);
-
-	} else {
-		//no blockers found, so no shadow
-		return 1.0;
-	}
-}
-
-#endif //USE_NO_SHADOWS
-
-float get_omni_attenuation(float distance, float inv_range, float decay) {
-	float nd = distance * inv_range;
-	nd *= nd;
-	nd *= nd; // nd^4
-	nd = max(1.0 - nd, 0.0);
-	nd *= nd; // nd^2
-	return nd * pow(max(distance, 0.0001), -decay);
-}
-
-float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
-#ifndef USE_NO_SHADOWS
-	if (omni_lights.data[idx].shadow_enabled) {
-		// there is a shadowmap
-
-		vec3 light_rel_vec = omni_lights.data[idx].position - vertex;
-		float light_length = length(light_rel_vec);
-
-		vec4 v = vec4(vertex, 1.0);
-
-		vec4 splane = (omni_lights.data[idx].shadow_matrix * v);
-		float shadow_len = length(splane.xyz); //need to remember shadow len from here
-
-		{
-			vec3 nofs = normal_interp * omni_lights.data[idx].shadow_normal_bias / omni_lights.data[idx].inv_radius;
-			nofs *= (1.0 - max(0.0, dot(normalize(light_rel_vec), normalize(normal_interp))));
-			v.xyz += nofs;
-			splane = (omni_lights.data[idx].shadow_matrix * v);
-		}
-
-		float shadow;
-
-#ifdef USE_SOFT_SHADOWS
-		if (omni_lights.data[idx].soft_shadow_size > 0.0) {
-			//soft shadow
-
-			//find blocker
-
-			float blocker_count = 0.0;
-			float blocker_average = 0.0;
-
-			mat2 disk_rotation;
-			{
-				float r = quick_hash(gl_FragCoord.xy) * 2.0 * M_PI;
-				float sr = sin(r);
-				float cr = cos(r);
-				disk_rotation = mat2(vec2(cr, -sr), vec2(sr, cr));
-			}
-
-			vec3 normal = normalize(splane.xyz);
-			vec3 v0 = abs(normal.z) < 0.999 ? vec3(0.0, 0.0, 1.0) : vec3(0.0, 1.0, 0.0);
-			vec3 tangent = normalize(cross(v0, normal));
-			vec3 bitangent = normalize(cross(tangent, normal));
-			float z_norm = shadow_len * omni_lights.data[idx].inv_radius;
-
-			tangent *= omni_lights.data[idx].soft_shadow_size * omni_lights.data[idx].soft_shadow_scale;
-			bitangent *= omni_lights.data[idx].soft_shadow_size * omni_lights.data[idx].soft_shadow_scale;
-
-			for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
-				vec2 disk = disk_rotation * scene_data.penumbra_shadow_kernel[i].xy;
-
-				vec3 pos = splane.xyz + tangent * disk.x + bitangent * disk.y;
-
-				pos = normalize(pos);
-				vec4 uv_rect = omni_lights.data[idx].atlas_rect;
-
-				if (pos.z >= 0.0) {
-					pos.z += 1.0;
-					uv_rect.y += uv_rect.w;
-				} else {
-					pos.z = 1.0 - pos.z;
-				}
-
-				pos.xy /= pos.z;
-
-				pos.xy = pos.xy * 0.5 + 0.5;
-				pos.xy = uv_rect.xy + pos.xy * uv_rect.zw;
-
-				float d = textureLod(sampler2D(shadow_atlas, material_samplers[SAMPLER_LINEAR_CLAMP]), pos.xy, 0.0).r;
-				if (d < z_norm) {
-					blocker_average += d;
-					blocker_count += 1.0;
-				}
-			}
-
-			if (blocker_count > 0.0) {
-				//blockers found, do soft shadow
-				blocker_average /= blocker_count;
-				float penumbra = (z_norm - blocker_average) / blocker_average;
-				tangent *= penumbra;
-				bitangent *= penumbra;
-
-				z_norm -= omni_lights.data[idx].inv_radius * omni_lights.data[idx].shadow_bias;
-
-				shadow = 0.0;
-				for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
-					vec2 disk = disk_rotation * scene_data.penumbra_shadow_kernel[i].xy;
-					vec3 pos = splane.xyz + tangent * disk.x + bitangent * disk.y;
-
-					pos = normalize(pos);
-					vec4 uv_rect = omni_lights.data[idx].atlas_rect;
-
-					if (pos.z >= 0.0) {
-						pos.z += 1.0;
-						uv_rect.y += uv_rect.w;
-					} else {
-						pos.z = 1.0 - pos.z;
-					}
-
-					pos.xy /= pos.z;
-
-					pos.xy = pos.xy * 0.5 + 0.5;
-					pos.xy = uv_rect.xy + pos.xy * uv_rect.zw;
-					shadow += textureProj(sampler2DShadow(shadow_atlas, shadow_sampler), vec4(pos.xy, z_norm, 1.0));
-				}
-
-				shadow /= float(scene_data.penumbra_shadow_samples);
-
-			} else {
-				//no blockers found, so no shadow
-				shadow = 1.0;
-			}
-		} else {
-#endif
-			splane.xyz = normalize(splane.xyz);
-			vec4 clamp_rect = omni_lights.data[idx].atlas_rect;
-
-			if (splane.z >= 0.0) {
-				splane.z += 1.0;
-
-				clamp_rect.y += clamp_rect.w;
-
-			} else {
-				splane.z = 1.0 - splane.z;
-			}
-
-			splane.xy /= splane.z;
-
-			splane.xy = splane.xy * 0.5 + 0.5;
-			splane.z = (shadow_len - omni_lights.data[idx].shadow_bias) * omni_lights.data[idx].inv_radius;
-			splane.xy = clamp_rect.xy + splane.xy * clamp_rect.zw;
-			splane.w = 1.0; //needed? i think it should be 1 already
-			shadow = sample_pcf_shadow(shadow_atlas, omni_lights.data[idx].soft_shadow_scale * scene_data.shadow_atlas_pixel_size, splane);
-#ifdef USE_SOFT_SHADOWS
-		}
-#endif
-
-		return shadow;
-	}
-#endif
-
-	return 1.0;
-}
-
-void light_process_omni(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 vertex_ddx, vec3 vertex_ddy, vec3 f0, uint orms, float shadow,
-#ifdef LIGHT_BACKLIGHT_USED
-		vec3 backlight,
-#endif
-#ifdef LIGHT_TRANSMITTANCE_USED
-		vec4 transmittance_color,
-		float transmittance_depth,
-		float transmittance_curve,
-		float transmittance_boost,
-#endif
-#ifdef LIGHT_RIM_USED
-		float rim, float rim_tint, vec3 rim_color,
-#endif
-#ifdef LIGHT_CLEARCOAT_USED
-		float clearcoat, float clearcoat_gloss,
-#endif
-#ifdef LIGHT_ANISOTROPY_USED
-		vec3 binormal, vec3 tangent, float anisotropy,
-#endif
-#ifdef USE_SHADOW_TO_OPACITY
-		inout float alpha,
-#endif
-		inout vec3 diffuse_light, inout vec3 specular_light) {
-	vec3 light_rel_vec = omni_lights.data[idx].position - vertex;
-	float light_length = length(light_rel_vec);
-	float omni_attenuation = get_omni_attenuation(light_length, omni_lights.data[idx].inv_radius, omni_lights.data[idx].attenuation);
-	float light_attenuation = omni_attenuation;
-	vec3 color = omni_lights.data[idx].color;
-
-#ifdef USE_SOFT_SHADOWS
-	float size_A = 0.0;
-
-	if (omni_lights.data[idx].size > 0.0) {
-		float t = omni_lights.data[idx].size / max(0.001, light_length);
-		size_A = max(0.0, 1.0 - 1 / sqrt(1 + t * t));
-	}
-#endif
-
-#ifdef LIGHT_TRANSMITTANCE_USED
-	float transmittance_z = transmittance_depth; //no transmittance by default
-	transmittance_color.a *= light_attenuation;
-	{
-		vec4 clamp_rect = omni_lights.data[idx].atlas_rect;
-
-		//redo shadowmapping, but shrink the model a bit to avoid arctifacts
-		vec4 splane = (omni_lights.data[idx].shadow_matrix * vec4(vertex - normalize(normal_interp) * omni_lights.data[idx].transmittance_bias, 1.0));
-
-		shadow_len = length(splane.xyz);
-		splane = normalize(splane.xyz);
-
-		if (splane.z >= 0.0) {
-			splane.z += 1.0;
-
-		} else {
-			splane.z = 1.0 - splane.z;
-		}
-
-		splane.xy /= splane.z;
-		splane.xy = splane.xy * 0.5 + 0.5;
-		splane.z = shadow_len * omni_lights.data[idx].inv_radius;
-		splane.xy = clamp_rect.xy + splane.xy * clamp_rect.zw;
-		splane.w = 1.0; //needed? i think it should be 1 already
-
-		float shadow_z = textureLod(sampler2D(shadow_atlas, material_samplers[SAMPLER_LINEAR_CLAMP]), splane.xy, 0.0).r;
-		transmittance_z = (splane.z - shadow_z) / omni_lights.data[idx].inv_radius;
-	}
-#endif
-
-#if 0
-
-	if (omni_lights.data[idx].projector_rect != vec4(0.0)) {
-		vec3 local_v = (omni_lights.data[idx].shadow_matrix * vec4(vertex, 1.0)).xyz;
-		local_v = normalize(local_v);
-
-		vec4 atlas_rect = omni_lights.data[idx].projector_rect;
-
-		if (local_v.z >= 0.0) {
-			local_v.z += 1.0;
-			atlas_rect.y += atlas_rect.w;
-
-		} else {
-			local_v.z = 1.0 - local_v.z;
-		}
-
-		local_v.xy /= local_v.z;
-		local_v.xy = local_v.xy * 0.5 + 0.5;
-		vec2 proj_uv = local_v.xy * atlas_rect.zw;
-
-		vec2 proj_uv_ddx;
-		vec2 proj_uv_ddy;
-		{
-			vec3 local_v_ddx = (omni_lights.data[idx].shadow_matrix * vec4(vertex + vertex_ddx, 1.0)).xyz;
-			local_v_ddx = normalize(local_v_ddx);
-
-			if (local_v_ddx.z >= 0.0) {
-				local_v_ddx.z += 1.0;
-			} else {
-				local_v_ddx.z = 1.0 - local_v_ddx.z;
-			}
-
-			local_v_ddx.xy /= local_v_ddx.z;
-			local_v_ddx.xy = local_v_ddx.xy * 0.5 + 0.5;
-
-			proj_uv_ddx = local_v_ddx.xy * atlas_rect.zw - proj_uv;
-
-			vec3 local_v_ddy = (omni_lights.data[idx].shadow_matrix * vec4(vertex + vertex_ddy, 1.0)).xyz;
-			local_v_ddy = normalize(local_v_ddy);
-
-			if (local_v_ddy.z >= 0.0) {
-				local_v_ddy.z += 1.0;
-			} else {
-				local_v_ddy.z = 1.0 - local_v_ddy.z;
-			}
-
-			local_v_ddy.xy /= local_v_ddy.z;
-			local_v_ddy.xy = local_v_ddy.xy * 0.5 + 0.5;
-
-			proj_uv_ddy = local_v_ddy.xy * atlas_rect.zw - proj_uv;
-		}
-
-		vec4 proj = textureGrad(sampler2D(decal_atlas_srgb, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), proj_uv + atlas_rect.xy, proj_uv_ddx, proj_uv_ddy);
-		no_shadow = mix(no_shadow, proj.rgb, proj.a);
-	}
-#endif
-
-	light_attenuation *= shadow;
-
-	light_compute(normal, normalize(light_rel_vec), eye_vec, color, light_attenuation, f0, orms, omni_lights.data[idx].specular_amount,
-#ifdef LIGHT_BACKLIGHT_USED
-			backlight,
-#endif
-#ifdef LIGHT_TRANSMITTANCE_USED
-			transmittance_color,
-			transmittance_depth,
-			transmittance_curve,
-			transmittance_boost,
-			transmittance_z,
-#endif
-#ifdef LIGHT_RIM_USED
-			rim * omni_attenuation, rim_tint, rim_color,
-#endif
-#ifdef LIGHT_CLEARCOAT_USED
-			clearcoat, clearcoat_gloss,
-#endif
-#ifdef LIGHT_ANISOTROPY_USED
-			binormal, tangent, anisotropy,
-#endif
-#ifdef USE_SOFT_SHADOWS
-			size_A,
-#endif
-#ifdef USE_SHADOW_TO_OPACITY
-			alpha,
-#endif
-			diffuse_light,
-			specular_light);
-}
-
-float light_process_spot_shadow(uint idx, vec3 vertex, vec3 normal) {
-#ifndef USE_NO_SHADOWS
-	if (spot_lights.data[idx].shadow_enabled) {
-		vec3 light_rel_vec = spot_lights.data[idx].position - vertex;
-		float light_length = length(light_rel_vec);
-		vec3 spot_dir = spot_lights.data[idx].direction;
-		//there is a shadowmap
-		vec4 v = vec4(vertex, 1.0);
-
-		v.xyz -= spot_dir * spot_lights.data[idx].shadow_bias;
-
-		float z_norm = dot(spot_dir, -light_rel_vec) * spot_lights.data[idx].inv_radius;
-
-		float depth_bias_scale = 1.0 / (max(0.0001, z_norm)); //the closer to the light origin, the more you have to offset to reach 1px in the map
-		vec3 normal_bias = normalize(normal_interp) * (1.0 - max(0.0, dot(spot_dir, -normalize(normal_interp)))) * spot_lights.data[idx].shadow_normal_bias * depth_bias_scale;
-		normal_bias -= spot_dir * dot(spot_dir, normal_bias); //only XY, no Z
-		v.xyz += normal_bias;
-
-		//adjust with bias
-		z_norm = dot(spot_dir, v.xyz - spot_lights.data[idx].position) * spot_lights.data[idx].inv_radius;
-
-		float shadow;
-
-		vec4 splane = (spot_lights.data[idx].shadow_matrix * v);
-		splane /= splane.w;
-
-#ifdef USE_SOFT_SHADOWS
-		if (spot_lights.data[idx].soft_shadow_size > 0.0) {
-			//soft shadow
-
-			//find blocker
-
-			vec2 shadow_uv = splane.xy * spot_lights.data[idx].atlas_rect.zw + spot_lights.data[idx].atlas_rect.xy;
-
-			float blocker_count = 0.0;
-			float blocker_average = 0.0;
-
-			mat2 disk_rotation;
-			{
-				float r = quick_hash(gl_FragCoord.xy) * 2.0 * M_PI;
-				float sr = sin(r);
-				float cr = cos(r);
-				disk_rotation = mat2(vec2(cr, -sr), vec2(sr, cr));
-			}
-
-			float uv_size = spot_lights.data[idx].soft_shadow_size * z_norm * spot_lights.data[idx].soft_shadow_scale;
-			vec2 clamp_max = spot_lights.data[idx].atlas_rect.xy + spot_lights.data[idx].atlas_rect.zw;
-			for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
-				vec2 suv = shadow_uv + (disk_rotation * scene_data.penumbra_shadow_kernel[i].xy) * uv_size;
-				suv = clamp(suv, spot_lights.data[idx].atlas_rect.xy, clamp_max);
-				float d = textureLod(sampler2D(shadow_atlas, material_samplers[SAMPLER_LINEAR_CLAMP]), suv, 0.0).r;
-				if (d < z_norm) {
-					blocker_average += d;
-					blocker_count += 1.0;
-				}
-			}
-
-			if (blocker_count > 0.0) {
-				//blockers found, do soft shadow
-				blocker_average /= blocker_count;
-				float penumbra = (z_norm - blocker_average) / blocker_average;
-				uv_size *= penumbra;
-
-				shadow = 0.0;
-				for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
-					vec2 suv = shadow_uv + (disk_rotation * scene_data.penumbra_shadow_kernel[i].xy) * uv_size;
-					suv = clamp(suv, spot_lights.data[idx].atlas_rect.xy, clamp_max);
-					shadow += textureProj(sampler2DShadow(shadow_atlas, shadow_sampler), vec4(suv, z_norm, 1.0));
-				}
-
-				shadow /= float(scene_data.penumbra_shadow_samples);
-
-			} else {
-				//no blockers found, so no shadow
-				shadow = 1.0;
-			}
-
-		} else {
-#endif
-			//hard shadow
-			vec4 shadow_uv = vec4(splane.xy * spot_lights.data[idx].atlas_rect.zw + spot_lights.data[idx].atlas_rect.xy, splane.z, 1.0);
-
-			shadow = sample_pcf_shadow(shadow_atlas, spot_lights.data[idx].soft_shadow_scale * scene_data.shadow_atlas_pixel_size, shadow_uv);
-#ifdef USE_SOFT_SHADOWS
-		}
-#endif
-
-		return shadow;
-	}
-
-#endif //USE_NO_SHADOWS
-
-	return 1.0;
-}
-
-void light_process_spot(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 vertex_ddx, vec3 vertex_ddy, vec3 f0, uint orms, float shadow,
-#ifdef LIGHT_BACKLIGHT_USED
-		vec3 backlight,
-#endif
-#ifdef LIGHT_TRANSMITTANCE_USED
-		vec4 transmittance_color,
-		float transmittance_depth,
-		float transmittance_curve,
-		float transmittance_boost,
-#endif
-#ifdef LIGHT_RIM_USED
-		float rim, float rim_tint, vec3 rim_color,
-#endif
-#ifdef LIGHT_CLEARCOAT_USED
-		float clearcoat, float clearcoat_gloss,
-#endif
-#ifdef LIGHT_ANISOTROPY_USED
-		vec3 binormal, vec3 tangent, float anisotropy,
-#endif
-#ifdef USE_SHADOW_TO_OPACITY
-		inout float alpha,
-#endif
-		inout vec3 diffuse_light,
-		inout vec3 specular_light) {
-	vec3 light_rel_vec = spot_lights.data[idx].position - vertex;
-	float light_length = length(light_rel_vec);
-	float spot_attenuation = get_omni_attenuation(light_length, spot_lights.data[idx].inv_radius, spot_lights.data[idx].attenuation);
-	vec3 spot_dir = spot_lights.data[idx].direction;
-	float scos = max(dot(-normalize(light_rel_vec), spot_dir), spot_lights.data[idx].cone_angle);
-	float spot_rim = max(0.0001, (1.0 - scos) / (1.0 - spot_lights.data[idx].cone_angle));
-	spot_attenuation *= 1.0 - pow(spot_rim, spot_lights.data[idx].cone_attenuation);
-	float light_attenuation = spot_attenuation;
-	vec3 color = spot_lights.data[idx].color;
-	float specular_amount = spot_lights.data[idx].specular_amount;
-
-#ifdef USE_SOFT_SHADOWS
-	float size_A = 0.0;
-
-	if (spot_lights.data[idx].size > 0.0) {
-		float t = spot_lights.data[idx].size / max(0.001, light_length);
-		size_A = max(0.0, 1.0 - 1 / sqrt(1 + t * t));
-	}
-#endif
-
-	/*
-	if (spot_lights.data[idx].atlas_rect!=vec4(0.0)) {
-		//use projector texture
-	}
-	*/
-
-#ifdef LIGHT_TRANSMITTANCE_USED
-	float transmittance_z = transmittance_depth;
-	transmittance_color.a *= light_attenuation;
-	{
-		splane = (spot_lights.data[idx].shadow_matrix * vec4(vertex - normalize(normal_interp) * spot_lights.data[idx].transmittance_bias, 1.0));
-		splane /= splane.w;
-		splane.xy = splane.xy * spot_lights.data[idx].atlas_rect.zw + spot_lights.data[idx].atlas_rect.xy;
-
-		float shadow_z = textureLod(sampler2D(shadow_atlas, material_samplers[SAMPLER_LINEAR_CLAMP]), splane.xy, 0.0).r;
-		//reconstruct depth
-		shadow_z /= spot_lights.data[idx].inv_radius;
-		//distance to light plane
-		float z = dot(spot_dir, -light_rel_vec);
-		transmittance_z = z - shadow_z;
-	}
-#endif //LIGHT_TRANSMITTANCE_USED
-
-	light_attenuation *= shadow;
-
-	light_compute(normal, normalize(light_rel_vec), eye_vec, color, light_attenuation, f0, orms, spot_lights.data[idx].specular_amount,
-#ifdef LIGHT_BACKLIGHT_USED
-			backlight,
-#endif
-#ifdef LIGHT_TRANSMITTANCE_USED
-			transmittance_color,
-			transmittance_depth,
-			transmittance_curve,
-			transmittance_boost,
-			transmittance_z,
-#endif
-#ifdef LIGHT_RIM_USED
-			rim * spot_attenuation, rim_tint, rim_color,
-#endif
-#ifdef LIGHT_CLEARCOAT_USED
-			clearcoat, clearcoat_gloss,
-#endif
-#ifdef LIGHT_ANISOTROPY_USED
-			binormal, tangent, anisotropy,
-#endif
-#ifdef USE_SOFT_SHADOW
-			size_A,
-#endif
-#ifdef USE_SHADOW_TO_OPACITY
-			alpha,
-#endif
-			diffuse_light, specular_light);
-}
-
-void reflection_process(uint ref_index, vec3 vertex, vec3 normal, float roughness, vec3 ambient_light, vec3 specular_light, inout vec4 ambient_accum, inout vec4 reflection_accum) {
-	vec3 box_extents = reflections.data[ref_index].box_extents;
-	vec3 local_pos = (reflections.data[ref_index].local_matrix * vec4(vertex, 1.0)).xyz;
-
-	if (any(greaterThan(abs(local_pos), box_extents))) { //out of the reflection box
-		return;
-	}
-
-	vec3 ref_vec = normalize(reflect(vertex, normal));
-
-	vec3 inner_pos = abs(local_pos / box_extents);
-	float blend = max(inner_pos.x, max(inner_pos.y, inner_pos.z));
-	//make blend more rounded
-	blend = mix(length(inner_pos), blend, blend);
-	blend *= blend;
-	blend = max(0.0, 1.0 - blend);
-
-	if (reflections.data[ref_index].intensity > 0.0) { // compute reflection
-
-		vec3 local_ref_vec = (reflections.data[ref_index].local_matrix * vec4(ref_vec, 0.0)).xyz;
-
-		if (reflections.data[ref_index].box_project) { //box project
-
-			vec3 nrdir = normalize(local_ref_vec);
-			vec3 rbmax = (box_extents - local_pos) / nrdir;
-			vec3 rbmin = (-box_extents - local_pos) / nrdir;
-
-			vec3 rbminmax = mix(rbmin, rbmax, greaterThan(nrdir, vec3(0.0, 0.0, 0.0)));
-
-			float fa = min(min(rbminmax.x, rbminmax.y), rbminmax.z);
-			vec3 posonbox = local_pos + nrdir * fa;
-			local_ref_vec = posonbox - reflections.data[ref_index].box_offset;
-		}
-
-		vec4 reflection;
-
-		reflection.rgb = textureLod(samplerCubeArray(reflection_atlas, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), vec4(local_ref_vec, reflections.data[ref_index].index), roughness * MAX_ROUGHNESS_LOD).rgb;
-
-		if (reflections.data[ref_index].exterior) {
-			reflection.rgb = mix(specular_light, reflection.rgb, blend);
-		}
-
-		reflection.rgb *= reflections.data[ref_index].intensity; //intensity
-		reflection.a = blend;
-		reflection.rgb *= reflection.a;
-
-		reflection_accum += reflection;
-	}
-
-	switch (reflections.data[ref_index].ambient_mode) {
-		case REFLECTION_AMBIENT_DISABLED: {
-			//do nothing
-		} break;
-		case REFLECTION_AMBIENT_ENVIRONMENT: {
-			//do nothing
-			vec3 local_amb_vec = (reflections.data[ref_index].local_matrix * vec4(normal, 0.0)).xyz;
-
-			vec4 ambient_out;
-
-			ambient_out.rgb = textureLod(samplerCubeArray(reflection_atlas, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), vec4(local_amb_vec, reflections.data[ref_index].index), MAX_ROUGHNESS_LOD).rgb;
-			ambient_out.a = blend;
-			if (reflections.data[ref_index].exterior) {
-				ambient_out.rgb = mix(ambient_light, ambient_out.rgb, blend);
-			}
-
-			ambient_out.rgb *= ambient_out.a;
-			ambient_accum += ambient_out;
-		} break;
-		case REFLECTION_AMBIENT_COLOR: {
-			vec4 ambient_out;
-			ambient_out.a = blend;
-			ambient_out.rgb = reflections.data[ref_index].ambient;
-			if (reflections.data[ref_index].exterior) {
-				ambient_out.rgb = mix(ambient_light, ambient_out.rgb, blend);
-			}
-			ambient_out.rgb *= ambient_out.a;
-			ambient_accum += ambient_out;
-		} break;
-	}
-}
+#include "scene_forward_lights_inc.glsl"
 
 #ifdef USE_FORWARD_GI
 
-//standard voxel cone trace
-vec4 voxel_cone_trace(texture3D probe, vec3 cell_size, vec3 pos, vec3 direction, float tan_half_angle, float max_distance, float p_bias) {
-	float dist = p_bias;
-	vec4 color = vec4(0.0);
-
-	while (dist < max_distance && color.a < 0.95) {
-		float diameter = max(1.0, 2.0 * tan_half_angle * dist);
-		vec3 uvw_pos = (pos + dist * direction) * cell_size;
-		float half_diameter = diameter * 0.5;
-		//check if outside, then break
-		if (any(greaterThan(abs(uvw_pos - 0.5), vec3(0.5f + half_diameter * cell_size)))) {
-			break;
-		}
-		vec4 scolor = textureLod(sampler3D(probe, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uvw_pos, log2(diameter));
-		float a = (1.0 - color.a);
-		color += a * scolor;
-		dist += half_diameter;
-	}
-
-	return color;
-}
-
-vec4 voxel_cone_trace_45_degrees(texture3D probe, vec3 cell_size, vec3 pos, vec3 direction, float tan_half_angle, float max_distance, float p_bias) {
-	float dist = p_bias;
-	vec4 color = vec4(0.0);
-	float radius = max(0.5, tan_half_angle * dist);
-	float lod_level = log2(radius * 2.0);
-
-	while (dist < max_distance && color.a < 0.95) {
-		vec3 uvw_pos = (pos + dist * direction) * cell_size;
-
-		//check if outside, then break
-		if (any(greaterThan(abs(uvw_pos - 0.5), vec3(0.5f + radius * cell_size)))) {
-			break;
-		}
-		vec4 scolor = textureLod(sampler3D(probe, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uvw_pos, lod_level);
-		lod_level += 1.0;
-
-		float a = (1.0 - color.a);
-		scolor *= a;
-		color += scolor;
-		dist += radius;
-		radius = max(0.5, tan_half_angle * dist);
-	}
-
-	return color;
-}
-
-void gi_probe_compute(uint index, vec3 position, vec3 normal, vec3 ref_vec, mat3 normal_xform, float roughness, vec3 ambient, vec3 environment, inout vec4 out_spec, inout vec4 out_diff) {
-	position = (gi_probes.data[index].xform * vec4(position, 1.0)).xyz;
-	ref_vec = normalize((gi_probes.data[index].xform * vec4(ref_vec, 0.0)).xyz);
-	normal = normalize((gi_probes.data[index].xform * vec4(normal, 0.0)).xyz);
-
-	position += normal * gi_probes.data[index].normal_bias;
-
-	//this causes corrupted pixels, i have no idea why..
-	if (any(bvec2(any(lessThan(position, vec3(0.0))), any(greaterThan(position, gi_probes.data[index].bounds))))) {
-		return;
-	}
-
-	vec3 blendv = abs(position / gi_probes.data[index].bounds * 2.0 - 1.0);
-	float blend = clamp(1.0 - max(blendv.x, max(blendv.y, blendv.z)), 0.0, 1.0);
-	//float blend=1.0;
-
-	float max_distance = length(gi_probes.data[index].bounds);
-	vec3 cell_size = 1.0 / gi_probes.data[index].bounds;
-
-	//radiance
-
-#define MAX_CONE_DIRS 4
-
-	vec3 cone_dirs[MAX_CONE_DIRS] = vec3[](
-			vec3(0.707107, 0.0, 0.707107),
-			vec3(0.0, 0.707107, 0.707107),
-			vec3(-0.707107, 0.0, 0.707107),
-			vec3(0.0, -0.707107, 0.707107));
-
-	float cone_weights[MAX_CONE_DIRS] = float[](0.25, 0.25, 0.25, 0.25);
-	float cone_angle_tan = 0.98269;
-
-	vec3 light = vec3(0.0);
-
-	for (int i = 0; i < MAX_CONE_DIRS; i++) {
-		vec3 dir = normalize((gi_probes.data[index].xform * vec4(normal_xform * cone_dirs[i], 0.0)).xyz);
-
-		vec4 cone_light = voxel_cone_trace_45_degrees(gi_probe_textures[index], cell_size, position, dir, cone_angle_tan, max_distance, gi_probes.data[index].bias);
-
-		if (gi_probes.data[index].blend_ambient) {
-			cone_light.rgb = mix(ambient, cone_light.rgb, min(1.0, cone_light.a / 0.95));
-		}
-
-		light += cone_weights[i] * cone_light.rgb;
-	}
-
-	light *= gi_probes.data[index].dynamic_range;
-	out_diff += vec4(light * blend, blend);
-
-	//irradiance
-	vec4 irr_light = voxel_cone_trace(gi_probe_textures[index], cell_size, position, ref_vec, tan(roughness * 0.5 * M_PI * 0.99), max_distance, gi_probes.data[index].bias);
-	if (gi_probes.data[index].blend_ambient) {
-		irr_light.rgb = mix(environment, irr_light.rgb, min(1.0, irr_light.a / 0.95));
-	}
-	irr_light.rgb *= gi_probes.data[index].dynamic_range;
-	//irr_light=vec3(0.0);
-
-	out_spec += vec4(irr_light.rgb * blend, blend);
-}
-
-vec2 octahedron_wrap(vec2 v) {
-	vec2 signVal;
-	signVal.x = v.x >= 0.0 ? 1.0 : -1.0;
-	signVal.y = v.y >= 0.0 ? 1.0 : -1.0;
-	return (1.0 - abs(v.yx)) * signVal;
-}
-
-vec2 octahedron_encode(vec3 n) {
-	// https://twitter.com/Stubbesaurus/status/937994790553227264
-	n /= (abs(n.x) + abs(n.y) + abs(n.z));
-	n.xy = n.z >= 0.0 ? n.xy : octahedron_wrap(n.xy);
-	n.xy = n.xy * 0.5 + 0.5;
-	return n.xy;
-}
-
-void sdfgi_process(uint cascade, vec3 cascade_pos, vec3 cam_pos, vec3 cam_normal, vec3 cam_specular_normal, bool use_specular, float roughness, out vec3 diffuse_light, out vec3 specular_light, out float blend) {
-	cascade_pos += cam_normal * sdfgi.normal_bias;
-
-	vec3 base_pos = floor(cascade_pos);
-	//cascade_pos += mix(vec3(0.0),vec3(0.01),lessThan(abs(cascade_pos-base_pos),vec3(0.01))) * cam_normal;
-	ivec3 probe_base_pos = ivec3(base_pos);
-
-	vec4 diffuse_accum = vec4(0.0);
-	vec3 specular_accum;
-
-	ivec3 tex_pos = ivec3(probe_base_pos.xy, int(cascade));
-	tex_pos.x += probe_base_pos.z * sdfgi.probe_axis_size;
-	tex_pos.xy = tex_pos.xy * (SDFGI_OCT_SIZE + 2) + ivec2(1);
-
-	vec3 diffuse_posf = (vec3(tex_pos) + vec3(octahedron_encode(cam_normal) * float(SDFGI_OCT_SIZE), 0.0)) * sdfgi.lightprobe_tex_pixel_size;
-
-	vec3 specular_posf;
-
-	if (use_specular) {
-		specular_accum = vec3(0.0);
-		specular_posf = (vec3(tex_pos) + vec3(octahedron_encode(cam_specular_normal) * float(SDFGI_OCT_SIZE), 0.0)) * sdfgi.lightprobe_tex_pixel_size;
-	}
-
-	vec4 light_accum = vec4(0.0);
-	float weight_accum = 0.0;
-
-	for (uint j = 0; j < 8; j++) {
-		ivec3 offset = (ivec3(j) >> ivec3(0, 1, 2)) & ivec3(1, 1, 1);
-		ivec3 probe_posi = probe_base_pos;
-		probe_posi += offset;
-
-		// Compute weight
-
-		vec3 probe_pos = vec3(probe_posi);
-		vec3 probe_to_pos = cascade_pos - probe_pos;
-		vec3 probe_dir = normalize(-probe_to_pos);
-
-		vec3 trilinear = vec3(1.0) - abs(probe_to_pos);
-		float weight = trilinear.x * trilinear.y * trilinear.z * max(0.005, dot(cam_normal, probe_dir));
-
-		// Compute lightprobe occlusion
-
-		if (sdfgi.use_occlusion) {
-			ivec3 occ_indexv = abs((sdfgi.cascades[cascade].probe_world_offset + probe_posi) & ivec3(1, 1, 1)) * ivec3(1, 2, 4);
-			vec4 occ_mask = mix(vec4(0.0), vec4(1.0), equal(ivec4(occ_indexv.x | occ_indexv.y), ivec4(0, 1, 2, 3)));
-
-			vec3 occ_pos = clamp(cascade_pos, probe_pos - sdfgi.occlusion_clamp, probe_pos + sdfgi.occlusion_clamp) * sdfgi.probe_to_uvw;
-			occ_pos.z += float(cascade);
-			if (occ_indexv.z != 0) { //z bit is on, means index is >=4, so make it switch to the other half of textures
-				occ_pos.x += 1.0;
-			}
-
-			occ_pos *= sdfgi.occlusion_renormalize;
-			float occlusion = dot(textureLod(sampler3D(sdfgi_occlusion_cascades, material_samplers[SAMPLER_LINEAR_CLAMP]), occ_pos, 0.0), occ_mask);
-
-			weight *= max(occlusion, 0.01);
-		}
-
-		// Compute lightprobe texture position
-
-		vec3 diffuse;
-		vec3 pos_uvw = diffuse_posf;
-		pos_uvw.xy += vec2(offset.xy) * sdfgi.lightprobe_uv_offset.xy;
-		pos_uvw.x += float(offset.z) * sdfgi.lightprobe_uv_offset.z;
-		diffuse = textureLod(sampler2DArray(sdfgi_lightprobe_texture, material_samplers[SAMPLER_LINEAR_CLAMP]), pos_uvw, 0.0).rgb;
-
-		diffuse_accum += vec4(diffuse * weight, weight);
-
-		if (use_specular) {
-			vec3 specular = vec3(0.0);
-			vec3 pos_uvw = specular_posf;
-			pos_uvw.xy += vec2(offset.xy) * sdfgi.lightprobe_uv_offset.xy;
-			pos_uvw.x += float(offset.z) * sdfgi.lightprobe_uv_offset.z;
-			if (roughness < 0.99) {
-				specular = textureLod(sampler2DArray(sdfgi_lightprobe_texture, material_samplers[SAMPLER_LINEAR_CLAMP]), pos_uvw + vec3(0, 0, float(sdfgi.max_cascades)), 0.0).rgb;
-			}
-			if (roughness > 0.5) {
-				specular = mix(specular, textureLod(sampler2DArray(sdfgi_lightprobe_texture, material_samplers[SAMPLER_LINEAR_CLAMP]), pos_uvw, 0.0).rgb, (roughness - 0.5) * 2.0);
-			}
-
-			specular_accum += specular * weight;
-		}
-	}
-
-	if (diffuse_accum.a > 0.0) {
-		diffuse_accum.rgb /= diffuse_accum.a;
-	}
-
-	diffuse_light = diffuse_accum.rgb;
-
-	if (use_specular) {
-		if (diffuse_accum.a > 0.0) {
-			specular_accum /= diffuse_accum.a;
-		}
-
-		specular_light = specular_accum;
-	}
-
-	{
-		//process blend
-		float blend_from = (float(sdfgi.probe_axis_size - 1) / 2.0) - 2.5;
-		float blend_to = blend_from + 2.0;
-
-		vec3 inner_pos = cam_pos * sdfgi.cascades[cascade].to_probe;
-
-		float len = length(inner_pos);
-
-		inner_pos = abs(normalize(inner_pos));
-		len *= max(inner_pos.x, max(inner_pos.y, inner_pos.z));
-
-		if (len >= blend_from) {
-			blend = smoothstep(blend_from, blend_to, len);
-		} else {
-			blend = 0.0;
-		}
-	}
-}
+#include "scene_forward_gi_inc.glsl"
 
 #endif //USE_FORWARD_GI
 
@@ -1735,8 +460,6 @@ void sdfgi_process(uint cascade, vec3 cascade_pos, vec3 cam_pos, vec3 cam_normal
 
 #ifndef MODE_RENDER_DEPTH
 
-#ifndef LOW_END_MODE
-
 vec4 volumetric_fog_process(vec2 screen_uv, float z) {
 	vec3 fog_pos = vec3(screen_uv, z * scene_data.volumetric_fog_inv_length);
 	if (fog_pos.z < 0.0) {
@@ -1747,7 +470,6 @@ vec4 volumetric_fog_process(vec2 screen_uv, float z) {
 
 	return texture(sampler3D(volumetric_fog_texture, material_samplers[SAMPLER_LINEAR_CLAMP]), fog_pos);
 }
-#endif
 
 vec4 fog_process(vec3 vertex) {
 	vec3 fog_color = scene_data.fog_light_color;
@@ -1811,26 +533,6 @@ uint cluster_get_range_clip_mask(uint i, uint z_min, uint z_max) {
 	return bitfieldInsert(uint(0), uint(0xFFFFFFFF), local_min, mask_width);
 }
 
-float blur_shadow(float shadow) {
-	return shadow;
-#if 0
-	//disabling for now, will investigate later
-	float interp_shadow = shadow;
-	if (gl_HelperInvocation) {
-		interp_shadow = -4.0; // technically anything below -4 will do but just to make sure
-	}
-
-	uvec2 fc2 = uvec2(gl_FragCoord.xy);
-	interp_shadow -= dFdx(interp_shadow) * (float(fc2.x & 1) - 0.5);
-	interp_shadow -= dFdy(interp_shadow) * (float(fc2.y & 1) - 0.5);
-
-	if (interp_shadow >= 0.0) {
-		shadow = interp_shadow;
-	}
-	return shadow;
-#endif
-}
-
 #endif //!MODE_RENDER DEPTH
 
 void main() {
@@ -1928,11 +630,7 @@ void main() {
 #endif // ALPHA_ANTIALIASING_EDGE_USED
 
 	{
-		/* clang-format off */
-
-FRAGMENT_SHADER_CODE
-
-		/* clang-format on */
+#CODE : FRAGMENT
 	}
 
 #ifdef LIGHT_TRANSMITTANCE_USED
@@ -2019,7 +717,6 @@ FRAGMENT_SHADER_CODE
 		fog = fog_process(vertex);
 	}
 
-#ifndef LOW_END_MODE
 	if (scene_data.volumetric_fog_enabled) {
 		vec4 volumetric_fog = volumetric_fog_process(screen_uv, -vertex.z);
 		if (scene_data.fog_enabled) {
@@ -2037,7 +734,6 @@ FRAGMENT_SHADER_CODE
 			fog = volumetric_fog;
 		}
 	}
-#endif //!LOW_END_MODE
 #endif //!CUSTOM_FOG_USED
 
 	uint fog_rg = packHalf2x16(fog.rg);
@@ -2377,7 +1073,7 @@ FRAGMENT_SHADER_CODE
 		specular_light = spec_accum.rgb;
 		ambient_light = amb_accum.rgb;
 	}
-#elif !defined(LOW_END_MODE)
+#else
 
 	if (bool(instances.data[instance_index].flags & INSTANCE_FLAGS_USE_GI_BUFFERS)) { //use GI buffers
 
@@ -2412,13 +1108,11 @@ FRAGMENT_SHADER_CODE
 	}
 #endif
 
-#ifndef LOW_END_MODE
 	if (scene_data.ssao_enabled) {
 		float ssao = texture(sampler2D(ao_buffer, material_samplers[SAMPLER_LINEAR_CLAMP]), screen_uv).r;
 		ao = min(ao, ssao);
 		ao_light_affect = mix(ao_light_affect, max(ao_light_affect, scene_data.ssao_light_affect), scene_data.ssao_ao_affect);
 	}
-#endif //LOW_END_MODE
 
 	{ // process reflections
 
@@ -2533,6 +1227,10 @@ FRAGMENT_SHADER_CODE
 				continue; //not masked
 			}
 
+			if (directional_lights.data[i].bake_mode == LIGHT_BAKE_STATIC && bool(instances.data[instance_index].flags & INSTANCE_FLAGS_USE_LIGHTMAP)) {
+				continue; // Statically baked light and object uses lightmap, skip
+			}
+
 			float shadow = 1.0;
 
 #ifdef USE_SOFT_SHADOWS
@@ -2982,6 +1680,10 @@ FRAGMENT_SHADER_CODE
 						continue; //not masked
 					}
 
+					if (omni_lights.data[light_index].bake_mode == LIGHT_BAKE_STATIC && bool(instances.data[instance_index].flags & INSTANCE_FLAGS_USE_LIGHTMAP)) {
+						continue; // Statically baked light and object uses lightmap, skip
+					}
+
 					float shadow = light_process_omni_shadow(light_index, vertex, view);
 
 					shadow = blur_shadow(shadow);
@@ -3055,6 +1757,10 @@ FRAGMENT_SHADER_CODE
 						continue; //not masked
 					}
 
+					if (spot_lights.data[light_index].bake_mode == LIGHT_BAKE_STATIC && bool(instances.data[instance_index].flags & INSTANCE_FLAGS_USE_LIGHTMAP)) {
+						continue; // Statically baked light and object uses lightmap, skip
+					}
+
 					float shadow = light_process_spot_shadow(light_index, vertex, view);
 
 					shadow = blur_shadow(shadow);
diff --git a/servers/rendering/renderer_rd/shaders/scene_forward_clustered_inc.glsl b/servers/rendering/renderer_rd/shaders/scene_forward_clustered_inc.glsl
index 4ea05c9ccc..ca75d6300e 100644
--- a/servers/rendering/renderer_rd/shaders/scene_forward_clustered_inc.glsl
+++ b/servers/rendering/renderer_rd/shaders/scene_forward_clustered_inc.glsl
@@ -13,6 +13,7 @@
 #endif
 
 #include "cluster_data_inc.glsl"
+#include "decal_data_inc.glsl"
 
 #if !defined(MODE_RENDER_DEPTH) || defined(MODE_RENDER_MATERIAL) || defined(MODE_RENDER_SDF) || defined(MODE_RENDER_NORMAL_ROUGHNESS) || defined(MODE_RENDER_GIPROBE) || defined(TANGENT_USED) || defined(NORMAL_MAP_USED)
 #ifndef NORMAL_USED
@@ -28,7 +29,11 @@ layout(push_constant, binding = 0, std430) uniform DrawCall {
 }
 draw_call;
 
-/* Set 0 Scene data that never changes, ever */
+#define SDFGI_MAX_CASCADES 8
+
+/* Set 0: Base Pass (never changes) */
+
+#include "light_data_inc.glsl"
 
 #define SAMPLER_NEAREST_CLAMP 0
 #define SAMPLER_LINEAR_CLAMP 1
@@ -43,10 +48,6 @@ draw_call;
 #define SAMPLER_NEAREST_WITH_MIPMAPS_ANISOTROPIC_REPEAT 10
 #define SAMPLER_LINEAR_WITH_MIPMAPS_ANISOTROPIC_REPEAT 11
 
-#define SDFGI_MAX_CASCADES 8
-
-/* Set 1: Base Pass (never changes) */
-
 layout(set = 0, binding = 1) uniform sampler material_samplers[12];
 
 layout(set = 0, binding = 2) uniform sampler shadow_sampler;
@@ -61,12 +62,11 @@ layout(set = 0, binding = 2) uniform sampler shadow_sampler;
 #define INSTANCE_FLAGS_MULTIMESH_FORMAT_2D (1 << 13)
 #define INSTANCE_FLAGS_MULTIMESH_HAS_COLOR (1 << 14)
 #define INSTANCE_FLAGS_MULTIMESH_HAS_CUSTOM_DATA (1 << 15)
-#define INSTANCE_FLAGS_MULTIMESH_STRIDE_SHIFT 16
+#define INSTANCE_FLAGS_PARTICLE_TRAIL_SHIFT 16
 //3 bits of stride
-#define INSTANCE_FLAGS_MULTIMESH_STRIDE_MASK 0x7
+#define INSTANCE_FLAGS_PARTICLE_TRAIL_MASK 0xFF
 
-#define INSTANCE_FLAGS_SKELETON (1 << 19)
-#define INSTANCE_FLAGS_NON_UNIFORM_SCALE (1 << 20)
+#define INSTANCE_FLAGS_NON_UNIFORM_SCALE (1 << 24)
 
 layout(set = 0, binding = 3, std430) restrict readonly buffer OmniLights {
 	LightData data[];
@@ -78,7 +78,7 @@ layout(set = 0, binding = 4, std430) restrict readonly buffer SpotLights {
 }
 spot_lights;
 
-layout(set = 0, binding = 5) buffer restrict readonly ReflectionProbeData {
+layout(set = 0, binding = 5, std430) restrict readonly buffer ReflectionProbeData {
 	ReflectionData data[];
 }
 reflections;
@@ -122,8 +122,6 @@ layout(set = 0, binding = 12, std430) restrict readonly buffer GlobalVariableDat
 }
 global_variables;
 
-#ifndef LOW_END_MODE
-
 struct SDFGIProbeCascadeData {
 	vec3 position;
 	float to_probe;
@@ -159,9 +157,7 @@ layout(set = 0, binding = 13, std140) uniform SDFGI {
 }
 sdfgi;
 
-#endif //LOW_END_MODE
-
-/* Set 2: Render Pass (changes per render pass) */
+/* Set 1: Render Pass (changes per render pass) */
 
 layout(set = 1, binding = 0, std140) uniform SceneData {
 	mat4 projection_matrix;
@@ -245,7 +241,6 @@ layout(set = 1, binding = 0, std140) uniform SceneData {
 
 	bool pancake_shadows;
 }
-
 scene_data;
 
 struct InstanceData {
@@ -280,9 +275,7 @@ layout(set = 1, binding = 5) uniform texture2D directional_shadow_atlas;
 
 layout(set = 1, binding = 6) uniform texture2DArray lightmap_textures[MAX_LIGHTMAP_TEXTURES];
 
-#ifndef LOW_END_MOD
 layout(set = 1, binding = 7) uniform texture3D gi_probe_textures[MAX_GI_PROBES];
-#endif
 
 layout(set = 1, binding = 8, std430) buffer restrict readonly ClusterBuffer {
 	uint data[];
@@ -306,8 +299,6 @@ layout(r32ui, set = 1, binding = 12) uniform restrict uimage3D geom_facing_grid;
 layout(set = 1, binding = 9) uniform texture2D depth_buffer;
 layout(set = 1, binding = 10) uniform texture2D color_buffer;
 
-#ifndef LOW_END_MODE
-
 layout(set = 1, binding = 11) uniform texture2D normal_roughness_buffer;
 layout(set = 1, binding = 12) uniform texture2D ao_buffer;
 layout(set = 1, binding = 13) uniform texture2D ambient_buffer;
@@ -338,8 +329,6 @@ gi_probes;
 
 layout(set = 1, binding = 18) uniform texture3D volumetric_fog_texture;
 
-#endif // LOW_END_MODE
-
 #endif
 
 /* Set 2 Skeleton & Instancing (can change per item) */
diff --git a/servers/rendering/renderer_rd/shaders/scene_forward_gi_inc.glsl b/servers/rendering/renderer_rd/shaders/scene_forward_gi_inc.glsl
new file mode 100644
index 0000000000..b41f16cbe7
--- /dev/null
+++ b/servers/rendering/renderer_rd/shaders/scene_forward_gi_inc.glsl
@@ -0,0 +1,242 @@
+// Functions related to gi/sdfgi for our forward renderer
+
+//standard voxel cone trace
+vec4 voxel_cone_trace(texture3D probe, vec3 cell_size, vec3 pos, vec3 direction, float tan_half_angle, float max_distance, float p_bias) {
+	float dist = p_bias;
+	vec4 color = vec4(0.0);
+
+	while (dist < max_distance && color.a < 0.95) {
+		float diameter = max(1.0, 2.0 * tan_half_angle * dist);
+		vec3 uvw_pos = (pos + dist * direction) * cell_size;
+		float half_diameter = diameter * 0.5;
+		//check if outside, then break
+		if (any(greaterThan(abs(uvw_pos - 0.5), vec3(0.5f + half_diameter * cell_size)))) {
+			break;
+		}
+		vec4 scolor = textureLod(sampler3D(probe, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uvw_pos, log2(diameter));
+		float a = (1.0 - color.a);
+		color += a * scolor;
+		dist += half_diameter;
+	}
+
+	return color;
+}
+
+vec4 voxel_cone_trace_45_degrees(texture3D probe, vec3 cell_size, vec3 pos, vec3 direction, float tan_half_angle, float max_distance, float p_bias) {
+	float dist = p_bias;
+	vec4 color = vec4(0.0);
+	float radius = max(0.5, tan_half_angle * dist);
+	float lod_level = log2(radius * 2.0);
+
+	while (dist < max_distance && color.a < 0.95) {
+		vec3 uvw_pos = (pos + dist * direction) * cell_size;
+
+		//check if outside, then break
+		if (any(greaterThan(abs(uvw_pos - 0.5), vec3(0.5f + radius * cell_size)))) {
+			break;
+		}
+		vec4 scolor = textureLod(sampler3D(probe, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uvw_pos, lod_level);
+		lod_level += 1.0;
+
+		float a = (1.0 - color.a);
+		scolor *= a;
+		color += scolor;
+		dist += radius;
+		radius = max(0.5, tan_half_angle * dist);
+	}
+
+	return color;
+}
+
+void gi_probe_compute(uint index, vec3 position, vec3 normal, vec3 ref_vec, mat3 normal_xform, float roughness, vec3 ambient, vec3 environment, inout vec4 out_spec, inout vec4 out_diff) {
+	position = (gi_probes.data[index].xform * vec4(position, 1.0)).xyz;
+	ref_vec = normalize((gi_probes.data[index].xform * vec4(ref_vec, 0.0)).xyz);
+	normal = normalize((gi_probes.data[index].xform * vec4(normal, 0.0)).xyz);
+
+	position += normal * gi_probes.data[index].normal_bias;
+
+	//this causes corrupted pixels, i have no idea why..
+	if (any(bvec2(any(lessThan(position, vec3(0.0))), any(greaterThan(position, gi_probes.data[index].bounds))))) {
+		return;
+	}
+
+	vec3 blendv = abs(position / gi_probes.data[index].bounds * 2.0 - 1.0);
+	float blend = clamp(1.0 - max(blendv.x, max(blendv.y, blendv.z)), 0.0, 1.0);
+	//float blend=1.0;
+
+	float max_distance = length(gi_probes.data[index].bounds);
+	vec3 cell_size = 1.0 / gi_probes.data[index].bounds;
+
+	//radiance
+
+#define MAX_CONE_DIRS 4
+
+	vec3 cone_dirs[MAX_CONE_DIRS] = vec3[](
+			vec3(0.707107, 0.0, 0.707107),
+			vec3(0.0, 0.707107, 0.707107),
+			vec3(-0.707107, 0.0, 0.707107),
+			vec3(0.0, -0.707107, 0.707107));
+
+	float cone_weights[MAX_CONE_DIRS] = float[](0.25, 0.25, 0.25, 0.25);
+	float cone_angle_tan = 0.98269;
+
+	vec3 light = vec3(0.0);
+
+	for (int i = 0; i < MAX_CONE_DIRS; i++) {
+		vec3 dir = normalize((gi_probes.data[index].xform * vec4(normal_xform * cone_dirs[i], 0.0)).xyz);
+
+		vec4 cone_light = voxel_cone_trace_45_degrees(gi_probe_textures[index], cell_size, position, dir, cone_angle_tan, max_distance, gi_probes.data[index].bias);
+
+		if (gi_probes.data[index].blend_ambient) {
+			cone_light.rgb = mix(ambient, cone_light.rgb, min(1.0, cone_light.a / 0.95));
+		}
+
+		light += cone_weights[i] * cone_light.rgb;
+	}
+
+	light *= gi_probes.data[index].dynamic_range;
+	out_diff += vec4(light * blend, blend);
+
+	//irradiance
+	vec4 irr_light = voxel_cone_trace(gi_probe_textures[index], cell_size, position, ref_vec, tan(roughness * 0.5 * M_PI * 0.99), max_distance, gi_probes.data[index].bias);
+	if (gi_probes.data[index].blend_ambient) {
+		irr_light.rgb = mix(environment, irr_light.rgb, min(1.0, irr_light.a / 0.95));
+	}
+	irr_light.rgb *= gi_probes.data[index].dynamic_range;
+	//irr_light=vec3(0.0);
+
+	out_spec += vec4(irr_light.rgb * blend, blend);
+}
+
+vec2 octahedron_wrap(vec2 v) {
+	vec2 signVal;
+	signVal.x = v.x >= 0.0 ? 1.0 : -1.0;
+	signVal.y = v.y >= 0.0 ? 1.0 : -1.0;
+	return (1.0 - abs(v.yx)) * signVal;
+}
+
+vec2 octahedron_encode(vec3 n) {
+	// https://twitter.com/Stubbesaurus/status/937994790553227264
+	n /= (abs(n.x) + abs(n.y) + abs(n.z));
+	n.xy = n.z >= 0.0 ? n.xy : octahedron_wrap(n.xy);
+	n.xy = n.xy * 0.5 + 0.5;
+	return n.xy;
+}
+
+void sdfgi_process(uint cascade, vec3 cascade_pos, vec3 cam_pos, vec3 cam_normal, vec3 cam_specular_normal, bool use_specular, float roughness, out vec3 diffuse_light, out vec3 specular_light, out float blend) {
+	cascade_pos += cam_normal * sdfgi.normal_bias;
+
+	vec3 base_pos = floor(cascade_pos);
+	//cascade_pos += mix(vec3(0.0),vec3(0.01),lessThan(abs(cascade_pos-base_pos),vec3(0.01))) * cam_normal;
+	ivec3 probe_base_pos = ivec3(base_pos);
+
+	vec4 diffuse_accum = vec4(0.0);
+	vec3 specular_accum;
+
+	ivec3 tex_pos = ivec3(probe_base_pos.xy, int(cascade));
+	tex_pos.x += probe_base_pos.z * sdfgi.probe_axis_size;
+	tex_pos.xy = tex_pos.xy * (SDFGI_OCT_SIZE + 2) + ivec2(1);
+
+	vec3 diffuse_posf = (vec3(tex_pos) + vec3(octahedron_encode(cam_normal) * float(SDFGI_OCT_SIZE), 0.0)) * sdfgi.lightprobe_tex_pixel_size;
+
+	vec3 specular_posf;
+
+	if (use_specular) {
+		specular_accum = vec3(0.0);
+		specular_posf = (vec3(tex_pos) + vec3(octahedron_encode(cam_specular_normal) * float(SDFGI_OCT_SIZE), 0.0)) * sdfgi.lightprobe_tex_pixel_size;
+	}
+
+	vec4 light_accum = vec4(0.0);
+	float weight_accum = 0.0;
+
+	for (uint j = 0; j < 8; j++) {
+		ivec3 offset = (ivec3(j) >> ivec3(0, 1, 2)) & ivec3(1, 1, 1);
+		ivec3 probe_posi = probe_base_pos;
+		probe_posi += offset;
+
+		// Compute weight
+
+		vec3 probe_pos = vec3(probe_posi);
+		vec3 probe_to_pos = cascade_pos - probe_pos;
+		vec3 probe_dir = normalize(-probe_to_pos);
+
+		vec3 trilinear = vec3(1.0) - abs(probe_to_pos);
+		float weight = trilinear.x * trilinear.y * trilinear.z * max(0.005, dot(cam_normal, probe_dir));
+
+		// Compute lightprobe occlusion
+
+		if (sdfgi.use_occlusion) {
+			ivec3 occ_indexv = abs((sdfgi.cascades[cascade].probe_world_offset + probe_posi) & ivec3(1, 1, 1)) * ivec3(1, 2, 4);
+			vec4 occ_mask = mix(vec4(0.0), vec4(1.0), equal(ivec4(occ_indexv.x | occ_indexv.y), ivec4(0, 1, 2, 3)));
+
+			vec3 occ_pos = clamp(cascade_pos, probe_pos - sdfgi.occlusion_clamp, probe_pos + sdfgi.occlusion_clamp) * sdfgi.probe_to_uvw;
+			occ_pos.z += float(cascade);
+			if (occ_indexv.z != 0) { //z bit is on, means index is >=4, so make it switch to the other half of textures
+				occ_pos.x += 1.0;
+			}
+
+			occ_pos *= sdfgi.occlusion_renormalize;
+			float occlusion = dot(textureLod(sampler3D(sdfgi_occlusion_cascades, material_samplers[SAMPLER_LINEAR_CLAMP]), occ_pos, 0.0), occ_mask);
+
+			weight *= max(occlusion, 0.01);
+		}
+
+		// Compute lightprobe texture position
+
+		vec3 diffuse;
+		vec3 pos_uvw = diffuse_posf;
+		pos_uvw.xy += vec2(offset.xy) * sdfgi.lightprobe_uv_offset.xy;
+		pos_uvw.x += float(offset.z) * sdfgi.lightprobe_uv_offset.z;
+		diffuse = textureLod(sampler2DArray(sdfgi_lightprobe_texture, material_samplers[SAMPLER_LINEAR_CLAMP]), pos_uvw, 0.0).rgb;
+
+		diffuse_accum += vec4(diffuse * weight, weight);
+
+		if (use_specular) {
+			vec3 specular = vec3(0.0);
+			vec3 pos_uvw = specular_posf;
+			pos_uvw.xy += vec2(offset.xy) * sdfgi.lightprobe_uv_offset.xy;
+			pos_uvw.x += float(offset.z) * sdfgi.lightprobe_uv_offset.z;
+			if (roughness < 0.99) {
+				specular = textureLod(sampler2DArray(sdfgi_lightprobe_texture, material_samplers[SAMPLER_LINEAR_CLAMP]), pos_uvw + vec3(0, 0, float(sdfgi.max_cascades)), 0.0).rgb;
+			}
+			if (roughness > 0.5) {
+				specular = mix(specular, textureLod(sampler2DArray(sdfgi_lightprobe_texture, material_samplers[SAMPLER_LINEAR_CLAMP]), pos_uvw, 0.0).rgb, (roughness - 0.5) * 2.0);
+			}
+
+			specular_accum += specular * weight;
+		}
+	}
+
+	if (diffuse_accum.a > 0.0) {
+		diffuse_accum.rgb /= diffuse_accum.a;
+	}
+
+	diffuse_light = diffuse_accum.rgb;
+
+	if (use_specular) {
+		if (diffuse_accum.a > 0.0) {
+			specular_accum /= diffuse_accum.a;
+		}
+
+		specular_light = specular_accum;
+	}
+
+	{
+		//process blend
+		float blend_from = (float(sdfgi.probe_axis_size - 1) / 2.0) - 2.5;
+		float blend_to = blend_from + 2.0;
+
+		vec3 inner_pos = cam_pos * sdfgi.cascades[cascade].to_probe;
+
+		float len = length(inner_pos);
+
+		inner_pos = abs(normalize(inner_pos));
+		len *= max(inner_pos.x, max(inner_pos.y, inner_pos.z));
+
+		if (len >= blend_from) {
+			blend = smoothstep(blend_from, blend_to, len);
+		} else {
+			blend = 0.0;
+		}
+	}
+}
diff --git a/servers/rendering/renderer_rd/shaders/scene_forward_lights_inc.glsl b/servers/rendering/renderer_rd/shaders/scene_forward_lights_inc.glsl
new file mode 100644
index 0000000000..32a86cb166
--- /dev/null
+++ b/servers/rendering/renderer_rd/shaders/scene_forward_lights_inc.glsl
@@ -0,0 +1,1023 @@
+// Functions related to lighting
+
+// This returns the G_GGX function divided by 2 cos_theta_m, where in practice cos_theta_m is either N.L or N.V.
+// We're dividing this factor off because the overall term we'll end up looks like
+// (see, for example, the first unnumbered equation in B. Burley, "Physically Based Shading at Disney", SIGGRAPH 2012):
+//
+//   F(L.V) D(N.H) G(N.L) G(N.V) / (4 N.L N.V)
+//
+// We're basically regouping this as
+//
+//   F(L.V) D(N.H) [G(N.L)/(2 N.L)] [G(N.V) / (2 N.V)]
+//
+// and thus, this function implements the [G(N.m)/(2 N.m)] part with m = L or V.
+//
+// The contents of the D and G (G1) functions (GGX) are taken from
+// E. Heitz, "Understanding the Masking-Shadowing Function in Microfacet-Based BRDFs", J. Comp. Graph. Tech. 3 (2) (2014).
+// Eqns 71-72 and 85-86 (see also Eqns 43 and 80).
+
+float G_GGX_2cos(float cos_theta_m, float alpha) {
+	// Schlick's approximation
+	// C. Schlick, "An Inexpensive BRDF Model for Physically-based Rendering", Computer Graphics Forum. 13 (3): 233 (1994)
+	// Eq. (19), although see Heitz (2014) the about the problems with his derivation.
+	// It nevertheless approximates GGX well with k = alpha/2.
+	float k = 0.5 * alpha;
+	return 0.5 / (cos_theta_m * (1.0 - k) + k);
+
+	// float cos2 = cos_theta_m * cos_theta_m;
+	// float sin2 = (1.0 - cos2);
+	// return 1.0 / (cos_theta_m + sqrt(cos2 + alpha * alpha * sin2));
+}
+
+float D_GGX(float cos_theta_m, float alpha) {
+	float alpha2 = alpha * alpha;
+	float d = 1.0 + (alpha2 - 1.0) * cos_theta_m * cos_theta_m;
+	return alpha2 / (M_PI * d * d);
+}
+
+float G_GGX_anisotropic_2cos(float cos_theta_m, float alpha_x, float alpha_y, float cos_phi, float sin_phi) {
+	float cos2 = cos_theta_m * cos_theta_m;
+	float sin2 = (1.0 - cos2);
+	float s_x = alpha_x * cos_phi;
+	float s_y = alpha_y * sin_phi;
+	return 1.0 / max(cos_theta_m + sqrt(cos2 + (s_x * s_x + s_y * s_y) * sin2), 0.001);
+}
+
+float D_GGX_anisotropic(float cos_theta_m, float alpha_x, float alpha_y, float cos_phi, float sin_phi) {
+	float cos2 = cos_theta_m * cos_theta_m;
+	float sin2 = (1.0 - cos2);
+	float r_x = cos_phi / alpha_x;
+	float r_y = sin_phi / alpha_y;
+	float d = cos2 + sin2 * (r_x * r_x + r_y * r_y);
+	return 1.0 / max(M_PI * alpha_x * alpha_y * d * d, 0.001);
+}
+
+float SchlickFresnel(float u) {
+	float m = 1.0 - u;
+	float m2 = m * m;
+	return m2 * m2 * m; // pow(m,5)
+}
+
+float GTR1(float NdotH, float a) {
+	if (a >= 1.0)
+		return 1.0 / M_PI;
+	float a2 = a * a;
+	float t = 1.0 + (a2 - 1.0) * NdotH * NdotH;
+	return (a2 - 1.0) / (M_PI * log(a2) * t);
+}
+
+vec3 F0(float metallic, float specular, vec3 albedo) {
+	float dielectric = 0.16 * specular * specular;
+	// use albedo * metallic as colored specular reflectance at 0 angle for metallic materials;
+	// see https://google.github.io/filament/Filament.md.html
+	return mix(vec3(dielectric), albedo, vec3(metallic));
+}
+
+void light_compute(vec3 N, vec3 L, vec3 V, vec3 light_color, float attenuation, vec3 f0, uint orms, float specular_amount,
+#ifdef LIGHT_BACKLIGHT_USED
+		vec3 backlight,
+#endif
+#ifdef LIGHT_TRANSMITTANCE_USED
+		vec4 transmittance_color,
+		float transmittance_depth,
+		float transmittance_curve,
+		float transmittance_boost,
+		float transmittance_z,
+#endif
+#ifdef LIGHT_RIM_USED
+		float rim, float rim_tint, vec3 rim_color,
+#endif
+#ifdef LIGHT_CLEARCOAT_USED
+		float clearcoat, float clearcoat_gloss,
+#endif
+#ifdef LIGHT_ANISOTROPY_USED
+		vec3 B, vec3 T, float anisotropy,
+#endif
+#ifdef USE_SOFT_SHADOWS
+		float A,
+#endif
+#ifdef USE_SHADOW_TO_OPACITY
+		inout float alpha,
+#endif
+		inout vec3 diffuse_light, inout vec3 specular_light) {
+
+#if defined(LIGHT_CODE_USED)
+	// light is written by the light shader
+
+	vec3 normal = N;
+	vec3 light = L;
+	vec3 view = V;
+
+#CODE : LIGHT
+
+#else
+
+#ifdef USE_SOFT_SHADOWS
+	float NdotL = min(A + dot(N, L), 1.0);
+#else
+	float NdotL = dot(N, L);
+#endif
+	float cNdotL = max(NdotL, 0.0); // clamped NdotL
+	float NdotV = dot(N, V);
+	float cNdotV = max(NdotV, 0.0);
+
+#if defined(DIFFUSE_BURLEY) || defined(SPECULAR_BLINN) || defined(SPECULAR_SCHLICK_GGX) || defined(LIGHT_CLEARCOAT_USED)
+	vec3 H = normalize(V + L);
+#endif
+
+#if defined(SPECULAR_BLINN) || defined(SPECULAR_SCHLICK_GGX) || defined(LIGHT_CLEARCOAT_USED)
+#ifdef USE_SOFT_SHADOWS
+	float cNdotH = clamp(A + dot(N, H), 0.0, 1.0);
+#else
+	float cNdotH = clamp(dot(N, H), 0.0, 1.0);
+#endif
+#endif
+
+#if defined(DIFFUSE_BURLEY) || defined(SPECULAR_SCHLICK_GGX) || defined(LIGHT_CLEARCOAT_USED)
+#ifdef USE_SOFT_SHADOWS
+	float cLdotH = clamp(A + dot(L, H), 0.0, 1.0);
+#else
+	float cLdotH = clamp(dot(L, H), 0.0, 1.0);
+#endif
+#endif
+
+	float metallic = unpackUnorm4x8(orms).z;
+	if (metallic < 1.0) {
+		float roughness = unpackUnorm4x8(orms).y;
+
+#if defined(DIFFUSE_OREN_NAYAR)
+		vec3 diffuse_brdf_NL;
+#else
+		float diffuse_brdf_NL; // BRDF times N.L for calculating diffuse radiance
+#endif
+
+#if defined(DIFFUSE_LAMBERT_WRAP)
+		// energy conserving lambert wrap shader
+		diffuse_brdf_NL = max(0.0, (NdotL + roughness) / ((1.0 + roughness) * (1.0 + roughness)));
+#elif defined(DIFFUSE_TOON)
+
+		diffuse_brdf_NL = smoothstep(-roughness, max(roughness, 0.01), NdotL);
+
+#elif defined(DIFFUSE_BURLEY)
+
+		{
+			float FD90_minus_1 = 2.0 * cLdotH * cLdotH * roughness - 0.5;
+			float FdV = 1.0 + FD90_minus_1 * SchlickFresnel(cNdotV);
+			float FdL = 1.0 + FD90_minus_1 * SchlickFresnel(cNdotL);
+			diffuse_brdf_NL = (1.0 / M_PI) * FdV * FdL * cNdotL;
+			/*
+			float energyBias = mix(roughness, 0.0, 0.5);
+			float energyFactor = mix(roughness, 1.0, 1.0 / 1.51);
+			float fd90 = energyBias + 2.0 * VoH * VoH * roughness;
+			float f0 = 1.0;
+			float lightScatter = f0 + (fd90 - f0) * pow(1.0 - cNdotL, 5.0);
+			float viewScatter = f0 + (fd90 - f0) * pow(1.0 - cNdotV, 5.0);
+
+			diffuse_brdf_NL = lightScatter * viewScatter * energyFactor;
+			*/
+		}
+#else
+		// lambert
+		diffuse_brdf_NL = cNdotL * (1.0 / M_PI);
+#endif
+
+		diffuse_light += light_color * diffuse_brdf_NL * attenuation;
+
+#if defined(LIGHT_BACKLIGHT_USED)
+		diffuse_light += light_color * (vec3(1.0 / M_PI) - diffuse_brdf_NL) * backlight * attenuation;
+#endif
+
+#if defined(LIGHT_RIM_USED)
+		float rim_light = pow(max(0.0, 1.0 - cNdotV), max(0.0, (1.0 - roughness) * 16.0));
+		diffuse_light += rim_light * rim * mix(vec3(1.0), rim_color, rim_tint) * light_color;
+#endif
+
+#ifdef LIGHT_TRANSMITTANCE_USED
+
+#ifdef SSS_MODE_SKIN
+
+		{
+			float scale = 8.25 / transmittance_depth;
+			float d = scale * abs(transmittance_z);
+			float dd = -d * d;
+			vec3 profile = vec3(0.233, 0.455, 0.649) * exp(dd / 0.0064) +
+						   vec3(0.1, 0.336, 0.344) * exp(dd / 0.0484) +
+						   vec3(0.118, 0.198, 0.0) * exp(dd / 0.187) +
+						   vec3(0.113, 0.007, 0.007) * exp(dd / 0.567) +
+						   vec3(0.358, 0.004, 0.0) * exp(dd / 1.99) +
+						   vec3(0.078, 0.0, 0.0) * exp(dd / 7.41);
+
+			diffuse_light += profile * transmittance_color.a * light_color * clamp(transmittance_boost - NdotL, 0.0, 1.0) * (1.0 / M_PI);
+		}
+#else
+
+		if (transmittance_depth > 0.0) {
+			float fade = clamp(abs(transmittance_z / transmittance_depth), 0.0, 1.0);
+
+			fade = pow(max(0.0, 1.0 - fade), transmittance_curve);
+			fade *= clamp(transmittance_boost - NdotL, 0.0, 1.0);
+
+			diffuse_light += transmittance_color.rgb * light_color * (1.0 / M_PI) * transmittance_color.a * fade;
+		}
+
+#endif //SSS_MODE_SKIN
+
+#endif //LIGHT_TRANSMITTANCE_USED
+	}
+
+	float roughness = unpackUnorm4x8(orms).y;
+	if (roughness > 0.0) { // FIXME: roughness == 0 should not disable specular light entirely
+
+		// D
+
+#if defined(SPECULAR_BLINN)
+
+		//normalized blinn
+		float shininess = exp2(15.0 * (1.0 - roughness) + 1.0) * 0.25;
+		float blinn = pow(cNdotH, shininess) * cNdotL;
+		blinn *= (shininess + 8.0) * (1.0 / (8.0 * M_PI));
+		float intensity = blinn;
+
+		specular_light += light_color * intensity * attenuation * specular_amount;
+
+#elif defined(SPECULAR_PHONG)
+
+		vec3 R = normalize(-reflect(L, N));
+		float cRdotV = clamp(A + dot(R, V), 0.0, 1.0);
+		float shininess = exp2(15.0 * (1.0 - roughness) + 1.0) * 0.25;
+		float phong = pow(cRdotV, shininess);
+		phong *= (shininess + 8.0) * (1.0 / (8.0 * M_PI));
+		float intensity = (phong) / max(4.0 * cNdotV * cNdotL, 0.75);
+
+		specular_light += light_color * intensity * attenuation * specular_amount;
+
+#elif defined(SPECULAR_TOON)
+
+		vec3 R = normalize(-reflect(L, N));
+		float RdotV = dot(R, V);
+		float mid = 1.0 - roughness;
+		mid *= mid;
+		float intensity = smoothstep(mid - roughness * 0.5, mid + roughness * 0.5, RdotV) * mid;
+		diffuse_light += light_color * intensity * attenuation * specular_amount; // write to diffuse_light, as in toon shading you generally want no reflection
+
+#elif defined(SPECULAR_DISABLED)
+		// none..
+
+#elif defined(SPECULAR_SCHLICK_GGX)
+		// shlick+ggx as default
+
+#if defined(LIGHT_ANISOTROPY_USED)
+
+		float alpha_ggx = roughness * roughness;
+		float aspect = sqrt(1.0 - anisotropy * 0.9);
+		float ax = alpha_ggx / aspect;
+		float ay = alpha_ggx * aspect;
+		float XdotH = dot(T, H);
+		float YdotH = dot(B, H);
+		float D = D_GGX_anisotropic(cNdotH, ax, ay, XdotH, YdotH);
+		float G = G_GGX_anisotropic_2cos(cNdotL, ax, ay, XdotH, YdotH) * G_GGX_anisotropic_2cos(cNdotV, ax, ay, XdotH, YdotH);
+
+#else
+		float alpha_ggx = roughness * roughness;
+		float D = D_GGX(cNdotH, alpha_ggx);
+		float G = G_GGX_2cos(cNdotL, alpha_ggx) * G_GGX_2cos(cNdotV, alpha_ggx);
+#endif
+		// F
+		float cLdotH5 = SchlickFresnel(cLdotH);
+		vec3 F = mix(vec3(cLdotH5), vec3(1.0), f0);
+
+		vec3 specular_brdf_NL = cNdotL * D * F * G;
+
+		specular_light += specular_brdf_NL * light_color * attenuation * specular_amount;
+#endif
+
+#if defined(LIGHT_CLEARCOAT_USED)
+
+#if !defined(SPECULAR_SCHLICK_GGX)
+		float cLdotH5 = SchlickFresnel(cLdotH);
+#endif
+		float Dr = GTR1(cNdotH, mix(.1, .001, clearcoat_gloss));
+		float Fr = mix(.04, 1.0, cLdotH5);
+		float Gr = G_GGX_2cos(cNdotL, .25) * G_GGX_2cos(cNdotV, .25);
+
+		float clearcoat_specular_brdf_NL = 0.25 * clearcoat * Gr * Fr * Dr * cNdotL;
+
+		specular_light += clearcoat_specular_brdf_NL * light_color * attenuation * specular_amount;
+#endif
+	}
+
+#ifdef USE_SHADOW_TO_OPACITY
+	alpha = min(alpha, clamp(1.0 - attenuation), 0.0, 1.0));
+#endif
+
+#endif //defined(LIGHT_CODE_USED)
+}
+
+#ifndef USE_NO_SHADOWS
+
+// Interleaved Gradient Noise
+// http://www.iryoku.com/next-generation-post-processing-in-call-of-duty-advanced-warfare
+float quick_hash(vec2 pos) {
+	const vec3 magic = vec3(0.06711056f, 0.00583715f, 52.9829189f);
+	return fract(magic.z * fract(dot(pos, magic.xy)));
+}
+
+float sample_directional_pcf_shadow(texture2D shadow, vec2 shadow_pixel_size, vec4 coord) {
+	vec2 pos = coord.xy;
+	float depth = coord.z;
+
+	//if only one sample is taken, take it from the center
+	if (scene_data.directional_soft_shadow_samples == 1) {
+		return textureProj(sampler2DShadow(shadow, shadow_sampler), vec4(pos, depth, 1.0));
+	}
+
+	mat2 disk_rotation;
+	{
+		float r = quick_hash(gl_FragCoord.xy) * 2.0 * M_PI;
+		float sr = sin(r);
+		float cr = cos(r);
+		disk_rotation = mat2(vec2(cr, -sr), vec2(sr, cr));
+	}
+
+	float avg = 0.0;
+
+	for (uint i = 0; i < scene_data.directional_soft_shadow_samples; i++) {
+		avg += textureProj(sampler2DShadow(shadow, shadow_sampler), vec4(pos + shadow_pixel_size * (disk_rotation * scene_data.directional_soft_shadow_kernel[i].xy), depth, 1.0));
+	}
+
+	return avg * (1.0 / float(scene_data.directional_soft_shadow_samples));
+}
+
+float sample_pcf_shadow(texture2D shadow, vec2 shadow_pixel_size, vec4 coord) {
+	vec2 pos = coord.xy;
+	float depth = coord.z;
+
+	//if only one sample is taken, take it from the center
+	if (scene_data.soft_shadow_samples == 1) {
+		return textureProj(sampler2DShadow(shadow, shadow_sampler), vec4(pos, depth, 1.0));
+	}
+
+	mat2 disk_rotation;
+	{
+		float r = quick_hash(gl_FragCoord.xy) * 2.0 * M_PI;
+		float sr = sin(r);
+		float cr = cos(r);
+		disk_rotation = mat2(vec2(cr, -sr), vec2(sr, cr));
+	}
+
+	float avg = 0.0;
+
+	for (uint i = 0; i < scene_data.soft_shadow_samples; i++) {
+		avg += textureProj(sampler2DShadow(shadow, shadow_sampler), vec4(pos + shadow_pixel_size * (disk_rotation * scene_data.soft_shadow_kernel[i].xy), depth, 1.0));
+	}
+
+	return avg * (1.0 / float(scene_data.soft_shadow_samples));
+}
+
+float sample_directional_soft_shadow(texture2D shadow, vec3 pssm_coord, vec2 tex_scale) {
+	//find blocker
+	float blocker_count = 0.0;
+	float blocker_average = 0.0;
+
+	mat2 disk_rotation;
+	{
+		float r = quick_hash(gl_FragCoord.xy) * 2.0 * M_PI;
+		float sr = sin(r);
+		float cr = cos(r);
+		disk_rotation = mat2(vec2(cr, -sr), vec2(sr, cr));
+	}
+
+	for (uint i = 0; i < scene_data.directional_penumbra_shadow_samples; i++) {
+		vec2 suv = pssm_coord.xy + (disk_rotation * scene_data.directional_penumbra_shadow_kernel[i].xy) * tex_scale;
+		float d = textureLod(sampler2D(shadow, material_samplers[SAMPLER_LINEAR_CLAMP]), suv, 0.0).r;
+		if (d < pssm_coord.z) {
+			blocker_average += d;
+			blocker_count += 1.0;
+		}
+	}
+
+	if (blocker_count > 0.0) {
+		//blockers found, do soft shadow
+		blocker_average /= blocker_count;
+		float penumbra = (pssm_coord.z - blocker_average) / blocker_average;
+		tex_scale *= penumbra;
+
+		float s = 0.0;
+		for (uint i = 0; i < scene_data.directional_penumbra_shadow_samples; i++) {
+			vec2 suv = pssm_coord.xy + (disk_rotation * scene_data.directional_penumbra_shadow_kernel[i].xy) * tex_scale;
+			s += textureProj(sampler2DShadow(shadow, shadow_sampler), vec4(suv, pssm_coord.z, 1.0));
+		}
+
+		return s / float(scene_data.directional_penumbra_shadow_samples);
+
+	} else {
+		//no blockers found, so no shadow
+		return 1.0;
+	}
+}
+
+#endif //USE_NO_SHADOWS
+
+float get_omni_attenuation(float distance, float inv_range, float decay) {
+	float nd = distance * inv_range;
+	nd *= nd;
+	nd *= nd; // nd^4
+	nd = max(1.0 - nd, 0.0);
+	nd *= nd; // nd^2
+	return nd * pow(max(distance, 0.0001), -decay);
+}
+
+float light_process_omni_shadow(uint idx, vec3 vertex, vec3 normal) {
+#ifndef USE_NO_SHADOWS
+	if (omni_lights.data[idx].shadow_enabled) {
+		// there is a shadowmap
+
+		vec3 light_rel_vec = omni_lights.data[idx].position - vertex;
+		float light_length = length(light_rel_vec);
+
+		vec4 v = vec4(vertex, 1.0);
+
+		vec4 splane = (omni_lights.data[idx].shadow_matrix * v);
+		float shadow_len = length(splane.xyz); //need to remember shadow len from here
+
+		{
+			vec3 nofs = normal_interp * omni_lights.data[idx].shadow_normal_bias / omni_lights.data[idx].inv_radius;
+			nofs *= (1.0 - max(0.0, dot(normalize(light_rel_vec), normalize(normal_interp))));
+			v.xyz += nofs;
+			splane = (omni_lights.data[idx].shadow_matrix * v);
+		}
+
+		float shadow;
+
+#ifdef USE_SOFT_SHADOWS
+		if (omni_lights.data[idx].soft_shadow_size > 0.0) {
+			//soft shadow
+
+			//find blocker
+
+			float blocker_count = 0.0;
+			float blocker_average = 0.0;
+
+			mat2 disk_rotation;
+			{
+				float r = quick_hash(gl_FragCoord.xy) * 2.0 * M_PI;
+				float sr = sin(r);
+				float cr = cos(r);
+				disk_rotation = mat2(vec2(cr, -sr), vec2(sr, cr));
+			}
+
+			vec3 normal = normalize(splane.xyz);
+			vec3 v0 = abs(normal.z) < 0.999 ? vec3(0.0, 0.0, 1.0) : vec3(0.0, 1.0, 0.0);
+			vec3 tangent = normalize(cross(v0, normal));
+			vec3 bitangent = normalize(cross(tangent, normal));
+			float z_norm = shadow_len * omni_lights.data[idx].inv_radius;
+
+			tangent *= omni_lights.data[idx].soft_shadow_size * omni_lights.data[idx].soft_shadow_scale;
+			bitangent *= omni_lights.data[idx].soft_shadow_size * omni_lights.data[idx].soft_shadow_scale;
+
+			for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
+				vec2 disk = disk_rotation * scene_data.penumbra_shadow_kernel[i].xy;
+
+				vec3 pos = splane.xyz + tangent * disk.x + bitangent * disk.y;
+
+				pos = normalize(pos);
+				vec4 uv_rect = omni_lights.data[idx].atlas_rect;
+
+				if (pos.z >= 0.0) {
+					pos.z += 1.0;
+					uv_rect.y += uv_rect.w;
+				} else {
+					pos.z = 1.0 - pos.z;
+				}
+
+				pos.xy /= pos.z;
+
+				pos.xy = pos.xy * 0.5 + 0.5;
+				pos.xy = uv_rect.xy + pos.xy * uv_rect.zw;
+
+				float d = textureLod(sampler2D(shadow_atlas, material_samplers[SAMPLER_LINEAR_CLAMP]), pos.xy, 0.0).r;
+				if (d < z_norm) {
+					blocker_average += d;
+					blocker_count += 1.0;
+				}
+			}
+
+			if (blocker_count > 0.0) {
+				//blockers found, do soft shadow
+				blocker_average /= blocker_count;
+				float penumbra = (z_norm - blocker_average) / blocker_average;
+				tangent *= penumbra;
+				bitangent *= penumbra;
+
+				z_norm -= omni_lights.data[idx].inv_radius * omni_lights.data[idx].shadow_bias;
+
+				shadow = 0.0;
+				for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
+					vec2 disk = disk_rotation * scene_data.penumbra_shadow_kernel[i].xy;
+					vec3 pos = splane.xyz + tangent * disk.x + bitangent * disk.y;
+
+					pos = normalize(pos);
+					vec4 uv_rect = omni_lights.data[idx].atlas_rect;
+
+					if (pos.z >= 0.0) {
+						pos.z += 1.0;
+						uv_rect.y += uv_rect.w;
+					} else {
+						pos.z = 1.0 - pos.z;
+					}
+
+					pos.xy /= pos.z;
+
+					pos.xy = pos.xy * 0.5 + 0.5;
+					pos.xy = uv_rect.xy + pos.xy * uv_rect.zw;
+					shadow += textureProj(sampler2DShadow(shadow_atlas, shadow_sampler), vec4(pos.xy, z_norm, 1.0));
+				}
+
+				shadow /= float(scene_data.penumbra_shadow_samples);
+
+			} else {
+				//no blockers found, so no shadow
+				shadow = 1.0;
+			}
+		} else {
+#endif
+			splane.xyz = normalize(splane.xyz);
+			vec4 clamp_rect = omni_lights.data[idx].atlas_rect;
+
+			if (splane.z >= 0.0) {
+				splane.z += 1.0;
+
+				clamp_rect.y += clamp_rect.w;
+
+			} else {
+				splane.z = 1.0 - splane.z;
+			}
+
+			splane.xy /= splane.z;
+
+			splane.xy = splane.xy * 0.5 + 0.5;
+			splane.z = (shadow_len - omni_lights.data[idx].shadow_bias) * omni_lights.data[idx].inv_radius;
+			splane.xy = clamp_rect.xy + splane.xy * clamp_rect.zw;
+			splane.w = 1.0; //needed? i think it should be 1 already
+			shadow = sample_pcf_shadow(shadow_atlas, omni_lights.data[idx].soft_shadow_scale * scene_data.shadow_atlas_pixel_size, splane);
+#ifdef USE_SOFT_SHADOWS
+		}
+#endif
+
+		return shadow;
+	}
+#endif
+
+	return 1.0;
+}
+
+void light_process_omni(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 vertex_ddx, vec3 vertex_ddy, vec3 f0, uint orms, float shadow,
+#ifdef LIGHT_BACKLIGHT_USED
+		vec3 backlight,
+#endif
+#ifdef LIGHT_TRANSMITTANCE_USED
+		vec4 transmittance_color,
+		float transmittance_depth,
+		float transmittance_curve,
+		float transmittance_boost,
+#endif
+#ifdef LIGHT_RIM_USED
+		float rim, float rim_tint, vec3 rim_color,
+#endif
+#ifdef LIGHT_CLEARCOAT_USED
+		float clearcoat, float clearcoat_gloss,
+#endif
+#ifdef LIGHT_ANISOTROPY_USED
+		vec3 binormal, vec3 tangent, float anisotropy,
+#endif
+#ifdef USE_SHADOW_TO_OPACITY
+		inout float alpha,
+#endif
+		inout vec3 diffuse_light, inout vec3 specular_light) {
+	vec3 light_rel_vec = omni_lights.data[idx].position - vertex;
+	float light_length = length(light_rel_vec);
+	float omni_attenuation = get_omni_attenuation(light_length, omni_lights.data[idx].inv_radius, omni_lights.data[idx].attenuation);
+	float light_attenuation = omni_attenuation;
+	vec3 color = omni_lights.data[idx].color;
+
+#ifdef USE_SOFT_SHADOWS
+	float size_A = 0.0;
+
+	if (omni_lights.data[idx].size > 0.0) {
+		float t = omni_lights.data[idx].size / max(0.001, light_length);
+		size_A = max(0.0, 1.0 - 1 / sqrt(1 + t * t));
+	}
+#endif
+
+#ifdef LIGHT_TRANSMITTANCE_USED
+	float transmittance_z = transmittance_depth; //no transmittance by default
+	transmittance_color.a *= light_attenuation;
+	{
+		vec4 clamp_rect = omni_lights.data[idx].atlas_rect;
+
+		//redo shadowmapping, but shrink the model a bit to avoid arctifacts
+		vec4 splane = (omni_lights.data[idx].shadow_matrix * vec4(vertex - normalize(normal_interp) * omni_lights.data[idx].transmittance_bias, 1.0));
+
+		shadow_len = length(splane.xyz);
+		splane = normalize(splane.xyz);
+
+		if (splane.z >= 0.0) {
+			splane.z += 1.0;
+
+		} else {
+			splane.z = 1.0 - splane.z;
+		}
+
+		splane.xy /= splane.z;
+		splane.xy = splane.xy * 0.5 + 0.5;
+		splane.z = shadow_len * omni_lights.data[idx].inv_radius;
+		splane.xy = clamp_rect.xy + splane.xy * clamp_rect.zw;
+		splane.w = 1.0; //needed? i think it should be 1 already
+
+		float shadow_z = textureLod(sampler2D(shadow_atlas, material_samplers[SAMPLER_LINEAR_CLAMP]), splane.xy, 0.0).r;
+		transmittance_z = (splane.z - shadow_z) / omni_lights.data[idx].inv_radius;
+	}
+#endif
+
+#if 0
+
+	if (omni_lights.data[idx].projector_rect != vec4(0.0)) {
+		vec3 local_v = (omni_lights.data[idx].shadow_matrix * vec4(vertex, 1.0)).xyz;
+		local_v = normalize(local_v);
+
+		vec4 atlas_rect = omni_lights.data[idx].projector_rect;
+
+		if (local_v.z >= 0.0) {
+			local_v.z += 1.0;
+			atlas_rect.y += atlas_rect.w;
+
+		} else {
+			local_v.z = 1.0 - local_v.z;
+		}
+
+		local_v.xy /= local_v.z;
+		local_v.xy = local_v.xy * 0.5 + 0.5;
+		vec2 proj_uv = local_v.xy * atlas_rect.zw;
+
+		vec2 proj_uv_ddx;
+		vec2 proj_uv_ddy;
+		{
+			vec3 local_v_ddx = (omni_lights.data[idx].shadow_matrix * vec4(vertex + vertex_ddx, 1.0)).xyz;
+			local_v_ddx = normalize(local_v_ddx);
+
+			if (local_v_ddx.z >= 0.0) {
+				local_v_ddx.z += 1.0;
+			} else {
+				local_v_ddx.z = 1.0 - local_v_ddx.z;
+			}
+
+			local_v_ddx.xy /= local_v_ddx.z;
+			local_v_ddx.xy = local_v_ddx.xy * 0.5 + 0.5;
+
+			proj_uv_ddx = local_v_ddx.xy * atlas_rect.zw - proj_uv;
+
+			vec3 local_v_ddy = (omni_lights.data[idx].shadow_matrix * vec4(vertex + vertex_ddy, 1.0)).xyz;
+			local_v_ddy = normalize(local_v_ddy);
+
+			if (local_v_ddy.z >= 0.0) {
+				local_v_ddy.z += 1.0;
+			} else {
+				local_v_ddy.z = 1.0 - local_v_ddy.z;
+			}
+
+			local_v_ddy.xy /= local_v_ddy.z;
+			local_v_ddy.xy = local_v_ddy.xy * 0.5 + 0.5;
+
+			proj_uv_ddy = local_v_ddy.xy * atlas_rect.zw - proj_uv;
+		}
+
+		vec4 proj = textureGrad(sampler2D(decal_atlas_srgb, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), proj_uv + atlas_rect.xy, proj_uv_ddx, proj_uv_ddy);
+		no_shadow = mix(no_shadow, proj.rgb, proj.a);
+	}
+#endif
+
+	light_attenuation *= shadow;
+
+	light_compute(normal, normalize(light_rel_vec), eye_vec, color, light_attenuation, f0, orms, omni_lights.data[idx].specular_amount,
+#ifdef LIGHT_BACKLIGHT_USED
+			backlight,
+#endif
+#ifdef LIGHT_TRANSMITTANCE_USED
+			transmittance_color,
+			transmittance_depth,
+			transmittance_curve,
+			transmittance_boost,
+			transmittance_z,
+#endif
+#ifdef LIGHT_RIM_USED
+			rim * omni_attenuation, rim_tint, rim_color,
+#endif
+#ifdef LIGHT_CLEARCOAT_USED
+			clearcoat, clearcoat_gloss,
+#endif
+#ifdef LIGHT_ANISOTROPY_USED
+			binormal, tangent, anisotropy,
+#endif
+#ifdef USE_SOFT_SHADOWS
+			size_A,
+#endif
+#ifdef USE_SHADOW_TO_OPACITY
+			alpha,
+#endif
+			diffuse_light,
+			specular_light);
+}
+
+float light_process_spot_shadow(uint idx, vec3 vertex, vec3 normal) {
+#ifndef USE_NO_SHADOWS
+	if (spot_lights.data[idx].shadow_enabled) {
+		vec3 light_rel_vec = spot_lights.data[idx].position - vertex;
+		float light_length = length(light_rel_vec);
+		vec3 spot_dir = spot_lights.data[idx].direction;
+		//there is a shadowmap
+		vec4 v = vec4(vertex, 1.0);
+
+		v.xyz -= spot_dir * spot_lights.data[idx].shadow_bias;
+
+		float z_norm = dot(spot_dir, -light_rel_vec) * spot_lights.data[idx].inv_radius;
+
+		float depth_bias_scale = 1.0 / (max(0.0001, z_norm)); //the closer to the light origin, the more you have to offset to reach 1px in the map
+		vec3 normal_bias = normalize(normal_interp) * (1.0 - max(0.0, dot(spot_dir, -normalize(normal_interp)))) * spot_lights.data[idx].shadow_normal_bias * depth_bias_scale;
+		normal_bias -= spot_dir * dot(spot_dir, normal_bias); //only XY, no Z
+		v.xyz += normal_bias;
+
+		//adjust with bias
+		z_norm = dot(spot_dir, v.xyz - spot_lights.data[idx].position) * spot_lights.data[idx].inv_radius;
+
+		float shadow;
+
+		vec4 splane = (spot_lights.data[idx].shadow_matrix * v);
+		splane /= splane.w;
+
+#ifdef USE_SOFT_SHADOWS
+		if (spot_lights.data[idx].soft_shadow_size > 0.0) {
+			//soft shadow
+
+			//find blocker
+
+			vec2 shadow_uv = splane.xy * spot_lights.data[idx].atlas_rect.zw + spot_lights.data[idx].atlas_rect.xy;
+
+			float blocker_count = 0.0;
+			float blocker_average = 0.0;
+
+			mat2 disk_rotation;
+			{
+				float r = quick_hash(gl_FragCoord.xy) * 2.0 * M_PI;
+				float sr = sin(r);
+				float cr = cos(r);
+				disk_rotation = mat2(vec2(cr, -sr), vec2(sr, cr));
+			}
+
+			float uv_size = spot_lights.data[idx].soft_shadow_size * z_norm * spot_lights.data[idx].soft_shadow_scale;
+			vec2 clamp_max = spot_lights.data[idx].atlas_rect.xy + spot_lights.data[idx].atlas_rect.zw;
+			for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
+				vec2 suv = shadow_uv + (disk_rotation * scene_data.penumbra_shadow_kernel[i].xy) * uv_size;
+				suv = clamp(suv, spot_lights.data[idx].atlas_rect.xy, clamp_max);
+				float d = textureLod(sampler2D(shadow_atlas, material_samplers[SAMPLER_LINEAR_CLAMP]), suv, 0.0).r;
+				if (d < z_norm) {
+					blocker_average += d;
+					blocker_count += 1.0;
+				}
+			}
+
+			if (blocker_count > 0.0) {
+				//blockers found, do soft shadow
+				blocker_average /= blocker_count;
+				float penumbra = (z_norm - blocker_average) / blocker_average;
+				uv_size *= penumbra;
+
+				shadow = 0.0;
+				for (uint i = 0; i < scene_data.penumbra_shadow_samples; i++) {
+					vec2 suv = shadow_uv + (disk_rotation * scene_data.penumbra_shadow_kernel[i].xy) * uv_size;
+					suv = clamp(suv, spot_lights.data[idx].atlas_rect.xy, clamp_max);
+					shadow += textureProj(sampler2DShadow(shadow_atlas, shadow_sampler), vec4(suv, z_norm, 1.0));
+				}
+
+				shadow /= float(scene_data.penumbra_shadow_samples);
+
+			} else {
+				//no blockers found, so no shadow
+				shadow = 1.0;
+			}
+
+		} else {
+#endif
+			//hard shadow
+			vec4 shadow_uv = vec4(splane.xy * spot_lights.data[idx].atlas_rect.zw + spot_lights.data[idx].atlas_rect.xy, splane.z, 1.0);
+
+			shadow = sample_pcf_shadow(shadow_atlas, spot_lights.data[idx].soft_shadow_scale * scene_data.shadow_atlas_pixel_size, shadow_uv);
+#ifdef USE_SOFT_SHADOWS
+		}
+#endif
+
+		return shadow;
+	}
+
+#endif //USE_NO_SHADOWS
+
+	return 1.0;
+}
+
+void light_process_spot(uint idx, vec3 vertex, vec3 eye_vec, vec3 normal, vec3 vertex_ddx, vec3 vertex_ddy, vec3 f0, uint orms, float shadow,
+#ifdef LIGHT_BACKLIGHT_USED
+		vec3 backlight,
+#endif
+#ifdef LIGHT_TRANSMITTANCE_USED
+		vec4 transmittance_color,
+		float transmittance_depth,
+		float transmittance_curve,
+		float transmittance_boost,
+#endif
+#ifdef LIGHT_RIM_USED
+		float rim, float rim_tint, vec3 rim_color,
+#endif
+#ifdef LIGHT_CLEARCOAT_USED
+		float clearcoat, float clearcoat_gloss,
+#endif
+#ifdef LIGHT_ANISOTROPY_USED
+		vec3 binormal, vec3 tangent, float anisotropy,
+#endif
+#ifdef USE_SHADOW_TO_OPACITY
+		inout float alpha,
+#endif
+		inout vec3 diffuse_light,
+		inout vec3 specular_light) {
+	vec3 light_rel_vec = spot_lights.data[idx].position - vertex;
+	float light_length = length(light_rel_vec);
+	float spot_attenuation = get_omni_attenuation(light_length, spot_lights.data[idx].inv_radius, spot_lights.data[idx].attenuation);
+	vec3 spot_dir = spot_lights.data[idx].direction;
+	float scos = max(dot(-normalize(light_rel_vec), spot_dir), spot_lights.data[idx].cone_angle);
+	float spot_rim = max(0.0001, (1.0 - scos) / (1.0 - spot_lights.data[idx].cone_angle));
+	spot_attenuation *= 1.0 - pow(spot_rim, spot_lights.data[idx].cone_attenuation);
+	float light_attenuation = spot_attenuation;
+	vec3 color = spot_lights.data[idx].color;
+	float specular_amount = spot_lights.data[idx].specular_amount;
+
+#ifdef USE_SOFT_SHADOWS
+	float size_A = 0.0;
+
+	if (spot_lights.data[idx].size > 0.0) {
+		float t = spot_lights.data[idx].size / max(0.001, light_length);
+		size_A = max(0.0, 1.0 - 1 / sqrt(1 + t * t));
+	}
+#endif
+
+	/*
+	if (spot_lights.data[idx].atlas_rect!=vec4(0.0)) {
+		//use projector texture
+	}
+	*/
+
+#ifdef LIGHT_TRANSMITTANCE_USED
+	float transmittance_z = transmittance_depth;
+	transmittance_color.a *= light_attenuation;
+	{
+		splane = (spot_lights.data[idx].shadow_matrix * vec4(vertex - normalize(normal_interp) * spot_lights.data[idx].transmittance_bias, 1.0));
+		splane /= splane.w;
+		splane.xy = splane.xy * spot_lights.data[idx].atlas_rect.zw + spot_lights.data[idx].atlas_rect.xy;
+
+		float shadow_z = textureLod(sampler2D(shadow_atlas, material_samplers[SAMPLER_LINEAR_CLAMP]), splane.xy, 0.0).r;
+		//reconstruct depth
+		shadow_z /= spot_lights.data[idx].inv_radius;
+		//distance to light plane
+		float z = dot(spot_dir, -light_rel_vec);
+		transmittance_z = z - shadow_z;
+	}
+#endif //LIGHT_TRANSMITTANCE_USED
+
+	light_attenuation *= shadow;
+
+	light_compute(normal, normalize(light_rel_vec), eye_vec, color, light_attenuation, f0, orms, spot_lights.data[idx].specular_amount,
+#ifdef LIGHT_BACKLIGHT_USED
+			backlight,
+#endif
+#ifdef LIGHT_TRANSMITTANCE_USED
+			transmittance_color,
+			transmittance_depth,
+			transmittance_curve,
+			transmittance_boost,
+			transmittance_z,
+#endif
+#ifdef LIGHT_RIM_USED
+			rim * spot_attenuation, rim_tint, rim_color,
+#endif
+#ifdef LIGHT_CLEARCOAT_USED
+			clearcoat, clearcoat_gloss,
+#endif
+#ifdef LIGHT_ANISOTROPY_USED
+			binormal, tangent, anisotropy,
+#endif
+#ifdef USE_SOFT_SHADOW
+			size_A,
+#endif
+#ifdef USE_SHADOW_TO_OPACITY
+			alpha,
+#endif
+			diffuse_light, specular_light);
+}
+
+void reflection_process(uint ref_index, vec3 vertex, vec3 normal, float roughness, vec3 ambient_light, vec3 specular_light, inout vec4 ambient_accum, inout vec4 reflection_accum) {
+	vec3 box_extents = reflections.data[ref_index].box_extents;
+	vec3 local_pos = (reflections.data[ref_index].local_matrix * vec4(vertex, 1.0)).xyz;
+
+	if (any(greaterThan(abs(local_pos), box_extents))) { //out of the reflection box
+		return;
+	}
+
+	vec3 ref_vec = normalize(reflect(vertex, normal));
+
+	vec3 inner_pos = abs(local_pos / box_extents);
+	float blend = max(inner_pos.x, max(inner_pos.y, inner_pos.z));
+	//make blend more rounded
+	blend = mix(length(inner_pos), blend, blend);
+	blend *= blend;
+	blend = max(0.0, 1.0 - blend);
+
+	if (reflections.data[ref_index].intensity > 0.0) { // compute reflection
+
+		vec3 local_ref_vec = (reflections.data[ref_index].local_matrix * vec4(ref_vec, 0.0)).xyz;
+
+		if (reflections.data[ref_index].box_project) { //box project
+
+			vec3 nrdir = normalize(local_ref_vec);
+			vec3 rbmax = (box_extents - local_pos) / nrdir;
+			vec3 rbmin = (-box_extents - local_pos) / nrdir;
+
+			vec3 rbminmax = mix(rbmin, rbmax, greaterThan(nrdir, vec3(0.0, 0.0, 0.0)));
+
+			float fa = min(min(rbminmax.x, rbminmax.y), rbminmax.z);
+			vec3 posonbox = local_pos + nrdir * fa;
+			local_ref_vec = posonbox - reflections.data[ref_index].box_offset;
+		}
+
+		vec4 reflection;
+
+		reflection.rgb = textureLod(samplerCubeArray(reflection_atlas, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), vec4(local_ref_vec, reflections.data[ref_index].index), roughness * MAX_ROUGHNESS_LOD).rgb;
+
+		if (reflections.data[ref_index].exterior) {
+			reflection.rgb = mix(specular_light, reflection.rgb, blend);
+		}
+
+		reflection.rgb *= reflections.data[ref_index].intensity; //intensity
+		reflection.a = blend;
+		reflection.rgb *= reflection.a;
+
+		reflection_accum += reflection;
+	}
+
+	switch (reflections.data[ref_index].ambient_mode) {
+		case REFLECTION_AMBIENT_DISABLED: {
+			//do nothing
+		} break;
+		case REFLECTION_AMBIENT_ENVIRONMENT: {
+			//do nothing
+			vec3 local_amb_vec = (reflections.data[ref_index].local_matrix * vec4(normal, 0.0)).xyz;
+
+			vec4 ambient_out;
+
+			ambient_out.rgb = textureLod(samplerCubeArray(reflection_atlas, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), vec4(local_amb_vec, reflections.data[ref_index].index), MAX_ROUGHNESS_LOD).rgb;
+			ambient_out.a = blend;
+			if (reflections.data[ref_index].exterior) {
+				ambient_out.rgb = mix(ambient_light, ambient_out.rgb, blend);
+			}
+
+			ambient_out.rgb *= ambient_out.a;
+			ambient_accum += ambient_out;
+		} break;
+		case REFLECTION_AMBIENT_COLOR: {
+			vec4 ambient_out;
+			ambient_out.a = blend;
+			ambient_out.rgb = reflections.data[ref_index].ambient;
+			if (reflections.data[ref_index].exterior) {
+				ambient_out.rgb = mix(ambient_light, ambient_out.rgb, blend);
+			}
+			ambient_out.rgb *= ambient_out.a;
+			ambient_accum += ambient_out;
+		} break;
+	}
+}
+
+float blur_shadow(float shadow) {
+	return shadow;
+#if 0
+	//disabling for now, will investigate later
+	float interp_shadow = shadow;
+	if (gl_HelperInvocation) {
+		interp_shadow = -4.0; // technically anything below -4 will do but just to make sure
+	}
+
+	uvec2 fc2 = uvec2(gl_FragCoord.xy);
+	interp_shadow -= dFdx(interp_shadow) * (float(fc2.x & 1) - 0.5);
+	interp_shadow -= dFdy(interp_shadow) * (float(fc2.y & 1) - 0.5);
+
+	if (interp_shadow >= 0.0) {
+		shadow = interp_shadow;
+	}
+	return shadow;
+#endif
+}
diff --git a/servers/rendering/renderer_rd/shaders/scene_forward_mobile.glsl b/servers/rendering/renderer_rd/shaders/scene_forward_mobile.glsl
new file mode 100644
index 0000000000..b38b8d803d
--- /dev/null
+++ b/servers/rendering/renderer_rd/shaders/scene_forward_mobile.glsl
@@ -0,0 +1,1476 @@
+#[vertex]
+
+#version 450
+
+#VERSION_DEFINES
+
+/* Include our forward mobile UBOs definitions etc. */
+#include "scene_forward_mobile_inc.glsl"
+
+/* INPUT ATTRIBS */
+
+layout(location = 0) in vec3 vertex_attrib;
+
+//only for pure render depth when normal is not used
+
+#ifdef NORMAL_USED
+layout(location = 1) in vec3 normal_attrib;
+#endif
+
+#if defined(TANGENT_USED) || defined(NORMAL_MAP_USED) || defined(LIGHT_ANISOTROPY_USED)
+layout(location = 2) in vec4 tangent_attrib;
+#endif
+
+#if defined(COLOR_USED)
+layout(location = 3) in vec4 color_attrib;
+#endif
+
+#ifdef UV_USED
+layout(location = 4) in vec2 uv_attrib;
+#endif
+
+#if defined(UV2_USED) || defined(USE_LIGHTMAP) || defined(MODE_RENDER_MATERIAL)
+layout(location = 5) in vec2 uv2_attrib;
+#endif // MODE_RENDER_MATERIAL
+
+#if defined(CUSTOM0_USED)
+layout(location = 6) in vec4 custom0_attrib;
+#endif
+
+#if defined(CUSTOM1_USED)
+layout(location = 7) in vec4 custom1_attrib;
+#endif
+
+#if defined(CUSTOM2_USED)
+layout(location = 8) in vec4 custom2_attrib;
+#endif
+
+#if defined(CUSTOM3_USED)
+layout(location = 9) in vec4 custom3_attrib;
+#endif
+
+#if defined(BONES_USED) || defined(USE_PARTICLE_TRAILS)
+layout(location = 10) in uvec4 bone_attrib;
+#endif
+
+#if defined(WEIGHTS_USED) || defined(USE_PARTICLE_TRAILS)
+layout(location = 11) in vec4 weight_attrib;
+#endif
+
+/* Varyings */
+
+layout(location = 0) out vec3 vertex_interp;
+
+#ifdef NORMAL_USED
+layout(location = 1) out vec3 normal_interp;
+#endif
+
+#if defined(COLOR_USED)
+layout(location = 2) out vec4 color_interp;
+#endif
+
+#ifdef UV_USED
+layout(location = 3) out vec2 uv_interp;
+#endif
+
+#if defined(UV2_USED) || defined(USE_LIGHTMAP)
+layout(location = 4) out vec2 uv2_interp;
+#endif
+
+#if defined(TANGENT_USED) || defined(NORMAL_MAP_USED) || defined(LIGHT_ANISOTROPY_USED)
+layout(location = 5) out vec3 tangent_interp;
+layout(location = 6) out vec3 binormal_interp;
+#endif
+
+#ifdef MATERIAL_UNIFORMS_USED
+layout(set = MATERIAL_UNIFORM_SET, binding = 0, std140) uniform MaterialUniforms{
+
+#MATERIAL_UNIFORMS
+
+} material;
+#endif
+
+#ifdef MODE_DUAL_PARABOLOID
+
+layout(location = 8) out float dp_clip;
+
+#endif
+
+invariant gl_Position;
+
+#GLOBALS
+
+void main() {
+	vec4 instance_custom = vec4(0.0);
+#if defined(COLOR_USED)
+	color_interp = color_attrib;
+#endif
+
+	bool is_multimesh = bool(draw_call.flags & INSTANCE_FLAGS_MULTIMESH);
+
+	mat4 world_matrix = draw_call.transform;
+
+	mat3 world_normal_matrix;
+	if (bool(draw_call.flags & INSTANCE_FLAGS_NON_UNIFORM_SCALE)) {
+		world_normal_matrix = inverse(mat3(world_matrix));
+	} else {
+		world_normal_matrix = mat3(world_matrix);
+	}
+
+	if (is_multimesh) {
+		//multimesh, instances are for it
+
+		mat4 matrix;
+
+#ifdef USE_PARTICLE_TRAILS
+		uint trail_size = (draw_call.flags >> INSTANCE_FLAGS_PARTICLE_TRAIL_SHIFT) & INSTANCE_FLAGS_PARTICLE_TRAIL_MASK;
+		uint stride = 3 + 1 + 1; //particles always uses this format
+
+		uint offset = trail_size * stride * gl_InstanceIndex;
+
+#ifdef COLOR_USED
+		vec4 pcolor;
+#endif
+		{
+			uint boffset = offset + bone_attrib.x * stride;
+			matrix = mat4(transforms.data[boffset + 0], transforms.data[boffset + 1], transforms.data[boffset + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weight_attrib.x;
+#ifdef COLOR_USED
+			pcolor = transforms.data[boffset + 3] * weight_attrib.x;
+#endif
+		}
+		if (weight_attrib.y > 0.001) {
+			uint boffset = offset + bone_attrib.y * stride;
+			matrix += mat4(transforms.data[boffset + 0], transforms.data[boffset + 1], transforms.data[boffset + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weight_attrib.y;
+#ifdef COLOR_USED
+			pcolor += transforms.data[boffset + 3] * weight_attrib.y;
+#endif
+		}
+		if (weight_attrib.z > 0.001) {
+			uint boffset = offset + bone_attrib.z * stride;
+			matrix += mat4(transforms.data[boffset + 0], transforms.data[boffset + 1], transforms.data[boffset + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weight_attrib.z;
+#ifdef COLOR_USED
+			pcolor += transforms.data[boffset + 3] * weight_attrib.z;
+#endif
+		}
+		if (weight_attrib.w > 0.001) {
+			uint boffset = offset + bone_attrib.w * stride;
+			matrix += mat4(transforms.data[boffset + 0], transforms.data[boffset + 1], transforms.data[boffset + 2], vec4(0.0, 0.0, 0.0, 1.0)) * weight_attrib.w;
+#ifdef COLOR_USED
+			pcolor += transforms.data[boffset + 3] * weight_attrib.w;
+#endif
+		}
+
+		instance_custom = transforms.data[offset + 4];
+
+#ifdef COLOR_USED
+		color_interp *= pcolor;
+#endif
+
+#else
+		uint stride = 0;
+		{
+			//TODO implement a small lookup table for the stride
+			if (bool(draw_call.flags & INSTANCE_FLAGS_MULTIMESH_FORMAT_2D)) {
+				stride += 2;
+			} else {
+				stride += 3;
+			}
+			if (bool(draw_call.flags & INSTANCE_FLAGS_MULTIMESH_HAS_COLOR)) {
+				stride += 1;
+			}
+			if (bool(draw_call.flags & INSTANCE_FLAGS_MULTIMESH_HAS_CUSTOM_DATA)) {
+				stride += 1;
+			}
+		}
+
+		uint offset = stride * gl_InstanceIndex;
+
+		if (bool(draw_call.flags & INSTANCE_FLAGS_MULTIMESH_FORMAT_2D)) {
+			matrix = mat4(transforms.data[offset + 0], transforms.data[offset + 1], vec4(0.0, 0.0, 1.0, 0.0), vec4(0.0, 0.0, 0.0, 1.0));
+			offset += 2;
+		} else {
+			matrix = mat4(transforms.data[offset + 0], transforms.data[offset + 1], transforms.data[offset + 2], vec4(0.0, 0.0, 0.0, 1.0));
+			offset += 3;
+		}
+
+		if (bool(draw_call.flags & INSTANCE_FLAGS_MULTIMESH_HAS_COLOR)) {
+#ifdef COLOR_USED
+			color_interp *= transforms.data[offset];
+#endif
+			offset += 1;
+		}
+
+		if (bool(draw_call.flags & INSTANCE_FLAGS_MULTIMESH_HAS_CUSTOM_DATA)) {
+			instance_custom = transforms.data[offset];
+		}
+
+#endif
+		//transpose
+		matrix = transpose(matrix);
+		world_matrix = world_matrix * matrix;
+		world_normal_matrix = world_normal_matrix * mat3(matrix);
+	}
+
+	vec3 vertex = vertex_attrib;
+#ifdef NORMAL_USED
+	vec3 normal = normal_attrib * 2.0 - 1.0;
+#endif
+
+#if defined(TANGENT_USED) || defined(NORMAL_MAP_USED) || defined(LIGHT_ANISOTROPY_USED)
+	vec3 tangent = tangent_attrib.xyz * 2.0 - 1.0;
+	float binormalf = tangent_attrib.a * 2.0 - 1.0;
+	vec3 binormal = normalize(cross(normal, tangent) * binormalf);
+#endif
+
+#ifdef UV_USED
+	uv_interp = uv_attrib;
+#endif
+
+#if defined(UV2_USED) || defined(USE_LIGHTMAP)
+	uv2_interp = uv2_attrib;
+#endif
+
+#ifdef OVERRIDE_POSITION
+	vec4 position;
+#endif
+
+	mat4 projection_matrix = scene_data.projection_matrix;
+
+//using world coordinates
+#if !defined(SKIP_TRANSFORM_USED) && defined(VERTEX_WORLD_COORDS_USED)
+
+	vertex = (world_matrix * vec4(vertex, 1.0)).xyz;
+
+	normal = world_normal_matrix * normal;
+
+#if defined(TANGENT_USED) || defined(NORMAL_MAP_USED) || defined(LIGHT_ANISOTROPY_USED)
+
+	tangent = world_normal_matrix * tangent;
+	binormal = world_normal_matrix * binormal;
+
+#endif
+#endif
+
+	float roughness = 1.0;
+
+	mat4 modelview = scene_data.inv_camera_matrix * world_matrix;
+	mat3 modelview_normal = mat3(scene_data.inv_camera_matrix) * world_normal_matrix;
+
+	{
+#CODE : VERTEX
+	}
+
+	/* output */
+
+// using local coordinates (default)
+#if !defined(SKIP_TRANSFORM_USED) && !defined(VERTEX_WORLD_COORDS_USED)
+
+	vertex = (modelview * vec4(vertex, 1.0)).xyz;
+#ifdef NORMAL_USED
+	normal = modelview_normal * normal;
+#endif
+
+#endif
+
+#if defined(TANGENT_USED) || defined(NORMAL_MAP_USED) || defined(LIGHT_ANISOTROPY_USED)
+
+	binormal = modelview_normal * binormal;
+	tangent = modelview_normal * tangent;
+#endif
+
+//using world coordinates
+#if !defined(SKIP_TRANSFORM_USED) && defined(VERTEX_WORLD_COORDS_USED)
+
+	vertex = (scene_data.inv_camera_matrix * vec4(vertex, 1.0)).xyz;
+	normal = mat3(scene_data.inverse_normal_matrix) * normal;
+
+#if defined(TANGENT_USED) || defined(NORMAL_MAP_USED) || defined(LIGHT_ANISOTROPY_USED)
+
+	binormal = mat3(scene_data.camera_inverse_binormal_matrix) * binormal;
+	tangent = mat3(scene_data.camera_inverse_tangent_matrix) * tangent;
+#endif
+#endif
+
+	vertex_interp = vertex;
+#ifdef NORMAL_USED
+	normal_interp = normal;
+#endif
+
+#if defined(TANGENT_USED) || defined(NORMAL_MAP_USED) || defined(LIGHT_ANISOTROPY_USED)
+	tangent_interp = tangent;
+	binormal_interp = binormal;
+#endif
+
+#ifdef MODE_RENDER_DEPTH
+
+#ifdef MODE_DUAL_PARABOLOID
+
+	vertex_interp.z *= scene_data.dual_paraboloid_side;
+
+	dp_clip = vertex_interp.z; //this attempts to avoid noise caused by objects sent to the other parabolloid side due to bias
+
+	//for dual paraboloid shadow mapping, this is the fastest but least correct way, as it curves straight edges
+
+	vec3 vtx = vertex_interp;
+	float distance = length(vtx);
+	vtx = normalize(vtx);
+	vtx.xy /= 1.0 - vtx.z;
+	vtx.z = (distance / scene_data.z_far);
+	vtx.z = vtx.z * 2.0 - 1.0;
+	vertex_interp = vtx;
+
+#endif
+
+#endif //MODE_RENDER_DEPTH
+
+#ifdef OVERRIDE_POSITION
+	gl_Position = position;
+#else
+	gl_Position = projection_matrix * vec4(vertex_interp, 1.0);
+#endif // OVERRIDE_POSITION
+
+#ifdef MODE_RENDER_DEPTH
+	if (scene_data.pancake_shadows) {
+		if (gl_Position.z <= 0.00001) {
+			gl_Position.z = 0.00001;
+		}
+	}
+#endif // MODE_RENDER_DEPTH
+#ifdef MODE_RENDER_MATERIAL
+	if (scene_data.material_uv2_mode) {
+		vec2 uv_offset = draw_call.lightmap_uv_scale.xy; // we are abusing lightmap_uv_scale here, we shouldn't have a lightmap during a depth pass...
+		gl_Position.xy = (uv2_attrib.xy + uv_offset) * 2.0 - 1.0;
+		gl_Position.z = 0.00001;
+		gl_Position.w = 1.0;
+	}
+#endif // MODE_RENDER_MATERIAL
+}
+
+#[fragment]
+
+#version 450
+
+#VERSION_DEFINES
+
+/* Include our forward mobile UBOs definitions etc. */
+#include "scene_forward_mobile_inc.glsl"
+
+/* Varyings */
+
+layout(location = 0) in vec3 vertex_interp;
+
+#ifdef NORMAL_USED
+layout(location = 1) in vec3 normal_interp;
+#endif
+
+#if defined(COLOR_USED)
+layout(location = 2) in vec4 color_interp;
+#endif
+
+#ifdef UV_USED
+layout(location = 3) in vec2 uv_interp;
+#endif
+
+#if defined(UV2_USED) || defined(USE_LIGHTMAP)
+layout(location = 4) in vec2 uv2_interp;
+#endif
+
+#if defined(TANGENT_USED) || defined(NORMAL_MAP_USED) || defined(LIGHT_ANISOTROPY_USED)
+layout(location = 5) in vec3 tangent_interp;
+layout(location = 6) in vec3 binormal_interp;
+#endif
+
+#ifdef MODE_DUAL_PARABOLOID
+
+layout(location = 8) in float dp_clip;
+
+#endif
+
+//defines to keep compatibility with vertex
+
+#define world_matrix draw_call.transform
+#define projection_matrix scene_data.projection_matrix
+
+#if defined(ENABLE_SSS) && defined(ENABLE_TRANSMITTANCE)
+//both required for transmittance to be enabled
+#define LIGHT_TRANSMITTANCE_USED
+#endif
+
+#ifdef MATERIAL_UNIFORMS_USED
+layout(set = MATERIAL_UNIFORM_SET, binding = 0, std140) uniform MaterialUniforms{
+
+#MATERIAL_UNIFORMS
+
+} material;
+#endif
+
+#GLOBALS
+
+/* clang-format on */
+
+#ifdef MODE_RENDER_DEPTH
+
+#ifdef MODE_RENDER_MATERIAL
+
+layout(location = 0) out vec4 albedo_output_buffer;
+layout(location = 1) out vec4 normal_output_buffer;
+layout(location = 2) out vec4 orm_output_buffer;
+layout(location = 3) out vec4 emission_output_buffer;
+layout(location = 4) out float depth_output_buffer;
+
+#endif // MODE_RENDER_MATERIAL
+
+#else // RENDER DEPTH
+
+#ifdef MODE_MULTIPLE_RENDER_TARGETS
+
+layout(location = 0) out vec4 diffuse_buffer; //diffuse (rgb) and roughness
+layout(location = 1) out vec4 specular_buffer; //specular and SSS (subsurface scatter)
+#else
+
+layout(location = 0) out vec4 frag_color;
+#endif // MODE_MULTIPLE_RENDER_TARGETS
+
+#endif // RENDER DEPTH
+
+#include "scene_forward_aa_inc.glsl"
+
+#if !defined(MODE_RENDER_DEPTH) && !defined(MODE_UNSHADED)
+
+#include "scene_forward_lights_inc.glsl"
+
+#endif //!defined(MODE_RENDER_DEPTH) && !defined(MODE_UNSHADED)
+
+#ifndef MODE_RENDER_DEPTH
+
+/*
+	Only supporting normal fog here.
+*/
+
+vec4 fog_process(vec3 vertex) {
+	vec3 fog_color = scene_data.fog_light_color;
+
+	if (scene_data.fog_aerial_perspective > 0.0) {
+		vec3 sky_fog_color = vec3(0.0);
+		vec3 cube_view = scene_data.radiance_inverse_xform * vertex;
+		// mip_level always reads from the second mipmap and higher so the fog is always slightly blurred
+		float mip_level = mix(1.0 / MAX_ROUGHNESS_LOD, 1.0, 1.0 - (abs(vertex.z) - scene_data.z_near) / (scene_data.z_far - scene_data.z_near));
+#ifdef USE_RADIANCE_CUBEMAP_ARRAY
+		float lod, blend;
+		blend = modf(mip_level * MAX_ROUGHNESS_LOD, lod);
+		sky_fog_color = texture(samplerCubeArray(radiance_cubemap, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), vec4(cube_view, lod)).rgb;
+		sky_fog_color = mix(sky_fog_color, texture(samplerCubeArray(radiance_cubemap, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), vec4(cube_view, lod + 1)).rgb, blend);
+#else
+		sky_fog_color = textureLod(samplerCube(radiance_cubemap, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), cube_view, mip_level * MAX_ROUGHNESS_LOD).rgb;
+#endif //USE_RADIANCE_CUBEMAP_ARRAY
+		fog_color = mix(fog_color, sky_fog_color, scene_data.fog_aerial_perspective);
+	}
+
+	if (scene_data.fog_sun_scatter > 0.001) {
+		vec4 sun_scatter = vec4(0.0);
+		float sun_total = 0.0;
+		vec3 view = normalize(vertex);
+
+		for (uint i = 0; i < scene_data.directional_light_count; i++) {
+			vec3 light_color = directional_lights.data[i].color * directional_lights.data[i].energy;
+			float light_amount = pow(max(dot(view, directional_lights.data[i].direction), 0.0), 8.0);
+			fog_color += light_color * light_amount * scene_data.fog_sun_scatter;
+		}
+	}
+
+	float fog_amount = 1.0 - exp(min(0.0, vertex.z * scene_data.fog_density));
+
+	if (abs(scene_data.fog_height_density) > 0.001) {
+		float y = (scene_data.camera_matrix * vec4(vertex, 1.0)).y;
+
+		float y_dist = scene_data.fog_height - y;
+
+		float vfog_amount = clamp(exp(y_dist * scene_data.fog_height_density), 0.0, 1.0);
+
+		fog_amount = max(vfog_amount, fog_amount);
+	}
+
+	return vec4(fog_color, fog_amount);
+}
+
+#endif //!MODE_RENDER DEPTH
+
+void main() {
+#ifdef MODE_DUAL_PARABOLOID
+
+	if (dp_clip > 0.0)
+		discard;
+#endif
+
+	//lay out everything, whathever is unused is optimized away anyway
+	vec3 vertex = vertex_interp;
+	vec3 view = -normalize(vertex_interp);
+	vec3 albedo = vec3(1.0);
+	vec3 backlight = vec3(0.0);
+	vec4 transmittance_color = vec4(0.0);
+	float transmittance_depth = 0.0;
+	float transmittance_curve = 1.0;
+	float transmittance_boost = 0.0;
+	float metallic = 0.0;
+	float specular = 0.5;
+	vec3 emission = vec3(0.0);
+	float roughness = 1.0;
+	float rim = 0.0;
+	float rim_tint = 0.0;
+	float clearcoat = 0.0;
+	float clearcoat_gloss = 0.0;
+	float anisotropy = 0.0;
+	vec2 anisotropy_flow = vec2(1.0, 0.0);
+	vec4 fog = vec4(0.0);
+#if defined(CUSTOM_RADIANCE_USED)
+	vec4 custom_radiance = vec4(0.0);
+#endif
+#if defined(CUSTOM_IRRADIANCE_USED)
+	vec4 custom_irradiance = vec4(0.0);
+#endif
+
+	float ao = 1.0;
+	float ao_light_affect = 0.0;
+
+	float alpha = 1.0;
+
+#if defined(TANGENT_USED) || defined(NORMAL_MAP_USED) || defined(LIGHT_ANISOTROPY_USED)
+	vec3 binormal = normalize(binormal_interp);
+	vec3 tangent = normalize(tangent_interp);
+#else
+	vec3 binormal = vec3(0.0);
+	vec3 tangent = vec3(0.0);
+#endif
+
+#ifdef NORMAL_USED
+	vec3 normal = normalize(normal_interp);
+
+#if defined(DO_SIDE_CHECK)
+	if (!gl_FrontFacing) {
+		normal = -normal;
+	}
+#endif
+
+#endif //NORMAL_USED
+
+#ifdef UV_USED
+	vec2 uv = uv_interp;
+#endif
+
+#if defined(UV2_USED) || defined(USE_LIGHTMAP)
+	vec2 uv2 = uv2_interp;
+#endif
+
+#if defined(COLOR_USED)
+	vec4 color = color_interp;
+#endif
+
+#if defined(NORMAL_MAP_USED)
+
+	vec3 normal_map = vec3(0.5);
+#endif
+
+	float normal_map_depth = 1.0;
+
+	vec2 screen_uv = gl_FragCoord.xy * scene_data.screen_pixel_size + scene_data.screen_pixel_size * 0.5; //account for center
+
+	float sss_strength = 0.0;
+
+#ifdef ALPHA_SCISSOR_USED
+	float alpha_scissor_threshold = 1.0;
+#endif // ALPHA_SCISSOR_USED
+
+#ifdef ALPHA_HASH_USED
+	float alpha_hash_scale = 1.0;
+#endif // ALPHA_HASH_USED
+
+#ifdef ALPHA_ANTIALIASING_EDGE_USED
+	float alpha_antialiasing_edge = 0.0;
+	vec2 alpha_texture_coordinate = vec2(0.0, 0.0);
+#endif // ALPHA_ANTIALIASING_EDGE_USED
+
+	{
+#CODE : FRAGMENT
+	}
+
+#ifdef LIGHT_TRANSMITTANCE_USED
+#ifdef SSS_MODE_SKIN
+	transmittance_color.a = sss_strength;
+#else
+	transmittance_color.a *= sss_strength;
+#endif
+#endif
+
+#ifndef USE_SHADOW_TO_OPACITY
+
+#ifdef ALPHA_SCISSOR_USED
+	if (alpha < alpha_scissor_threshold) {
+		discard;
+	}
+#endif // ALPHA_SCISSOR_USED
+
+// alpha hash can be used in unison with alpha antialiasing
+#ifdef ALPHA_HASH_USED
+	if (alpha < compute_alpha_hash_threshold(vertex, alpha_hash_scale)) {
+		discard;
+	}
+#endif // ALPHA_HASH_USED
+
+// If we are not edge antialiasing, we need to remove the output alpha channel from scissor and hash
+#if (defined(ALPHA_SCISSOR_USED) || defined(ALPHA_HASH_USED)) && !defined(ALPHA_ANTIALIASING_EDGE_USED)
+	alpha = 1.0;
+#endif
+
+#ifdef ALPHA_ANTIALIASING_EDGE_USED
+// If alpha scissor is used, we must further the edge threshold, otherwise we wont get any edge feather
+#ifdef ALPHA_SCISSOR_USED
+	alpha_antialiasing_edge = clamp(alpha_scissor_threshold + alpha_antialiasing_edge, 0.0, 1.0);
+#endif
+	alpha = compute_alpha_antialiasing_edge(alpha, alpha_texture_coordinate, alpha_antialiasing_edge);
+#endif // ALPHA_ANTIALIASING_EDGE_USED
+
+#ifdef USE_OPAQUE_PREPASS
+	if (alpha < opaque_prepass_threshold) {
+		discard;
+	}
+#endif // USE_OPAQUE_PREPASS
+
+#endif // !USE_SHADOW_TO_OPACITY
+
+#ifdef NORMAL_MAP_USED
+
+	normal_map.xy = normal_map.xy * 2.0 - 1.0;
+	normal_map.z = sqrt(max(0.0, 1.0 - dot(normal_map.xy, normal_map.xy))); //always ignore Z, as it can be RG packed, Z may be pos/neg, etc.
+
+	normal = normalize(mix(normal, tangent * normal_map.x + binormal * normal_map.y + normal * normal_map.z, normal_map_depth));
+
+#endif
+
+#ifdef LIGHT_ANISOTROPY_USED
+
+	if (anisotropy > 0.01) {
+		//rotation matrix
+		mat3 rot = mat3(tangent, binormal, normal);
+		//make local to space
+		tangent = normalize(rot * vec3(anisotropy_flow.x, anisotropy_flow.y, 0.0));
+		binormal = normalize(rot * vec3(-anisotropy_flow.y, anisotropy_flow.x, 0.0));
+	}
+
+#endif
+
+#ifdef ENABLE_CLIP_ALPHA
+	if (albedo.a < 0.99) {
+		//used for doublepass and shadowmapping
+		discard;
+	}
+#endif
+
+	/////////////////////// FOG //////////////////////
+#ifndef MODE_RENDER_DEPTH
+
+#ifndef CUSTOM_FOG_USED
+	// fog must be processed as early as possible and then packed.
+	// to maximize VGPR usage
+	// Draw "fixed" fog before volumetric fog to ensure volumetric fog can appear in front of the sky.
+
+	if (scene_data.fog_enabled) {
+		fog = fog_process(vertex);
+	}
+
+#endif //!CUSTOM_FOG_USED
+
+	uint fog_rg = packHalf2x16(fog.rg);
+	uint fog_ba = packHalf2x16(fog.ba);
+
+#endif //!MODE_RENDER_DEPTH
+
+	/////////////////////// DECALS ////////////////////////////////
+
+#ifndef MODE_RENDER_DEPTH
+
+	vec3 vertex_ddx = dFdx(vertex);
+	vec3 vertex_ddy = dFdy(vertex);
+
+	{ //Decals
+		// must implement
+
+		uint decal_indices = draw_call.decals.x;
+		for (uint i = 0; i < 8; i++) {
+			uint decal_index = decal_indices & 0xFF;
+			if (i == 4) {
+				decal_indices = draw_call.decals.y;
+			} else {
+				decal_indices = decal_indices >> 8;
+			}
+
+			if (decal_index == 0xFF) {
+				break;
+			}
+
+			vec3 uv_local = (decals.data[decal_index].xform * vec4(vertex, 1.0)).xyz;
+			if (any(lessThan(uv_local, vec3(0.0, -1.0, 0.0))) || any(greaterThan(uv_local, vec3(1.0)))) {
+				continue; //out of decal
+			}
+
+			//we need ddx/ddy for mipmaps, so simulate them
+			vec2 ddx = (decals.data[decal_index].xform * vec4(vertex_ddx, 0.0)).xz;
+			vec2 ddy = (decals.data[decal_index].xform * vec4(vertex_ddy, 0.0)).xz;
+
+			float fade = pow(1.0 - (uv_local.y > 0.0 ? uv_local.y : -uv_local.y), uv_local.y > 0.0 ? decals.data[decal_index].upper_fade : decals.data[decal_index].lower_fade);
+
+			if (decals.data[decal_index].normal_fade > 0.0) {
+				fade *= smoothstep(decals.data[decal_index].normal_fade, 1.0, dot(normal_interp, decals.data[decal_index].normal) * 0.5 + 0.5);
+			}
+
+			if (decals.data[decal_index].albedo_rect != vec4(0.0)) {
+				//has albedo
+				vec4 decal_albedo = textureGrad(sampler2D(decal_atlas_srgb, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].albedo_rect.zw + decals.data[decal_index].albedo_rect.xy, ddx * decals.data[decal_index].albedo_rect.zw, ddy * decals.data[decal_index].albedo_rect.zw);
+				decal_albedo *= decals.data[decal_index].modulate;
+				decal_albedo.a *= fade;
+				albedo = mix(albedo, decal_albedo.rgb, decal_albedo.a * decals.data[decal_index].albedo_mix);
+
+				if (decals.data[decal_index].normal_rect != vec4(0.0)) {
+					vec3 decal_normal = textureGrad(sampler2D(decal_atlas, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].normal_rect.zw + decals.data[decal_index].normal_rect.xy, ddx * decals.data[decal_index].normal_rect.zw, ddy * decals.data[decal_index].normal_rect.zw).xyz;
+					decal_normal.xy = decal_normal.xy * vec2(2.0, -2.0) - vec2(1.0, -1.0); //users prefer flipped y normal maps in most authoring software
+					decal_normal.z = sqrt(max(0.0, 1.0 - dot(decal_normal.xy, decal_normal.xy)));
+					//convert to view space, use xzy because y is up
+					decal_normal = (decals.data[decal_index].normal_xform * decal_normal.xzy).xyz;
+
+					normal = normalize(mix(normal, decal_normal, decal_albedo.a));
+				}
+
+				if (decals.data[decal_index].orm_rect != vec4(0.0)) {
+					vec3 decal_orm = textureGrad(sampler2D(decal_atlas, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].orm_rect.zw + decals.data[decal_index].orm_rect.xy, ddx * decals.data[decal_index].orm_rect.zw, ddy * decals.data[decal_index].orm_rect.zw).xyz;
+					ao = mix(ao, decal_orm.r, decal_albedo.a);
+					roughness = mix(roughness, decal_orm.g, decal_albedo.a);
+					metallic = mix(metallic, decal_orm.b, decal_albedo.a);
+				}
+			}
+
+			if (decals.data[decal_index].emission_rect != vec4(0.0)) {
+				//emission is additive, so its independent from albedo
+				emission += textureGrad(sampler2D(decal_atlas_srgb, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), uv_local.xz * decals.data[decal_index].emission_rect.zw + decals.data[decal_index].emission_rect.xy, ddx * decals.data[decal_index].emission_rect.zw, ddy * decals.data[decal_index].emission_rect.zw).xyz * decals.data[decal_index].emission_energy * fade;
+			}
+		}
+	} //Decals
+#endif //!MODE_RENDER_DEPTH
+
+	/////////////////////// LIGHTING //////////////////////////////
+
+#ifdef NORMAL_USED
+	if (scene_data.roughness_limiter_enabled) {
+		//http://www.jp.square-enix.com/tech/library/pdf/ImprovedGeometricSpecularAA.pdf
+		float roughness2 = roughness * roughness;
+		vec3 dndu = dFdx(normal), dndv = dFdx(normal);
+		float variance = scene_data.roughness_limiter_amount * (dot(dndu, dndu) + dot(dndv, dndv));
+		float kernelRoughness2 = min(2.0 * variance, scene_data.roughness_limiter_limit); //limit effect
+		float filteredRoughness2 = min(1.0, roughness2 + kernelRoughness2);
+		roughness = sqrt(filteredRoughness2);
+	}
+#endif // NORMAL_USED
+	//apply energy conservation
+
+	vec3 specular_light = vec3(0.0, 0.0, 0.0);
+	vec3 diffuse_light = vec3(0.0, 0.0, 0.0);
+	vec3 ambient_light = vec3(0.0, 0.0, 0.0);
+
+#if !defined(MODE_RENDER_DEPTH) && !defined(MODE_UNSHADED)
+
+	if (scene_data.use_reflection_cubemap) {
+		vec3 ref_vec = reflect(-view, normal);
+		ref_vec = scene_data.radiance_inverse_xform * ref_vec;
+#ifdef USE_RADIANCE_CUBEMAP_ARRAY
+
+		float lod, blend;
+		blend = modf(roughness * MAX_ROUGHNESS_LOD, lod);
+		specular_light = texture(samplerCubeArray(radiance_cubemap, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), vec4(ref_vec, lod)).rgb;
+		specular_light = mix(specular_light, texture(samplerCubeArray(radiance_cubemap, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), vec4(ref_vec, lod + 1)).rgb, blend);
+
+#else // USE_RADIANCE_CUBEMAP_ARRAY
+		specular_light = textureLod(samplerCube(radiance_cubemap, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), ref_vec, roughness * MAX_ROUGHNESS_LOD).rgb;
+
+#endif //USE_RADIANCE_CUBEMAP_ARRAY
+		specular_light *= scene_data.ambient_light_color_energy.a;
+	}
+
+#if defined(CUSTOM_RADIANCE_USED)
+	specular_light = mix(specular_light, custom_radiance.rgb, custom_radiance.a);
+#endif // CUSTOM_RADIANCE_USED
+
+#ifndef USE_LIGHTMAP
+	//lightmap overrides everything
+	if (scene_data.use_ambient_light) {
+		ambient_light = scene_data.ambient_light_color_energy.rgb;
+
+		if (scene_data.use_ambient_cubemap) {
+			vec3 ambient_dir = scene_data.radiance_inverse_xform * normal;
+#ifdef USE_RADIANCE_CUBEMAP_ARRAY
+			vec3 cubemap_ambient = texture(samplerCubeArray(radiance_cubemap, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), vec4(ambient_dir, MAX_ROUGHNESS_LOD)).rgb;
+#else
+			vec3 cubemap_ambient = textureLod(samplerCube(radiance_cubemap, material_samplers[SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP]), ambient_dir, MAX_ROUGHNESS_LOD).rgb;
+#endif //USE_RADIANCE_CUBEMAP_ARRAY
+
+			ambient_light = mix(ambient_light, cubemap_ambient * scene_data.ambient_light_color_energy.a, scene_data.ambient_color_sky_mix);
+		}
+	}
+#endif // !USE_LIGHTMAP
+
+#if defined(CUSTOM_IRRADIANCE_USED)
+	ambient_light = mix(specular_light, custom_irradiance.rgb, custom_irradiance.a);
+#endif // CUSTOM_IRRADIANCE_USED
+
+#endif //!defined(MODE_RENDER_DEPTH) && !defined(MODE_UNSHADED)
+
+	//radiance
+
+#if !defined(MODE_RENDER_DEPTH) && !defined(MODE_UNSHADED)
+
+#ifdef USE_LIGHTMAP
+
+	//lightmap
+	if (bool(draw_call.flags & INSTANCE_FLAGS_USE_LIGHTMAP_CAPTURE)) { //has lightmap capture
+		uint index = draw_call.gi_offset;
+
+		vec3 wnormal = mat3(scene_data.camera_matrix) * normal;
+		const float c1 = 0.429043;
+		const float c2 = 0.511664;
+		const float c3 = 0.743125;
+		const float c4 = 0.886227;
+		const float c5 = 0.247708;
+		ambient_light += (c1 * lightmap_captures.data[index].sh[8].rgb * (wnormal.x * wnormal.x - wnormal.y * wnormal.y) +
+						  c3 * lightmap_captures.data[index].sh[6].rgb * wnormal.z * wnormal.z +
+						  c4 * lightmap_captures.data[index].sh[0].rgb -
+						  c5 * lightmap_captures.data[index].sh[6].rgb +
+						  2.0 * c1 * lightmap_captures.data[index].sh[4].rgb * wnormal.x * wnormal.y +
+						  2.0 * c1 * lightmap_captures.data[index].sh[7].rgb * wnormal.x * wnormal.z +
+						  2.0 * c1 * lightmap_captures.data[index].sh[5].rgb * wnormal.y * wnormal.z +
+						  2.0 * c2 * lightmap_captures.data[index].sh[3].rgb * wnormal.x +
+						  2.0 * c2 * lightmap_captures.data[index].sh[1].rgb * wnormal.y +
+						  2.0 * c2 * lightmap_captures.data[index].sh[2].rgb * wnormal.z);
+
+	} else if (bool(draw_call.flags & INSTANCE_FLAGS_USE_LIGHTMAP)) { // has actual lightmap
+		bool uses_sh = bool(draw_call.flags & INSTANCE_FLAGS_USE_SH_LIGHTMAP);
+		uint ofs = draw_call.gi_offset & 0xFFFF;
+		vec3 uvw;
+		uvw.xy = uv2 * draw_call.lightmap_uv_scale.zw + draw_call.lightmap_uv_scale.xy;
+		uvw.z = float((draw_call.gi_offset >> 16) & 0xFFFF);
+
+		if (uses_sh) {
+			uvw.z *= 4.0; //SH textures use 4 times more data
+			vec3 lm_light_l0 = textureLod(sampler2DArray(lightmap_textures[ofs], material_samplers[SAMPLER_LINEAR_CLAMP]), uvw + vec3(0.0, 0.0, 0.0), 0.0).rgb;
+			vec3 lm_light_l1n1 = textureLod(sampler2DArray(lightmap_textures[ofs], material_samplers[SAMPLER_LINEAR_CLAMP]), uvw + vec3(0.0, 0.0, 1.0), 0.0).rgb;
+			vec3 lm_light_l1_0 = textureLod(sampler2DArray(lightmap_textures[ofs], material_samplers[SAMPLER_LINEAR_CLAMP]), uvw + vec3(0.0, 0.0, 2.0), 0.0).rgb;
+			vec3 lm_light_l1p1 = textureLod(sampler2DArray(lightmap_textures[ofs], material_samplers[SAMPLER_LINEAR_CLAMP]), uvw + vec3(0.0, 0.0, 3.0), 0.0).rgb;
+
+			uint idx = draw_call.gi_offset >> 20;
+			vec3 n = normalize(lightmaps.data[idx].normal_xform * normal);
+
+			ambient_light += lm_light_l0 * 0.282095f;
+			ambient_light += lm_light_l1n1 * 0.32573 * n.y;
+			ambient_light += lm_light_l1_0 * 0.32573 * n.z;
+			ambient_light += lm_light_l1p1 * 0.32573 * n.x;
+			if (metallic > 0.01) { // since the more direct bounced light is lost, we can kind of fake it with this trick
+				vec3 r = reflect(normalize(-vertex), normal);
+				specular_light += lm_light_l1n1 * 0.32573 * r.y;
+				specular_light += lm_light_l1_0 * 0.32573 * r.z;
+				specular_light += lm_light_l1p1 * 0.32573 * r.x;
+			}
+
+		} else {
+			ambient_light += textureLod(sampler2DArray(lightmap_textures[ofs], material_samplers[SAMPLER_LINEAR_CLAMP]), uvw, 0.0).rgb;
+		}
+	}
+
+	// No GI nor non low end mode...
+
+#endif // USE_LIGHTMAP
+
+	// skipping ssao, do we remove ssao totally?
+
+	{ //Reflection probes
+		vec4 reflection_accum = vec4(0.0, 0.0, 0.0, 0.0);
+		vec4 ambient_accum = vec4(0.0, 0.0, 0.0, 0.0);
+
+		uint reflection_indices = draw_call.reflection_probes.x;
+		for (uint i = 0; i < 8; i++) {
+			uint reflection_index = reflection_indices & 0xFF;
+			if (i == 4) {
+				reflection_indices = draw_call.reflection_probes.y;
+			} else {
+				reflection_indices = reflection_indices >> 8;
+			}
+
+			if (reflection_index == 0xFF) {
+				break;
+			}
+
+			reflection_process(reflection_index, vertex, normal, roughness, ambient_light, specular_light, ambient_accum, reflection_accum);
+		}
+
+		if (reflection_accum.a > 0.0) {
+			specular_light = reflection_accum.rgb / reflection_accum.a;
+		}
+	} //Reflection probes
+
+	// finalize ambient light here
+	ambient_light *= albedo.rgb;
+	ambient_light *= ao;
+
+	// convert ao to direct light ao
+	ao = mix(1.0, ao, ao_light_affect);
+
+	//this saves some VGPRs
+	vec3 f0 = F0(metallic, specular, albedo);
+
+	{
+#if defined(DIFFUSE_TOON)
+		//simplify for toon, as
+		specular_light *= specular * metallic * albedo * 2.0;
+#else
+
+		// scales the specular reflections, needs to be be computed before lighting happens,
+		// but after environment, GI, and reflection probes are added
+		// Environment brdf approximation (Lazarov 2013)
+		// see https://www.unrealengine.com/en-US/blog/physically-based-shading-on-mobile
+		const vec4 c0 = vec4(-1.0, -0.0275, -0.572, 0.022);
+		const vec4 c1 = vec4(1.0, 0.0425, 1.04, -0.04);
+		vec4 r = roughness * c0 + c1;
+		float ndotv = clamp(dot(normal, view), 0.0, 1.0);
+		float a004 = min(r.x * r.x, exp2(-9.28 * ndotv)) * r.x + r.y;
+		vec2 env = vec2(-1.04, 1.04) * a004 + r.zw;
+
+		specular_light *= env.x * f0 + env.y;
+#endif
+	}
+
+#endif // !defined(MODE_RENDER_DEPTH) && !defined(MODE_UNSHADED)
+
+#if !defined(MODE_RENDER_DEPTH)
+	//this saves some VGPRs
+	uint orms = packUnorm4x8(vec4(ao, roughness, metallic, specular));
+#endif
+
+// LIGHTING
+#if !defined(MODE_RENDER_DEPTH) && !defined(MODE_UNSHADED)
+
+	{ //directional light
+
+		// Do shadow and lighting in two passes to reduce register pressure
+		uint shadow0 = 0;
+		uint shadow1 = 0;
+
+		for (uint i = 0; i < 8; i++) {
+			if (i >= scene_data.directional_light_count) {
+				break;
+			}
+
+			if (!bool(directional_lights.data[i].mask & draw_call.layer_mask)) {
+				continue; //not masked
+			}
+
+			float shadow = 1.0;
+
+			// Directional light shadow code is basically the same as forward clustered at this point in time minus `LIGHT_TRANSMITTANCE_USED` support.
+			// Not sure if there is a reason to change this seeing directional lights are part of our global data
+			// Should think about whether we may want to move this code into an include file or function??
+
+#ifdef USE_SOFT_SHADOWS
+			//version with soft shadows, more expensive
+			if (directional_lights.data[i].shadow_enabled) {
+				float depth_z = -vertex.z;
+
+				vec4 pssm_coord;
+				vec3 shadow_color = vec3(0.0);
+				vec3 light_dir = directional_lights.data[i].direction;
+
+#define BIAS_FUNC(m_var, m_idx)                                                                                                                                       \
+	m_var.xyz += light_dir * directional_lights.data[i].shadow_bias[m_idx];                                                                                           \
+	vec3 normal_bias = normalize(normal_interp) * (1.0 - max(0.0, dot(light_dir, -normalize(normal_interp)))) * directional_lights.data[i].shadow_normal_bias[m_idx]; \
+	normal_bias -= light_dir * dot(light_dir, normal_bias);                                                                                                           \
+	m_var.xyz += normal_bias;
+
+				if (depth_z < directional_lights.data[i].shadow_split_offsets.x) {
+					vec4 v = vec4(vertex, 1.0);
+
+					BIAS_FUNC(v, 0)
+
+					pssm_coord = (directional_lights.data[i].shadow_matrix1 * v);
+					pssm_coord /= pssm_coord.w;
+
+					if (directional_lights.data[i].softshadow_angle > 0) {
+						float range_pos = dot(directional_lights.data[i].direction, v.xyz);
+						float range_begin = directional_lights.data[i].shadow_range_begin.x;
+						float test_radius = (range_pos - range_begin) * directional_lights.data[i].softshadow_angle;
+						vec2 tex_scale = directional_lights.data[i].uv_scale1 * test_radius;
+						shadow = sample_directional_soft_shadow(directional_shadow_atlas, pssm_coord.xyz, tex_scale * directional_lights.data[i].soft_shadow_scale);
+					} else {
+						shadow = sample_directional_pcf_shadow(directional_shadow_atlas, scene_data.directional_shadow_pixel_size * directional_lights.data[i].soft_shadow_scale, pssm_coord);
+					}
+
+					shadow_color = directional_lights.data[i].shadow_color1.rgb;
+
+				} else if (depth_z < directional_lights.data[i].shadow_split_offsets.y) {
+					vec4 v = vec4(vertex, 1.0);
+
+					BIAS_FUNC(v, 1)
+
+					pssm_coord = (directional_lights.data[i].shadow_matrix2 * v);
+					pssm_coord /= pssm_coord.w;
+
+					if (directional_lights.data[i].softshadow_angle > 0) {
+						float range_pos = dot(directional_lights.data[i].direction, v.xyz);
+						float range_begin = directional_lights.data[i].shadow_range_begin.y;
+						float test_radius = (range_pos - range_begin) * directional_lights.data[i].softshadow_angle;
+						vec2 tex_scale = directional_lights.data[i].uv_scale2 * test_radius;
+						shadow = sample_directional_soft_shadow(directional_shadow_atlas, pssm_coord.xyz, tex_scale * directional_lights.data[i].soft_shadow_scale);
+					} else {
+						shadow = sample_directional_pcf_shadow(directional_shadow_atlas, scene_data.directional_shadow_pixel_size * directional_lights.data[i].soft_shadow_scale, pssm_coord);
+					}
+
+					shadow_color = directional_lights.data[i].shadow_color2.rgb;
+				} else if (depth_z < directional_lights.data[i].shadow_split_offsets.z) {
+					vec4 v = vec4(vertex, 1.0);
+
+					BIAS_FUNC(v, 2)
+
+					pssm_coord = (directional_lights.data[i].shadow_matrix3 * v);
+					pssm_coord /= pssm_coord.w;
+
+					if (directional_lights.data[i].softshadow_angle > 0) {
+						float range_pos = dot(directional_lights.data[i].direction, v.xyz);
+						float range_begin = directional_lights.data[i].shadow_range_begin.z;
+						float test_radius = (range_pos - range_begin) * directional_lights.data[i].softshadow_angle;
+						vec2 tex_scale = directional_lights.data[i].uv_scale3 * test_radius;
+						shadow = sample_directional_soft_shadow(directional_shadow_atlas, pssm_coord.xyz, tex_scale * directional_lights.data[i].soft_shadow_scale);
+					} else {
+						shadow = sample_directional_pcf_shadow(directional_shadow_atlas, scene_data.directional_shadow_pixel_size * directional_lights.data[i].soft_shadow_scale, pssm_coord);
+					}
+
+					shadow_color = directional_lights.data[i].shadow_color3.rgb;
+
+				} else {
+					vec4 v = vec4(vertex, 1.0);
+
+					BIAS_FUNC(v, 3)
+
+					pssm_coord = (directional_lights.data[i].shadow_matrix4 * v);
+					pssm_coord /= pssm_coord.w;
+
+					if (directional_lights.data[i].softshadow_angle > 0) {
+						float range_pos = dot(directional_lights.data[i].direction, v.xyz);
+						float range_begin = directional_lights.data[i].shadow_range_begin.w;
+						float test_radius = (range_pos - range_begin) * directional_lights.data[i].softshadow_angle;
+						vec2 tex_scale = directional_lights.data[i].uv_scale4 * test_radius;
+						shadow = sample_directional_soft_shadow(directional_shadow_atlas, pssm_coord.xyz, tex_scale * directional_lights.data[i].soft_shadow_scale);
+					} else {
+						shadow = sample_directional_pcf_shadow(directional_shadow_atlas, scene_data.directional_shadow_pixel_size * directional_lights.data[i].soft_shadow_scale, pssm_coord);
+					}
+
+					shadow_color = directional_lights.data[i].shadow_color4.rgb;
+				}
+
+				if (directional_lights.data[i].blend_splits) {
+					vec3 shadow_color_blend = vec3(0.0);
+					float pssm_blend;
+					float shadow2;
+
+					if (depth_z < directional_lights.data[i].shadow_split_offsets.x) {
+						vec4 v = vec4(vertex, 1.0);
+						BIAS_FUNC(v, 1)
+						pssm_coord = (directional_lights.data[i].shadow_matrix2 * v);
+						pssm_coord /= pssm_coord.w;
+
+						if (directional_lights.data[i].softshadow_angle > 0) {
+							float range_pos = dot(directional_lights.data[i].direction, v.xyz);
+							float range_begin = directional_lights.data[i].shadow_range_begin.y;
+							float test_radius = (range_pos - range_begin) * directional_lights.data[i].softshadow_angle;
+							vec2 tex_scale = directional_lights.data[i].uv_scale2 * test_radius;
+							shadow2 = sample_directional_soft_shadow(directional_shadow_atlas, pssm_coord.xyz, tex_scale * directional_lights.data[i].soft_shadow_scale);
+						} else {
+							shadow2 = sample_directional_pcf_shadow(directional_shadow_atlas, scene_data.directional_shadow_pixel_size * directional_lights.data[i].soft_shadow_scale, pssm_coord);
+						}
+
+						pssm_blend = smoothstep(0.0, directional_lights.data[i].shadow_split_offsets.x, depth_z);
+						shadow_color_blend = directional_lights.data[i].shadow_color2.rgb;
+					} else if (depth_z < directional_lights.data[i].shadow_split_offsets.y) {
+						vec4 v = vec4(vertex, 1.0);
+						BIAS_FUNC(v, 2)
+						pssm_coord = (directional_lights.data[i].shadow_matrix3 * v);
+						pssm_coord /= pssm_coord.w;
+
+						if (directional_lights.data[i].softshadow_angle > 0) {
+							float range_pos = dot(directional_lights.data[i].direction, v.xyz);
+							float range_begin = directional_lights.data[i].shadow_range_begin.z;
+							float test_radius = (range_pos - range_begin) * directional_lights.data[i].softshadow_angle;
+							vec2 tex_scale = directional_lights.data[i].uv_scale3 * test_radius;
+							shadow2 = sample_directional_soft_shadow(directional_shadow_atlas, pssm_coord.xyz, tex_scale * directional_lights.data[i].soft_shadow_scale);
+						} else {
+							shadow2 = sample_directional_pcf_shadow(directional_shadow_atlas, scene_data.directional_shadow_pixel_size * directional_lights.data[i].soft_shadow_scale, pssm_coord);
+						}
+
+						pssm_blend = smoothstep(directional_lights.data[i].shadow_split_offsets.x, directional_lights.data[i].shadow_split_offsets.y, depth_z);
+
+						shadow_color_blend = directional_lights.data[i].shadow_color3.rgb;
+					} else if (depth_z < directional_lights.data[i].shadow_split_offsets.z) {
+						vec4 v = vec4(vertex, 1.0);
+						BIAS_FUNC(v, 3)
+						pssm_coord = (directional_lights.data[i].shadow_matrix4 * v);
+						pssm_coord /= pssm_coord.w;
+						if (directional_lights.data[i].softshadow_angle > 0) {
+							float range_pos = dot(directional_lights.data[i].direction, v.xyz);
+							float range_begin = directional_lights.data[i].shadow_range_begin.w;
+							float test_radius = (range_pos - range_begin) * directional_lights.data[i].softshadow_angle;
+							vec2 tex_scale = directional_lights.data[i].uv_scale4 * test_radius;
+							shadow2 = sample_directional_soft_shadow(directional_shadow_atlas, pssm_coord.xyz, tex_scale * directional_lights.data[i].soft_shadow_scale);
+						} else {
+							shadow2 = sample_directional_pcf_shadow(directional_shadow_atlas, scene_data.directional_shadow_pixel_size * directional_lights.data[i].soft_shadow_scale, pssm_coord);
+						}
+
+						pssm_blend = smoothstep(directional_lights.data[i].shadow_split_offsets.y, directional_lights.data[i].shadow_split_offsets.z, depth_z);
+						shadow_color_blend = directional_lights.data[i].shadow_color4.rgb;
+					} else {
+						pssm_blend = 0.0; //if no blend, same coord will be used (divide by z will result in same value, and already cached)
+					}
+
+					pssm_blend = sqrt(pssm_blend);
+
+					shadow = mix(shadow, shadow2, pssm_blend);
+					shadow_color = mix(shadow_color, shadow_color_blend, pssm_blend);
+				}
+
+				shadow = mix(shadow, 1.0, smoothstep(directional_lights.data[i].fade_from, directional_lights.data[i].fade_to, vertex.z)); //done with negative values for performance
+
+#undef BIAS_FUNC
+			}
+#else
+			// Soft shadow disabled version
+
+			if (directional_lights.data[i].shadow_enabled) {
+				float depth_z = -vertex.z;
+
+				vec4 pssm_coord;
+				vec3 light_dir = directional_lights.data[i].direction;
+				vec3 base_normal_bias = normalize(normal_interp) * (1.0 - max(0.0, dot(light_dir, -normalize(normal_interp))));
+
+#define BIAS_FUNC(m_var, m_idx)                                                                 \
+	m_var.xyz += light_dir * directional_lights.data[i].shadow_bias[m_idx];                     \
+	vec3 normal_bias = base_normal_bias * directional_lights.data[i].shadow_normal_bias[m_idx]; \
+	normal_bias -= light_dir * dot(light_dir, normal_bias);                                     \
+	m_var.xyz += normal_bias;
+
+				if (depth_z < directional_lights.data[i].shadow_split_offsets.x) {
+					vec4 v = vec4(vertex, 1.0);
+
+					BIAS_FUNC(v, 0)
+
+					pssm_coord = (directional_lights.data[i].shadow_matrix1 * v);
+				} else if (depth_z < directional_lights.data[i].shadow_split_offsets.y) {
+					vec4 v = vec4(vertex, 1.0);
+
+					BIAS_FUNC(v, 1)
+
+					pssm_coord = (directional_lights.data[i].shadow_matrix2 * v);
+				} else if (depth_z < directional_lights.data[i].shadow_split_offsets.z) {
+					vec4 v = vec4(vertex, 1.0);
+
+					BIAS_FUNC(v, 2)
+
+					pssm_coord = (directional_lights.data[i].shadow_matrix3 * v);
+
+				} else {
+					vec4 v = vec4(vertex, 1.0);
+
+					BIAS_FUNC(v, 3)
+
+					pssm_coord = (directional_lights.data[i].shadow_matrix4 * v);
+				}
+
+				pssm_coord /= pssm_coord.w;
+
+				shadow = sample_directional_pcf_shadow(directional_shadow_atlas, scene_data.directional_shadow_pixel_size * directional_lights.data[i].soft_shadow_scale, pssm_coord);
+
+				if (directional_lights.data[i].blend_splits) {
+					float pssm_blend;
+
+					if (depth_z < directional_lights.data[i].shadow_split_offsets.x) {
+						vec4 v = vec4(vertex, 1.0);
+						BIAS_FUNC(v, 1)
+						pssm_coord = (directional_lights.data[i].shadow_matrix2 * v);
+						pssm_blend = smoothstep(0.0, directional_lights.data[i].shadow_split_offsets.x, depth_z);
+					} else if (depth_z < directional_lights.data[i].shadow_split_offsets.y) {
+						vec4 v = vec4(vertex, 1.0);
+						BIAS_FUNC(v, 2)
+						pssm_coord = (directional_lights.data[i].shadow_matrix3 * v);
+						pssm_blend = smoothstep(directional_lights.data[i].shadow_split_offsets.x, directional_lights.data[i].shadow_split_offsets.y, depth_z);
+					} else if (depth_z < directional_lights.data[i].shadow_split_offsets.z) {
+						vec4 v = vec4(vertex, 1.0);
+						BIAS_FUNC(v, 3)
+						pssm_coord = (directional_lights.data[i].shadow_matrix4 * v);
+						pssm_blend = smoothstep(directional_lights.data[i].shadow_split_offsets.y, directional_lights.data[i].shadow_split_offsets.z, depth_z);
+					} else {
+						pssm_blend = 0.0; //if no blend, same coord will be used (divide by z will result in same value, and already cached)
+					}
+
+					pssm_coord /= pssm_coord.w;
+
+					float shadow2 = sample_directional_pcf_shadow(directional_shadow_atlas, scene_data.directional_shadow_pixel_size * directional_lights.data[i].soft_shadow_scale, pssm_coord);
+					shadow = mix(shadow, shadow2, pssm_blend);
+				}
+
+				shadow = mix(shadow, 1.0, smoothstep(directional_lights.data[i].fade_from, directional_lights.data[i].fade_to, vertex.z)); //done with negative values for performance
+
+#undef BIAS_FUNC
+			}
+#endif
+
+			if (i < 4) {
+				shadow0 |= uint(clamp(shadow * 255.0, 0.0, 255.0)) << (i * 8);
+			} else {
+				shadow1 |= uint(clamp(shadow * 255.0, 0.0, 255.0)) << ((i - 4) * 8);
+			}
+		}
+
+		for (uint i = 0; i < 8; i++) {
+			if (i >= scene_data.directional_light_count) {
+				break;
+			}
+
+			if (!bool(directional_lights.data[i].mask & draw_call.layer_mask)) {
+				continue; //not masked
+			}
+
+			// We're not doing light transmittence
+
+			float shadow = 1.0;
+
+			if (i < 4) {
+				shadow = float(shadow0 >> (i * 8) & 0xFF) / 255.0;
+			} else {
+				shadow = float(shadow1 >> ((i - 4) * 8) & 0xFF) / 255.0;
+			}
+
+			blur_shadow(shadow);
+
+			light_compute(normal, directional_lights.data[i].direction, normalize(view), directional_lights.data[i].color * directional_lights.data[i].energy, shadow, f0, orms, 1.0,
+#ifdef LIGHT_BACKLIGHT_USED
+					backlight,
+#endif
+/* not supported here
+#ifdef LIGHT_TRANSMITTANCE_USED
+					transmittance_color,
+					transmittance_depth,
+					transmittance_curve,
+					transmittance_boost,
+					transmittance_z,
+#endif
+*/
+#ifdef LIGHT_RIM_USED
+					rim, rim_tint, albedo,
+#endif
+#ifdef LIGHT_CLEARCOAT_USED
+					clearcoat, clearcoat_gloss,
+#endif
+#ifdef LIGHT_ANISOTROPY_USED
+					binormal, tangent, anisotropy,
+#endif
+#ifdef USE_SOFT_SHADOW
+					directional_lights.data[i].size,
+#endif
+#ifdef USE_SHADOW_TO_OPACITY
+					alpha,
+#endif
+					diffuse_light,
+					specular_light);
+		}
+	} //directional light
+
+	{ //omni lights
+		uint light_indices = draw_call.omni_lights.x;
+		for (uint i = 0; i < 8; i++) {
+			uint light_index = light_indices & 0xFF;
+			if (i == 4) {
+				light_indices = draw_call.omni_lights.y;
+			} else {
+				light_indices = light_indices >> 8;
+			}
+
+			if (light_index == 0xFF) {
+				break;
+			}
+
+			float shadow = light_process_omni_shadow(light_index, vertex, view);
+
+			shadow = blur_shadow(shadow);
+
+			light_process_omni(light_index, vertex, view, normal, vertex_ddx, vertex_ddy, f0, orms, shadow,
+#ifdef LIGHT_BACKLIGHT_USED
+					backlight,
+#endif
+/*
+#ifdef LIGHT_TRANSMITTANCE_USED
+					transmittance_color,
+					transmittance_depth,
+					transmittance_curve,
+					transmittance_boost,
+#endif
+*/
+#ifdef LIGHT_RIM_USED
+					rim,
+					rim_tint,
+					albedo,
+#endif
+#ifdef LIGHT_CLEARCOAT_USED
+					clearcoat, clearcoat_gloss,
+#endif
+#ifdef LIGHT_ANISOTROPY_USED
+					tangent, binormal, anisotropy,
+#endif
+#ifdef USE_SHADOW_TO_OPACITY
+					alpha,
+#endif
+					diffuse_light, specular_light);
+		}
+	} //omni lights
+
+	{ //spot lights
+
+		uint light_indices = draw_call.spot_lights.x;
+		for (uint i = 0; i < 8; i++) {
+			uint light_index = light_indices & 0xFF;
+			if (i == 4) {
+				light_indices = draw_call.spot_lights.y;
+			} else {
+				light_indices = light_indices >> 8;
+			}
+
+			if (light_index == 0xFF) {
+				break;
+			}
+
+			float shadow = light_process_spot_shadow(light_index, vertex, view);
+
+			shadow = blur_shadow(shadow);
+
+			light_process_spot(light_index, vertex, view, normal, vertex_ddx, vertex_ddy, f0, orms, shadow,
+#ifdef LIGHT_BACKLIGHT_USED
+					backlight,
+#endif
+/*
+#ifdef LIGHT_TRANSMITTANCE_USED
+					transmittance_color,
+					transmittance_depth,
+					transmittance_curve,
+					transmittance_boost,
+#endif
+*/
+#ifdef LIGHT_RIM_USED
+					rim,
+					rim_tint,
+					albedo,
+#endif
+#ifdef LIGHT_CLEARCOAT_USED
+					clearcoat, clearcoat_gloss,
+#endif
+#ifdef LIGHT_ANISOTROPY_USED
+					tangent, binormal, anisotropy,
+#endif
+#ifdef USE_SHADOW_TO_OPACITY
+					alpha,
+#endif
+					diffuse_light, specular_light);
+		}
+	} //spot lights
+
+#ifdef USE_SHADOW_TO_OPACITY
+	alpha = min(alpha, clamp(length(ambient_light), 0.0, 1.0));
+
+#if defined(ALPHA_SCISSOR_USED)
+	if (alpha < alpha_scissor) {
+		discard;
+	}
+#endif // ALPHA_SCISSOR_USED
+
+#ifdef USE_OPAQUE_PREPASS
+
+	if (alpha < opaque_prepass_threshold) {
+		discard;
+	}
+
+#endif // USE_OPAQUE_PREPASS
+
+#endif // USE_SHADOW_TO_OPACITY
+
+#endif //!defined(MODE_RENDER_DEPTH) && !defined(MODE_UNSHADED)
+
+#ifdef MODE_RENDER_DEPTH
+
+#ifdef MODE_RENDER_MATERIAL
+
+	albedo_output_buffer.rgb = albedo;
+	albedo_output_buffer.a = alpha;
+
+	normal_output_buffer.rgb = normal * 0.5 + 0.5;
+	normal_output_buffer.a = 0.0;
+	depth_output_buffer.r = -vertex.z;
+
+	orm_output_buffer.r = ao;
+	orm_output_buffer.g = roughness;
+	orm_output_buffer.b = metallic;
+	orm_output_buffer.a = sss_strength;
+
+	emission_output_buffer.rgb = emission;
+	emission_output_buffer.a = 0.0;
+#endif // MODE_RENDER_MATERIAL
+
+#else // MODE_RENDER_DEPTH
+
+	// multiply by albedo
+	diffuse_light *= albedo; // ambient must be multiplied by albedo at the end
+
+	// apply direct light AO
+	ao = unpackUnorm4x8(orms).x;
+	specular_light *= ao;
+	diffuse_light *= ao;
+
+	// apply metallic
+	metallic = unpackUnorm4x8(orms).z;
+	diffuse_light *= 1.0 - metallic;
+	ambient_light *= 1.0 - metallic;
+
+	//restore fog
+	fog = vec4(unpackHalf2x16(fog_rg), unpackHalf2x16(fog_ba));
+
+#ifdef MODE_MULTIPLE_RENDER_TARGETS
+
+#ifdef MODE_UNSHADED
+	diffuse_buffer = vec4(albedo.rgb, 0.0);
+	specular_buffer = vec4(0.0);
+
+#else // MODE_UNSHADED
+
+#ifdef SSS_MODE_SKIN
+	sss_strength = -sss_strength;
+#endif // SSS_MODE_SKIN
+	diffuse_buffer = vec4(emission + diffuse_light + ambient_light, sss_strength);
+	specular_buffer = vec4(specular_light, metallic);
+#endif // MODE_UNSHADED
+
+	diffuse_buffer.rgb = mix(diffuse_buffer.rgb, fog.rgb, fog.a);
+	specular_buffer.rgb = mix(specular_buffer.rgb, vec3(0.0), fog.a);
+
+#else //MODE_MULTIPLE_RENDER_TARGETS
+
+#ifdef MODE_UNSHADED
+	frag_color = vec4(albedo, alpha);
+#else // MODE_UNSHADED
+	frag_color = vec4(emission + ambient_light + diffuse_light + specular_light, alpha);
+	//frag_color = vec4(1.0);
+#endif // MODE_UNSHADED
+
+	// Draw "fixed" fog before volumetric fog to ensure volumetric fog can appear in front of the sky.
+	frag_color.rgb = mix(frag_color.rgb, fog.rgb, fog.a);
+
+#endif //MODE_MULTIPLE_RENDER_TARGETS
+
+#endif //MODE_RENDER_DEPTH
+}
diff --git a/servers/rendering/renderer_rd/shaders/scene_forward_mobile_inc.glsl b/servers/rendering/renderer_rd/shaders/scene_forward_mobile_inc.glsl
new file mode 100644
index 0000000000..0156b58574
--- /dev/null
+++ b/servers/rendering/renderer_rd/shaders/scene_forward_mobile_inc.glsl
@@ -0,0 +1,220 @@
+#define M_PI 3.14159265359
+
+#include "decal_data_inc.glsl"
+
+#if !defined(MODE_RENDER_DEPTH) || defined(MODE_RENDER_MATERIAL) || defined(TANGENT_USED) || defined(NORMAL_MAP_USED)
+#ifndef NORMAL_USED
+#define NORMAL_USED
+#endif
+#endif
+
+/* don't exceed 128 bytes!! */
+/* put instance data into our push content, not a array */
+layout(push_constant, binding = 0, std430) uniform DrawCall {
+	mat4 transform; // 64 - 64
+	uint flags; // 04 - 68
+	uint instance_uniforms_ofs; //base offset in global buffer for instance variables	// 04 - 72
+	uint gi_offset; //GI information when using lightmapping (VCT or lightmap index)    // 04 - 76
+	uint layer_mask; // 04 - 80
+	vec4 lightmap_uv_scale; // 16 - 96 doubles as uv_offset when needed
+
+	uvec2 reflection_probes; // 08 - 104
+	uvec2 omni_lights; // 08 - 112
+	uvec2 spot_lights; // 08 - 120
+	uvec2 decals; // 08 - 128
+}
+draw_call;
+
+/* Set 0: Base Pass (never changes) */
+
+#include "light_data_inc.glsl"
+
+#define SAMPLER_NEAREST_CLAMP 0
+#define SAMPLER_LINEAR_CLAMP 1
+#define SAMPLER_NEAREST_WITH_MIPMAPS_CLAMP 2
+#define SAMPLER_LINEAR_WITH_MIPMAPS_CLAMP 3
+#define SAMPLER_NEAREST_WITH_MIPMAPS_ANISOTROPIC_CLAMP 4
+#define SAMPLER_LINEAR_WITH_MIPMAPS_ANISOTROPIC_CLAMP 5
+#define SAMPLER_NEAREST_REPEAT 6
+#define SAMPLER_LINEAR_REPEAT 7
+#define SAMPLER_NEAREST_WITH_MIPMAPS_REPEAT 8
+#define SAMPLER_LINEAR_WITH_MIPMAPS_REPEAT 9
+#define SAMPLER_NEAREST_WITH_MIPMAPS_ANISOTROPIC_REPEAT 10
+#define SAMPLER_LINEAR_WITH_MIPMAPS_ANISOTROPIC_REPEAT 11
+
+layout(set = 0, binding = 1) uniform sampler material_samplers[12];
+
+layout(set = 0, binding = 2) uniform sampler shadow_sampler;
+
+#define INSTANCE_FLAGS_USE_GI_BUFFERS (1 << 6)
+#define INSTANCE_FLAGS_USE_SDFGI (1 << 7)
+#define INSTANCE_FLAGS_USE_LIGHTMAP_CAPTURE (1 << 8)
+#define INSTANCE_FLAGS_USE_LIGHTMAP (1 << 9)
+#define INSTANCE_FLAGS_USE_SH_LIGHTMAP (1 << 10)
+#define INSTANCE_FLAGS_USE_GIPROBE (1 << 11)
+#define INSTANCE_FLAGS_MULTIMESH (1 << 12)
+#define INSTANCE_FLAGS_MULTIMESH_FORMAT_2D (1 << 13)
+#define INSTANCE_FLAGS_MULTIMESH_HAS_COLOR (1 << 14)
+#define INSTANCE_FLAGS_MULTIMESH_HAS_CUSTOM_DATA (1 << 15)
+#define INSTANCE_FLAGS_PARTICLE_TRAIL_SHIFT 16
+//3 bits of stride
+#define INSTANCE_FLAGS_PARTICLE_TRAIL_MASK 0xFF
+
+#define INSTANCE_FLAGS_NON_UNIFORM_SCALE (1 << 24)
+
+layout(set = 0, binding = 3, std430) restrict readonly buffer OmniLights {
+	LightData data[];
+}
+omni_lights;
+
+layout(set = 0, binding = 4, std430) restrict readonly buffer SpotLights {
+	LightData data[];
+}
+spot_lights;
+
+layout(set = 0, binding = 5, std430) restrict readonly buffer ReflectionProbeData {
+	ReflectionData data[];
+}
+reflections;
+
+layout(set = 0, binding = 6, std140) uniform DirectionalLights {
+	DirectionalLightData data[MAX_DIRECTIONAL_LIGHT_DATA_STRUCTS];
+}
+directional_lights;
+
+#define LIGHTMAP_FLAG_USE_DIRECTION 1
+#define LIGHTMAP_FLAG_USE_SPECULAR_DIRECTION 2
+
+struct Lightmap {
+	mat3 normal_xform;
+};
+
+layout(set = 0, binding = 7, std140) restrict readonly buffer Lightmaps {
+	Lightmap data[];
+}
+lightmaps;
+
+struct LightmapCapture {
+	vec4 sh[9];
+};
+
+layout(set = 0, binding = 8, std140) restrict readonly buffer LightmapCaptures {
+	LightmapCapture data[];
+}
+lightmap_captures;
+
+layout(set = 0, binding = 9) uniform texture2D decal_atlas;
+layout(set = 0, binding = 10) uniform texture2D decal_atlas_srgb;
+
+layout(set = 0, binding = 11, std430) restrict readonly buffer Decals {
+	DecalData data[];
+}
+decals;
+
+layout(set = 0, binding = 12, std430) restrict readonly buffer GlobalVariableData {
+	vec4 data[];
+}
+global_variables;
+
+/* Set 1: Render Pass (changes per render pass) */
+
+layout(set = 1, binding = 0, std140) uniform SceneData {
+	mat4 projection_matrix;
+	mat4 inv_projection_matrix;
+
+	mat4 camera_matrix;
+	mat4 inv_camera_matrix;
+
+	vec2 viewport_size;
+	vec2 screen_pixel_size;
+
+	//use vec4s because std140 doesnt play nice with vec2s, z and w are wasted
+	vec4 directional_penumbra_shadow_kernel[32];
+	vec4 directional_soft_shadow_kernel[32];
+	vec4 penumbra_shadow_kernel[32];
+	vec4 soft_shadow_kernel[32];
+
+	uint directional_penumbra_shadow_samples;
+	uint directional_soft_shadow_samples;
+	uint penumbra_shadow_samples;
+	uint soft_shadow_samples;
+
+	vec4 ambient_light_color_energy;
+
+	float ambient_color_sky_mix;
+	bool use_ambient_light;
+	bool use_ambient_cubemap;
+	bool use_reflection_cubemap;
+
+	mat3 radiance_inverse_xform;
+
+	vec2 shadow_atlas_pixel_size;
+	vec2 directional_shadow_pixel_size;
+
+	uint directional_light_count;
+	float dual_paraboloid_side;
+	float z_far;
+	float z_near;
+
+	bool ssao_enabled;
+	float ssao_light_affect;
+	float ssao_ao_affect;
+	bool roughness_limiter_enabled;
+
+	float roughness_limiter_amount;
+	float roughness_limiter_limit;
+	uvec2 roughness_limiter_pad;
+
+	vec4 ao_color;
+
+	bool fog_enabled;
+	float fog_density;
+	float fog_height;
+	float fog_height_density;
+
+	vec3 fog_light_color;
+	float fog_sun_scatter;
+
+	float fog_aerial_perspective;
+	bool material_uv2_mode;
+
+	float time;
+	float reflection_multiplier; // one normally, zero when rendering reflections
+
+	bool pancake_shadows;
+	uint pad1;
+	uint pad2;
+	uint pad3;
+}
+scene_data;
+
+#ifdef USE_RADIANCE_CUBEMAP_ARRAY
+
+layout(set = 1, binding = 2) uniform textureCubeArray radiance_cubemap;
+
+#else
+
+layout(set = 1, binding = 2) uniform textureCube radiance_cubemap;
+
+#endif
+
+layout(set = 1, binding = 3) uniform textureCubeArray reflection_atlas;
+
+layout(set = 1, binding = 4) uniform texture2D shadow_atlas;
+
+layout(set = 1, binding = 5) uniform texture2D directional_shadow_atlas;
+
+// this needs to change to providing just the lightmap we're using..
+layout(set = 1, binding = 6) uniform texture2DArray lightmap_textures[MAX_LIGHTMAP_TEXTURES];
+
+layout(set = 1, binding = 9) uniform texture2D depth_buffer;
+layout(set = 1, binding = 10) uniform texture2D color_buffer;
+
+/* Set 2 Skeleton & Instancing (can change per item) */
+
+layout(set = 2, binding = 0, std430) restrict readonly buffer Transforms {
+	vec4 data[];
+}
+transforms;
+
+/* Set 3 User Material */
diff --git a/servers/rendering/renderer_rd/shaders/screen_space_reflection.glsl b/servers/rendering/renderer_rd/shaders/screen_space_reflection.glsl
index 06dc4b13de..78e0a85341 100644
--- a/servers/rendering/renderer_rd/shaders/screen_space_reflection.glsl
+++ b/servers/rendering/renderer_rd/shaders/screen_space_reflection.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/screen_space_reflection_filter.glsl b/servers/rendering/renderer_rd/shaders/screen_space_reflection_filter.glsl
index a5afe74cb2..62d1cffb0a 100644
--- a/servers/rendering/renderer_rd/shaders/screen_space_reflection_filter.glsl
+++ b/servers/rendering/renderer_rd/shaders/screen_space_reflection_filter.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/screen_space_reflection_scale.glsl b/servers/rendering/renderer_rd/shaders/screen_space_reflection_scale.glsl
index 218605a962..7e06516d90 100644
--- a/servers/rendering/renderer_rd/shaders/screen_space_reflection_scale.glsl
+++ b/servers/rendering/renderer_rd/shaders/screen_space_reflection_scale.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/sdfgi_debug.glsl b/servers/rendering/renderer_rd/shaders/sdfgi_debug.glsl
index e4c3f3a84b..8b58796962 100644
--- a/servers/rendering/renderer_rd/shaders/sdfgi_debug.glsl
+++ b/servers/rendering/renderer_rd/shaders/sdfgi_debug.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/sdfgi_debug_probes.glsl b/servers/rendering/renderer_rd/shaders/sdfgi_debug_probes.glsl
index 08da283dad..0eacbc5363 100644
--- a/servers/rendering/renderer_rd/shaders/sdfgi_debug_probes.glsl
+++ b/servers/rendering/renderer_rd/shaders/sdfgi_debug_probes.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #define MAX_CASCADES 8
 
@@ -153,7 +153,7 @@ void main() {
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(location = 0) out vec4 frag_color;
 
diff --git a/servers/rendering/renderer_rd/shaders/sdfgi_direct_light.glsl b/servers/rendering/renderer_rd/shaders/sdfgi_direct_light.glsl
index dc7238abed..99db35bb34 100644
--- a/servers/rendering/renderer_rd/shaders/sdfgi_direct_light.glsl
+++ b/servers/rendering/renderer_rd/shaders/sdfgi_direct_light.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/sdfgi_fields.glsl b/servers/rendering/renderer_rd/shaders/sdfgi_fields.glsl
deleted file mode 100644
index 69d8824d8a..0000000000
--- a/servers/rendering/renderer_rd/shaders/sdfgi_fields.glsl
+++ /dev/null
@@ -1,182 +0,0 @@
-/* clang-format off */
-[compute]
-
-#version 450
-
-VERSION_DEFINES
-
-layout(local_size_x = OCT_RES, local_size_y = OCT_RES, local_size_z = 1) in;
-
-/* clang-format on */
-
-#define MAX_CASCADES 8
-
-layout(rgba16f, set = 0, binding = 1) uniform restrict image2DArray irradiance_texture;
-layout(rg16f, set = 0, binding = 2) uniform restrict image2DArray depth_texture;
-
-layout(rgba32ui, set = 0, binding = 3) uniform restrict uimage2DArray irradiance_history_texture;
-layout(rg32ui, set = 0, binding = 4) uniform restrict uimage2DArray depth_history_texture;
-
-struct CascadeData {
-	vec3 offset; //offset of (0,0,0) in world coordinates
-	float to_cell; // 1/bounds * grid_size
-};
-
-layout(set = 0, binding = 5, std140) uniform Cascades {
-	CascadeData data[MAX_CASCADES];
-}
-cascades;
-
-#define DEPTH_HISTORY_BITS 24
-#define IRRADIANCE_HISTORY_BITS 16
-
-layout(push_constant, binding = 0, std430) uniform Params {
-	vec3 grid_size;
-	uint max_cascades;
-
-	uint probe_axis_size;
-	uint cascade;
-	uint history_size;
-	uint pad0;
-
-	ivec3 scroll; //scroll in probes
-	uint pad1;
-}
-params;
-
-void main() {
-	ivec2 local = ivec2(gl_LocalInvocationID.xy);
-	ivec2 probe = ivec2(gl_WorkGroupID.xy);
-
-	ivec3 probe_cell;
-	probe_cell.x = probe.x % int(params.probe_axis_size);
-	probe_cell.y = probe.y;
-	probe_cell.z = probe.x / int(params.probe_axis_size);
-
-#ifdef MODE_SCROLL_BEGIN
-
-	ivec3 read_cell = probe_cell - params.scroll;
-
-	uint src_layer = (params.history_size + 1) * params.cascade;
-	uint dst_layer = (params.history_size + 1) * params.max_cascades;
-
-	for (uint i = 0; i <= params.history_size; i++) {
-		ivec3 write_pos = ivec3(probe * OCT_RES + local, int(i));
-
-		if (any(lessThan(read_pos, ivec3(0))) || any(greaterThanEqual(read_pos, ivec3(params.probe_axis_size)))) {
-			// nowhere to read from for scrolling, try finding the value from upper probes
-
-#ifdef MODE_IRRADIANCE
-			imageStore(irradiance_history_texture, write_pos, uvec4(0));
-#endif
-#ifdef MODE_DEPTH
-			imageStore(depth_history_texture, write_pos, uvec4(0));
-#endif
-		} else {
-			ivec3 read_pos;
-			read_pos.xy = read_cell.xy;
-			read_pos.x += read_cell.z * params.probe_axis_size;
-			read_pos.xy = read_pos.xy * OCT_RES + local;
-			read_pos.z = int(i);
-
-#ifdef MODE_IRRADIANCE
-			uvec4 value = imageLoad(irradiance_history_texture, read_pos);
-			imageStore(irradiance_history_texture, write_pos, value);
-#endif
-#ifdef MODE_DEPTH
-			uvec2 value = imageLoad(depth_history_texture, read_pos);
-			imageStore(depth_history_texture, write_pos, value);
-#endif
-		}
-	}
-
-#endif // MODE_SCROLL_BEGIN
-
-#ifdef MODE_SCROLL_END
-
-	uint src_layer = (params.history_size + 1) * params.max_cascades;
-	uint dst_layer = (params.history_size + 1) * params.cascade;
-
-	for (uint i = 0; i <= params.history_size; i++) {
-		ivec3 pos = ivec3(probe * OCT_RES + local, int(i));
-
-#ifdef MODE_IRRADIANCE
-		uvec4 value = imageLoad(irradiance_history_texture, read_pos);
-		imageStore(irradiance_history_texture, write_pos, value);
-#endif
-#ifdef MODE_DEPTH
-		uvec2 value = imageLoad(depth_history_texture, read_pos);
-		imageStore(depth_history_texture, write_pos, value);
-#endif
-	}
-
-#endif //MODE_SCROLL_END
-
-#ifdef MODE_STORE
-
-	uint src_layer = (params.history_size + 1) * params.cascade + params.history_size;
-	ivec3 read_pos = ivec3(probe * OCT_RES + local, int(src_layer));
-
-	ivec3 write_pos = ivec3(probe * (OCT_RES + 2) + ivec2(1), int(params.cascade));
-
-	ivec3 copy_to[4] = ivec3[](write_pos, ivec3(-2, -2, -2), ivec3(-2, -2, -2), ivec3(-2, -2, -2));
-
-#ifdef MODE_IRRADIANCE
-	uvec4 average = imageLoad(irradiance_history_texture, read_pos);
-	vec4 light_accum = vec4(average / params.history_size) / float(1 << IRRADIANCE_HISTORY_BITS);
-
-#endif
-#ifdef MODE_DEPTH
-	uvec2 value = imageLoad(depth_history_texture, read_pos);
-	vec2 depth_accum = vec4(average / params.history_size) / float(1 << IRRADIANCE_HISTORY_BITS);
-
-	float probe_cell_size = float(params.grid_size / float(params.probe_axis_size - 1)) / cascades.data[params.cascade].to_cell;
-	float max_depth = length(params.grid_size / cascades.data[params.max_cascades - 1].to_cell);
-	max_depth /= probe_cell_size;
-
-	depth_value = (vec2(average / params.history_size) / float(1 << DEPTH_HISTORY_BITS)) * vec2(max_depth, max_depth * max_depth);
-
-#endif
-
-	/* Fill the border if required */
-
-	if (local == ivec2(0, 0)) {
-		copy_to[1] = texture_pos + ivec3(OCT_RES - 1, -1, 0);
-		copy_to[2] = texture_pos + ivec3(-1, OCT_RES - 1, 0);
-		copy_to[3] = texture_pos + ivec3(OCT_RES, OCT_RES, 0);
-	} else if (local == ivec2(OCT_RES - 1, 0)) {
-		copy_to[1] = texture_pos + ivec3(0, -1, 0);
-		copy_to[2] = texture_pos + ivec3(OCT_RES, OCT_RES - 1, 0);
-		copy_to[3] = texture_pos + ivec3(-1, OCT_RES, 0);
-	} else if (local == ivec2(0, OCT_RES - 1)) {
-		copy_to[1] = texture_pos + ivec3(-1, 0, 0);
-		copy_to[2] = texture_pos + ivec3(OCT_RES - 1, OCT_RES, 0);
-		copy_to[3] = texture_pos + ivec3(OCT_RES, -1, 0);
-	} else if (local == ivec2(OCT_RES - 1, OCT_RES - 1)) {
-		copy_to[1] = texture_pos + ivec3(0, OCT_RES, 0);
-		copy_to[2] = texture_pos + ivec3(OCT_RES, 0, 0);
-		copy_to[3] = texture_pos + ivec3(-1, -1, 0);
-	} else if (local.y == 0) {
-		copy_to[1] = texture_pos + ivec3(OCT_RES - local.x - 1, local.y - 1, 0);
-	} else if (local.x == 0) {
-		copy_to[1] = texture_pos + ivec3(local.x - 1, OCT_RES - local.y - 1, 0);
-	} else if (local.y == OCT_RES - 1) {
-		copy_to[1] = texture_pos + ivec3(OCT_RES - local.x - 1, local.y + 1, 0);
-	} else if (local.x == OCT_RES - 1) {
-		copy_to[1] = texture_pos + ivec3(local.x + 1, OCT_RES - local.y - 1, 0);
-	}
-
-	for (int i = 0; i < 4; i++) {
-		if (copy_to[i] == ivec3(-2, -2, -2)) {
-			continue;
-		}
-#ifdef MODE_IRRADIANCE
-		imageStore(irradiance_texture, copy_to[i], light_accum);
-#endif
-#ifdef MODE_DEPTH
-		imageStore(depth_texture, copy_to[i], vec4(depth_value, 0.0, 0.0));
-#endif
-	}
-
-#endif // MODE_STORE
-}
diff --git a/servers/rendering/renderer_rd/shaders/sdfgi_integrate.glsl b/servers/rendering/renderer_rd/shaders/sdfgi_integrate.glsl
index 007e4c113a..bc376e9522 100644
--- a/servers/rendering/renderer_rd/shaders/sdfgi_integrate.glsl
+++ b/servers/rendering/renderer_rd/shaders/sdfgi_integrate.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/sdfgi_preprocess.glsl b/servers/rendering/renderer_rd/shaders/sdfgi_preprocess.glsl
index 916c60ac89..aa4ded146f 100644
--- a/servers/rendering/renderer_rd/shaders/sdfgi_preprocess.glsl
+++ b/servers/rendering/renderer_rd/shaders/sdfgi_preprocess.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #ifdef MODE_JUMPFLOOD_OPTIMIZED
 #define GROUP_SIZE 8
diff --git a/servers/rendering/renderer_rd/shaders/skeleton.glsl b/servers/rendering/renderer_rd/shaders/skeleton.glsl
index 680d1045cd..669ffc961d 100644
--- a/servers/rendering/renderer_rd/shaders/skeleton.glsl
+++ b/servers/rendering/renderer_rd/shaders/skeleton.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/sky.glsl b/servers/rendering/renderer_rd/shaders/sky.glsl
index 6c985e1f5c..9924da37d5 100644
--- a/servers/rendering/renderer_rd/shaders/sky.glsl
+++ b/servers/rendering/renderer_rd/shaders/sky.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(location = 0) out vec2 uv_interp;
 
@@ -24,7 +24,7 @@ void main() {
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #define M_PI 3.14159265359
 
@@ -88,13 +88,9 @@ layout(set = 0, binding = 3, std140) uniform DirectionalLights {
 
 directional_lights;
 
-#ifdef USE_MATERIAL_UNIFORMS
+#ifdef MATERIAL_UNIFORMS_USED
 layout(set = 1, binding = 0, std140) uniform MaterialUniforms{
-	/* clang-format off */
-
-MATERIAL_UNIFORMS
-
-	/* clang-format on */
+#MATERIAL_UNIFORMS
 } material;
 #endif
 
@@ -127,11 +123,7 @@ layout(set = 3, binding = 0) uniform texture3D volumetric_fog_texture;
 #define AT_QUARTER_RES_PASS false
 #endif
 
-/* clang-format off */
-
-FRAGMENT_SHADER_GLOBALS
-
-/* clang-format on */
+#GLOBALS
 
 layout(location = 0) out vec4 frag_color;
 
@@ -202,22 +194,10 @@ void main() {
 #endif
 #endif
 
-// unused, just here to make our compiler happy, make sure we don't execute any light code the user adds in..
-#ifndef REALLYINCLUDETHIS
-	{
-		/* clang-format off */
-
-LIGHT_SHADER_CODE
-
-		/* clang-format on */
-	}
-#endif
 	{
-		/* clang-format off */
 
-FRAGMENT_SHADER_CODE
+#CODE : SKY
 
-		/* clang-format on */
 	}
 
 	frag_color.rgb = color * params.position_multiplier.w;
diff --git a/servers/rendering/renderer_rd/shaders/sort.glsl b/servers/rendering/renderer_rd/shaders/sort.glsl
index e5ebb9c64b..307e60dc21 100644
--- a/servers/rendering/renderer_rd/shaders/sort.glsl
+++ b/servers/rendering/renderer_rd/shaders/sort.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 // Original version here:
 // https://github.com/GPUOpen-LibrariesAndSDKs/GPUParticles11/blob/master/gpuparticles11/src/Shaders
diff --git a/servers/rendering/renderer_rd/shaders/specular_merge.glsl b/servers/rendering/renderer_rd/shaders/specular_merge.glsl
index 0b8f406213..3579c35cce 100644
--- a/servers/rendering/renderer_rd/shaders/specular_merge.glsl
+++ b/servers/rendering/renderer_rd/shaders/specular_merge.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(location = 0) out vec2 uv_interp;
 
@@ -17,7 +17,7 @@ void main() {
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(location = 0) in vec2 uv_interp;
 
diff --git a/servers/rendering/renderer_rd/shaders/ssao.glsl b/servers/rendering/renderer_rd/shaders/ssao.glsl
index 231f8f91ec..6e945edfcd 100644
--- a/servers/rendering/renderer_rd/shaders/ssao.glsl
+++ b/servers/rendering/renderer_rd/shaders/ssao.glsl
@@ -21,7 +21,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 #define SSAO_ADAPTIVE_TAP_BASE_COUNT 5
 
diff --git a/servers/rendering/renderer_rd/shaders/ssao_blur.glsl b/servers/rendering/renderer_rd/shaders/ssao_blur.glsl
index 510a777048..d9cd2b4e85 100644
--- a/servers/rendering/renderer_rd/shaders/ssao_blur.glsl
+++ b/servers/rendering/renderer_rd/shaders/ssao_blur.glsl
@@ -21,7 +21,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/ssao_downsample.glsl b/servers/rendering/renderer_rd/shaders/ssao_downsample.glsl
index cb2d31f70d..ee0db6a6f0 100644
--- a/servers/rendering/renderer_rd/shaders/ssao_downsample.glsl
+++ b/servers/rendering/renderer_rd/shaders/ssao_downsample.glsl
@@ -21,7 +21,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/ssao_importance_map.glsl b/servers/rendering/renderer_rd/shaders/ssao_importance_map.glsl
index 6aa7624261..687fe1e6e2 100644
--- a/servers/rendering/renderer_rd/shaders/ssao_importance_map.glsl
+++ b/servers/rendering/renderer_rd/shaders/ssao_importance_map.glsl
@@ -21,7 +21,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/ssao_interleave.glsl b/servers/rendering/renderer_rd/shaders/ssao_interleave.glsl
index 4fdf334aa5..0907423d5d 100644
--- a/servers/rendering/renderer_rd/shaders/ssao_interleave.glsl
+++ b/servers/rendering/renderer_rd/shaders/ssao_interleave.glsl
@@ -20,7 +20,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/subsurface_scattering.glsl b/servers/rendering/renderer_rd/shaders/subsurface_scattering.glsl
index 88a953562f..9367b641c2 100644
--- a/servers/rendering/renderer_rd/shaders/subsurface_scattering.glsl
+++ b/servers/rendering/renderer_rd/shaders/subsurface_scattering.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 
diff --git a/servers/rendering/renderer_rd/shaders/tonemap.glsl b/servers/rendering/renderer_rd/shaders/tonemap.glsl
index 7de91fd541..86b4da6b08 100644
--- a/servers/rendering/renderer_rd/shaders/tonemap.glsl
+++ b/servers/rendering/renderer_rd/shaders/tonemap.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(location = 0) out vec2 uv_interp;
 
@@ -16,7 +16,7 @@ void main() {
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 layout(location = 0) in vec2 uv_interp;
 
diff --git a/servers/rendering/renderer_rd/shaders/volumetric_fog.glsl b/servers/rendering/renderer_rd/shaders/volumetric_fog.glsl
index ce8a459b24..c793b6ebe1 100644
--- a/servers/rendering/renderer_rd/shaders/volumetric_fog.glsl
+++ b/servers/rendering/renderer_rd/shaders/volumetric_fog.glsl
@@ -2,7 +2,7 @@
 
 #version 450
 
-VERSION_DEFINES
+#VERSION_DEFINES
 
 /* Do not use subgroups here, seems there is not much advantage and causes glitches
 #if defined(has_GL_KHR_shader_subgroup_ballot) && defined(has_GL_KHR_shader_subgroup_arithmetic)
@@ -26,6 +26,7 @@ layout(local_size_x = 4, local_size_y = 4, local_size_z = 4) in;
 #endif
 
 #include "cluster_data_inc.glsl"
+#include "light_data_inc.glsl"
 
 #define M_PI 3.14159265359
 
diff --git a/servers/rendering/renderer_scene.h b/servers/rendering/renderer_scene.h
index b546001843..db1e3d1377 100644
--- a/servers/rendering/renderer_scene.h
+++ b/servers/rendering/renderer_scene.h
@@ -49,6 +49,10 @@ public:
 	virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable) = 0;
 	virtual bool is_camera(RID p_camera) const = 0;
 
+	virtual RID occluder_allocate() = 0;
+	virtual void occluder_initialize(RID p_occluder) = 0;
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) = 0;
+
 	virtual RID scenario_allocate() = 0;
 	virtual void scenario_initialize(RID p_rid) = 0;
 
@@ -69,7 +73,7 @@ public:
 	virtual void instance_set_transform(RID p_instance, const Transform &p_transform) = 0;
 	virtual void instance_attach_object_instance_id(RID p_instance, ObjectID p_id) = 0;
 	virtual void instance_set_blend_shape_weight(RID p_instance, int p_shape, float p_weight) = 0;
-	virtual void instance_set_surface_material(RID p_instance, int p_surface, RID p_material) = 0;
+	virtual void instance_set_surface_override_material(RID p_instance, int p_surface, RID p_material) = 0;
 	virtual void instance_set_visible(RID p_instance, bool p_visible) = 0;
 
 	virtual void instance_set_custom_aabb(RID p_instance, AABB p_aabb) = 0;
@@ -197,8 +201,8 @@ public:
 	virtual void sdfgi_set_debug_probe_select(const Vector3 &p_position, const Vector3 &p_dir) = 0;
 
 	virtual void render_empty_scene(RID p_render_buffers, RID p_scenario, RID p_shadow_atlas) = 0;
-	virtual void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;
-	virtual void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;
+	virtual void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;
+	virtual void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_lod_threshold, RID p_shadow_atlas) = 0;
 
 	virtual void update() = 0;
 	virtual void render_probes() = 0;
diff --git a/servers/rendering/renderer_scene_cull.cpp b/servers/rendering/renderer_scene_cull.cpp
index e8155e4025..fcea8e4ffc 100644
--- a/servers/rendering/renderer_scene_cull.cpp
+++ b/servers/rendering/renderer_scene_cull.cpp
@@ -109,6 +109,20 @@ bool RendererSceneCull::is_camera(RID p_camera) const {
 	return camera_owner.owns(p_camera);
 }
 
+/* OCCLUDER API */
+
+RID RendererSceneCull::occluder_allocate() {
+	return RendererSceneOcclusionCull::get_singleton()->occluder_allocate();
+}
+
+void RendererSceneCull::occluder_initialize(RID p_rid) {
+	RendererSceneOcclusionCull::get_singleton()->occluder_initialize(p_rid);
+}
+
+void RendererSceneCull::occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) {
+	RendererSceneOcclusionCull::get_singleton()->occluder_set_mesh(p_occluder, p_vertices, p_indices);
+}
+
 /* SCENARIO API */
 
 void RendererSceneCull::_instance_pair(Instance *p_A, Instance *p_B) {
@@ -310,6 +324,8 @@ void RendererSceneCull::scenario_initialize(RID p_rid) {
 	scenario->instance_aabbs.set_page_pool(&instance_aabb_page_pool);
 	scenario->instance_data.set_page_pool(&instance_data_page_pool);
 
+	RendererSceneOcclusionCull::get_singleton()->add_scenario(p_rid);
+
 	scenario_owner.initialize_rid(p_rid, scenario);
 }
 
@@ -497,6 +513,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) {
 				scene_render->free(gi_probe->probe_instance);
 
 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				if (scenario && instance->visible) {
+					RendererSceneOcclusionCull::get_singleton()->scenario_remove_instance(instance->scenario->self, p_instance);
+				}
+			} break;
 			default: {
 			}
 		}
@@ -514,6 +535,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) {
 
 	if (p_base.is_valid()) {
 		instance->base_type = RSG::storage->get_base_type(p_base);
+
+		if (instance->base_type == RS::INSTANCE_NONE && RendererSceneOcclusionCull::get_singleton()->is_occluder(p_base)) {
+			instance->base_type = RS::INSTANCE_OCCLUDER;
+		}
+
 		ERR_FAIL_COND(instance->base_type == RS::INSTANCE_NONE);
 
 		switch (instance->base_type) {
@@ -588,6 +614,11 @@ void RendererSceneCull::instance_set_base(RID p_instance, RID p_base) {
 				gi_probe->probe_instance = scene_render->gi_probe_instance_create(p_base);
 
 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				if (scenario) {
+					RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(scenario->self, p_instance, p_base, instance->transform, instance->visible);
+				}
+			} break;
 			default: {
 			}
 		}
@@ -655,6 +686,11 @@ void RendererSceneCull::instance_set_scenario(RID p_instance, RID p_scenario) {
 					gi_probe_update_list.remove(&gi_probe->update_element);
 				}
 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				if (instance->visible) {
+					RendererSceneOcclusionCull::get_singleton()->scenario_remove_instance(instance->scenario->self, p_instance);
+				}
+			} break;
 			default: {
 			}
 		}
@@ -684,6 +720,9 @@ void RendererSceneCull::instance_set_scenario(RID p_instance, RID p_scenario) {
 					gi_probe_update_list.add(&gi_probe->update_element);
 				}
 			} break;
+			case RS::INSTANCE_OCCLUDER: {
+				RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(scenario->self, p_instance, instance->base, instance->transform, instance->visible);
+			} break;
 			default: {
 			}
 		}
@@ -752,7 +791,7 @@ void RendererSceneCull::instance_set_blend_shape_weight(RID p_instance, int p_sh
 	}
 }
 
-void RendererSceneCull::instance_set_surface_material(RID p_instance, int p_surface, RID p_material) {
+void RendererSceneCull::instance_set_surface_override_material(RID p_instance, int p_surface, RID p_material) {
 	Instance *instance = instance_owner.getornull(p_instance);
 	ERR_FAIL_COND(!instance);
 
@@ -801,6 +840,12 @@ void RendererSceneCull::instance_set_visible(RID p_instance, bool p_visible) {
 		InstanceParticlesCollisionData *collision = static_cast<InstanceParticlesCollisionData *>(instance->base_data);
 		RSG::storage->particles_collision_instance_set_active(collision->instance, p_visible);
 	}
+
+	if (instance->base_type == RS::INSTANCE_OCCLUDER) {
+		if (instance->scenario) {
+			RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(instance->scenario->self, p_instance, instance->base, instance->transform, p_visible);
+		}
+	}
 }
 
 inline bool is_geometry_instance(RenderingServer::InstanceType p_type) {
@@ -998,6 +1043,18 @@ void RendererSceneCull::instance_geometry_set_flag(RID p_instance, RS::InstanceF
 			}
 
 		} break;
+		case RS::INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING: {
+			instance->ignore_occlusion_culling = p_enabled;
+
+			if (instance->scenario && instance->array_index >= 0) {
+				InstanceData &idata = instance->scenario->instance_data[instance->array_index];
+				if (instance->ignore_occlusion_culling) {
+					idata.flags |= InstanceData::FLAG_IGNORE_OCCLUSION_CULLING;
+				} else {
+					idata.flags &= ~uint32_t(InstanceData::FLAG_IGNORE_OCCLUSION_CULLING);
+				}
+			}
+		} break;
 		default: {
 		}
 	}
@@ -1210,6 +1267,10 @@ void RendererSceneCull::_update_instance(Instance *p_instance) {
 			heightfield_particle_colliders_update_list.insert(p_instance);
 		}
 		RSG::storage->particles_collision_instance_set_transform(collision->instance, p_instance->transform);
+	} else if (p_instance->base_type == RS::INSTANCE_OCCLUDER) {
+		if (p_instance->scenario) {
+			RendererSceneOcclusionCull::get_singleton()->scenario_set_instance(p_instance->scenario->self, p_instance->self, p_instance->base, p_instance->transform, p_instance->visible);
+		}
 	}
 
 	if (p_instance->aabb.has_no_surface()) {
@@ -1337,6 +1398,9 @@ void RendererSceneCull::_update_instance(Instance *p_instance) {
 		if (p_instance->mesh_instance.is_valid()) {
 			idata.flags |= InstanceData::FLAG_USES_MESH_INSTANCE;
 		}
+		if (p_instance->ignore_occlusion_culling) {
+			idata.flags |= InstanceData::FLAG_IGNORE_OCCLUSION_CULLING;
+		}
 
 		p_instance->scenario->instance_data.push_back(idata);
 		p_instance->scenario->instance_aabbs.push_back(InstanceBounds(p_instance->transformed_aabb));
@@ -1363,6 +1427,9 @@ void RendererSceneCull::_update_instance(Instance *p_instance) {
 		pair.pair_mask |= 1 << RS::INSTANCE_LIGHT;
 		pair.pair_mask |= 1 << RS::INSTANCE_GI_PROBE;
 		pair.pair_mask |= 1 << RS::INSTANCE_LIGHTMAP;
+		if (p_instance->base_type == RS::INSTANCE_PARTICLES) {
+			pair.pair_mask |= 1 << RS::INSTANCE_PARTICLES_COLLISION;
+		}
 
 		pair.pair_mask |= geometry_instance_pair_mask;
 
@@ -2119,7 +2186,7 @@ bool RendererSceneCull::_light_instance_update_shadow(Instance *p_instance, cons
 	return animated_material_found;
 }
 
-void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
+void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
 // render to mono camera
 #ifndef _3D_DISABLED
 
@@ -2164,11 +2231,14 @@ void RendererSceneCull::render_camera(RID p_render_buffers, RID p_camera, RID p_
 
 	RID environment = _render_get_environment(p_camera, p_scenario);
 
-	_render_scene(camera->transform, camera_matrix, ortho, camera->vaspect, p_render_buffers, environment, camera->effects, camera->visible_layers, p_scenario, p_shadow_atlas, RID(), -1, p_screen_lod_threshold);
+	RENDER_TIMESTAMP("Update occlusion buffer")
+	RendererSceneOcclusionCull::get_singleton()->buffer_update(p_viewport, camera->transform, camera_matrix, ortho, RendererThreadPool::singleton->thread_work_pool);
+
+	_render_scene(camera->transform, camera_matrix, ortho, camera->vaspect, p_render_buffers, environment, camera->effects, camera->visible_layers, p_scenario, p_viewport, p_shadow_atlas, RID(), -1, p_screen_lod_threshold);
 #endif
 }
 
-void RendererSceneCull::render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
+void RendererSceneCull::render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas) {
 	// render for AR/VR interface
 #if 0
 	Camera *camera = camera_owner.getornull(p_camera);
@@ -2253,7 +2323,7 @@ void RendererSceneCull::render_camera(RID p_render_buffers, Ref<XRInterface> &p_
 #endif
 };
 
-void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, FrustumCullData *cull_data) {
+void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, CullData *cull_data) {
 	uint32_t cull_total = cull_data->scenario->instance_data.size();
 	uint32_t total_threads = RendererThreadPool::singleton->thread_work_pool.get_thread_count();
 	uint32_t cull_from = p_thread * cull_total / total_threads;
@@ -2262,7 +2332,7 @@ void RendererSceneCull::_frustum_cull_threaded(uint32_t p_thread, FrustumCullDat
 	_frustum_cull(*cull_data, frustum_cull_result_threads[p_thread], cull_from, cull_to);
 }
 
-void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to) {
+void RendererSceneCull::_frustum_cull(CullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to) {
 	uint64_t frame_number = RSG::rasterizer->get_frame_number();
 	float lightmap_probe_update_speed = RSG::storage->lightmap_get_probe_capture_update_speed() * RSG::rasterizer->get_frame_delta_time();
 
@@ -2271,10 +2341,14 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes
 
 	RID instance_pair_buffer[MAX_INSTANCE_PAIRS];
 
+	Transform inv_cam_transform = cull_data.cam_transform.inverse();
+	float z_near = cull_data.camera_matrix->get_z_near();
+
 	for (uint64_t i = p_from; i < p_to; i++) {
 		bool mesh_visible = false;
 
-		if (cull_data.scenario->instance_aabbs[i].in_frustum(cull_data.cull->frustum)) {
+		if (cull_data.scenario->instance_aabbs[i].in_frustum(cull_data.cull->frustum) && (cull_data.occlusion_buffer == nullptr || cull_data.scenario->instance_data[i].flags & InstanceData::FLAG_IGNORE_OCCLUSION_CULLING ||
+																								 !cull_data.occlusion_buffer->is_occluded(cull_data.scenario->instance_aabbs[i].bounds, cull_data.cam_transform.origin, inv_cam_transform, *cull_data.camera_matrix, z_near))) {
 			InstanceData &idata = cull_data.scenario->instance_data[i];
 			uint32_t base_type = idata.flags & InstanceData::FLAG_BASE_TYPE_MASK;
 
@@ -2320,7 +2394,7 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes
 				cull_result.gi_probes.push_back(RID::from_uint64(idata.instance_data_rid));
 
 			} else if (base_type == RS::INSTANCE_LIGHTMAP) {
-				cull_result.gi_probes.push_back(RID::from_uint64(idata.instance_data_rid));
+				cull_result.lightmaps.push_back(RID::from_uint64(idata.instance_data_rid));
 			} else if (((1 << base_type) & RS::INSTANCE_GEOMETRY_MASK) && !(idata.flags & InstanceData::FLAG_CAST_SHADOWS_ONLY)) {
 				bool keep = true;
 
@@ -2339,7 +2413,7 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes
 						cull_data.cull->lock.lock();
 						RSG::storage->particles_request_process(idata.base_rid);
 						cull_data.cull->lock.unlock();
-						RSG::storage->particles_set_view_axis(idata.base_rid, -cull_data.cam_transform.basis.get_axis(2).normalized());
+						RSG::storage->particles_set_view_axis(idata.base_rid, -cull_data.cam_transform.basis.get_axis(2).normalized(), cull_data.cam_transform.basis.get_axis(1).normalized());
 						//particles visible? request redraw
 						RenderingServerDefault::redraw_request();
 					}
@@ -2379,18 +2453,19 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes
 				}
 
 				if (geometry_instance_pair_mask & (1 << RS::INSTANCE_DECAL) && (idata.flags & InstanceData::FLAG_GEOM_DECAL_DIRTY)) {
-					//InstanceGeometryData *geom = static_cast<InstanceGeometryData *>(idata.instance->base_data);
-					//todo for GLES3
-					idata.flags &= ~uint32_t(InstanceData::FLAG_GEOM_DECAL_DIRTY);
-					/*for (Set<Instance *>::Element *E = geom->dec.front(); E; E = E->next()) {
-					InstanceReflectionProbeData *reflection_probe = static_cast<InstanceReflectionProbeData *>(E->get()->base_data);
+					InstanceGeometryData *geom = static_cast<InstanceGeometryData *>(idata.instance->base_data);
+					uint32_t idx = 0;
 
-					instance_pair_buffer[idx++] = reflection_probe->instance;
-					if (idx==MAX_INSTANCE_PAIRS) {
-						break;
+					for (Set<Instance *>::Element *E = geom->decals.front(); E; E = E->next()) {
+						InstanceDecalData *decal = static_cast<InstanceDecalData *>(E->get()->base_data);
+
+						instance_pair_buffer[idx++] = decal->instance;
+						if (idx == MAX_INSTANCE_PAIRS) {
+							break;
+						}
 					}
-				}*/
-					//scene_render->geometry_instance_pair_decal_instances(geom->geometry_instance, light_instances, idx);
+					scene_render->geometry_instance_pair_decal_instances(geom->geometry_instance, instance_pair_buffer, idx);
+					idata.flags &= ~uint32_t(InstanceData::FLAG_GEOM_DECAL_DIRTY);
 				}
 
 				if (idata.flags & InstanceData::FLAG_GEOM_GI_PROBE_DIRTY) {
@@ -2469,7 +2544,7 @@ void RendererSceneCull::_frustum_cull(FrustumCullData &cull_data, FrustumCullRes
 	}
 }
 
-void RendererSceneCull::_render_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows) {
+void RendererSceneCull::_render_scene(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_viewport, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows) {
 	// Note, in stereo rendering:
 	// - p_cam_transform will be a transform in the middle of our two eyes
 	// - p_cam_projection is a wider frustrum that encompasses both eyes
@@ -2566,7 +2641,7 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam
 		uint64_t cull_from = 0;
 		uint64_t cull_to = scenario->instance_data.size();
 
-		FrustumCullData cull_data;
+		CullData cull_data;
 
 		//prepare for eventual thread usage
 		cull_data.cull = &cull;
@@ -2575,6 +2650,8 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam
 		cull_data.cam_transform = p_cam_transform;
 		cull_data.visible_layers = p_visible_layers;
 		cull_data.render_reflection_probe = render_reflection_probe;
+		cull_data.occlusion_buffer = RendererSceneOcclusionCull::get_singleton()->buffer_get_ptr(p_viewport);
+		cull_data.camera_matrix = &p_cam_projection;
 //#define DEBUG_CULL_TIME
 #ifdef DEBUG_CULL_TIME
 		uint64_t time_from = OS::get_singleton()->get_ticks_usec();
@@ -2781,8 +2858,13 @@ void RendererSceneCull::_render_scene(const Transform p_cam_transform, const Cam
 	}
 	/* PROCESS GEOMETRY AND DRAW SCENE */
 
+	RID occluders_tex;
+	if (p_viewport.is_valid()) {
+		occluders_tex = RSG::viewport->viewport_get_occluder_debug_texture(p_viewport);
+	}
+
 	RENDER_TIMESTAMP("Render Scene ");
-	scene_render->render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_orthogonal, frustum_cull_result.geometry_instances, frustum_cull_result.light_instances, frustum_cull_result.reflections, frustum_cull_result.gi_probes, frustum_cull_result.decals, frustum_cull_result.lightmaps, p_environment, camera_effects, p_shadow_atlas, p_reflection_probe.is_valid() ? RID() : scenario->reflection_atlas, p_reflection_probe, p_reflection_probe_pass, p_screen_lod_threshold, render_shadow_data, max_shadows_used, render_sdfgi_data, cull.sdfgi.region_count, &sdfgi_update_data);
+	scene_render->render_scene(p_render_buffers, p_cam_transform, p_cam_projection, p_cam_orthogonal, frustum_cull_result.geometry_instances, frustum_cull_result.light_instances, frustum_cull_result.reflections, frustum_cull_result.gi_probes, frustum_cull_result.decals, frustum_cull_result.lightmaps, p_environment, camera_effects, p_shadow_atlas, occluders_tex, p_reflection_probe.is_valid() ? RID() : scenario->reflection_atlas, p_reflection_probe, p_reflection_probe_pass, p_screen_lod_threshold, render_shadow_data, max_shadows_used, render_sdfgi_data, cull.sdfgi.region_count, &sdfgi_update_data);
 
 	for (uint32_t i = 0; i < max_shadows_used; i++) {
 		render_shadow_data[i].instances.clear();
@@ -2829,7 +2911,7 @@ void RendererSceneCull::render_empty_scene(RID p_render_buffers, RID p_scenario,
 		environment = scenario->fallback_environment;
 	}
 	RENDER_TIMESTAMP("Render Empty Scene ");
-	scene_render->render_scene(p_render_buffers, Transform(), CameraMatrix(), true, PagedArray<RendererSceneRender::GeometryInstance *>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), RID(), RID(), p_shadow_atlas, scenario->reflection_atlas, RID(), 0, 0, nullptr, 0, nullptr, 0, nullptr);
+	scene_render->render_scene(p_render_buffers, Transform(), CameraMatrix(), true, PagedArray<RendererSceneRender::GeometryInstance *>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), PagedArray<RID>(), RID(), RID(), p_shadow_atlas, RID(), scenario->reflection_atlas, RID(), 0, 0, nullptr, 0, nullptr, 0, nullptr);
 #endif
 }
 
@@ -2891,8 +2973,15 @@ bool RendererSceneCull::_render_reflection_probe_step(Instance *p_instance, int
 			shadow_atlas = scenario->reflection_probe_shadow_atlas;
 		}
 
+		RID environment;
+		if (scenario->environment.is_valid()) {
+			environment = scenario->environment;
+		} else {
+			environment = scenario->fallback_environment;
+		}
+
 		RENDER_TIMESTAMP("Render Reflection Probe, Step " + itos(p_step));
-		_render_scene(xform, cm, false, false, RID(), RID(), RID(), RSG::storage->reflection_probe_get_cull_mask(p_instance->base), p_instance->scenario->self, shadow_atlas, reflection_probe->instance, p_step, lod_threshold, use_shadows);
+		_render_scene(xform, cm, false, false, RID(), environment, RID(), RSG::storage->reflection_probe_get_cull_mask(p_instance->base), p_instance->scenario->self, RID(), shadow_atlas, reflection_probe->instance, p_step, lod_threshold, use_shadows);
 
 	} else {
 		//do roughness postprocess step until it believes it's done
@@ -3466,8 +3555,11 @@ bool RendererSceneCull::free(RID p_rid) {
 		scene_render->free(scenario->reflection_probe_shadow_atlas);
 		scene_render->free(scenario->reflection_atlas);
 		scenario_owner.free(p_rid);
+		RendererSceneOcclusionCull::get_singleton()->remove_scenario(p_rid);
 		memdelete(scenario);
 
+	} else if (RendererSceneOcclusionCull::get_singleton()->is_occluder(p_rid)) {
+		RendererSceneOcclusionCull::get_singleton()->free_occluder(p_rid);
 	} else if (instance_owner.owns(p_rid)) {
 		// delete the instance
 
@@ -3536,6 +3628,8 @@ RendererSceneCull::RendererSceneCull() {
 	indexer_update_iterations = GLOBAL_GET("rendering/limits/spatial_indexer/update_iterations_per_frame");
 	thread_cull_threshold = GLOBAL_GET("rendering/limits/spatial_indexer/threaded_cull_minimum_instances");
 	thread_cull_threshold = MAX(thread_cull_threshold, (uint32_t)RendererThreadPool::singleton->thread_work_pool.get_thread_count()); //make sure there is at least one thread per CPU
+
+	dummy_occlusion_culling = memnew(RendererSceneOcclusionCull);
 }
 
 RendererSceneCull::~RendererSceneCull() {
@@ -3554,4 +3648,8 @@ RendererSceneCull::~RendererSceneCull() {
 		frustum_cull_result_threads[i].reset();
 	}
 	frustum_cull_result_threads.clear();
+
+	if (dummy_occlusion_culling) {
+		memdelete(dummy_occlusion_culling);
+	}
 }
diff --git a/servers/rendering/renderer_scene_cull.h b/servers/rendering/renderer_scene_cull.h
index 32f4334288..a61b04afc8 100644
--- a/servers/rendering/renderer_scene_cull.h
+++ b/servers/rendering/renderer_scene_cull.h
@@ -45,8 +45,10 @@
 #include "core/templates/rid_owner.h"
 #include "core/templates/self_list.h"
 #include "servers/rendering/renderer_scene.h"
+#include "servers/rendering/renderer_scene_occlusion_cull.h"
 #include "servers/rendering/renderer_scene_render.h"
 #include "servers/xr/xr_interface.h"
+
 class RendererSceneCull : public RendererScene {
 public:
 	RendererSceneRender *scene_render;
@@ -109,6 +111,14 @@ public:
 	virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable);
 	virtual bool is_camera(RID p_camera) const;
 
+	/* OCCLUDER API */
+
+	virtual RID occluder_allocate();
+	virtual void occluder_initialize(RID p_occluder);
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices);
+
+	RendererSceneOcclusionCull *dummy_occlusion_culling;
+
 	/* SCENARIO API */
 
 	struct Instance;
@@ -248,6 +258,7 @@ public:
 			FLAG_USES_BAKED_LIGHT = (1 << 16),
 			FLAG_USES_MESH_INSTANCE = (1 << 17),
 			FLAG_REFLECTION_PROBE_DIRTY = (1 << 18),
+			FLAG_IGNORE_OCCLUSION_CULLING = (1 << 19),
 		};
 
 		uint32_t flags = 0;
@@ -346,6 +357,8 @@ public:
 
 		float lod_bias;
 
+		bool ignore_occlusion_culling;
+
 		Vector<RID> materials;
 
 		RS::ShadowCastingSetting cast_shadows;
@@ -430,6 +443,7 @@ public:
 					singleton->_instance_queue_update(instance, false, true);
 				} break;
 				case RendererStorage::DEPENDENCY_CHANGED_MESH:
+				case RendererStorage::DEPENDENCY_CHANGED_PARTICLES:
 				case RendererStorage::DEPENDENCY_CHANGED_MULTIMESH:
 				case RendererStorage::DEPENDENCY_CHANGED_DECAL:
 				case RendererStorage::DEPENDENCY_CHANGED_LIGHT:
@@ -470,6 +484,7 @@ public:
 			lightmap = nullptr;
 			lightmap_cull_index = 0;
 			lod_bias = 1.0;
+			ignore_occlusion_culling = false;
 
 			scenario = nullptr;
 
@@ -647,6 +662,7 @@ public:
 
 		_FORCE_INLINE_ bool operator()(void *p_data) {
 			Instance *p_instance = (Instance *)p_data;
+
 			if (instance != p_instance && instance->transformed_aabb.intersects(p_instance->transformed_aabb) && (pair_mask & (1 << p_instance->base_type))) {
 				//test is more coarse in indexer
 				p_instance->pair_check = pair_pass;
@@ -840,7 +856,7 @@ public:
 	virtual void instance_set_transform(RID p_instance, const Transform &p_transform);
 	virtual void instance_attach_object_instance_id(RID p_instance, ObjectID p_id);
 	virtual void instance_set_blend_shape_weight(RID p_instance, int p_shape, float p_weight);
-	virtual void instance_set_surface_material(RID p_instance, int p_surface, RID p_material);
+	virtual void instance_set_surface_override_material(RID p_instance, int p_surface, RID p_material);
 	virtual void instance_set_visible(RID p_instance, bool p_visible);
 
 	virtual void instance_set_custom_aabb(RID p_instance, AABB p_aabb);
@@ -921,24 +937,26 @@ public:
 		Frustum frustum;
 	} cull;
 
-	struct FrustumCullData {
+	struct CullData {
 		Cull *cull;
 		Scenario *scenario;
 		RID shadow_atlas;
 		Transform cam_transform;
 		uint32_t visible_layers;
 		Instance *render_reflection_probe;
+		const RendererSceneOcclusionCull::HZBuffer *occlusion_buffer;
+		const CameraMatrix *camera_matrix;
 	};
 
-	void _frustum_cull_threaded(uint32_t p_thread, FrustumCullData *cull_data);
-	void _frustum_cull(FrustumCullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to);
+	void _frustum_cull_threaded(uint32_t p_thread, CullData *cull_data);
+	void _frustum_cull(CullData &cull_data, FrustumCullResult &cull_result, uint64_t p_from, uint64_t p_to);
 
 	bool _render_reflection_probe_step(Instance *p_instance, int p_step);
-	void _render_scene(const Transform p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows = true);
+	void _render_scene(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, bool p_cam_vaspect, RID p_render_buffers, RID p_environment, RID p_force_camera_effects, uint32_t p_visible_layers, RID p_scenario, RID p_viewport, RID p_shadow_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, bool p_using_shadows = true);
 	void render_empty_scene(RID p_render_buffers, RID p_scenario, RID p_shadow_atlas);
 
-	void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
-	void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
+	void render_camera(RID p_render_buffers, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
+	void render_camera(RID p_render_buffers, Ref<XRInterface> &p_interface, XRInterface::Eyes p_eye, RID p_camera, RID p_scenario, RID p_viewport, Size2 p_viewport_size, float p_screen_lod_threshold, RID p_shadow_atlas);
 	void update_dirty_instances();
 
 	void render_particle_colliders();
diff --git a/servers/rendering/renderer_scene_occlusion_cull.cpp b/servers/rendering/renderer_scene_occlusion_cull.cpp
new file mode 100644
index 0000000000..c491ccbe7a
--- /dev/null
+++ b/servers/rendering/renderer_scene_occlusion_cull.cpp
@@ -0,0 +1,192 @@
+/*************************************************************************/
+/*  renderer_scene_occlusion_cull.cpp                                    */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#include "renderer_scene_occlusion_cull.h"
+
+RendererSceneOcclusionCull *RendererSceneOcclusionCull::singleton = nullptr;
+
+const Vector3 RendererSceneOcclusionCull::HZBuffer::corners[8] = {
+	Vector3(0, 0, 0),
+	Vector3(0, 0, 1),
+	Vector3(0, 1, 0),
+	Vector3(0, 1, 1),
+	Vector3(1, 0, 0),
+	Vector3(1, 0, 1),
+	Vector3(1, 1, 0),
+	Vector3(1, 1, 1)
+};
+
+bool RendererSceneOcclusionCull::HZBuffer::is_empty() const {
+	return sizes.is_empty();
+}
+
+void RendererSceneOcclusionCull::HZBuffer::clear() {
+	if (sizes.is_empty()) {
+		return; // Already cleared
+	}
+
+	data.clear();
+	sizes.clear();
+	mips.clear();
+
+	debug_data.clear();
+	if (debug_image.is_valid()) {
+		debug_image.unref();
+	}
+	RS::get_singleton()->free(debug_texture);
+}
+
+void RendererSceneOcclusionCull::HZBuffer::resize(const Size2i &p_size) {
+	if (p_size == Size2i()) {
+		clear();
+		return;
+	}
+
+	if (!sizes.is_empty() && p_size == sizes[0]) {
+		return; // Size didn't change
+	}
+
+	int mip_count = 0;
+	int data_size = 0;
+	int w = p_size.x;
+	int h = p_size.y;
+
+	while (true) {
+		data_size += h * w;
+
+		w = MAX(1, w >> 1);
+		h = MAX(1, h >> 1);
+
+		mip_count++;
+
+		if (w == 1U && h == 1U) {
+			data_size += 1U;
+			mip_count++;
+			break;
+		}
+	}
+
+	data.resize(data_size);
+	mips.resize(mip_count);
+	sizes.resize(mip_count);
+
+	w = p_size.x;
+	h = p_size.y;
+	float *ptr = data.ptr();
+
+	for (int i = 0; i < mip_count; i++) {
+		sizes[i] = Size2i(w, h);
+		mips[i] = ptr;
+
+		ptr = &ptr[w * h];
+		w = MAX(1, w >> 1);
+		h = MAX(1, h >> 1);
+	}
+
+	for (int i = 0; i < data_size; i++) {
+		data[i] = FLT_MAX;
+	}
+
+	debug_data.resize(sizes[0].x * sizes[0].y);
+	if (debug_texture.is_valid()) {
+		RS::get_singleton()->free(debug_texture);
+		debug_texture = RID();
+	}
+}
+
+void RendererSceneOcclusionCull::HZBuffer::update_mips() {
+	if (sizes.is_empty()) {
+		return;
+	}
+
+	for (uint32_t mip = 1; mip < mips.size(); mip++) {
+		for (int y = 0; y < sizes[mip].y; y++) {
+			for (int x = 0; x < sizes[mip].x; x++) {
+				int prev_x = x * 2;
+				int prev_y = y * 2;
+
+				int prev_w = sizes[mip - 1].width;
+				int prev_h = sizes[mip - 1].height;
+
+				bool odd_w = (prev_w % 2) != 0;
+				bool odd_h = (prev_h % 2) != 0;
+
+#define CHECK_OFFSET(xx, yy) max_depth = MAX(max_depth, mips[mip - 1][MIN(prev_h - 1, prev_y + (yy)) * prev_w + MIN(prev_w - 1, prev_x + (xx))])
+
+				float max_depth = mips[mip - 1][prev_y * sizes[mip - 1].x + prev_x];
+				CHECK_OFFSET(0, 1);
+				CHECK_OFFSET(1, 0);
+				CHECK_OFFSET(1, 1);
+
+				if (odd_w) {
+					CHECK_OFFSET(2, 0);
+					CHECK_OFFSET(2, 1);
+				}
+
+				if (odd_h) {
+					CHECK_OFFSET(0, 2);
+					CHECK_OFFSET(1, 2);
+				}
+
+				if (odd_w && odd_h) {
+					CHECK_OFFSET(2, 2);
+				}
+
+				mips[mip][y * sizes[mip].x + x] = max_depth;
+#undef CHECK_OFFSET
+			}
+		}
+	}
+}
+
+RID RendererSceneOcclusionCull::HZBuffer::get_debug_texture() {
+	if (sizes.is_empty() || sizes[0] == Size2i()) {
+		return RID();
+	}
+
+	if (debug_image.is_null()) {
+		debug_image.instance();
+	}
+
+	unsigned char *ptrw = debug_data.ptrw();
+	for (int i = 0; i < debug_data.size(); i++) {
+		ptrw[i] = MIN(mips[0][i] / debug_tex_range, 1.0) * 255;
+	}
+
+	debug_image->create(sizes[0].x, sizes[0].y, false, Image::FORMAT_L8, debug_data);
+
+	if (debug_texture.is_null()) {
+		debug_texture = RS::get_singleton()->texture_2d_create(debug_image);
+	} else {
+		RenderingServer::get_singleton()->texture_2d_update_immediate(debug_texture, debug_image);
+	}
+
+	return debug_texture;
+}
diff --git a/servers/rendering/renderer_scene_occlusion_cull.h b/servers/rendering/renderer_scene_occlusion_cull.h
new file mode 100644
index 0000000000..390bbaa64b
--- /dev/null
+++ b/servers/rendering/renderer_scene_occlusion_cull.h
@@ -0,0 +1,201 @@
+/*************************************************************************/
+/*  renderer_scene_occlusion_cull.h                                      */
+/*************************************************************************/
+/*                       This file is part of:                           */
+/*                           GODOT ENGINE                                */
+/*                      https://godotengine.org                          */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/*                                                                       */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the       */
+/* "Software"), to deal in the Software without restriction, including   */
+/* without limitation the rights to use, copy, modify, merge, publish,   */
+/* distribute, sublicense, and/or sell copies of the Software, and to    */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions:                                             */
+/*                                                                       */
+/* The above copyright notice and this permission notice shall be        */
+/* included in all copies or substantial portions of the Software.       */
+/*                                                                       */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,       */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF    */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY  */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,  */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE     */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                */
+/*************************************************************************/
+
+#ifndef RENDERER_SCENE_OCCLUSION_CULL_H
+#define RENDERER_SCENE_OCCLUSION_CULL_H
+
+#include "core/math/camera_matrix.h"
+#include "core/templates/local_vector.h"
+#include "servers/rendering_server.h"
+
+class RendererSceneOcclusionCull {
+protected:
+	static RendererSceneOcclusionCull *singleton;
+
+public:
+	class HZBuffer {
+	protected:
+		static const Vector3 corners[8];
+
+		LocalVector<float> data;
+		LocalVector<Size2i> sizes;
+		LocalVector<float *> mips;
+
+		RID debug_texture;
+		Ref<Image> debug_image;
+		PackedByteArray debug_data;
+		float debug_tex_range = 0.0f;
+
+	public:
+		bool is_empty() const;
+		virtual void clear();
+		virtual void resize(const Size2i &p_size);
+
+		void update_mips();
+
+		_FORCE_INLINE_ bool is_occluded(const float p_bounds[6], const Vector3 &p_cam_position, const Transform &p_cam_inv_transform, const CameraMatrix &p_cam_projection, float p_near) const {
+			if (is_empty()) {
+				return false;
+			}
+
+			Vector3 closest_point = Vector3(CLAMP(p_cam_position.x, p_bounds[0], p_bounds[3]), CLAMP(p_cam_position.y, p_bounds[1], p_bounds[4]), CLAMP(p_cam_position.z, p_bounds[2], p_bounds[5]));
+
+			if (closest_point == p_cam_position) {
+				return false;
+			}
+
+			Vector3 closest_point_view = p_cam_inv_transform.xform(closest_point);
+			if (closest_point_view.z > -p_near) {
+				return false;
+			}
+
+			float min_depth;
+			if (p_cam_projection.is_orthogonal()) {
+				min_depth = (-closest_point_view.z) - p_near;
+			} else {
+				float r = -p_near / closest_point_view.z;
+				Vector3 closest_point_proj = Vector3(closest_point_view.x * r, closest_point_view.y * r, -p_near);
+				min_depth = closest_point_proj.distance_to(closest_point_view);
+			}
+
+			Vector2 rect_min = Vector2(FLT_MAX, FLT_MAX);
+			Vector2 rect_max = Vector2(FLT_MIN, FLT_MIN);
+
+			for (int j = 0; j < 8; j++) {
+				Vector3 c = RendererSceneOcclusionCull::HZBuffer::corners[j];
+				Vector3 nc = Vector3(1, 1, 1) - c;
+				Vector3 corner = Vector3(p_bounds[0] * c.x + p_bounds[3] * nc.x, p_bounds[1] * c.y + p_bounds[4] * nc.y, p_bounds[2] * c.z + p_bounds[5] * nc.z);
+				Vector3 view = p_cam_inv_transform.xform(corner);
+
+				Vector3 projected = p_cam_projection.xform(view);
+				Vector2 normalized = Vector2(projected.x * 0.5f + 0.5f, projected.y * 0.5f + 0.5f);
+				rect_min = rect_min.min(normalized);
+				rect_max = rect_max.max(normalized);
+			}
+
+			rect_max = rect_max.min(Vector2(1, 1));
+			rect_min = rect_min.max(Vector2(0, 0));
+
+			int mip_count = mips.size();
+
+			Vector2 screen_diagonal = (rect_max - rect_min) * sizes[0];
+			float size = MAX(screen_diagonal.x, screen_diagonal.y);
+			float l = Math::ceil(Math::log2(size));
+			int lod = CLAMP(l, 0, mip_count - 1);
+
+			const int max_samples = 512;
+			int sample_count = 0;
+			bool visible = true;
+
+			for (; lod >= 0; lod--) {
+				int w = sizes[lod].x;
+				int h = sizes[lod].y;
+
+				int minx = CLAMP(rect_min.x * w - 1, 0, w - 1);
+				int maxx = CLAMP(rect_max.x * w + 1, 0, w - 1);
+
+				int miny = CLAMP(rect_min.y * h - 1, 0, h - 1);
+				int maxy = CLAMP(rect_max.y * h + 1, 0, h - 1);
+
+				sample_count += (maxx - minx + 1) * (maxy - miny + 1);
+
+				if (sample_count > max_samples) {
+					return false;
+				}
+
+				visible = false;
+				for (int y = miny; y <= maxy; y++) {
+					for (int x = minx; x <= maxx; x++) {
+						float depth = mips[lod][y * w + x];
+						if (depth > min_depth) {
+							visible = true;
+							break;
+						}
+					}
+					if (visible) {
+						break;
+					}
+				}
+
+				if (!visible) {
+					return true;
+				}
+			}
+
+			return !visible;
+		}
+
+		RID get_debug_texture();
+
+		virtual ~HZBuffer(){};
+	};
+
+	static RendererSceneOcclusionCull *get_singleton() { return singleton; }
+
+	void _print_warining() {
+		WARN_PRINT_ONCE("Occlusion culling is disabled at build time.");
+	}
+
+	virtual bool is_occluder(RID p_rid) { return false; }
+	virtual RID occluder_allocate() { return RID(); }
+	virtual void occluder_initialize(RID p_occluder) {}
+	virtual void free_occluder(RID p_occluder) { _print_warining(); }
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) { _print_warining(); }
+
+	virtual void add_scenario(RID p_scenario) {}
+	virtual void remove_scenario(RID p_scenario) {}
+	virtual void scenario_set_instance(RID p_scenario, RID p_instance, RID p_occluder, const Transform &p_xform, bool p_enabled) { _print_warining(); }
+	virtual void scenario_remove_instance(RID p_scenario, RID p_instance) { _print_warining(); }
+
+	virtual void add_buffer(RID p_buffer) { _print_warining(); }
+	virtual void remove_buffer(RID p_buffer) { _print_warining(); }
+	virtual HZBuffer *buffer_get_ptr(RID p_buffer) {
+		return nullptr;
+	}
+	virtual void buffer_set_scenario(RID p_buffer, RID p_scenario) { _print_warining(); }
+	virtual void buffer_set_size(RID p_buffer, const Vector2i &p_size) { _print_warining(); }
+	virtual void buffer_update(RID p_buffer, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_orthogonal, ThreadWorkPool &p_thread_pool) {}
+	virtual RID buffer_get_debug_texture(RID p_buffer) {
+		_print_warining();
+		return RID();
+	}
+
+	virtual void set_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) {}
+
+	RendererSceneOcclusionCull() {
+		singleton = this;
+	};
+
+	virtual ~RendererSceneOcclusionCull() {
+		singleton = nullptr;
+	};
+};
+
+#endif //RENDERER_SCENE_OCCLUSION_CULL_H
diff --git a/servers/rendering/renderer_scene_render.h b/servers/rendering/renderer_scene_render.h
index 1dea3580b6..3f28fac549 100644
--- a/servers/rendering/renderer_scene_render.h
+++ b/servers/rendering/renderer_scene_render.h
@@ -216,7 +216,7 @@ public:
 		uint32_t positional_light_count;
 	};
 
-	virtual void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr) = 0;
+	virtual void render_scene(RID p_render_buffers, const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, const PagedArray<RID> &p_lights, const PagedArray<RID> &p_reflection_probes, const PagedArray<RID> &p_gi_probes, const PagedArray<RID> &p_decals, const PagedArray<RID> &p_lightmaps, RID p_environment, RID p_camera_effects, RID p_shadow_atlas, RID p_occluder_debug_tex, RID p_reflection_atlas, RID p_reflection_probe, int p_reflection_probe_pass, float p_screen_lod_threshold, const RenderShadowData *p_render_shadows, int p_render_shadow_count, const RenderSDFGIData *p_render_sdfgi_regions, int p_render_sdfgi_region_count, const RenderSDFGIUpdateData *p_sdfgi_update_data = nullptr) = 0;
 
 	virtual void render_material(const Transform &p_cam_transform, const CameraMatrix &p_cam_projection, bool p_cam_ortogonal, const PagedArray<GeometryInstance *> &p_instances, RID p_framebuffer, const Rect2i &p_region) = 0;
 	virtual void render_particle_collider_heightfield(RID p_collider, const Transform &p_transform, const PagedArray<GeometryInstance *> &p_instances) = 0;
@@ -241,8 +241,6 @@ public:
 
 	virtual void sdfgi_set_debug_probe_select(const Vector3 &p_position, const Vector3 &p_dir) = 0;
 
-	virtual bool is_low_end() const = 0;
-
 	virtual void update() = 0;
 	virtual ~RendererSceneRender() {}
 };
diff --git a/servers/rendering/renderer_storage.h b/servers/rendering/renderer_storage.h
index 22cf6acb19..15d99c038e 100644
--- a/servers/rendering/renderer_storage.h
+++ b/servers/rendering/renderer_storage.h
@@ -43,6 +43,7 @@ public:
 		DEPENDENCY_CHANGED_MESH,
 		DEPENDENCY_CHANGED_MULTIMESH,
 		DEPENDENCY_CHANGED_MULTIMESH_VISIBLE_INSTANCES,
+		DEPENDENCY_CHANGED_PARTICLES,
 		DEPENDENCY_CHANGED_DECAL,
 		DEPENDENCY_CHANGED_SKELETON_DATA,
 		DEPENDENCY_CHANGED_SKELETON_BONES,
@@ -498,8 +499,15 @@ public:
 	virtual void particles_set_use_local_coordinates(RID p_particles, bool p_enable) = 0;
 	virtual void particles_set_process_material(RID p_particles, RID p_material) = 0;
 	virtual void particles_set_fixed_fps(RID p_particles, int p_fps) = 0;
+	virtual void particles_set_interpolate(RID p_particles, bool p_enable) = 0;
 	virtual void particles_set_fractional_delta(RID p_particles, bool p_enable) = 0;
 	virtual void particles_set_collision_base_size(RID p_particles, float p_size) = 0;
+
+	virtual void particles_set_transform_align(RID p_particles, RS::ParticlesTransformAlign p_transform_align) = 0;
+
+	virtual void particles_set_trails(RID p_particles, bool p_enable, float p_length) = 0;
+	virtual void particles_set_trail_bind_poses(RID p_particles, const Vector<Transform> &p_bind_poses) = 0;
+
 	virtual void particles_restart(RID p_particles) = 0;
 	virtual void particles_emit(RID p_particles, const Transform &p_transform, const Vector3 &p_velocity, const Color &p_color, const Color &p_custom, uint32_t p_emit_flags) = 0;
 	virtual void particles_set_subemitter(RID p_particles, RID p_subemitter_particles) = 0;
@@ -520,7 +528,7 @@ public:
 	virtual int particles_get_draw_passes(RID p_particles) const = 0;
 	virtual RID particles_get_draw_pass_mesh(RID p_particles, int p_pass) const = 0;
 
-	virtual void particles_set_view_axis(RID p_particles, const Vector3 &p_axis) = 0;
+	virtual void particles_set_view_axis(RID p_particles, const Vector3 &p_axis, const Vector3 &p_up_axis) = 0;
 
 	virtual void particles_add_collision(RID p_particles, RID p_particles_collision_instance) = 0;
 	virtual void particles_remove_collision(RID p_particles, RID p_particles_collision_instance) = 0;
diff --git a/servers/rendering/renderer_viewport.cpp b/servers/rendering/renderer_viewport.cpp
index a5d5033c18..f7be6c6c60 100644
--- a/servers/rendering/renderer_viewport.cpp
+++ b/servers/rendering/renderer_viewport.cpp
@@ -79,11 +79,26 @@ void RendererViewport::_draw_3d(Viewport *p_viewport, XRInterface::Eyes p_eye) {
 		xr_interface = XRServer::get_singleton()->get_primary_interface();
 	}
 
+	if (p_viewport->use_occlusion_culling) {
+		if (p_viewport->occlusion_buffer_dirty) {
+			float aspect = p_viewport->size.aspect();
+			int max_size = occlusion_rays_per_thread * RendererThreadPool::singleton->thread_work_pool.get_thread_count();
+
+			int viewport_size = p_viewport->size.width * p_viewport->size.height;
+			max_size = CLAMP(max_size, viewport_size / (32 * 32), viewport_size / (2 * 2)); // At least one depth pixel for every 16x16 region. At most one depth pixel for every 2x2 region.
+
+			float height = Math::sqrt(max_size / aspect);
+			Size2i new_size = Size2i(height * aspect, height);
+			RendererSceneOcclusionCull::get_singleton()->buffer_set_size(p_viewport->self, new_size);
+			p_viewport->occlusion_buffer_dirty = false;
+		}
+	}
+
 	float screen_lod_threshold = p_viewport->lod_threshold / float(p_viewport->size.width);
 	if (p_viewport->use_xr && xr_interface.is_valid()) {
-		RSG::scene->render_camera(p_viewport->render_buffers, xr_interface, p_eye, p_viewport->camera, p_viewport->scenario, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
+		RSG::scene->render_camera(p_viewport->render_buffers, xr_interface, p_eye, p_viewport->camera, p_viewport->scenario, p_viewport->self, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
 	} else {
-		RSG::scene->render_camera(p_viewport->render_buffers, p_viewport->camera, p_viewport->scenario, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
+		RSG::scene->render_camera(p_viewport->render_buffers, p_viewport->camera, p_viewport->scenario, p_viewport->self, p_viewport->size, screen_lod_threshold, p_viewport->shadow_atlas);
 	}
 	RENDER_TIMESTAMP("<End Rendering 3D Scene");
 }
@@ -647,6 +662,8 @@ void RendererViewport::viewport_set_size(RID p_viewport, int p_width, int p_heig
 			RSG::scene->render_buffers_configure(viewport->render_buffers, viewport->render_target, viewport->size.width, viewport->size.height, viewport->msaa, viewport->screen_space_aa, viewport->use_debanding);
 		}
 	}
+
+	viewport->occlusion_buffer_dirty = true;
 }
 
 void RendererViewport::viewport_set_active(RID p_viewport, bool p_active) {
@@ -655,6 +672,7 @@ void RendererViewport::viewport_set_active(RID p_viewport, bool p_active) {
 
 	if (p_active) {
 		ERR_FAIL_COND(active_viewports.find(viewport) != -1); //already active
+		viewport->occlusion_buffer_dirty = true;
 		active_viewports.push_back(viewport);
 	} else {
 		active_viewports.erase(viewport);
@@ -739,6 +757,16 @@ RID RendererViewport::viewport_get_texture(RID p_viewport) const {
 	return RSG::storage->render_target_get_texture(viewport->render_target);
 }
 
+RID RendererViewport::viewport_get_occluder_debug_texture(RID p_viewport) const {
+	const Viewport *viewport = viewport_owner.getornull(p_viewport);
+	ERR_FAIL_COND_V(!viewport, RID());
+
+	if (viewport->use_occlusion_culling && viewport->debug_draw == RenderingServer::VIEWPORT_DEBUG_DRAW_OCCLUDERS) {
+		return RendererSceneOcclusionCull::get_singleton()->buffer_get_debug_texture(p_viewport);
+	}
+	return RID();
+}
+
 void RendererViewport::viewport_set_hide_scenario(RID p_viewport, bool p_hide) {
 	Viewport *viewport = viewport_owner.getornull(p_viewport);
 	ERR_FAIL_COND(!viewport);
@@ -772,6 +800,9 @@ void RendererViewport::viewport_set_scenario(RID p_viewport, RID p_scenario) {
 	ERR_FAIL_COND(!viewport);
 
 	viewport->scenario = p_scenario;
+	if (viewport->use_occlusion_culling) {
+		RendererSceneOcclusionCull::get_singleton()->buffer_set_scenario(p_viewport, p_scenario);
+	}
 }
 
 void RendererViewport::viewport_attach_canvas(RID p_viewport, RID p_canvas) {
@@ -888,6 +919,41 @@ void RendererViewport::viewport_set_use_debanding(RID p_viewport, bool p_use_deb
 	}
 }
 
+void RendererViewport::viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_occlusion_culling) {
+	Viewport *viewport = viewport_owner.getornull(p_viewport);
+	ERR_FAIL_COND(!viewport);
+
+	if (viewport->use_occlusion_culling == p_use_occlusion_culling) {
+		return;
+	}
+	viewport->use_occlusion_culling = p_use_occlusion_culling;
+
+	if (viewport->use_occlusion_culling) {
+		RendererSceneOcclusionCull::get_singleton()->add_buffer(p_viewport);
+		RendererSceneOcclusionCull::get_singleton()->buffer_set_scenario(p_viewport, viewport->scenario);
+	} else {
+		RendererSceneOcclusionCull::get_singleton()->remove_buffer(p_viewport);
+	}
+
+	viewport->occlusion_buffer_dirty = true;
+}
+
+void RendererViewport::viewport_set_occlusion_rays_per_thread(int p_rays_per_thread) {
+	if (occlusion_rays_per_thread == p_rays_per_thread) {
+		return;
+	}
+
+	occlusion_rays_per_thread = p_rays_per_thread;
+
+	for (int i = 0; i < active_viewports.size(); i++) {
+		active_viewports[i]->occlusion_buffer_dirty = true;
+	}
+}
+
+void RendererViewport::viewport_set_occlusion_culling_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality) {
+	RendererSceneOcclusionCull::get_singleton()->set_build_quality(p_quality);
+}
+
 void RendererViewport::viewport_set_lod_threshold(RID p_viewport, float p_pixels) {
 	Viewport *viewport = viewport_owner.getornull(p_viewport);
 	ERR_FAIL_COND(!viewport);
@@ -985,6 +1051,10 @@ bool RendererViewport::free(RID p_rid) {
 		viewport_set_scenario(p_rid, RID());
 		active_viewports.erase(viewport);
 
+		if (viewport->use_occlusion_culling) {
+			RendererSceneOcclusionCull::get_singleton()->remove_buffer(p_rid);
+		}
+
 		viewport_owner.free(p_rid);
 		memdelete(viewport);
 
@@ -1026,4 +1096,5 @@ void RendererViewport::call_set_use_vsync(bool p_enable) {
 }
 
 RendererViewport::RendererViewport() {
+	occlusion_rays_per_thread = GLOBAL_GET("rendering/occlusion_culling/occlusion_rays_per_thread");
 }
diff --git a/servers/rendering/renderer_viewport.h b/servers/rendering/renderer_viewport.h
index f5ed543e8d..5c372e8c9a 100644
--- a/servers/rendering/renderer_viewport.h
+++ b/servers/rendering/renderer_viewport.h
@@ -31,9 +31,9 @@
 #ifndef VISUALSERVERVIEWPORT_H
 #define VISUALSERVERVIEWPORT_H
 
+#include "core/templates/local_vector.h"
 #include "core/templates/rid_owner.h"
 #include "core/templates/self_list.h"
-#include "renderer_compositor.h"
 #include "servers/rendering_server.h"
 #include "servers/xr/xr_interface.h"
 
@@ -61,6 +61,9 @@ public:
 		RS::ViewportScreenSpaceAA screen_space_aa;
 		bool use_debanding;
 
+		bool use_occlusion_culling;
+		bool occlusion_buffer_dirty;
+
 		DisplayServer::WindowID viewport_to_screen;
 		Rect2 viewport_to_screen_rect;
 		bool viewport_render_direct_to_screen;
@@ -143,6 +146,8 @@ public:
 			msaa = RS::VIEWPORT_MSAA_DISABLED;
 			screen_space_aa = RS::VIEWPORT_SCREEN_SPACE_AA_DISABLED;
 			use_debanding = false;
+			use_occlusion_culling = false;
+			occlusion_buffer_dirty = true;
 
 			snap_2d_transforms_to_pixel = false;
 			snap_2d_vertices_to_pixel = false;
@@ -185,6 +190,10 @@ private:
 	void _draw_3d(Viewport *p_viewport, XRInterface::Eyes p_eye);
 	void _draw_viewport(Viewport *p_viewport, XRInterface::Eyes p_eye = XRInterface::EYE_MONO);
 
+	int occlusion_rays_per_thread = 512;
+
+	void _resize_occlusion_culling_buffer(const Size2i &p_size);
+
 public:
 	RID viewport_allocate();
 	void viewport_initialize(RID p_rid);
@@ -204,6 +213,7 @@ public:
 	void viewport_set_clear_mode(RID p_viewport, RS::ViewportClearMode p_clear_mode);
 
 	RID viewport_get_texture(RID p_viewport) const;
+	RID viewport_get_occluder_debug_texture(RID p_viewport) const;
 
 	void viewport_set_hide_scenario(RID p_viewport, bool p_hide);
 	void viewport_set_hide_canvas(RID p_viewport, bool p_hide);
@@ -225,7 +235,9 @@ public:
 	void viewport_set_msaa(RID p_viewport, RS::ViewportMSAA p_msaa);
 	void viewport_set_screen_space_aa(RID p_viewport, RS::ViewportScreenSpaceAA p_mode);
 	void viewport_set_use_debanding(RID p_viewport, bool p_use_debanding);
-
+	void viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_occlusion_culling);
+	void viewport_set_occlusion_rays_per_thread(int p_rays_per_thread);
+	void viewport_set_occlusion_culling_build_quality(RS::ViewportOcclusionCullingBuildQuality p_quality);
 	void viewport_set_lod_threshold(RID p_viewport, float p_pixels);
 
 	virtual int viewport_get_render_info(RID p_viewport, RS::ViewportRenderInfo p_info);
diff --git a/servers/rendering/rendering_server_default.h b/servers/rendering/rendering_server_default.h
index e82d5cc3f8..c76ae1bb34 100644
--- a/servers/rendering/rendering_server_default.h
+++ b/servers/rendering/rendering_server_default.h
@@ -491,14 +491,20 @@ public:
 	FUNC2(particles_set_use_local_coordinates, RID, bool)
 	FUNC2(particles_set_process_material, RID, RID)
 	FUNC2(particles_set_fixed_fps, RID, int)
+	FUNC2(particles_set_interpolate, RID, bool)
 	FUNC2(particles_set_fractional_delta, RID, bool)
 	FUNC1R(bool, particles_is_inactive, RID)
+	FUNC3(particles_set_trails, RID, bool, float)
+	FUNC2(particles_set_trail_bind_poses, RID, const Vector<Transform> &)
+
 	FUNC1(particles_request_process, RID)
 	FUNC1(particles_restart, RID)
 	FUNC6(particles_emit, RID, const Transform &, const Vector3 &, const Color &, const Color &, uint32_t)
 	FUNC2(particles_set_subemitter, RID, RID)
 	FUNC2(particles_set_collision_base_size, RID, float)
 
+	FUNC2(particles_set_transform_align, RID, RS::ParticlesTransformAlign)
+
 	FUNC2(particles_set_draw_order, RID, RS::ParticlesDrawOrder)
 
 	FUNC2(particles_set_draw_passes, RID, int)
@@ -540,6 +546,10 @@ public:
 	FUNC2(camera_set_camera_effects, RID, RID)
 	FUNC2(camera_set_use_vertical_aspect, RID, bool)
 
+	/* OCCLUDER */
+	FUNCRIDSPLIT(occluder)
+	FUNC3(occluder_set_mesh, RID, const PackedVector3Array &, const PackedInt32Array &);
+
 #undef server_name
 #undef ServerName
 //from now on, calls forwarded to this singleton
@@ -590,6 +600,9 @@ public:
 	FUNC2(viewport_set_msaa, RID, ViewportMSAA)
 	FUNC2(viewport_set_screen_space_aa, RID, ViewportScreenSpaceAA)
 	FUNC2(viewport_set_use_debanding, RID, bool)
+	FUNC2(viewport_set_use_occlusion_culling, RID, bool)
+	FUNC1(viewport_set_occlusion_rays_per_thread, int)
+	FUNC1(viewport_set_occlusion_culling_build_quality, ViewportOcclusionCullingBuildQuality)
 	FUNC2(viewport_set_lod_threshold, RID, float)
 
 	FUNC2R(int, viewport_get_render_info, RID, ViewportRenderInfo)
@@ -703,7 +716,7 @@ public:
 	FUNC2(instance_set_transform, RID, const Transform &)
 	FUNC2(instance_attach_object_instance_id, RID, ObjectID)
 	FUNC3(instance_set_blend_shape_weight, RID, int, float)
-	FUNC3(instance_set_surface_material, RID, int, RID)
+	FUNC3(instance_set_surface_override_material, RID, int, RID)
 	FUNC2(instance_set_visible, RID, bool)
 
 	FUNC2(instance_set_custom_aabb, RID, AABB)
diff --git a/servers/rendering/shader_language.cpp b/servers/rendering/shader_language.cpp
index 4ae0eda232..0d6d3f5e13 100644
--- a/servers/rendering/shader_language.cpp
+++ b/servers/rendering/shader_language.cpp
@@ -3109,20 +3109,20 @@ bool ShaderLanguage::_validate_varying_assign(ShaderNode::Varying &p_varying, St
 	}
 	switch (p_varying.stage) {
 		case ShaderNode::Varying::STAGE_UNKNOWN: // first assign
-			if (current_function == String("vertex")) {
+			if (current_function == varying_function_names.vertex) {
 				p_varying.stage = ShaderNode::Varying::STAGE_VERTEX;
-			} else if (current_function == String("fragment")) {
+			} else if (current_function == varying_function_names.fragment) {
 				p_varying.stage = ShaderNode::Varying::STAGE_FRAGMENT;
 			}
 			break;
 		case ShaderNode::Varying::STAGE_VERTEX:
-			if (current_function == String("fragment")) {
+			if (current_function == varying_function_names.fragment) {
 				*r_message = RTR("Varyings which assigned in 'vertex' function may not be reassigned in 'fragment' or 'light'.");
 				return false;
 			}
 			break;
 		case ShaderNode::Varying::STAGE_FRAGMENT:
-			if (current_function == String("vertex")) {
+			if (current_function == varying_function_names.vertex) {
 				*r_message = RTR("Varyings which assigned in 'fragment' function may not be reassigned in 'vertex' or 'light'.");
 				return false;
 			}
@@ -3139,25 +3139,25 @@ bool ShaderLanguage::_validate_varying_using(ShaderNode::Varying &p_varying, Str
 			*r_message = RTR("Varying must be assigned before using!");
 			return false;
 		case ShaderNode::Varying::STAGE_VERTEX:
-			if (current_function == String("fragment")) {
+			if (current_function == varying_function_names.fragment) {
 				p_varying.stage = ShaderNode::Varying::STAGE_VERTEX_TO_FRAGMENT;
-			} else if (current_function == String("light")) {
+			} else if (current_function == varying_function_names.light) {
 				p_varying.stage = ShaderNode::Varying::STAGE_VERTEX_TO_LIGHT;
 			}
 			break;
 		case ShaderNode::Varying::STAGE_FRAGMENT:
-			if (current_function == String("light")) {
+			if (current_function == varying_function_names.light) {
 				p_varying.stage = ShaderNode::Varying::STAGE_FRAGMENT_TO_LIGHT;
 			}
 			break;
 		case ShaderNode::Varying::STAGE_VERTEX_TO_FRAGMENT:
-			if (current_function == String("light")) {
+			if (current_function == varying_function_names.light) {
 				*r_message = RTR("Varying must only be used in two different stages, which can be 'vertex' 'fragment' and 'light'");
 				return false;
 			}
 			break;
 		case ShaderNode::Varying::STAGE_VERTEX_TO_LIGHT:
-			if (current_function == String("fragment")) {
+			if (current_function == varying_function_names.fragment) {
 				*r_message = RTR("Varying must only be used in two different stages, which can be 'vertex' 'fragment' and 'light'");
 				return false;
 			}
@@ -3168,6 +3168,36 @@ bool ShaderLanguage::_validate_varying_using(ShaderNode::Varying &p_varying, Str
 	return true;
 }
 
+bool ShaderLanguage::_check_node_constness(const Node *p_node) const {
+	switch (p_node->type) {
+		case Node::TYPE_OPERATOR: {
+			OperatorNode *op_node = (OperatorNode *)p_node;
+			for (int i = (1 ? op_node->op == OP_CALL : 0); i < op_node->arguments.size(); i++) {
+				if (!_check_node_constness(op_node->arguments[i])) {
+					return false;
+				}
+			}
+		} break;
+		case Node::TYPE_CONSTANT:
+			break;
+		case Node::TYPE_VARIABLE: {
+			VariableNode *varn = (VariableNode *)p_node;
+			if (!varn->is_const) {
+				return false;
+			}
+		} break;
+		case Node::TYPE_ARRAY: {
+			ArrayNode *arrn = (ArrayNode *)p_node;
+			if (!arrn->is_const) {
+				return false;
+			}
+		} break;
+		default:
+			return false;
+	}
+	return true;
+}
+
 bool ShaderLanguage::_validate_assign(Node *p_node, const FunctionInfo &p_function_info, String *r_message) {
 	if (p_node->type == Node::TYPE_OPERATOR) {
 		OperatorNode *op = static_cast<OperatorNode *>(p_node);
@@ -3956,8 +3986,6 @@ ShaderLanguage::Node *ShaderLanguage::_parse_expression(BlockNode *p_block, cons
 		ERR_FAIL_COND_V(!expr, nullptr);
 
 		/* OK now see what's NEXT to the operator.. */
-		/* OK now see what's NEXT to the operator.. */
-		/* OK now see what's NEXT to the operator.. */
 
 		while (true) {
 			TkPos pos2 = _get_tkpos();
@@ -4735,7 +4763,6 @@ ShaderLanguage::Node *ShaderLanguage::_parse_expression(BlockNode *p_block, cons
 		ERR_FAIL_COND_V(next_op == -1, nullptr);
 
 		// OK! create operator..
-		// OK! create operator..
 		if (is_unary) {
 			int expr_pos = next_op;
 			while (expression[expr_pos].is_op) {
@@ -5387,8 +5414,13 @@ Error ShaderLanguage::_parse_block(BlockNode *p_block, const FunctionInfo &p_fun
 						return ERR_PARSE_ERROR;
 					}
 					if (node->is_const && n->type == Node::TYPE_OPERATOR && ((OperatorNode *)n)->op == OP_CALL) {
-						_set_error("Expected constant expression after '='");
-						return ERR_PARSE_ERROR;
+						OperatorNode *op = ((OperatorNode *)n);
+						for (int i = 1; i < op->arguments.size(); i++) {
+							if (!_check_node_constness(op->arguments[i])) {
+								_set_error("Expected constant expression for argument '" + itos(i - 1) + "' of function call after '='");
+								return ERR_PARSE_ERROR;
+							}
+						}
 					}
 					decl.initializer = n;
 
@@ -5847,7 +5879,7 @@ Error ShaderLanguage::_parse_block(BlockNode *p_block, const FunctionInfo &p_fun
 			//check return type
 			BlockNode *b = p_block;
 
-			if (b && b->parent_function && (b->parent_function->name == "vertex" || b->parent_function->name == "fragment" || b->parent_function->name == "light")) {
+			if (b && b->parent_function && p_function_info.main_function) {
 				_set_error(vformat("Using 'return' in '%s' processor function results in undefined behavior!", b->parent_function->name));
 				return ERR_PARSE_ERROR;
 			}
@@ -6967,8 +6999,13 @@ Error ShaderLanguage::_parse_shader(const Map<StringName, FunctionInfo> &p_funct
 									return ERR_PARSE_ERROR;
 								}
 								if (expr->type == Node::TYPE_OPERATOR && ((OperatorNode *)expr)->op == OP_CALL) {
-									_set_error("Expected constant expression after '='");
-									return ERR_PARSE_ERROR;
+									OperatorNode *op = ((OperatorNode *)expr);
+									for (int i = 1; i < op->arguments.size(); i++) {
+										if (!_check_node_constness(op->arguments[i])) {
+											_set_error("Expected constant expression for argument '" + itos(i - 1) + "' of function call after '='");
+											return ERR_PARSE_ERROR;
+										}
+									}
 								}
 
 								constant.initializer = static_cast<ConstantNode *>(expr);
@@ -7246,26 +7283,12 @@ Error ShaderLanguage::_parse_shader(const Map<StringName, FunctionInfo> &p_funct
 }
 
 bool ShaderLanguage::has_builtin(const Map<StringName, ShaderLanguage::FunctionInfo> &p_functions, const StringName &p_name) {
-	if (p_functions.has("vertex")) {
-		if (p_functions["vertex"].built_ins.has(p_name)) {
-			return true;
-		}
-	}
-	if (p_functions.has("fragment")) {
-		if (p_functions["fragment"].built_ins.has(p_name)) {
-			return true;
-		}
-	}
-	if (p_functions.has("light")) {
-		if (p_functions["light"].built_ins.has(p_name)) {
-			return true;
-		}
-	}
-	if (p_functions.has("compute")) {
-		if (p_functions["compute"].built_ins.has(p_name)) {
+	for (Map<StringName, ShaderLanguage::FunctionInfo>::Element *E = p_functions.front(); E; E = E->next()) {
+		if (E->get().built_ins.has(p_name)) {
 			return true;
 		}
 	}
+
 	return false;
 }
 
@@ -7399,11 +7422,12 @@ String ShaderLanguage::get_shader_type(const String &p_code) {
 	return String();
 }
 
-Error ShaderLanguage::compile(const String &p_code, const Map<StringName, FunctionInfo> &p_functions, const Vector<StringName> &p_render_modes, const Set<String> &p_shader_types, GlobalVariableGetTypeFunc p_global_variable_type_func) {
+Error ShaderLanguage::compile(const String &p_code, const Map<StringName, FunctionInfo> &p_functions, const Vector<StringName> &p_render_modes, const VaryingFunctionNames &p_varying_function_names, const Set<String> &p_shader_types, GlobalVariableGetTypeFunc p_global_variable_type_func) {
 	clear();
 
 	code = p_code;
 	global_var_get_type_func = p_global_variable_type_func;
+	varying_function_names = p_varying_function_names;
 
 	nodes = nullptr;
 
@@ -7416,10 +7440,11 @@ Error ShaderLanguage::compile(const String &p_code, const Map<StringName, Functi
 	return OK;
 }
 
-Error ShaderLanguage::complete(const String &p_code, const Map<StringName, FunctionInfo> &p_functions, const Vector<StringName> &p_render_modes, const Set<String> &p_shader_types, GlobalVariableGetTypeFunc p_global_variable_type_func, List<ScriptCodeCompletionOption> *r_options, String &r_call_hint) {
+Error ShaderLanguage::complete(const String &p_code, const Map<StringName, FunctionInfo> &p_functions, const Vector<StringName> &p_render_modes, const VaryingFunctionNames &p_varying_function_names, const Set<String> &p_shader_types, GlobalVariableGetTypeFunc p_global_variable_type_func, List<ScriptCodeCompletionOption> *r_options, String &r_call_hint) {
 	clear();
 
 	code = p_code;
+	varying_function_names = p_varying_function_names;
 
 	nodes = nullptr;
 	global_var_get_type_func = p_global_variable_type_func;
diff --git a/servers/rendering/shader_language.h b/servers/rendering/shader_language.h
index 14594b039c..470f3d38d5 100644
--- a/servers/rendering/shader_language.h
+++ b/servers/rendering/shader_language.h
@@ -331,6 +331,17 @@ public:
 		MAX_INSTANCE_UNIFORM_INDICES = 16
 	};
 
+	struct VaryingFunctionNames {
+		StringName fragment;
+		StringName vertex;
+		StringName light;
+		VaryingFunctionNames() {
+			fragment = "fragment";
+			vertex = "vertex";
+			light = "light";
+		}
+	};
+
 	struct Node {
 		Node *next = nullptr;
 
@@ -769,7 +780,8 @@ public:
 		Map<StringName, BuiltInInfo> built_ins;
 		Map<StringName, StageFunctionInfo> stage_functions;
 
-		bool can_discard;
+		bool can_discard = false;
+		bool main_function = false;
 	};
 	static bool has_builtin(const Map<StringName, ShaderLanguage::FunctionInfo> &p_functions, const StringName &p_name);
 
@@ -796,6 +808,8 @@ private:
 	StringName current_function;
 	bool last_const = false;
 
+	VaryingFunctionNames varying_function_names;
+
 	TkPos _get_tkpos() {
 		TkPos tkp;
 		tkp.char_idx = char_idx;
@@ -877,6 +891,7 @@ private:
 	bool _propagate_function_call_sampler_builtin_reference(StringName p_name, int p_argument, const StringName &p_builtin);
 	bool _validate_varying_assign(ShaderNode::Varying &p_varying, String *r_message);
 	bool _validate_varying_using(ShaderNode::Varying &p_varying, String *r_message);
+	bool _check_node_constness(const Node *p_node) const;
 
 	Node *_parse_expression(BlockNode *p_block, const FunctionInfo &p_function_info);
 	Node *_parse_array_constructor(BlockNode *p_block, const FunctionInfo &p_function_info, DataType p_type, const StringName &p_struct_name, int p_array_size);
@@ -898,8 +913,8 @@ public:
 	void clear();
 
 	static String get_shader_type(const String &p_code);
-	Error compile(const String &p_code, const Map<StringName, FunctionInfo> &p_functions, const Vector<StringName> &p_render_modes, const Set<String> &p_shader_types, GlobalVariableGetTypeFunc p_global_variable_type_func);
-	Error complete(const String &p_code, const Map<StringName, FunctionInfo> &p_functions, const Vector<StringName> &p_render_modes, const Set<String> &p_shader_types, GlobalVariableGetTypeFunc p_global_variable_type_func, List<ScriptCodeCompletionOption> *r_options, String &r_call_hint);
+	Error compile(const String &p_code, const Map<StringName, FunctionInfo> &p_functions, const Vector<StringName> &p_render_modes, const VaryingFunctionNames &p_varying_function_names, const Set<String> &p_shader_types, GlobalVariableGetTypeFunc p_global_variable_type_func);
+	Error complete(const String &p_code, const Map<StringName, FunctionInfo> &p_functions, const Vector<StringName> &p_render_modes, const VaryingFunctionNames &p_varying_function_names, const Set<String> &p_shader_types, GlobalVariableGetTypeFunc p_global_variable_type_func, List<ScriptCodeCompletionOption> *r_options, String &r_call_hint);
 
 	String get_error_text();
 	int get_error_line();
diff --git a/servers/rendering/shader_types.cpp b/servers/rendering/shader_types.cpp
index e99b8504bb..0bf68b9e0f 100644
--- a/servers/rendering/shader_types.cpp
+++ b/servers/rendering/shader_types.cpp
@@ -74,6 +74,7 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[RS::SHADER_SPATIAL].functions["vertex"].built_ins["CUSTOM2"] = ShaderLanguage::TYPE_VEC4;
 	shader_modes[RS::SHADER_SPATIAL].functions["vertex"].built_ins["CUSTOM3"] = ShaderLanguage::TYPE_VEC4;
 	shader_modes[RS::SHADER_SPATIAL].functions["vertex"].can_discard = false;
+	shader_modes[RS::SHADER_SPATIAL].functions["vertex"].main_function = true;
 
 	//builtins
 	shader_modes[RS::SHADER_SPATIAL].functions["vertex"].built_ins["WORLD_MATRIX"] = ShaderLanguage::TYPE_MAT4;
@@ -139,6 +140,7 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[RS::SHADER_SPATIAL].functions["fragment"].built_ins["RADIANCE"] = ShaderLanguage::TYPE_VEC4;
 	shader_modes[RS::SHADER_SPATIAL].functions["fragment"].built_ins["IRRADIANCE"] = ShaderLanguage::TYPE_VEC4;
 	shader_modes[RS::SHADER_SPATIAL].functions["fragment"].can_discard = true;
+	shader_modes[RS::SHADER_SPATIAL].functions["fragment"].main_function = true;
 
 	shader_modes[RS::SHADER_SPATIAL].functions["fragment"].built_ins["ALPHA_SCISSOR_THRESHOLD"] = ShaderLanguage::TYPE_FLOAT;
 	shader_modes[RS::SHADER_SPATIAL].functions["fragment"].built_ins["ALPHA_HASH_SCALE"] = ShaderLanguage::TYPE_FLOAT;
@@ -171,6 +173,7 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[RS::SHADER_SPATIAL].functions["light"].built_ins["ALPHA"] = ShaderLanguage::TYPE_FLOAT;
 
 	shader_modes[RS::SHADER_SPATIAL].functions["light"].can_discard = true;
+	shader_modes[RS::SHADER_SPATIAL].functions["light"].main_function = true;
 
 	//order used puts first enum mode (default) first
 	shader_modes[RS::SHADER_SPATIAL].modes.push_back("blend_mix");
@@ -216,6 +219,7 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[RS::SHADER_SPATIAL].modes.push_back("shadow_to_opacity");
 
 	shader_modes[RS::SHADER_SPATIAL].modes.push_back("vertex_lighting");
+	shader_modes[RS::SHADER_SPATIAL].modes.push_back("particle_trails");
 
 	shader_modes[RS::SHADER_SPATIAL].modes.push_back("alpha_to_coverage");
 	shader_modes[RS::SHADER_SPATIAL].modes.push_back("alpha_to_coverage_and_one");
@@ -236,6 +240,7 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[RS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["AT_LIGHT_PASS"] = constt(ShaderLanguage::TYPE_BOOL);
 	shader_modes[RS::SHADER_CANVAS_ITEM].functions["vertex"].built_ins["TEXTURE_PIXEL_SIZE"] = constt(ShaderLanguage::TYPE_VEC2);
 	shader_modes[RS::SHADER_CANVAS_ITEM].functions["vertex"].can_discard = false;
+	shader_modes[RS::SHADER_CANVAS_ITEM].functions["vertex"].main_function = true;
 
 	shader_modes[RS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["VERTEX"] = ShaderLanguage::TYPE_VEC2;
 	shader_modes[RS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["SHADOW_VERTEX"] = ShaderLanguage::TYPE_VEC2;
@@ -257,6 +262,7 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[RS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["AT_LIGHT_PASS"] = constt(ShaderLanguage::TYPE_BOOL);
 	shader_modes[RS::SHADER_CANVAS_ITEM].functions["fragment"].built_ins["SCREEN_TEXTURE"] = constt(ShaderLanguage::TYPE_SAMPLER2D);
 	shader_modes[RS::SHADER_CANVAS_ITEM].functions["fragment"].can_discard = true;
+	shader_modes[RS::SHADER_CANVAS_ITEM].functions["fragment"].main_function = true;
 
 	{
 		ShaderLanguage::StageFunctionInfo func;
@@ -294,6 +300,7 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[RS::SHADER_CANVAS_ITEM].functions["light"].built_ins["TEXTURE_PIXEL_SIZE"] = constt(ShaderLanguage::TYPE_VEC2);
 	shader_modes[RS::SHADER_CANVAS_ITEM].functions["light"].built_ins["POINT_COORD"] = constt(ShaderLanguage::TYPE_VEC2);
 	shader_modes[RS::SHADER_CANVAS_ITEM].functions["light"].can_discard = true;
+	shader_modes[RS::SHADER_CANVAS_ITEM].functions["light"].main_function = true;
 
 	shader_modes[RS::SHADER_CANVAS_ITEM].modes.push_back("skip_vertex_transform");
 
@@ -310,34 +317,50 @@ ShaderTypes::ShaderTypes() {
 	/************ PARTICLES **************************/
 
 	shader_modes[RS::SHADER_PARTICLES].functions["global"].built_ins["TIME"] = constt(ShaderLanguage::TYPE_FLOAT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["COLOR"] = ShaderLanguage::TYPE_VEC4;
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["VELOCITY"] = ShaderLanguage::TYPE_VEC3;
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["MASS"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["ACTIVE"] = ShaderLanguage::TYPE_BOOL;
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["RESTART"] = constt(ShaderLanguage::TYPE_BOOL);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["CUSTOM"] = ShaderLanguage::TYPE_VEC4;
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["TRANSFORM"] = ShaderLanguage::TYPE_MAT4;
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["LIFETIME"] = constt(ShaderLanguage::TYPE_FLOAT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["DELTA"] = constt(ShaderLanguage::TYPE_FLOAT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["NUMBER"] = constt(ShaderLanguage::TYPE_UINT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["INDEX"] = constt(ShaderLanguage::TYPE_INT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["EMISSION_TRANSFORM"] = constt(ShaderLanguage::TYPE_MAT4);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["RANDOM_SEED"] = constt(ShaderLanguage::TYPE_UINT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["FLAG_EMIT_POSITION"] = constt(ShaderLanguage::TYPE_UINT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["FLAG_EMIT_ROT_SCALE"] = constt(ShaderLanguage::TYPE_UINT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["FLAG_EMIT_VELOCITY"] = constt(ShaderLanguage::TYPE_UINT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["FLAG_EMIT_COLOR"] = constt(ShaderLanguage::TYPE_UINT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["FLAG_EMIT_CUSTOM"] = constt(ShaderLanguage::TYPE_UINT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["RESTART_POSITION"] = constt(ShaderLanguage::TYPE_BOOL);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["RESTART_ROT_SCALE"] = constt(ShaderLanguage::TYPE_BOOL);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["RESTART_VELOCITY"] = constt(ShaderLanguage::TYPE_BOOL);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["RESTART_COLOR"] = constt(ShaderLanguage::TYPE_BOOL);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["RESTART_CUSTOM"] = constt(ShaderLanguage::TYPE_BOOL);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["COLLIDED"] = constt(ShaderLanguage::TYPE_BOOL);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["COLLISION_NORMAL"] = constt(ShaderLanguage::TYPE_VEC3);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["COLLISION_DEPTH"] = constt(ShaderLanguage::TYPE_FLOAT);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].built_ins["ATTRACTOR_FORCE"] = constt(ShaderLanguage::TYPE_VEC3);
-	shader_modes[RS::SHADER_PARTICLES].functions["compute"].can_discard = false;
+
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["COLOR"] = ShaderLanguage::TYPE_VEC4;
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["VELOCITY"] = ShaderLanguage::TYPE_VEC3;
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["MASS"] = ShaderLanguage::TYPE_FLOAT;
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["ACTIVE"] = ShaderLanguage::TYPE_BOOL;
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["RESTART"] = constt(ShaderLanguage::TYPE_BOOL);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["CUSTOM"] = ShaderLanguage::TYPE_VEC4;
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["TRANSFORM"] = ShaderLanguage::TYPE_MAT4;
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["LIFETIME"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["DELTA"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["NUMBER"] = constt(ShaderLanguage::TYPE_UINT);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["INDEX"] = constt(ShaderLanguage::TYPE_UINT);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["EMISSION_TRANSFORM"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["RANDOM_SEED"] = constt(ShaderLanguage::TYPE_UINT);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["RESTART_POSITION"] = constt(ShaderLanguage::TYPE_BOOL);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["RESTART_ROT_SCALE"] = constt(ShaderLanguage::TYPE_BOOL);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["RESTART_VELOCITY"] = constt(ShaderLanguage::TYPE_BOOL);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["RESTART_COLOR"] = constt(ShaderLanguage::TYPE_BOOL);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].built_ins["RESTART_CUSTOM"] = constt(ShaderLanguage::TYPE_BOOL);
+	shader_modes[RS::SHADER_PARTICLES].functions["start"].main_function = true;
+
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["COLOR"] = ShaderLanguage::TYPE_VEC4;
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["VELOCITY"] = ShaderLanguage::TYPE_VEC3;
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["MASS"] = ShaderLanguage::TYPE_FLOAT;
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["ACTIVE"] = ShaderLanguage::TYPE_BOOL;
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["RESTART"] = constt(ShaderLanguage::TYPE_BOOL);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["CUSTOM"] = ShaderLanguage::TYPE_VEC4;
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["TRANSFORM"] = ShaderLanguage::TYPE_MAT4;
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["LIFETIME"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["DELTA"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["NUMBER"] = constt(ShaderLanguage::TYPE_UINT);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["INDEX"] = constt(ShaderLanguage::TYPE_INT);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["EMISSION_TRANSFORM"] = constt(ShaderLanguage::TYPE_MAT4);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["RANDOM_SEED"] = constt(ShaderLanguage::TYPE_UINT);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["FLAG_EMIT_POSITION"] = constt(ShaderLanguage::TYPE_UINT);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["FLAG_EMIT_ROT_SCALE"] = constt(ShaderLanguage::TYPE_UINT);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["FLAG_EMIT_VELOCITY"] = constt(ShaderLanguage::TYPE_UINT);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["FLAG_EMIT_COLOR"] = constt(ShaderLanguage::TYPE_UINT);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["FLAG_EMIT_CUSTOM"] = constt(ShaderLanguage::TYPE_UINT);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["COLLIDED"] = constt(ShaderLanguage::TYPE_BOOL);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["COLLISION_NORMAL"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["COLLISION_DEPTH"] = constt(ShaderLanguage::TYPE_FLOAT);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].built_ins["ATTRACTOR_FORCE"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[RS::SHADER_PARTICLES].functions["process"].main_function = true;
 
 	{
 		ShaderLanguage::StageFunctionInfo emit_vertex_func;
@@ -347,7 +370,7 @@ ShaderTypes::ShaderTypes() {
 		emit_vertex_func.arguments.push_back(ShaderLanguage::StageFunctionInfo::Argument("custom", ShaderLanguage::TYPE_VEC4));
 		emit_vertex_func.arguments.push_back(ShaderLanguage::StageFunctionInfo::Argument("flags", ShaderLanguage::TYPE_UINT));
 		emit_vertex_func.return_type = ShaderLanguage::TYPE_BOOL; //whether it could emit
-		shader_modes[RS::SHADER_PARTICLES].functions["compute"].stage_functions["emit_subparticle"] = emit_vertex_func;
+		shader_modes[RS::SHADER_PARTICLES].functions["process"].stage_functions["emit_subparticle"] = emit_vertex_func;
 	}
 
 	shader_modes[RS::SHADER_PARTICLES].modes.push_back("collision_use_scale");
@@ -384,14 +407,15 @@ ShaderTypes::ShaderTypes() {
 	shader_modes[RS::SHADER_SKY].functions["global"].built_ins["LIGHT3_COLOR"] = constt(ShaderLanguage::TYPE_VEC3);
 	shader_modes[RS::SHADER_SKY].functions["global"].built_ins["LIGHT3_SIZE"] = constt(ShaderLanguage::TYPE_FLOAT);
 
-	shader_modes[RS::SHADER_SKY].functions["fragment"].built_ins["COLOR"] = ShaderLanguage::TYPE_VEC3;
-	shader_modes[RS::SHADER_SKY].functions["fragment"].built_ins["ALPHA"] = ShaderLanguage::TYPE_FLOAT;
-	shader_modes[RS::SHADER_SKY].functions["fragment"].built_ins["EYEDIR"] = constt(ShaderLanguage::TYPE_VEC3);
-	shader_modes[RS::SHADER_SKY].functions["fragment"].built_ins["SCREEN_UV"] = constt(ShaderLanguage::TYPE_VEC2);
-	shader_modes[RS::SHADER_SKY].functions["fragment"].built_ins["SKY_COORDS"] = constt(ShaderLanguage::TYPE_VEC2);
-	shader_modes[RS::SHADER_SKY].functions["fragment"].built_ins["HALF_RES_COLOR"] = constt(ShaderLanguage::TYPE_VEC4);
-	shader_modes[RS::SHADER_SKY].functions["fragment"].built_ins["QUARTER_RES_COLOR"] = constt(ShaderLanguage::TYPE_VEC4);
-	shader_modes[RS::SHADER_SKY].functions["fragment"].built_ins["FOG"] = ShaderLanguage::TYPE_VEC4;
+	shader_modes[RS::SHADER_SKY].functions["sky"].built_ins["COLOR"] = ShaderLanguage::TYPE_VEC3;
+	shader_modes[RS::SHADER_SKY].functions["sky"].built_ins["ALPHA"] = ShaderLanguage::TYPE_FLOAT;
+	shader_modes[RS::SHADER_SKY].functions["sky"].built_ins["EYEDIR"] = constt(ShaderLanguage::TYPE_VEC3);
+	shader_modes[RS::SHADER_SKY].functions["sky"].built_ins["SCREEN_UV"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[RS::SHADER_SKY].functions["sky"].built_ins["SKY_COORDS"] = constt(ShaderLanguage::TYPE_VEC2);
+	shader_modes[RS::SHADER_SKY].functions["sky"].built_ins["HALF_RES_COLOR"] = constt(ShaderLanguage::TYPE_VEC4);
+	shader_modes[RS::SHADER_SKY].functions["sky"].built_ins["QUARTER_RES_COLOR"] = constt(ShaderLanguage::TYPE_VEC4);
+	shader_modes[RS::SHADER_SKY].functions["sky"].built_ins["FOG"] = ShaderLanguage::TYPE_VEC4;
+	shader_modes[RS::SHADER_SKY].functions["sky"].main_function = true;
 
 	shader_modes[RS::SHADER_SKY].modes.push_back("use_half_res_pass");
 	shader_modes[RS::SHADER_SKY].modes.push_back("use_quarter_res_pass");
diff --git a/servers/rendering_server.cpp b/servers/rendering_server.cpp
index 809343114c..a9154603ee 100644
--- a/servers/rendering_server.cpp
+++ b/servers/rendering_server.cpp
@@ -349,7 +349,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						for (int i = 0; i < p_vertex_array_len; i++) {
 							float vector[2] = { src[i].x, src[i].y };
 
-							copymem(&vw[p_offsets[ai] + i * p_vertex_stride], vector, sizeof(float) * 2);
+							memcpy(&vw[p_offsets[ai] + i * p_vertex_stride], vector, sizeof(float) * 2);
 
 							if (i == 0) {
 								aabb = Rect2(src[i], SMALL_VEC2); //must have a bit of size
@@ -374,7 +374,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						for (int i = 0; i < p_vertex_array_len; i++) {
 							float vector[3] = { src[i].x, src[i].y, src[i].z };
 
-							copymem(&vw[p_offsets[ai] + i * p_vertex_stride], vector, sizeof(float) * 3);
+							memcpy(&vw[p_offsets[ai] + i * p_vertex_stride], vector, sizeof(float) * 3);
 
 							if (i == 0) {
 								aabb = AABB(src[i], SMALL_VEC3);
@@ -403,7 +403,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 					value |= CLAMP(int(n.y * 1023.0), 0, 1023) << 10;
 					value |= CLAMP(int(n.z * 1023.0), 0, 1023) << 20;
 
-					copymem(&vw[p_offsets[ai] + i * p_vertex_stride], &value, 4);
+					memcpy(&vw[p_offsets[ai] + i * p_vertex_stride], &value, 4);
 				}
 
 			} break;
@@ -422,9 +422,9 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 					value |= CLAMP(int((src[i * 4 + 0] * 0.5 + 0.5) * 1023.0), 0, 1023);
 					value |= CLAMP(int((src[i * 4 + 1] * 0.5 + 0.5) * 1023.0), 0, 1023) << 10;
 					value |= CLAMP(int((src[i * 4 + 2] * 0.5 + 0.5) * 1023.0), 0, 1023) << 20;
-					value |= CLAMP(int((src[i * 4 + 3] * 0.5 + 0.5) * 3.0), 0, 3) << 30;
+					value |= CLAMP(int((src[i * 4 + 3] * 0.5 + 0.5) * 1023.0), 0, 1023) << 30;
 
-					copymem(&vw[p_offsets[ai] + i * p_vertex_stride], &value, 4);
+					memcpy(&vw[p_offsets[ai] + i * p_vertex_stride], &value, 4);
 				}
 
 			} break;
@@ -442,7 +442,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 					color16[1] = Math::make_half_float(src[i].g);
 					color16[2] = Math::make_half_float(src[i].b);
 					color16[3] = Math::make_half_float(src[i].a);
-					copymem(&aw[p_offsets[ai] + i * p_attrib_stride], color16, 8);
+					memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], color16, 8);
 				}
 			} break;
 			case RS::ARRAY_TEX_UV: {
@@ -457,7 +457,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 				for (int i = 0; i < p_vertex_array_len; i++) {
 					float uv[2] = { src[i].x, src[i].y };
 
-					copymem(&aw[p_offsets[ai] + i * p_attrib_stride], uv, 2 * 4);
+					memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], uv, 2 * 4);
 				}
 
 			} break;
@@ -472,8 +472,8 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 				const Vector2 *src = array.ptr();
 
 				for (int i = 0; i < p_vertex_array_len; i++) {
-					uint16_t uv[2] = { Math::make_half_float(src[i].x), Math::make_half_float(src[i].y) };
-					copymem(&aw[p_offsets[ai] + i * p_attrib_stride], uv, 2 * 2);
+					float uv[2] = { src[i].x, src[i].y };
+					memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], uv, 2 * 4);
 				}
 			} break;
 			case RS::ARRAY_CUSTOM0:
@@ -495,7 +495,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						const uint8_t *src = array.ptr();
 
 						for (int i = 0; i < p_vertex_array_len; i++) {
-							copymem(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * 4], 4);
+							memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * 4], 4);
 						}
 
 					} break;
@@ -510,7 +510,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						const uint8_t *src = array.ptr();
 
 						for (int i = 0; i < p_vertex_array_len; i++) {
-							copymem(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * 8], 8);
+							memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * 8], 8);
 						}
 					} break;
 					case ARRAY_CUSTOM_R_FLOAT:
@@ -528,7 +528,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						const float *src = array.ptr();
 
 						for (int i = 0; i < p_vertex_array_len; i++) {
-							copymem(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * s], 4 * s);
+							memcpy(&aw[p_offsets[ai] + i * p_attrib_stride], &src[i * s], 4 * s);
 						}
 					} break;
 					default: {
@@ -554,7 +554,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 							data[j] = CLAMP(src[i * bone_count + j] * 65535, 0, 65535);
 						}
 
-						copymem(&sw[p_offsets[ai] + i * p_skin_stride], data, 2 * bone_count);
+						memcpy(&sw[p_offsets[ai] + i * p_skin_stride], data, 2 * bone_count);
 					}
 				}
 
@@ -578,7 +578,7 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 						max_bone = MAX(data[j], max_bone);
 					}
 
-					copymem(&sw[p_offsets[ai] + i * p_skin_stride], data, 2 * bone_count);
+					memcpy(&sw[p_offsets[ai] + i * p_skin_stride], data, 2 * bone_count);
 				}
 
 			} break;
@@ -600,11 +600,11 @@ Error RenderingServer::_surface_set_data(Array p_arrays, uint32_t p_format, uint
 					if (p_vertex_array_len < (1 << 16)) {
 						uint16_t v = src[i];
 
-						copymem(&iw[i * 2], &v, 2);
+						memcpy(&iw[i * 2], &v, 2);
 					} else {
 						uint32_t v = src[i];
 
-						copymem(&iw[i * 4], &v, 4);
+						memcpy(&iw[i * 4], &v, 4);
 					}
 				}
 			} break;
@@ -1172,7 +1172,7 @@ Array RenderingServer::_get_array_from_surface(uint32_t p_format, Vector<uint8_t
 
 						for (int j = 0; j < p_vertex_len; j++) {
 							const uint8_t *v = (const uint8_t *)&ar[j * attrib_elem_size + offsets[i]];
-							copymem(&w[j * s], v, s);
+							memcpy(&w[j * s], v, s);
 						}
 
 						ret[i] = arr;
@@ -1189,7 +1189,7 @@ Array RenderingServer::_get_array_from_surface(uint32_t p_format, Vector<uint8_t
 
 						for (int j = 0; j < p_vertex_len; j++) {
 							const float *v = (const float *)&ar[j * attrib_elem_size + offsets[i]];
-							copymem(&w[j * s], v, s * sizeof(float));
+							memcpy(&w[j * s], v, s * sizeof(float));
 						}
 						ret[i] = arr;
 
@@ -1594,7 +1594,7 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("gi_probe_set_compress", "probe", "enable"), &RenderingServer::gi_probe_set_compress);
 	ClassDB::bind_method(D_METHOD("gi_probe_is_compressed", "probe"), &RenderingServer::gi_probe_is_compressed);
 #endif
-/*
+	/*
 	ClassDB::bind_method(D_METHOD("lightmap_create()"), &RenderingServer::lightmap_capture_create);
 	ClassDB::bind_method(D_METHOD("lightmap_capture_set_bounds", "capture", "bounds"), &RenderingServer::lightmap_capture_set_bounds);
 	ClassDB::bind_method(D_METHOD("lightmap_capture_get_bounds", "capture"), &RenderingServer::lightmap_capture_get_bounds);
@@ -1607,6 +1607,10 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("lightmap_capture_set_energy", "capture", "energy"), &RenderingServer::lightmap_capture_set_energy);
 	ClassDB::bind_method(D_METHOD("lightmap_capture_get_energy", "capture"), &RenderingServer::lightmap_capture_get_energy);
 */
+
+	ClassDB::bind_method(D_METHOD("occluder_create"), &RenderingServer::occluder_create);
+	ClassDB::bind_method(D_METHOD("occluder_set_mesh"), &RenderingServer::occluder_set_mesh);
+
 #endif
 	ClassDB::bind_method(D_METHOD("particles_create"), &RenderingServer::particles_create);
 	ClassDB::bind_method(D_METHOD("particles_set_emitting", "particles", "emitting"), &RenderingServer::particles_set_emitting);
@@ -1667,6 +1671,9 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("viewport_set_shadow_atlas_quadrant_subdivision", "viewport", "quadrant", "subdivision"), &RenderingServer::viewport_set_shadow_atlas_quadrant_subdivision);
 	ClassDB::bind_method(D_METHOD("viewport_set_msaa", "viewport", "msaa"), &RenderingServer::viewport_set_msaa);
 	ClassDB::bind_method(D_METHOD("viewport_set_use_debanding", "viewport", "enable"), &RenderingServer::viewport_set_use_debanding);
+	ClassDB::bind_method(D_METHOD("viewport_set_use_occlusion_culling", "viewport", "enable"), &RenderingServer::viewport_set_use_occlusion_culling);
+	ClassDB::bind_method(D_METHOD("viewport_set_occlusion_rays_per_thread", "rays_per_thread"), &RenderingServer::viewport_set_occlusion_rays_per_thread);
+	ClassDB::bind_method(D_METHOD("viewport_set_occlusion_culling_build_quality", "quality"), &RenderingServer::viewport_set_occlusion_culling_build_quality);
 
 	ClassDB::bind_method(D_METHOD("viewport_get_render_info", "viewport", "info"), &RenderingServer::viewport_get_render_info);
 	ClassDB::bind_method(D_METHOD("viewport_set_debug_draw", "viewport", "draw"), &RenderingServer::viewport_set_debug_draw);
@@ -1694,6 +1701,7 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("scenario_create"), &RenderingServer::scenario_create);
 	ClassDB::bind_method(D_METHOD("scenario_set_debug", "scenario", "debug_mode"), &RenderingServer::scenario_set_debug);
 	ClassDB::bind_method(D_METHOD("scenario_set_environment", "scenario", "environment"), &RenderingServer::scenario_set_environment);
+	ClassDB::bind_method(D_METHOD("scenario_set_camera_effects", "scenario", "effects"), &RenderingServer::scenario_set_camera_effects);
 	ClassDB::bind_method(D_METHOD("scenario_set_fallback_environment", "scenario", "environment"), &RenderingServer::scenario_set_fallback_environment);
 
 #ifndef _3D_DISABLED
@@ -1706,7 +1714,7 @@ void RenderingServer::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("instance_set_transform", "instance", "transform"), &RenderingServer::instance_set_transform);
 	ClassDB::bind_method(D_METHOD("instance_attach_object_instance_id", "instance", "id"), &RenderingServer::instance_attach_object_instance_id);
 	ClassDB::bind_method(D_METHOD("instance_set_blend_shape_weight", "instance", "shape", "weight"), &RenderingServer::instance_set_blend_shape_weight);
-	ClassDB::bind_method(D_METHOD("instance_set_surface_material", "instance", "surface", "material"), &RenderingServer::instance_set_surface_material);
+	ClassDB::bind_method(D_METHOD("instance_set_surface_override_material", "instance", "surface", "material"), &RenderingServer::instance_set_surface_override_material);
 	ClassDB::bind_method(D_METHOD("instance_set_visible", "instance", "visible"), &RenderingServer::instance_set_visible);
 	//	ClassDB::bind_method(D_METHOD("instance_set_use_lightmap", "instance", "lightmap_instance", "lightmap"), &RenderingServer::instance_set_use_lightmap);
 	ClassDB::bind_method(D_METHOD("instance_set_custom_aabb", "instance", "aabb"), &RenderingServer::instance_set_custom_aabb);
@@ -2024,6 +2032,7 @@ void RenderingServer::_bind_methods() {
 	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_SDFGI);
 	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_SDFGI_PROBES);
 	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_GI_BUFFER);
+	BIND_ENUM_CONSTANT(VIEWPORT_DEBUG_DRAW_OCCLUDERS);
 
 	BIND_ENUM_CONSTANT(SKY_MODE_QUALITY);
 	BIND_ENUM_CONSTANT(SKY_MODE_REALTIME);
@@ -2093,6 +2102,10 @@ void RenderingServer::_bind_methods() {
 	BIND_ENUM_CONSTANT(SCENARIO_DEBUG_OVERDRAW);
 	BIND_ENUM_CONSTANT(SCENARIO_DEBUG_SHADELESS);
 
+	BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_LOW);
+	BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_MEDIUM);
+	BIND_ENUM_CONSTANT(VIEWPORT_OCCLUSION_BUILD_QUALITY_HIGH);
+
 	BIND_ENUM_CONSTANT(INSTANCE_NONE);
 	BIND_ENUM_CONSTANT(INSTANCE_MESH);
 	BIND_ENUM_CONSTANT(INSTANCE_MULTIMESH);
@@ -2104,12 +2117,14 @@ void RenderingServer::_bind_methods() {
 	BIND_ENUM_CONSTANT(INSTANCE_DECAL);
 	BIND_ENUM_CONSTANT(INSTANCE_GI_PROBE);
 	BIND_ENUM_CONSTANT(INSTANCE_LIGHTMAP);
+	BIND_ENUM_CONSTANT(INSTANCE_OCCLUDER);
 	BIND_ENUM_CONSTANT(INSTANCE_MAX);
 	BIND_ENUM_CONSTANT(INSTANCE_GEOMETRY_MASK);
 
 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_USE_BAKED_LIGHT);
 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_USE_DYNAMIC_GI);
 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_DRAW_NEXT_FRAME_IF_VISIBLE);
+	BIND_ENUM_CONSTANT(INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING);
 	BIND_ENUM_CONSTANT(INSTANCE_FLAG_MAX);
 
 	BIND_ENUM_CONSTANT(SHADOW_CASTING_SETTING_OFF);
@@ -2282,8 +2297,12 @@ RenderingServer::RenderingServer() {
 
 	GLOBAL_DEF("rendering/2d/shadow_atlas/size", 2048);
 
-	GLOBAL_DEF("rendering/driver/rd_renderer/use_low_end_renderer", false);
-	GLOBAL_DEF("rendering/driver/rd_renderer/use_low_end_renderer.mobile", true);
+	GLOBAL_DEF_RST("rendering/vulkan/rendering/back_end", 0);
+	GLOBAL_DEF_RST("rendering/vulkan/rendering/back_end.mobile", 1);
+	ProjectSettings::get_singleton()->set_custom_property_info("rendering/vulkan/rendering/back_end",
+			PropertyInfo(Variant::INT,
+					"rendering/vulkan/rendering/back_end",
+					PROPERTY_HINT_ENUM, "ForwardClustered,ForwardMobile"));
 
 	GLOBAL_DEF("rendering/reflections/sky_reflections/roughness_layers", 8);
 	GLOBAL_DEF("rendering/reflections/sky_reflections/texture_array_reflections", true);
@@ -2340,6 +2359,10 @@ RenderingServer::RenderingServer() {
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/anti_aliasing/screen_space_roughness_limiter/amount", PropertyInfo(Variant::FLOAT, "rendering/anti_aliasing/screen_space_roughness_limiter/amount", PROPERTY_HINT_RANGE, "0.01,4.0,0.01"));
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/anti_aliasing/screen_space_roughness_limiter/limit", PropertyInfo(Variant::FLOAT, "rendering/anti_aliasing/screen_space_roughness_limiter/limit", PROPERTY_HINT_RANGE, "0.01,1.0,0.01"));
 
+	GLOBAL_DEF_RST("rendering/occlusion_culling/occlusion_rays_per_thread", 512);
+	GLOBAL_DEF_RST("rendering/occlusion_culling/bvh_build_quality", 2);
+	ProjectSettings::get_singleton()->set_custom_property_info("rendering/occlusion_culling/bvh_build_quality", PropertyInfo(Variant::INT, "rendering/occlusion_culling/bvh_build_quality", PROPERTY_HINT_ENUM, "Low,Medium,High"));
+
 	GLOBAL_DEF("rendering/environment/glow/upscale_mode", 1);
 	ProjectSettings::get_singleton()->set_custom_property_info("rendering/environment/glow/upscale_mode", PropertyInfo(Variant::INT, "rendering/environment/glow/upscale_mode", PROPERTY_HINT_ENUM, "Linear (Fast),Bicubic (Slow)"));
 	GLOBAL_DEF("rendering/environment/glow/upscale_mode.mobile", 0);
diff --git a/servers/rendering_server.h b/servers/rendering_server.h
index 6a8bb83ec1..ad965e9690 100644
--- a/servers/rendering_server.h
+++ b/servers/rendering_server.h
@@ -631,8 +631,22 @@ public:
 	virtual void particles_set_use_local_coordinates(RID p_particles, bool p_enable) = 0;
 	virtual void particles_set_process_material(RID p_particles, RID p_material) = 0;
 	virtual void particles_set_fixed_fps(RID p_particles, int p_fps) = 0;
+	virtual void particles_set_interpolate(RID p_particles, bool p_enable) = 0;
 	virtual void particles_set_fractional_delta(RID p_particles, bool p_enable) = 0;
 	virtual void particles_set_collision_base_size(RID p_particles, float p_size) = 0;
+
+	enum ParticlesTransformAlign {
+		PARTICLES_TRANSFORM_ALIGN_DISABLED,
+		PARTICLES_TRANSFORM_ALIGN_Z_BILLBOARD,
+		PARTICLES_TRANSFORM_ALIGN_Y_TO_VELOCITY,
+		PARTICLES_TRANSFORM_ALIGN_Z_BILLBOARD_Y_TO_VELOCITY,
+	};
+
+	virtual void particles_set_transform_align(RID p_particles, ParticlesTransformAlign p_transform_align) = 0;
+
+	virtual void particles_set_trails(RID p_particles, bool p_enable, float p_length_sec) = 0;
+	virtual void particles_set_trail_bind_poses(RID p_particles, const Vector<Transform> &p_bind_poses) = 0;
+
 	virtual bool particles_is_inactive(RID p_particles) = 0;
 	virtual void particles_request_process(RID p_particles) = 0;
 	virtual void particles_restart(RID p_particles) = 0;
@@ -713,6 +727,11 @@ public:
 	virtual void camera_set_camera_effects(RID p_camera, RID p_camera_effects) = 0;
 	virtual void camera_set_use_vertical_aspect(RID p_camera, bool p_enable) = 0;
 
+	/* OCCLUDER API */
+
+	virtual RID occluder_create() = 0;
+	virtual void occluder_set_mesh(RID p_occluder, const PackedVector3Array &p_vertices, const PackedInt32Array &p_indices) = 0;
+
 	/* VIEWPORT TARGET API */
 
 	enum CanvasItemTextureFilter {
@@ -826,6 +845,17 @@ public:
 
 	virtual void viewport_set_lod_threshold(RID p_viewport, float p_pixels) = 0;
 
+	virtual void viewport_set_use_occlusion_culling(RID p_viewport, bool p_use_debanding) = 0;
+	virtual void viewport_set_occlusion_rays_per_thread(int p_rays_per_thread) = 0;
+
+	enum ViewportOcclusionCullingBuildQuality {
+		VIEWPORT_OCCLUSION_BUILD_QUALITY_LOW = 0,
+		VIEWPORT_OCCLUSION_BUILD_QUALITY_MEDIUM = 1,
+		VIEWPORT_OCCLUSION_BUILD_QUALITY_HIGH = 2,
+	};
+
+	virtual void viewport_set_occlusion_culling_build_quality(ViewportOcclusionCullingBuildQuality p_quality) = 0;
+
 	enum ViewportRenderInfo {
 		VIEWPORT_RENDER_INFO_OBJECTS_IN_FRAME,
 		VIEWPORT_RENDER_INFO_VERTICES_IN_FRAME,
@@ -862,6 +892,7 @@ public:
 		VIEWPORT_DEBUG_DRAW_CLUSTER_SPOT_LIGHTS,
 		VIEWPORT_DEBUG_DRAW_CLUSTER_DECALS,
 		VIEWPORT_DEBUG_DRAW_CLUSTER_REFLECTION_PROBES,
+		VIEWPORT_DEBUG_DRAW_OCCLUDERS,
 	};
 
 	virtual void viewport_set_debug_draw(RID p_viewport, ViewportDebugDraw p_draw) = 0;
@@ -1109,6 +1140,7 @@ public:
 		INSTANCE_DECAL,
 		INSTANCE_GI_PROBE,
 		INSTANCE_LIGHTMAP,
+		INSTANCE_OCCLUDER,
 		INSTANCE_MAX,
 
 		INSTANCE_GEOMETRY_MASK = (1 << INSTANCE_MESH) | (1 << INSTANCE_MULTIMESH) | (1 << INSTANCE_IMMEDIATE) | (1 << INSTANCE_PARTICLES)
@@ -1124,7 +1156,7 @@ public:
 	virtual void instance_set_transform(RID p_instance, const Transform &p_transform) = 0;
 	virtual void instance_attach_object_instance_id(RID p_instance, ObjectID p_id) = 0;
 	virtual void instance_set_blend_shape_weight(RID p_instance, int p_shape, float p_weight) = 0;
-	virtual void instance_set_surface_material(RID p_instance, int p_surface, RID p_material) = 0;
+	virtual void instance_set_surface_override_material(RID p_instance, int p_surface, RID p_material) = 0;
 	virtual void instance_set_visible(RID p_instance, bool p_visible) = 0;
 
 	virtual void instance_set_custom_aabb(RID p_instance, AABB aabb) = 0;
@@ -1147,6 +1179,7 @@ public:
 		INSTANCE_FLAG_USE_BAKED_LIGHT,
 		INSTANCE_FLAG_USE_DYNAMIC_GI,
 		INSTANCE_FLAG_DRAW_NEXT_FRAME_IF_VISIBLE,
+		INSTANCE_FLAG_IGNORE_OCCLUSION_CULLING,
 		INSTANCE_FLAG_MAX
 	};
 
@@ -1505,6 +1538,7 @@ VARIANT_ENUM_CAST(RenderingServer::ViewportMSAA);
 VARIANT_ENUM_CAST(RenderingServer::ViewportScreenSpaceAA);
 VARIANT_ENUM_CAST(RenderingServer::ViewportRenderInfo);
 VARIANT_ENUM_CAST(RenderingServer::ViewportDebugDraw);
+VARIANT_ENUM_CAST(RenderingServer::ViewportOcclusionCullingBuildQuality);
 VARIANT_ENUM_CAST(RenderingServer::SkyMode);
 VARIANT_ENUM_CAST(RenderingServer::EnvironmentBG);
 VARIANT_ENUM_CAST(RenderingServer::EnvironmentAmbientSource);
diff --git a/tests/test_command_queue.h b/tests/test_command_queue.h
index b4fa63ad2b..2f0f62f5c8 100644
--- a/tests/test_command_queue.h
+++ b/tests/test_command_queue.h
@@ -31,14 +31,14 @@
 #ifndef TEST_COMMAND_QUEUE_H
 #define TEST_COMMAND_QUEUE_H
 
-#include "test_command_queue.h"
-
 #include "core/config/project_settings.h"
+#include "core/math/random_number_generator.h"
 #include "core/os/mutex.h"
 #include "core/os/os.h"
 #include "core/os/semaphore.h"
 #include "core/os/thread.h"
 #include "core/templates/command_queue_mt.h"
+#include "test_macros.h"
 
 #if !defined(NO_THREADS)
 
diff --git a/tests/test_physics_2d.cpp b/tests/test_physics_2d.cpp
index 047697e314..25b2871890 100644
--- a/tests/test_physics_2d.cpp
+++ b/tests/test_physics_2d.cpp
@@ -243,9 +243,7 @@ protected:
 		Size2 imgsize(5, 5); //vs->texture_get_width(body_shape_data[p_shape].image), vs->texture_get_height(body_shape_data[p_shape].image));
 		vs->canvas_item_add_texture_rect(sprite, Rect2(-imgsize / 2.0, imgsize), body_shape_data[p_shape].image);
 
-		ps->body_set_force_integration_callback(body, this, "_body_moved", sprite);
-		//RID q = ps->query_create(this,"_body_moved",sprite);
-		//ps->query_body_state(q,body);
+		ps->body_set_force_integration_callback(body, callable_mp(this, &TestPhysics2DMainLoop::_body_moved), sprite);
 
 		return body;
 	}
@@ -310,7 +308,6 @@ protected:
 	}
 
 	static void _bind_methods() {
-		ClassDB::bind_method(D_METHOD("_body_moved"), &TestPhysics2DMainLoop::_body_moved);
 		ClassDB::bind_method(D_METHOD("_ray_query_callback"), &TestPhysics2DMainLoop::_ray_query_callback);
 	}
 
diff --git a/tests/test_physics_3d.cpp b/tests/test_physics_3d.cpp
index bb324d8ffe..ac8078a0a8 100644
--- a/tests/test_physics_3d.cpp
+++ b/tests/test_physics_3d.cpp
@@ -77,10 +77,6 @@ class TestPhysics3DMainLoop : public MainLoop {
 	bool quit;
 
 protected:
-	static void _bind_methods() {
-		ClassDB::bind_method("body_changed_transform", &TestPhysics3DMainLoop::body_changed_transform);
-	}
-
 	RID create_body(PhysicsServer3D::ShapeType p_shape, PhysicsServer3D::BodyMode p_body, const Transform p_location, bool p_active_default = true, const Transform &p_shape_xform = Transform()) {
 		RenderingServer *vs = RenderingServer::get_singleton();
 		PhysicsServer3D *ps = PhysicsServer3D::get_singleton();
@@ -93,7 +89,7 @@ protected:
 		ps->body_set_param(body, PhysicsServer3D::BODY_PARAM_BOUNCE, 0.0);
 		//todo set space
 		ps->body_add_shape(body, type_shape_map[p_shape]);
-		ps->body_set_force_integration_callback(body, this, "body_changed_transform", mesh_instance);
+		ps->body_set_force_integration_callback(body, callable_mp(this, &TestPhysics3DMainLoop::body_changed_transform), mesh_instance);
 
 		ps->body_set_state(body, PhysicsServer3D::BODY_STATE_TRANSFORM, p_location);
 		bodies.push_back(body);
@@ -370,8 +366,7 @@ public:
 		ps->body_set_space(character, space);
 		//todo add space
 		ps->body_add_shape(character, capsule_shape);
-
-		ps->body_set_force_integration_callback(character, this, "body_changed_transform", mesh_instance);
+		ps->body_set_force_integration_callback(character, callable_mp(this, &TestPhysics3DMainLoop::body_changed_transform), mesh_instance);
 
 		ps->body_set_state(character, PhysicsServer3D::BODY_STATE_TRANSFORM, Transform(Basis(), Vector3(-2, 5, -2)));
 		bodies.push_back(character);
diff --git a/tests/test_shader_lang.cpp b/tests/test_shader_lang.cpp
index a023f35506..2169350c02 100644
--- a/tests/test_shader_lang.cpp
+++ b/tests/test_shader_lang.cpp
@@ -344,7 +344,7 @@ MainLoop *test() {
 	Set<String> types;
 	types.insert("spatial");
 
-	Error err = sl.compile(code, dt, rm, types, nullptr);
+	Error err = sl.compile(code, dt, rm, ShaderLanguage::VaryingFunctionNames(), types, nullptr);
 
 	if (err) {
 		print_line("Error at line: " + rtos(sl.get_error_line()) + ": " + sl.get_error_text());
diff --git a/tests/test_string.h b/tests/test_string.h
index 6febf22765..94d14517ae 100644
--- a/tests/test_string.h
+++ b/tests/test_string.h
@@ -1045,7 +1045,7 @@ TEST_CASE("[String] lstrip and rstrip") {
 
 TEST_CASE("[String] ensuring empty string into parse_utf8 passes empty string") {
 	String empty;
-	CHECK(empty.parse_utf8(NULL, -1));
+	CHECK(empty.parse_utf8(nullptr, -1));
 }
 
 TEST_CASE("[String] Cyrillic to_lower()") {
@@ -1156,6 +1156,17 @@ TEST_CASE("[String] uri_encode/unescape") {
 	String s = "Godot Engine:'docs'";
 	String t = "Godot%20Engine%3A%27docs%27";
 
+	String x1 = "T%C4%93%C5%A1t";
+	static const uint8_t u8str[] = { 0x54, 0xC4, 0x93, 0xC5, 0xA1, 0x74, 0x00 };
+	String x2 = String::utf8((const char *)u8str);
+	String x3 = U"Tēšt";
+
+	CHECK(x1.uri_decode() == x2);
+	CHECK(x1.uri_decode() == x3);
+	CHECK((x1 + x3).uri_decode() == (x2 + x3)); // Mixed unicode and URL encoded string, e.g. GTK+ bookmark.
+	CHECK(x2.uri_encode() == x1);
+	CHECK(x3.uri_encode() == x1);
+
 	CHECK(s.uri_encode() == t);
 	CHECK(t.uri_decode() == s);
 }
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 6dca29e856..605b298ac1 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -62,6 +62,26 @@ Files extracted from upstream source:
 Extracted from .zip provided. Extracted license and header only.
 
 
+## embree-aarch64
+
+- Upstream: https://github.com/lighttransport/embree-aarch64
+- Version: 3.12.1 (6ef362f99af80c9dfe8dd2bfc582d9067897edc6, 2020)
+- License: Apache 2.0
+
+Files extracted from upstream:
+
+- All cpp files listed in `modules/raycast/godot_update_embree.py`
+- All header files in the directories listed in `modules/raycast/godot_update_embree.py`
+
+The `modules/raycast/godot_update_embree.py`script can be used to pull the 
+relevant files from the latest Embree-aarch64 release and apply some automatic changes.
+
+Some changes have been made in order to remove exceptions and fix minor build errors.
+They are marked with `// -- GODOT start --` and `// -- GODOT end --`
+comments. Apply the patches in the `patches/` folder when syncing on newer upstream
+commits.
+
+
 ## enet
 
 - Upstream: http://enet.bespin.org
@@ -86,20 +106,20 @@ It is still possible to build against a system wide ENet but doing so
 will limit its functionality to IPv4 only.
 
 
-## etc2comp
+## etcpak
 
-- Upstream: https://github.com/google/etc2comp
-- Version: git (9cd0f9cae0f32338943699bb418107db61bb66f2, 2017)
-- License: Apache 2.0
+- Upstream: https://github.com/wolfpld/etcpak
+- Version: git (f27daea656ff77671580f838a889e33049430ebd, 2021)
+- License: BSD-3-Clause
 
 Files extracted from upstream source:
 
-- all .cpp and .h files in EtcLib/
-- README.md, LICENSE, AUTHORS
-
-Important: Some files have Godot-made changes.
-They are marked with `// -- GODOT start --` and `// -- GODOT end --`
-comments.
+- Only the files relevant for compression (i.e. `Process*.cpp` and their deps):
+  ```
+  Dither.{cpp,hpp} ForceInline.hpp Math.hpp ProcessCommon.hpp ProcessRGB.{cpp,hpp}
+  ProcessDxtc.{cpp,hpp} Tables.{cpp,hpp} Vector.hpp
+  ```
+- `AUTHORS.txt` and `LICENSE.txt`
 
 
 ## fonts
@@ -186,17 +206,17 @@ Files extracted from upstream source:
 ## icu4c
 
 - Upstream: https://github.com/unicode-org/icu
-- Version: 68.2 (84e1f26ea77152936e70d53178a816dbfbf69989, 2020)
+- Version: 69.1 (0e7b4428866f3133b4abba2d932ee3faa708db1d, 2021)
 - License: Unicode
 
 Files extracted from upstream source:
 
 - the `common` folder
-- `APIChangeReport.md`, `LICENSE`
+- `LICENSE`
 
 Files generated from upstream source:
 
-- the `icudt68l.dat` built with the provided `godot_data.json` config file (see
+- the `icudt69l.dat` built with the provided `godot_data.json` config file (see
   https://github.com/unicode-org/icu/blob/master/docs/userguide/icu_data/buildtool.md
   for instructions)
 
@@ -344,7 +364,7 @@ File extracted from upstream release tarball:
 ## meshoptimizer
 
 - Upstream: https://github.com/zeux/meshoptimizer
-- Version: git (e3f53f66e7a35b9b8764bee478589d79e34fa698, 2021)
+- Version: 0.16 (95893c0566646434dd675b708d293fcb2d526d08, 2021)
 - License: MIT
 
 Files extracted from upstream repository:
diff --git a/thirdparty/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp b/thirdparty/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
index 6873a95d90..c79623bd57 100644
--- a/thirdparty/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
+++ b/thirdparty/bullet/BulletCollision/CollisionDispatch/btBox2dBox2dCollisionAlgorithm.cpp
@@ -80,7 +80,6 @@ struct ClipVertex
 	btVector3 v;
 	int id;
 	//b2ContactID id;
-	//b2ContactID id;
 };
 
 #define b2Dot(a, b) (a).dot(b)
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp
index fec9b03213..4372489fa1 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodyJointMotor.cpp
@@ -43,7 +43,6 @@ void btMultiBodyJointMotor::finalizeMultiDof()
 	unsigned int offset = 6 + (m_bodyA->getLink(m_linkA).m_dofOffset + linkDoF);
 
 	// row 0: the lower bound
-	// row 0: the lower bound
 	jacobianA(0)[offset] = 1;
 
 	m_numDofsFinalized = m_jacSizeBoth;
diff --git a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp
index 25ddd539bf..5c20d2a0d4 100644
--- a/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp
+++ b/thirdparty/bullet/BulletDynamics/Featherstone/btMultiBodySphericalJointMotor.cpp
@@ -45,7 +45,6 @@ void btMultiBodySphericalJointMotor::finalizeMultiDof()
 	unsigned int offset = 6 + (m_bodyA->getLink(m_linkA).m_dofOffset + linkDoF);
 
 	// row 0: the lower bound
-	// row 0: the lower bound
 	jacobianA(0)[offset] = 1;
 
 	m_numDofsFinalized = m_jacSizeBoth;
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h
new file mode 100644
index 0000000000..01f1f80f6c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_any_of.h
@@ -0,0 +1,55 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <functional>
+#include "parallel_reduce.h"
+
+namespace embree
+{
+  
+  template<typename Index, class UnaryPredicate>
+    __forceinline bool parallel_any_of (Index first, Index last, UnaryPredicate pred)
+  {
+    bool ret = false;
+    
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred,&context](const tbb::blocked_range<size_t>& r) {
+        if (context.is_group_execution_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            context.cancel_group_execution();
+          }
+        }
+      });
+#else
+    tbb::parallel_for(tbb::blocked_range<size_t>{first, last}, [&ret,pred](const tbb::blocked_range<size_t>& r) {
+        if (tbb::task::self().is_cancelled()) return;
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+          if (pred(i)) {
+            ret = true;
+            tbb::task::self().cancel_group_execution();
+          }
+        }
+      });
+#endif
+#else
+    ret = parallel_reduce (first, last, false, [pred](const range<size_t>& r)->bool {
+        bool localret = false;
+        for (auto i=r.begin(); i<r.end(); ++i) {
+          localret |= pred(i);
+        }
+        return localret;
+      },
+      std::bit_or<bool>()
+      );
+#endif
+    
+    return ret;
+  }
+  
+} // end namespace
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp
new file mode 100644
index 0000000000..acddc0ff81
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.cpp
@@ -0,0 +1,56 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_filter.h"
+#include "../sys/regression.h"
+#include <map>
+
+namespace embree
+{
+  struct parallel_filter_regression_test : public RegressionTest
+  {
+    parallel_filter_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+      auto pred = [&]( uint32_t v ) { return (v & 0x3) == 0; };
+      
+      for (size_t N=10; N<1000000; N=size_t(2.1*N))
+      {
+        size_t N0 = rand() % N;
+        
+	/* initialize array with random numbers */
+	std::vector<uint32_t> src(N);
+        std::map<uint32_t,int> m;
+	for (size_t i=0; i<N; i++) src[i] = rand();
+
+        /* count elements up */
+	for (size_t i=N0; i<N; i++)
+          if (pred(src[i]))
+            m[src[i]] = 0;
+        for (size_t i=N0; i<N; i++)
+          if (pred(src[i]))
+            m[src[i]]++;
+
+        /* filter array */
+        //size_t M = sequential_filter(src.data(),N0,N,pred);
+        size_t M = parallel_filter(src.data(),N0,N,size_t(1024),pred);
+        
+	/* check if filtered data is correct */
+	for (size_t i=N0; i<M; i++) {
+          passed &= pred(src[i]);
+          m[src[i]]--;
+        }
+	for (size_t i=N0; i<M; i++)
+          passed &= (m[src[i]] == 0);
+      }
+
+      return passed;
+    }
+  };
+
+  parallel_filter_regression_test parallel_filter_regression("parallel_filter_regression");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h
new file mode 100644
index 0000000000..5823fc631f
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_filter.h
@@ -0,0 +1,93 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index sequential_filter( Ty* data, const Index first, const Index last, const Predicate& predicate)
+  {
+    Index j = first;
+    for (Index i=first; i<last; i++)
+      if (predicate(data[i]))
+        data[j++] = data[i];
+
+    return j;
+  }
+
+  template<typename Ty, typename Index, typename Predicate>
+    inline Index parallel_filter( Ty* data, const Index begin, const Index end, const Index minStepSize, const Predicate& predicate)
+  {
+    /* sequential fallback */
+    if (end-begin <= minStepSize)
+      return sequential_filter(data,begin,end,predicate);
+
+    /* calculate number of tasks to use */
+    enum { MAX_TASKS = 64 };
+    const Index numThreads = TaskScheduler::threadCount();
+    const Index numBlocks  = (end-begin+minStepSize-1)/minStepSize;
+    const Index taskCount  = min(numThreads,numBlocks,(Index)MAX_TASKS);
+
+    /* filter blocks */
+    Index nused[MAX_TASKS];
+    Index nfree[MAX_TASKS];
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      const Index i0 = begin+(taskIndex+0)*(end-begin)/taskCount;
+      const Index i1 = begin+(taskIndex+1)*(end-begin)/taskCount;
+      const Index i2 = sequential_filter(data,i0,i1,predicate);
+      nused[taskIndex] = i2-i0;
+      nfree[taskIndex] = i1-i2;
+    });
+
+    /* calculate offsets */
+    Index sused=0;
+    Index sfree=0;
+    Index pfree[MAX_TASKS];
+    for (Index i=0; i<taskCount; i++) 
+    {
+      sused+=nused[i];
+      Index cfree = nfree[i]; pfree[i] = sfree; sfree+=cfree;
+    }
+
+    /* return if we did not filter out any element */
+    assert(sfree <= end-begin);
+    assert(sused <= end-begin);
+    if (sused == end-begin)
+      return end;
+
+    /* otherwise we have to copy misplaced elements around */
+    parallel_for(taskCount, [&](const Index taskIndex)
+    {
+      /* destination to write elements to */
+      Index dst = begin+(taskIndex+0)*(end-begin)/taskCount+nused[taskIndex];
+      Index dst_end = min(dst+nfree[taskIndex],begin+sused);
+      if (dst_end <= dst) return;
+
+      /* range of misplaced elements to copy to destination */
+      Index r0 = pfree[taskIndex];
+      Index r1 = r0+dst_end-dst;
+
+      /* find range in misplaced elements in back to front order */
+      Index k0=0;
+      for (Index i=taskCount-1; i>0; i--)
+      {
+        if (k0 > r1) break;
+        Index k1 = k0+nused[i];
+        Index src = begin+(i+0)*(end-begin)/taskCount+nused[i];
+        for (Index i=max(r0,k0); i<min(r1,k1); i++) {
+          Index isrc = src-i+k0-1;
+          assert(dst >= begin && dst < end);
+          assert(isrc >= begin && isrc < end);
+          data[dst++] = data[isrc];
+        }
+        k0 = k1;
+      }
+    });
+
+    return begin+sused;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp
new file mode 100644
index 0000000000..ef070ebc4d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.cpp
@@ -0,0 +1,48 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_for.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_for_regression_test : public RegressionTest
+  {
+    parallel_for_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      const size_t M = 10;
+      for (size_t N=10; N<10000000; N=size_t(2.1*N))
+      {
+        /* sequentially calculate sum of squares */
+        size_t sum0 = 0;
+        for (size_t i=0; i<N; i++) {
+          sum0 += i*i;
+        }
+
+        /* parallel calculation of sum of squares */
+        for (size_t m=0; m<M; m++)
+        {
+          std::atomic<size_t> sum1(0);
+          parallel_for( size_t(0), size_t(N), size_t(1024), [&](const range<size_t>& r) 
+          {
+            size_t s = 0;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+              s += i*i;
+            sum1 += s;
+          });
+          passed = sum0 == sum1;
+        }
+      }
+      
+      return passed;
+    }
+  };
+
+  parallel_for_regression_test parallel_for_regression("parallel_for_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
new file mode 100644
index 0000000000..51d296fb16
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
@@ -0,0 +1,229 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../tasking/taskscheduler.h"
+#include "../sys/array.h"
+#include "../math/math.h"
+#include "../math/range.h"
+
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+#include <dispatch/dispatch.h>
+#include <algorithm>
+#include <type_traits>
+#endif
+
+namespace embree
+{
+  /* parallel_for without range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index N, const Func& func)
+  {
+#if defined(TASKING_INTERNAL)
+    if (N) {
+      TaskScheduler::spawn(Index(0),N,Index(1),[&] (const range<Index>& r) {
+          assert(r.size() == 1);
+          func(r.begin());
+        });
+      if (!TaskScheduler::wait())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    }
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+      
+    const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? TaskScheduler::threadCount() : 1;
+    const size_t length = N;
+    const size_t blockSize = (length + baselineNumBlocks-1) / baselineNumBlocks;
+    const size_t numBlocks = (length + blockSize-1) / blockSize;
+      
+    dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
+          
+        const size_t start = (currentBlock * blockSize);
+        const size_t blockLength = std::min(length - start, blockSize);
+        const size_t end = start + blockLength;
+          
+        for(size_t i=start; i < end; i++)
+        {
+            func(i);
+        }
+    });
+      
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      },context);
+    if (context.is_group_execution_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #else
+    tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+        func(i);
+      });
+    if (tbb::task::self().is_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(Index(0),N,Index(1),[&](Index i) { 
+        func(i);
+      });
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range and granulatity */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Index minStepSize, const Func& func)
+  {
+    assert(first <= last);
+#if defined(TASKING_INTERNAL)
+    TaskScheduler::spawn(first,last,minStepSize,func);
+    if (!TaskScheduler::wait())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+      
+    const size_t baselineNumBlocks = (TaskScheduler::threadCount() > 1)? 4*TaskScheduler::threadCount() : 1;
+    const size_t length = last - first;
+    const size_t blockSizeByThreads = (length + baselineNumBlocks-1) / baselineNumBlocks;
+    size_t blockSize = std::max<size_t>(minStepSize,blockSizeByThreads);
+    blockSize += blockSize % 4;
+      
+    const size_t numBlocks = (length + blockSize-1) / blockSize;
+      
+    dispatch_apply(numBlocks, DISPATCH_APPLY_AUTO, ^(size_t currentBlock) {
+          
+        const size_t start = first + (currentBlock * blockSize);
+        const size_t end = std::min<size_t>(last, start + blockSize);
+          
+        func( embree::range<Index>(start,end) );
+    });
+      
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      },context);
+    if (context.is_group_execution_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #else
+    tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+        func(range<Index>(r.begin(),r.end()));
+      });
+    if (tbb::task::self().is_cancelled())
+      // -- GODOT start --
+      // throw std::runtime_error("task cancelled");
+      abort(); 
+      // -- GODOT end --
+  #endif
+
+#elif defined(TASKING_PPL)
+    concurrency::parallel_for(first, last, Index(1) /*minStepSize*/, [&](Index i) { 
+        func(range<Index>(i,i+1)); 
+      });
+
+#else
+#  error "no tasking system enabled"
+#endif
+  }
+  
+  /* parallel for with range */
+  template<typename Index, typename Func>
+    __forceinline void parallel_for( const Index first, const Index last, const Func& func)
+  {
+    assert(first <= last);
+    parallel_for(first,last,(Index)1,func);
+  }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION > 4001)
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner(),context);
+      if (context.is_group_execution_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },tbb::simple_partitioner());
+      if (tbb::task::self().is_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    #endif
+  }
+
+  typedef tbb::affinity_partitioner affinity_partitioner;
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, tbb::affinity_partitioner& ap)
+  {
+    #if TBB_INTERFACE_VERSION >= 12002
+      tbb::task_group_context context;
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap,context);
+      if (context.is_group_execution_cancelled())
+       // -- GODOT start --
+       // throw std::runtime_error("task cancelled");
+       abort(); 
+       // -- GODOT end --
+    #else
+      tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+          func(i);
+        },ap);
+      if (tbb::task::self().is_cancelled())
+        // -- GODOT start --
+        // throw std::runtime_error("task cancelled");
+        abort(); 
+        // -- GODOT end --
+    #endif
+  }
+
+#else
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_static( const Index N, const Func& func) 
+  {
+    parallel_for(N,func);
+  }
+
+  struct affinity_partitioner {
+  };
+
+  template<typename Index, typename Func>
+    __forceinline void parallel_for_affinity( const Index N, const Func& func, affinity_partitioner& ap) 
+  {
+    parallel_for(N,func);
+  }
+
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp
new file mode 100644
index 0000000000..0337611b35
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.cpp
@@ -0,0 +1,63 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_for_for.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_for_for_regression_test : public RegressionTest
+  {
+    parallel_for_for_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create vector with random numbers */
+      size_t sum0 = 0;
+      size_t K = 0;
+      const size_t M = 1000;
+      std::vector<std::vector<size_t>* > array2(M);
+      for (size_t i=0; i<M; i++) {
+        const size_t N = rand() % 1024;
+        K+=N;
+        array2[i] = new std::vector<size_t>(N);
+        for (size_t j=0; j<N; j++) 
+          sum0 += (*array2[i])[j] = rand();
+      }
+
+      /* array to test global index */
+      std::vector<atomic<size_t>> verify_k(K);
+      for (size_t i=0; i<K; i++) verify_k[i].store(0);
+
+      /* add all numbers using parallel_for_for */
+      std::atomic<size_t> sum1(0);
+      parallel_for_for( array2, size_t(1), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k) -> size_t
+      {
+        size_t s = 0;
+	for (size_t i=r.begin(); i<r.end(); i++) {
+	  s += (*v)[i];
+          verify_k[k++]++;
+        }
+        sum1 += s;
+	return sum1;
+      });
+      passed &= (sum0 == sum1);
+
+      /* check global index */
+      for (size_t i=0; i<K; i++) 
+        passed &= (verify_k[i] == 1);
+
+      /* delete vectors again */
+      for (size_t i=0; i<array2.size(); i++)
+	delete array2[i];
+      
+      return passed;
+    }
+  };
+
+  parallel_for_for_regression_test parallel_for_for_regression("parallel_for_for_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h
new file mode 100644
index 0000000000..852b8a0900
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for.h
@@ -0,0 +1,149 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename ArrayArray, typename Func>
+    __forceinline void sequential_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func ) 
+  {
+    size_t k=0;
+    for (size_t i=0; i!=array2.size(); ++i) {
+      const size_t N = array2[i]->size();
+      if (N) func(array2[i],range<size_t>(0,N),k);
+      k+=N;
+    }
+  }
+
+  class ParallelForForState
+  {
+  public:
+
+    enum { MAX_TASKS = 64 };
+
+    __forceinline ParallelForForState () 
+      : taskCount(0) {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForState (ArrayArray& array2, const size_t minStepSize) {
+      init(array2,minStepSize);
+    } 
+
+    template<typename ArrayArray>
+      __forceinline void init ( ArrayArray& array2, const size_t minStepSize )
+    {
+      /* first calculate total number of elements */
+      size_t N = 0;
+      for (size_t i=0; i<array2.size(); i++) {
+	N += array2[i] ? array2[i]->size() : 0;
+      }
+      this->N = N;
+
+      /* calculate number of tasks to use */
+      const size_t numThreads = TaskScheduler::threadCount();
+      const size_t numBlocks  = (N+minStepSize-1)/minStepSize;
+      taskCount = max(size_t(1),min(numThreads,numBlocks,size_t(ParallelForForState::MAX_TASKS)));
+      
+      /* calculate start (i,j) for each task */
+      size_t taskIndex = 0;
+      i0[taskIndex] = 0;
+      j0[taskIndex] = 0;
+      size_t k0 = (++taskIndex)*N/taskCount;
+      for (size_t i=0, k=0; taskIndex < taskCount; i++) 
+      {
+	assert(i<array2.size());
+	size_t j=0, M = array2[i] ? array2[i]->size() : 0;
+	while (j<M && k+M-j >= k0 && taskIndex < taskCount) {
+	  assert(taskIndex<taskCount);
+	  i0[taskIndex] = i;
+	  j0[taskIndex] = j += k0-k;
+	  k=k0;
+	  k0 = (++taskIndex)*N/taskCount;
+	}
+	k+=M-j;
+      }
+    }
+
+    __forceinline size_t size() const {
+      return N;
+    }
+    
+  public:
+    size_t i0[MAX_TASKS];
+    size_t j0[MAX_TASKS];
+    size_t taskCount;
+    size_t N;
+  };
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const size_t minStepSize, const Func& func )
+  {
+    ParallelForForState state(array2,minStepSize);
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) func(array2[i],range<size_t>(r0,r1),k);
+        k+=r1-r0; j0 = 0;
+      }
+    });
+  }
+
+  template<typename ArrayArray, typename Func>
+    __forceinline void parallel_for_for( ArrayArray& array2, const Func& func )
+  {
+    parallel_for_for(array2,1,func);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const size_t minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    ParallelForForState state(array2,minStepSize);
+    Value temp[ParallelForForState::MAX_TASKS];
+
+    for (size_t i=0; i<state.taskCount; i++)
+      temp[i] = identity;
+    
+    parallel_for(state.taskCount, [&](const size_t taskIndex) 
+    {
+      /* calculate range */
+      const size_t k0 = (taskIndex+0)*state.size()/state.taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/state.taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      for (size_t i=i0; k<k1; i++) {
+        const size_t N =  array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(N,r0+k1-k);
+        if (r1 > r0) temp[taskIndex] = reduction(temp[taskIndex],func(array2[i],range<size_t>(r0,r1),k));
+        k+=r1-r0; j0 = 0;
+      }
+    });
+
+    Value ret = identity;
+    for (size_t i=0; i<state.taskCount; i++)
+      ret = reduction(ret,temp[i]);
+    return ret;
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_reduce( ArrayArray& array2, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_reduce(array2,1,identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp
new file mode 100644
index 0000000000..0169d8e481
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.cpp
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_for_for_prefix_sum.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_for_for_prefix_sum_regression_test : public RegressionTest
+  {
+    parallel_for_for_prefix_sum_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create vector with random numbers */
+      const size_t M = 10;
+      std::vector<atomic<size_t>> flattened;
+      typedef std::vector<std::vector<size_t>* > ArrayArray;
+      ArrayArray array2(M);
+      size_t K = 0;
+      for (size_t i=0; i<M; i++) {
+        const size_t N = rand() % 10;
+        K += N;
+        array2[i] = new std::vector<size_t>(N);
+        for (size_t j=0; j<N; j++) 
+          (*array2[i])[j] = rand() % 10;
+      }
+  
+      /* array to test global index */
+      std::vector<atomic<size_t>> verify_k(K);
+      for (size_t i=0; i<K; i++) verify_k[i].store(0);
+
+      ParallelForForPrefixSumState<size_t> state(array2,size_t(1));
+  
+      /* dry run only counts */
+      size_t S = parallel_for_for_prefix_sum0( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i) -> size_t
+      {
+        size_t s = 0;
+	for (size_t i=r.begin(); i<r.end(); i++) {
+          s += (*v)[i];
+          verify_k[k++]++;
+        }
+        return s;
+      }, [](size_t v0, size_t v1) { return v0+v1; });
+      
+      /* create properly sized output array */
+      flattened.resize(S);
+      for (auto& a : flattened) a.store(0);
+
+      /* now we actually fill the flattened array */
+      parallel_for_for_prefix_sum1( state, array2, size_t(0), [&](std::vector<size_t>* v, const range<size_t>& r, size_t k, size_t i, const size_t base) -> size_t
+      {
+        size_t s = 0;
+	for (size_t i=r.begin(); i<r.end(); i++) {
+          for (size_t j=0; j<(*v)[i]; j++) {
+            flattened[base+s+j]++;
+          }
+          s += (*v)[i];
+          verify_k[k++]++;
+        }
+        return s;
+      }, [](size_t v0, size_t v1) { return v0+v1; });
+
+      /* check global index */
+      for (size_t i=0; i<K; i++) 
+        passed &= (verify_k[i] == 2);
+
+      /* check if each element was assigned exactly once */
+      for (size_t i=0; i<flattened.size(); i++)
+        passed &= (flattened[i] == 1);
+      
+      /* delete arrays again */
+      for (size_t i=0; i<array2.size(); i++)
+	delete array2[i];
+
+      return passed;
+    }
+  };
+
+  parallel_for_for_prefix_sum_regression_test parallel_for_for_prefix_sum_regression("parallel_for_for_prefix_sum_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h
new file mode 100644
index 0000000000..d2671d8a6a
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for_for_prefix_sum.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for_for.h"
+#include "parallel_prefix_sum.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelForForPrefixSumState : public ParallelForForState
+  {
+    __forceinline ParallelForForPrefixSumState () {}
+
+    template<typename ArrayArray>
+      __forceinline ParallelForForPrefixSumState (ArrayArray& array2, const size_t minStepSize)
+      : ParallelForForState(array2,minStepSize) {}
+
+    ParallelPrefixSumState<Value> prefix_state;
+  };
+  
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
+                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename ArrayArray, typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, Index minStepSize, 
+                                                      const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t taskCount = state.taskCount;
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t k0 = (taskIndex+0)*state.size()/taskCount;
+      const size_t k1 = (taskIndex+1)*state.size()/taskCount;
+      size_t i0 = state.i0[taskIndex];
+      size_t j0 = state.j0[taskIndex];
+
+      /* iterate over arrays */
+      size_t k=k0;
+      Value N=identity;
+      for (size_t i=i0; k<k1; i++) {
+	const size_t size = array2[i] ? array2[i]->size() : 0;
+        const size_t r0 = j0, r1 = min(size,r0+k1-k);
+        if (r1 > r0) N = reduction(N, func(array2[i],range<Index>((Index)r0,(Index)r1),(Index)k,(Index)i,reduction(state.prefix_state.sums[taskIndex],N)));
+        k+=r1-r0; j0 = 0;
+      }
+      state.prefix_state.counts[taskIndex] = N;
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++)
+    {
+      const Value c = state.prefix_state.counts[i];
+      state.prefix_state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum0( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum0(state,array2,size_t(1),identity,func,reduction);
+  }
+
+  template<typename ArrayArray, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_for_for_prefix_sum1( ParallelForForPrefixSumState<Value>& state, ArrayArray& array2, 
+						     const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    return parallel_for_for_prefix_sum1(state,array2,size_t(1),identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp
new file mode 100644
index 0000000000..09dc303f81
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_map.cpp
@@ -0,0 +1,47 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_map.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_map_regression_test : public RegressionTest
+  {
+    parallel_map_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create key/value vectors with random numbers */
+      const size_t N = 10000;
+      std::vector<uint32_t> keys(N);
+      std::vector<uint32_t> vals(N);
+      for (size_t i=0; i<N; i++) keys[i] = 2*unsigned(i)*647382649;
+      for (size_t i=0; i<N; i++) std::swap(keys[i],keys[rand()%N]);
+      for (size_t i=0; i<N; i++) vals[i] = 2*rand();
+      
+      /* create map */
+      parallel_map<uint32_t,uint32_t> map;
+      map.init(keys,vals);
+
+      /* check that all keys are properly mapped */
+      for (size_t i=0; i<N; i++) {
+        const uint32_t* val = map.lookup(keys[i]);
+        passed &= val && (*val == vals[i]);
+      }
+
+      /* check that these keys are not in the map */
+      for (size_t i=0; i<N; i++) {
+        passed &= !map.lookup(keys[i]+1);
+      }
+
+      return passed;
+    }
+  };
+
+  parallel_map_regression_test parallel_map_regression("parallel_map_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_map.h b/thirdparty/embree-aarch64/common/algorithms/parallel_map.h
new file mode 100644
index 0000000000..02e1a8f8d0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_map.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /*! implementation of a key/value map with parallel construction */
+  template<typename Key, typename Val>
+  class parallel_map
+  {
+    /* key/value pair to build the map */
+    struct KeyValue
+    {
+      __forceinline KeyValue () {}
+
+      __forceinline KeyValue (const Key key, const Val val)
+	: key(key), val(val) {}
+
+      __forceinline operator Key() const {
+	return key;
+      }
+
+    public:
+      Key key;
+      Val val;
+    };
+
+  public:
+    
+    /*! parallel map constructors */
+    parallel_map () {}
+
+    /*! construction from pair of vectors */
+    template<typename KeyVector, typename ValVector>
+      parallel_map (const KeyVector& keys, const ValVector& values) { init(keys,values); }
+
+    /*! initialized the parallel map from a vector with keys and values */
+    template<typename KeyVector, typename ValVector>
+      void init(const KeyVector& keys, const ValVector& values) 
+    {
+      /* reserve sufficient space for all data */
+      assert(keys.size() == values.size());
+      vec.resize(keys.size());
+      
+      /* generate key/value pairs */
+      parallel_for( size_t(0), keys.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++)
+	  vec[i] = KeyValue((Key)keys[i],values[i]);
+      });
+
+      /* perform parallel radix sort of the key/value pairs */
+      std::vector<KeyValue> temp(keys.size());
+      radix_sort<KeyValue,Key>(vec.data(),temp.data(),keys.size());
+    }
+
+    /*! Returns a pointer to the value associated with the specified key. The pointer will be nullptr of the key is not contained in the map. */
+    __forceinline const Val* lookup(const Key& key) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return nullptr;
+      if (i->key != key) return nullptr;
+      return &i->val;
+    }
+
+    /*! If the key is in the map, the function returns the value associated with the key, otherwise it returns the default value. */
+    __forceinline Val lookup(const Key& key, const Val& def) const 
+    {
+      typename std::vector<KeyValue>::const_iterator i = std::lower_bound(vec.begin(), vec.end(), key);
+      if (i == vec.end()) return def;
+      if (i->key != key) return def;
+      return i->val;
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<KeyValue> vec;    //!< vector containing sorted elements
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp
new file mode 100644
index 0000000000..eb20c4465d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.cpp
@@ -0,0 +1,53 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_partition.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_partition_regression_test : public RegressionTest
+  {
+    parallel_partition_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      for (size_t i=0; i<100; i++)
+      {
+        /* create random permutation */
+        size_t N = std::rand() % 1000000;
+        std::vector<unsigned> array(N);
+        for (unsigned i=0; i<N; i++) array[i] = i;
+        for (auto& v : array) std::swap(v,array[std::rand()%array.size()]);
+        size_t split = std::rand() % (N+1);
+
+        /* perform parallel partitioning */
+        size_t left_sum = 0, right_sum = 0;
+        size_t mid = parallel_partitioning(array.data(),0,array.size(),0,left_sum,right_sum,
+                                           [&] ( size_t i ) { return i < split; },
+                                           []  ( size_t& sum, unsigned v) { sum += v; },
+                                           []  ( size_t& sum, size_t v) { sum += v; },
+                                           128);
+        
+        /*serial_partitioning(array.data(),0,array.size(),left_sum,right_sum,
+                            [&] ( size_t i ) { return i < split; },
+                            []  ( size_t& left_sum, int v) { left_sum += v; });*/
+
+        /* verify result */
+        passed &= mid == split;
+        passed &= left_sum == split*(split-1)/2;
+        passed &= right_sum == N*(N-1)/2-left_sum;
+        for (size_t i=0; i<split; i++) passed &= array[i] < split;
+        for (size_t i=split; i<N; i++) passed &= array[i] >= split;
+      }
+      
+      return passed;
+    }
+  };
+
+  parallel_partition_regression_test parallel_partition_regression("parallel_partition_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h
new file mode 100644
index 0000000000..3b3ad7c854
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_partition.h
@@ -0,0 +1,283 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+#include "../math/range.h"
+
+namespace embree
+{
+  /* serial partitioning */
+  template<typename T, typename V, typename IsLeft, typename Reduction_T>
+    __forceinline size_t serial_partitioning(T* array, 
+                                             const size_t begin,
+                                             const size_t end, 
+                                             V& leftReduction,
+                                             V& rightReduction,
+                                             const IsLeft& is_left, 
+                                             const Reduction_T& reduction_t)
+  {
+    T* l = array + begin;
+    T* r = array + end - 1;
+    
+    while(1)
+    {
+      /* *l < pivot */
+      while (likely(l <= r && is_left(*l) )) 
+      {
+        //prefetchw(l+4); // FIXME: enable?
+        reduction_t(leftReduction,*l);
+        ++l;
+      }
+      /* *r >= pivot) */
+      while (likely(l <= r && !is_left(*r)))
+      {
+        //prefetchw(r-4); FIXME: enable?
+        reduction_t(rightReduction,*r);
+        --r;
+      }
+      if (r<l) break;
+      
+      reduction_t(leftReduction ,*r);
+      reduction_t(rightReduction,*l);
+      xchg(*l,*r);
+      l++; r--;
+    }
+    
+    return l - array;        
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    class __aligned(64) parallel_partition_task
+  {
+    ALIGNED_CLASS_(64);
+  private:
+
+    static const size_t MAX_TASKS = 64;
+
+    T* array;
+    size_t N;
+    const IsLeft& is_left;
+    const Reduction_T& reduction_t;
+    const Reduction_V& reduction_v;
+    const Vi& identity;
+
+    size_t numTasks; 
+    __aligned(64) size_t counter_start[MAX_TASKS+1]; 
+    __aligned(64) size_t counter_left[MAX_TASKS+1];  
+    __aligned(64) range<ssize_t> leftMisplacedRanges[MAX_TASKS];  
+    __aligned(64) range<ssize_t> rightMisplacedRanges[MAX_TASKS]; 
+    __aligned(64) V leftReductions[MAX_TASKS];           
+    __aligned(64) V rightReductions[MAX_TASKS];    
+
+  public:
+     
+    __forceinline parallel_partition_task(T* array, 
+                                          const size_t N, 
+                                          const Vi& identity, 
+                                          const IsLeft& is_left, 
+                                          const Reduction_T& reduction_t, 
+                                          const Reduction_V& reduction_v,
+                                          const size_t BLOCK_SIZE) 
+
+      : array(array), N(N), is_left(is_left), reduction_t(reduction_t), reduction_v(reduction_v), identity(identity),
+      numTasks(min((N+BLOCK_SIZE-1)/BLOCK_SIZE,min(TaskScheduler::threadCount(),MAX_TASKS))) {}
+
+    __forceinline const range<ssize_t>* findStartRange(size_t& index, const range<ssize_t>* const r, const size_t numRanges)
+    {
+      size_t i = 0;
+      while(index >= (size_t)r[i].size())
+      {
+        assert(i < numRanges);
+        index -= (size_t)r[i].size();
+        i++;
+      }	    
+      return &r[i];
+    }
+
+    __forceinline void swapItemsInMisplacedRanges(const size_t numLeftMisplacedRanges,
+                                                  const size_t numRightMisplacedRanges,
+                                                  const size_t startID,
+                                                  const size_t endID)
+    {
+      size_t leftLocalIndex  = startID;
+      size_t rightLocalIndex = startID;
+      const range<ssize_t>* l_range = findStartRange(leftLocalIndex,leftMisplacedRanges,numLeftMisplacedRanges);
+      const range<ssize_t>* r_range = findStartRange(rightLocalIndex,rightMisplacedRanges,numRightMisplacedRanges);
+      
+      size_t l_left = l_range->size() - leftLocalIndex;
+      size_t r_left = r_range->size() - rightLocalIndex;
+      T *__restrict__ l = &array[l_range->begin() + leftLocalIndex];
+      T *__restrict__ r = &array[r_range->begin() + rightLocalIndex];
+      size_t size = endID - startID;
+      size_t items = min(size,min(l_left,r_left)); 
+     
+      while (size)
+      {
+        if (unlikely(l_left == 0))
+        {
+          l_range++;
+          l_left = l_range->size();
+          l = &array[l_range->begin()];
+          items = min(size,min(l_left,r_left));
+        }
+
+        if (unlikely(r_left == 0))
+        {		
+          r_range++;
+          r_left = r_range->size();
+          r = &array[r_range->begin()];          
+          items = min(size,min(l_left,r_left));
+        }
+
+        size   -= items;
+        l_left -= items;
+        r_left -= items;
+
+        while(items) {
+          items--;
+          xchg(*l++,*r++);
+        }
+      }
+    }
+
+    __forceinline size_t partition(V& leftReduction, V& rightReduction)
+    {
+      /* partition the individual ranges for each task */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*N/numTasks;
+          const size_t endID   = (taskID+1)*N/numTasks;
+          V local_left(identity);
+          V local_right(identity);
+          const size_t mid = serial_partitioning(array,startID,endID,local_left,local_right,is_left,reduction_t);
+          counter_start[taskID] = startID;
+          counter_left [taskID] = mid-startID;
+          leftReductions[taskID]  = local_left;
+          rightReductions[taskID] = local_right;
+        });
+      counter_start[numTasks] = N;
+      counter_left[numTasks]  = 0;
+      
+      /* finalize the reductions */
+      for (size_t i=0; i<numTasks; i++) {
+        reduction_v(leftReduction,leftReductions[i]);
+        reduction_v(rightReduction,rightReductions[i]);
+      }
+
+      /* calculate mid point for partitioning */
+      size_t mid = counter_left[0];
+      for (size_t i=1; i<numTasks; i++)
+        mid += counter_left[i];
+      const range<ssize_t> globalLeft (0,mid);
+      const range<ssize_t> globalRight(mid,N);
+
+      /* calculate all left and right ranges that are on the wrong global side */
+      size_t numMisplacedRangesLeft  = 0;
+      size_t numMisplacedRangesRight = 0;
+      size_t numMisplacedItemsLeft   = 0;
+      size_t numMisplacedItemsRight  = 0;
+
+      for (size_t i=0; i<numTasks; i++)
+      {	    
+        const range<ssize_t> left_range (counter_start[i], counter_start[i] + counter_left[i]);
+        const range<ssize_t> right_range(counter_start[i] + counter_left[i], counter_start[i+1]);
+        const range<ssize_t> left_misplaced  = globalLeft. intersect(right_range);
+        const range<ssize_t> right_misplaced = globalRight.intersect(left_range);
+
+        if (!left_misplaced.empty())  
+        {
+          numMisplacedItemsLeft += left_misplaced.size();
+          leftMisplacedRanges[numMisplacedRangesLeft++] = left_misplaced;
+        }
+
+        if (!right_misplaced.empty()) 
+        {
+          numMisplacedItemsRight += right_misplaced.size();
+          rightMisplacedRanges[numMisplacedRangesRight++] = right_misplaced;
+        }
+      }
+      assert( numMisplacedItemsLeft == numMisplacedItemsRight );
+
+      /* if no items are misplaced we are done */
+      if (numMisplacedItemsLeft == 0)
+        return mid;
+
+      /* otherwise we copy the items to the right place in parallel */
+      parallel_for(numTasks,[&] (const size_t taskID) {
+          const size_t startID = (taskID+0)*numMisplacedItemsLeft/numTasks;
+          const size_t endID   = (taskID+1)*numMisplacedItemsLeft/numTasks;
+          swapItemsInMisplacedRanges(numMisplacedRangesLeft,numMisplacedRangesRight,startID,endID);	                             
+        });
+
+      return mid;
+    }
+  };
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE = 128)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < BLOCK_SIZE))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+  template<typename T, typename V, typename Vi, typename IsLeft, typename Reduction_T, typename Reduction_V>
+    __noinline size_t parallel_partitioning(T* array, 
+                                            const size_t begin,
+                                            const size_t end, 
+                                            const Vi &identity,
+                                            V &leftReduction,
+                                            V &rightReduction,
+                                            const IsLeft& is_left, 
+                                            const Reduction_T& reduction_t,
+                                            const Reduction_V& reduction_v,
+                                            size_t BLOCK_SIZE,
+                                            size_t PARALLEL_THRESHOLD)
+  {
+    /* fall back to single threaded partitioning for small N */
+    if (unlikely(end-begin < PARALLEL_THRESHOLD))
+      return serial_partitioning(array,begin,end,leftReduction,rightReduction,is_left,reduction_t);
+
+    /* otherwise use parallel code */
+    else {
+      typedef parallel_partition_task<T,V,Vi,IsLeft,Reduction_T,Reduction_V> partition_task;
+      std::unique_ptr<partition_task> p(new partition_task(&array[begin],end-begin,identity,is_left,reduction_t,reduction_v,BLOCK_SIZE));
+      return begin+p->partition(leftReduction,rightReduction);    
+    }
+  }
+
+
+  template<typename T, typename IsLeft>
+    inline size_t parallel_partitioning(T* array, 
+                                        const size_t begin,
+                                        const size_t end, 
+                                        const IsLeft& is_left, 
+                                        size_t BLOCK_SIZE = 128)
+  {
+    size_t leftReduction = 0;
+    size_t rightReduction = 0;
+    return parallel_partitioning(
+      array,begin,end,0,leftReduction,rightReduction,is_left,
+      [] (size_t& t,const T& ref) {  },
+      [] (size_t& t0,size_t& t1) { },
+      BLOCK_SIZE);
+  }
+
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp
new file mode 100644
index 0000000000..685952c3dc
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.cpp
@@ -0,0 +1,48 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_prefix_sum.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_prefix_sum_regression_test : public RegressionTest
+  {
+    parallel_prefix_sum_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+      const size_t M = 10;
+      
+      for (size_t N=10; N<10000000; N=size_t(2.1*N))
+      {
+	/* initialize array with random numbers */
+        uint32_t sum0 = 0;
+	std::vector<uint32_t> src(N);
+	for (size_t i=0; i<N; i++) {
+	  sum0 += src[i] = rand();
+        }
+        
+	/* calculate parallel prefix sum */
+	std::vector<uint32_t> dst(N);
+	for (auto& v : dst) v = 0;
+	
+	for (size_t i=0; i<M; i++) {
+	  uint32_t sum1 = parallel_prefix_sum(src,dst,N,0,std::plus<uint32_t>());
+          passed &= (sum0 == sum1);
+        }
+        
+	/* check if prefix sum is correct */
+	for (size_t i=0, sum=0; i<N; sum+=src[i++])
+	  passed &= ((uint32_t)sum == dst[i]);
+      }
+      
+      return passed;
+    }
+  };
+
+  parallel_prefix_sum_regression_test parallel_prefix_sum_regression("parallel_prefix_sum_regression");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h
new file mode 100644
index 0000000000..117c7a79b0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_prefix_sum.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Value>
+    struct ParallelPrefixSumState 
+  {
+    enum { MAX_TASKS = 64 };
+    Value counts[MAX_TASKS];
+    Value sums  [MAX_TASKS];
+  };
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction)
+  {
+    /* calculate number of tasks to use */
+    const size_t numThreads = TaskScheduler::threadCount();
+    const size_t numBlocks  = (last-first+minStepSize-1)/minStepSize;
+    const size_t taskCount  = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS));
+
+    /* perform parallel prefix sum */
+    parallel_for(taskCount, [&](const size_t taskIndex)
+    {
+      const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount;
+      const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount;
+      state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]);
+    });
+
+    /* calculate prefix sum */
+    Value sum=identity;
+    for (size_t i=0; i<taskCount; i++) 
+    {
+      const Value c = state.counts[i];
+      state.sums[i] = sum;
+      sum=reduction(sum,c);
+    }
+
+    return sum;
+  }
+
+  /*! parallel calculation of prefix sums */
+  template<typename SrcArray, typename DstArray, typename Value, typename Add>
+    __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096) 
+  {
+    /* perform single threaded prefix operation for small N */
+    if (N < SINGLE_THREAD_THRESHOLD) 
+    {
+      Value sum=identity;
+      for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum;
+      return sum;
+    }
+    
+    /* perform parallel prefix operation for large N */
+    else 
+    {
+      ParallelPrefixSumState<Value> state;
+      
+      /* initial run just sets up start values for subtasks */
+      parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]);
+          return s;
+          
+        }, add);
+      
+      /* final run calculates prefix sum */
+      return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value {
+          
+          Value s = identity;
+          for (size_t i=r.begin(); i<r.end(); i++) {
+            dst[i] = add(sum,s);
+            s = add(s,src[i]);
+          }
+          return s;
+          
+        }, add);
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp
new file mode 100644
index 0000000000..331fe4288e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.cpp
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_reduce.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_reduce_regression_test : public RegressionTest
+  {
+    parallel_reduce_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      const size_t M = 10;
+      for (size_t N=10; N<10000000; N=size_t(2.1*N))
+      {
+        /* sequentially calculate sum of squares */
+        size_t sum0 = 0;
+        for (size_t i=0; i<N; i++) {
+          sum0 += i*i;
+        }
+
+        /* parallel calculation of sum of squares */
+        for (size_t m=0; m<M; m++)
+        {
+          size_t sum1 = parallel_reduce( size_t(0), size_t(N), size_t(1024), size_t(0), [&](const range<size_t>& r) -> size_t
+          {
+            size_t s = 0;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+              s += i*i;
+            return s;
+          }, 
+          [](const size_t v0, const size_t v1) {
+            return v0+v1;
+          });
+          passed = sum0 == sum1;
+        }
+      }
+      return passed;
+    }
+  };
+
+  parallel_reduce_regression_test parallel_reduce_regression("parallel_reduce_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
new file mode 100644
index 0000000000..0daf94e50e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
@@ -0,0 +1,150 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_for.h"
+
+namespace embree
+{
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return func(range<Index>(first,last));
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    const Index maxTasks = 512;
+    const Index threadCount = (Index) TaskScheduler::threadCount();
+    taskCount = min(taskCount,threadCount,maxTasks);
+
+    /* parallel invokation of all tasks */
+    dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
+    parallel_for(taskCount, [&](const Index taskIndex) {
+        const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
+        const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
+        values[taskIndex] = func(range<Index>(k0,k1));
+      });
+
+    /* perform reduction over all tasks */
+    Value v = identity;
+    for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
+    return v;
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+#if defined(TASKING_INTERNAL) || (defined(TASKING_GCD) && defined(BUILD_IOS))
+
+    /* fast path for small number of iterations */
+    Index taskCount = (last-first+minStepSize-1)/minStepSize;
+    if (likely(taskCount == 1)) {
+      return func(range<Index>(first,last));
+    }
+    return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
+
+#elif defined(TASKING_TBB)
+  #if TBB_INTERFACE_VERSION >= 12002
+    tbb::task_group_context context;
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction,context);
+    // -- GODOT start --
+    // if (context.is_group_execution_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
+    return v;
+  #else
+    const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+      [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+      reduction);
+    // -- GODOT start --
+    // if (tbb::task::self().is_cancelled())
+    //   throw std::runtime_error("task cancelled");
+    // -- GODOT end --
+    return v;
+  #endif
+#else // TASKING_PPL
+    struct AlignedValue
+    {
+      char storage[__alignof(Value)+sizeof(Value)];
+      static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
+      Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
+      AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
+      AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
+      AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
+      AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
+      operator Value() const { return *getValuePtr(); }
+    };
+    
+    struct Iterator_Index
+    {
+      Index v;
+      typedef std::forward_iterator_tag iterator_category;
+      typedef AlignedValue value_type;
+      typedef Index difference_type;
+      typedef Index distance_type;
+      typedef AlignedValue* pointer;
+      typedef AlignedValue& reference;
+      __forceinline Iterator_Index() {}
+      __forceinline Iterator_Index(Index v) : v(v) {}
+      __forceinline bool operator== (Iterator_Index other) { return v == other.v; }
+      __forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
+      __forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
+      __forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
+    };
+    
+    auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
+      assert(begin.v < end.v);
+      return reduction(start, func(range<Index>(begin.v, end.v)));
+    };
+    const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
+    return v;
+#endif
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    if (likely(last-first < parallel_threshold)) {
+      return func(range<Index>(first,last)); 
+    } else {
+      return parallel_reduce(first,last,minStepSize,identity,func,reduction);
+    }
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction ) 
+  {
+    return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    auto funcr = [&] ( const range<Index> r ) {
+      Value v = identity;
+      for (Index i=r.begin(); i<r.end(); i++)
+        v = reduction(v,func(i));
+      return v;
+    };
+    return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
+  }
+
+  template<typename Index, typename Value, typename Func, typename Reduction>
+    __forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
+  {
+    return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp
new file mode 100644
index 0000000000..20b639c1c9
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_set.cpp
@@ -0,0 +1,43 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_set.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  struct parallel_set_regression_test : public RegressionTest
+  {
+    parallel_set_regression_test(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+
+      /* create vector with random numbers */
+      const size_t N = 10000;
+      std::vector<uint32_t> unsorted(N);
+      for (size_t i=0; i<N; i++) unsorted[i] = 2*rand();
+      
+      /* created set from numbers */
+      parallel_set<uint32_t> sorted;
+      sorted.init(unsorted);
+
+      /* check that all elements are in the set */
+      for (size_t i=0; i<N; i++) {
+	passed &= sorted.lookup(unsorted[i]);
+      }
+
+      /* check that these elements are not in the set */
+      for (size_t i=0; i<N; i++) {
+	passed &= !sorted.lookup(unsorted[i]+1);
+      }
+
+      return passed;
+    }
+  };
+
+  parallel_set_regression_test parallel_set_regression("parallel_set_regression_test");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_set.h b/thirdparty/embree-aarch64/common/algorithms/parallel_set.h
new file mode 100644
index 0000000000..640beba7ec
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_set.h
@@ -0,0 +1,52 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "parallel_sort.h"
+
+namespace embree
+{
+  /* implementation of a set of values with parallel construction */
+  template<typename T>
+  class parallel_set
+  {
+  public:
+
+    /*! default constructor for the parallel set */
+    parallel_set () {}
+
+    /*! construction from vector */
+    template<typename Vector>
+      parallel_set (const Vector& in) { init(in); }
+
+    /*! initialized the parallel set from a vector */
+    template<typename Vector>
+      void init(const Vector& in) 
+    {
+      /* copy data to internal vector */
+      vec.resize(in.size());
+      parallel_for( size_t(0), in.size(), size_t(4*4096), [&](const range<size_t>& r) {
+	for (size_t i=r.begin(); i<r.end(); i++) 
+	  vec[i] = in[i];
+      });
+
+      /* sort the data */
+      std::vector<T> temp(in.size());
+      radix_sort<T>(vec.data(),temp.data(),vec.size());
+    }
+
+    /*! tests if some element is in the set */
+    __forceinline bool lookup(const T& elt) const {
+      return std::binary_search(vec.begin(), vec.end(), elt);
+    }
+
+    /*! clears all state */
+    void clear() {
+      vec.clear();
+    }
+
+  private:
+    std::vector<T> vec;   //!< vector containing sorted elements
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp
new file mode 100644
index 0000000000..5e7ec79ac1
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.cpp
@@ -0,0 +1,50 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "parallel_sort.h"
+#include "../sys/regression.h"
+
+namespace embree
+{
+  template<typename Key>
+  struct RadixSortRegressionTest : public RegressionTest
+  {
+    RadixSortRegressionTest(const char* name) : RegressionTest(name) {
+      registerRegressionTest(this);
+    }
+    
+    bool run ()
+    {
+      bool passed = true;
+      const size_t M = 10;
+
+      for (size_t N=10; N<1000000; N=size_t(2.1*N))
+      {
+	std::vector<Key> src(N); memset(src.data(),0,N*sizeof(Key));
+	std::vector<Key> tmp(N); memset(tmp.data(),0,N*sizeof(Key));
+	for (size_t i=0; i<N; i++) src[i] = uint64_t(rand())*uint64_t(rand());
+	
+	/* calculate checksum */
+	Key sum0 = 0; for (size_t i=0; i<N; i++) sum0 += src[i];
+        
+	/* sort numbers */
+	for (size_t i=0; i<M; i++) {
+          radix_sort<Key>(src.data(),tmp.data(),N);
+        }
+	
+	/* calculate checksum */
+	Key sum1 = 0; for (size_t i=0; i<N; i++) sum1 += src[i];
+	if (sum0 != sum1) passed = false;
+        
+	/* check if numbers are sorted */
+	for (size_t i=1; i<N; i++)
+	  passed &= src[i-1] <= src[i];
+      }
+      
+      return passed;
+    }
+  };
+
+  RadixSortRegressionTest<uint32_t> test_u32("RadixSortRegressionTestU32");
+  RadixSortRegressionTest<uint64_t> test_u64("RadixSortRegressionTestU64");
+}
diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h
new file mode 100644
index 0000000000..a758227c1b
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/algorithms/parallel_sort.h
@@ -0,0 +1,457 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../simd/simd.h"
+#include "parallel_for.h"
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+#include "../sys/alloc.h"
+#endif
+#include <algorithm>
+
+namespace embree
+{
+  template<class T>
+    __forceinline void insertionsort_ascending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v < array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T>
+    __forceinline void insertionsort_decending(T *__restrict__ array, const size_t length)
+  {
+    for(size_t i = 1;i<length;++i)
+    {
+      T v = array[i];
+      size_t j = i;
+      while(j > 0 && v > array[j-1])
+      {
+        array[j] = array[j-1];
+        --j;
+      }
+      array[j] = v;
+    }
+  }
+  
+  template<class T> 
+    void quicksort_ascending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] > pivotvalue);
+        while (t[++left] < pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_ascending(t, begin, pivot);
+      quicksort_ascending(t, pivot + 1, end);
+    }
+  }
+  
+  template<class T> 
+    void quicksort_decending(T *__restrict__ t, 
+			     const ssize_t begin, 
+			     const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const T pivotvalue = t[begin];
+      ssize_t left  = begin - 1;
+      ssize_t right = end   + 1;
+      
+      while(1) 
+      {
+        while (t[--right] < pivotvalue);
+        while (t[++left] > pivotvalue);
+        
+        if (left >= right) break;
+        
+        const T temp = t[right];
+        t[right] = t[left];
+        t[left] = temp;
+      }
+      
+      const int pivot = right;
+      quicksort_decending(t, begin, pivot);
+      quicksort_decending(t, pivot + 1, end);
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_ascending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {      
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_ascending<T>(&t[begin],size);
+      }
+      else
+      {
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] > pivotvalue);
+          while (t[++left] < pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_ascending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  
+  template<class T, ssize_t THRESHOLD> 
+    void quicksort_insertionsort_decending(T *__restrict__ t, 
+					   const ssize_t begin, 
+					   const ssize_t end)
+  {
+    if (likely(begin < end)) 
+    {
+      const ssize_t size = end-begin+1;
+      if (likely(size <= THRESHOLD))
+      {
+        insertionsort_decending<T>(&t[begin],size);
+      }
+      else
+      {
+        
+        const T pivotvalue = t[begin];
+        ssize_t left  = begin - 1;
+        ssize_t right = end   + 1;
+        
+        while(1) 
+        {
+          while (t[--right] < pivotvalue);
+          while (t[++left] > pivotvalue);
+          
+          if (left >= right) break;
+          
+          const T temp = t[right];
+          t[right] = t[left];
+          t[left] = temp;
+        }
+        
+        const ssize_t pivot = right;
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, begin, pivot);
+        quicksort_insertionsort_decending<T,THRESHOLD>(t, pivot + 1, end);
+      }
+    }
+  }
+  
+  template<typename T>
+    static void radixsort32(T* const morton, const size_t num, const unsigned int shift = 3*8)
+  {
+    static const unsigned int BITS = 8;
+    static const unsigned int BUCKETS = (1 << BITS);
+    static const unsigned int CMP_SORT_THRESHOLD = 16;
+    
+    __aligned(64) unsigned int count[BUCKETS];
+    
+    /* clear buckets */
+    for (size_t i=0;i<BUCKETS;i++) count[i] = 0;
+    
+    /* count buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+    for (size_t i=0;i<num;i++)
+      count[(unsigned(morton[i]) >> shift) & (BUCKETS-1)]++;
+    
+    /* prefix sums */
+    __aligned(64) unsigned int head[BUCKETS];
+    __aligned(64) unsigned int tail[BUCKETS];
+    
+    head[0] = 0;
+    for (size_t i=1; i<BUCKETS; i++)    
+      head[i] = head[i-1] + count[i-1];
+    
+    for (size_t i=0; i<BUCKETS-1; i++)    
+      tail[i] = head[i+1];
+    
+    tail[BUCKETS-1] = head[BUCKETS-1] + count[BUCKETS-1];
+    
+    assert(tail[BUCKETS-1] == head[BUCKETS-1] + count[BUCKETS-1]);      
+    assert(tail[BUCKETS-1] == num);      
+    
+    /* in-place swap */      
+    for (size_t i=0;i<BUCKETS;i++)
+    {
+      /* process bucket */
+      while(head[i] < tail[i])
+      {
+        T v = morton[head[i]];
+        while(1)
+        {
+          const size_t b = (unsigned(v) >> shift) & (BUCKETS-1);
+          if (b == i) break;
+          std::swap(v,morton[head[b]++]);
+        }
+        assert((unsigned(v) >> shift & (BUCKETS-1)) == i);
+        morton[head[i]++] = v;
+      }
+    }
+    if (shift == 0) return;
+    
+    size_t offset = 0;
+    for (size_t i=0;i<BUCKETS;i++)
+      if (count[i])
+      {
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(((unsigned(morton[j]) >> shift) & (BUCKETS-1)) == i);
+        
+        if (unlikely(count[i] < CMP_SORT_THRESHOLD))
+          insertionsort_ascending(morton + offset, count[i]);
+        else
+          radixsort32(morton + offset, count[i], shift-BITS);
+        
+        for (size_t j=offset;j<offset+count[i]-1;j++)
+          assert(morton[j] <= morton[j+1]);
+        
+        offset += count[i];
+      }      
+  }    
+
+  template<typename Ty, typename Key>
+    class ParallelRadixSort
+  {
+    static const size_t MAX_TASKS = 64;
+    static const size_t BITS = 8;
+    static const size_t BUCKETS = (1 << BITS);
+    typedef unsigned int TyRadixCount[BUCKETS];
+    
+    template<typename T>
+      static bool compare(const T& v0, const T& v1) {
+      return (Key)v0 < (Key)v1;
+    }
+
+  private:
+    ParallelRadixSort (const ParallelRadixSort& other) DELETED; // do not implement
+    ParallelRadixSort& operator= (const ParallelRadixSort& other) DELETED; // do not implement
+
+    
+  public:
+    ParallelRadixSort (Ty* const src, Ty* const tmp, const size_t N)
+      : radixCount(nullptr), src(src), tmp(tmp), N(N) {}
+
+    void sort(const size_t blockSize)
+    {
+      assert(blockSize > 0);
+      
+      /* perform single threaded sort for small N */
+      if (N<=blockSize) // handles also special case of 0!
+      {	  
+        /* do inplace sort inside destination array */
+        std::sort(src,src+N,compare<Ty>);
+      }
+      
+      /* perform parallel sort for large N */
+      else 
+      {
+        const size_t numThreads = min((N+blockSize-1)/blockSize,TaskScheduler::threadCount(),size_t(MAX_TASKS));
+        tbbRadixSort(numThreads);
+      }
+    }
+
+    ~ParallelRadixSort()
+    {
+      alignedFree(radixCount); 
+      radixCount = nullptr;
+    }
+    
+  private:
+    
+    void tbbRadixIteration0(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* count how many items go into the buckets */
+      for (size_t i=0; i<BUCKETS; i++)
+        radixCount[threadIndex][i] = 0;
+
+      /* iterate over src array and count buckets */
+      unsigned int * __restrict const count = radixCount[threadIndex];
+#if defined(__INTEL_COMPILER)
+#pragma nounroll      
+#endif
+      for (size_t i=startID; i<endID; i++) {
+#if defined(__X86_64__) || defined(__aarch64__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const Key index = ((Key)src[i] >> shift) & mask;
+#endif
+        count[index]++;
+      }
+    }
+    
+    void tbbRadixIteration1(const Key shift, 
+                            const Ty* __restrict const src, 
+                            Ty* __restrict const dst, 
+                            const size_t threadIndex, const size_t threadCount)
+    {
+      const size_t startID = (threadIndex+0)*N/threadCount;
+      const size_t endID   = (threadIndex+1)*N/threadCount;
+      
+      /* mask to extract some number of bits */
+      const Key mask = BUCKETS-1;
+      
+      /* calculate total number of items for each bucket */
+      __aligned(64) unsigned int total[BUCKETS];
+      /*
+      for (size_t i=0; i<BUCKETS; i++)
+        total[i] = 0;
+      */
+      for (size_t i=0; i<BUCKETS; i+=VSIZEX)
+        vintx::store(&total[i], zero);
+      
+      for (size_t i=0; i<threadCount; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          total[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&total[j], vintx::load(&total[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* calculate start offset of each bucket */
+      __aligned(64) unsigned int offset[BUCKETS];
+      offset[0] = 0;
+      for (size_t i=1; i<BUCKETS; i++)    
+        offset[i] = offset[i-1] + total[i-1];
+      
+      /* calculate start offset of each bucket for this thread */
+      for (size_t i=0; i<threadIndex; i++)
+      {
+        /*
+        for (size_t j=0; j<BUCKETS; j++)
+          offset[j] += radixCount[i][j];
+        */
+        for (size_t j=0; j<BUCKETS; j+=VSIZEX)
+          vintx::store(&offset[j], vintx::load(&offset[j]) + vintx::load(&radixCount[i][j]));
+      }
+      
+      /* copy items into their buckets */
+#if defined(__INTEL_COMPILER)
+#pragma nounroll
+#endif
+      for (size_t i=startID; i<endID; i++) {
+        const Ty elt = src[i];
+#if defined(__X86_64__) || defined(__aarch64__)
+        const size_t index = ((size_t)(Key)src[i] >> (size_t)shift) & (size_t)mask;
+#else
+        const size_t index = ((Key)src[i] >> shift) & mask;
+#endif
+        dst[offset[index]++] = elt;
+      }
+    }
+    
+    void tbbRadixIteration(const Key shift, const bool last,
+                           const Ty* __restrict src, Ty* __restrict dst,
+                           const size_t numTasks)
+    {
+      affinity_partitioner ap;
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration0(shift,src,dst,taskIndex,numTasks); },ap);
+      parallel_for_affinity(numTasks,[&] (size_t taskIndex) { tbbRadixIteration1(shift,src,dst,taskIndex,numTasks); },ap);
+    }
+    
+    void tbbRadixSort(const size_t numTasks)
+    {
+      radixCount = (TyRadixCount*) alignedMalloc(MAX_TASKS*sizeof(TyRadixCount),64);
+      
+      if (sizeof(Key) == sizeof(uint32_t)) {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,1,tmp,src,numTasks);
+      }
+      else if (sizeof(Key) == sizeof(uint64_t))
+      {
+        tbbRadixIteration(0*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(1*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(2*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(3*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(4*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(5*BITS,0,tmp,src,numTasks);
+        tbbRadixIteration(6*BITS,0,src,tmp,numTasks);
+        tbbRadixIteration(7*BITS,1,tmp,src,numTasks);
+      }
+    }
+    
+  private:
+    TyRadixCount* radixCount;
+    Ty* const src;
+    Ty* const tmp;
+    const size_t N;
+  };
+
+  template<typename Ty>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Ty>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty, typename Key>
+    void radix_sort(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192)
+  {
+    ParallelRadixSort<Ty,Key>(src,tmp,N).sort(blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u32(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint32_t>(src,tmp,N,blockSize);
+  }
+  
+  template<typename Ty>
+    void radix_sort_u64(Ty* const src, Ty* const tmp, const size_t N, const size_t blockSize = 8192) {
+    radix_sort<Ty,uint64_t>(src,tmp,N,blockSize);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/parsestream.h b/thirdparty/embree-aarch64/common/lexers/parsestream.h
new file mode 100644
index 0000000000..db46dc114f
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/parsestream.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stringstream.h"
+#include "../sys/filename.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/col3.h"
+#include "../math/color.h"
+
+namespace embree
+{
+  /*! helper class for simple command line parsing */
+  class ParseStream : public Stream<std::string>
+  {
+  public:
+    ParseStream (const Ref<Stream<std::string> >& cin) : cin(cin) {}
+
+    ParseStream (const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false)
+      : cin(new StringStream(cin,seps,endl,multiLine)) {}
+
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next() { return cin->get(); }
+
+    void force(const std::string& next) {
+      std::string token = getString();
+      if (token != next)
+        THROW_RUNTIME_ERROR("token \""+next+"\" expected but token \""+token+"\" found");
+    }
+
+    std::string getString() {
+      return get();
+    }
+
+    FileName getFileName()  {
+      return FileName(get());
+    }
+
+    int   getInt  () {
+      return atoi(get().c_str());
+    }
+
+    Vec2i getVec2i() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      return Vec2i(x,y);
+    }
+
+    Vec3ia getVec3ia() {
+      int x = atoi(get().c_str());
+      int y = atoi(get().c_str());
+      int z = atoi(get().c_str());
+      return Vec3ia(x,y,z);
+    }
+
+    float getFloat() {
+      return (float)atof(get().c_str());
+    }
+
+    Vec2f getVec2f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      return Vec2f(x,y);
+    }
+
+    Vec3f getVec3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3f(x,y,z);
+    }
+
+    Vec3fa getVec3fa() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Vec3fa(x,y,z);
+    }
+
+    Col3f getCol3f() {
+      float x = (float)atof(get().c_str());
+      float y = (float)atof(get().c_str());
+      float z = (float)atof(get().c_str());
+      return Col3f(x,y,z);
+    }
+
+    Color getColor() {
+      float r = (float)atof(get().c_str());
+      float g = (float)atof(get().c_str());
+      float b = (float)atof(get().c_str());
+      return Color(r,g,b);
+    }
+
+  private:
+    Ref<Stream<std::string> > cin;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/stream.h b/thirdparty/embree-aarch64/common/lexers/stream.h
new file mode 100644
index 0000000000..3f75677e68
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/stream.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/ref.h"
+#include "../sys/filename.h"
+#include "../sys/string.h"
+
+#include <vector>
+#include <iostream>
+#include <cstdio>
+#include <string.h>
+
+namespace embree
+{
+  /*! stores the location of a stream element in the source */
+  class ParseLocation
+  {
+  public:
+    ParseLocation () : lineNumber(-1), colNumber(-1) {}
+    ParseLocation (std::shared_ptr<std::string> fileName, ssize_t lineNumber, ssize_t colNumber, ssize_t /*charNumber*/)
+      : fileName(fileName), lineNumber(lineNumber), colNumber(colNumber) {}
+
+    std::string str() const
+    {
+      std::string str = "unknown";
+      if (fileName) str = *fileName;
+      if (lineNumber >= 0) str += " line " + toString(lineNumber);
+      if (lineNumber >= 0 && colNumber >= 0) str += " character " + toString(colNumber);
+      return str;
+    }
+
+  private:
+    std::shared_ptr<std::string> fileName;         /// name of the file (or stream) the token is from
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+  };
+
+  /*! a stream class templated over the stream elements */
+  template<typename T> class Stream : public RefCount
+  {
+    enum { BUF_SIZE = 1024 };
+    
+  private:
+    virtual T next() = 0;
+    virtual ParseLocation location() = 0;
+    __forceinline std::pair<T,ParseLocation> nextHelper() {
+      ParseLocation l = location();
+      T v = next();
+      return std::pair<T,ParseLocation>(v,l);
+    }
+    __forceinline void push_back(const std::pair<T,ParseLocation>& v) {
+      if (past+future == BUF_SIZE) pop_front();
+      size_t end = (start+past+future++)%BUF_SIZE;
+      buffer[end] = v;
+    }
+    __forceinline void pop_front() {
+      if (past == 0) THROW_RUNTIME_ERROR("stream buffer empty");
+      start = (start+1)%BUF_SIZE; past--;
+    }
+  public:
+    Stream () : start(0), past(0), future(0), buffer(BUF_SIZE) {}
+    virtual ~Stream() {}
+    
+  public:
+    
+    const ParseLocation& loc() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].second;
+    }
+    T get() {
+      if (future == 0) push_back(nextHelper());
+      T t = buffer[(start+past)%BUF_SIZE].first;
+      past++; future--;
+      return t;
+    }
+    const T& peek() {
+      if (future == 0) push_back(nextHelper());
+      return buffer[(start+past)%BUF_SIZE].first;
+    }
+    const T& unget(size_t n = 1) {
+      if (past < n) THROW_RUNTIME_ERROR ("cannot unget that many items");
+      past -= n; future += n;
+      return peek();
+    }
+    void drop() {
+      if (future == 0) push_back(nextHelper());
+      past++; future--;
+    }
+  private:
+    size_t start,past,future;
+    std::vector<std::pair<T,ParseLocation> > buffer;
+  };
+  
+  /*! warps an iostream stream */
+  class StdStream : public Stream<int>
+  {
+  public:
+    StdStream (std::istream& cin, const std::string& name = "std::stream")
+      : cin(cin), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+    ~StdStream() {}
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+    int next() {
+      int c = cin.get();
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+  private:
+    std::istream& cin;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a file */
+  class FileStream : public Stream<int>
+  {
+  public:
+
+    FileStream (FILE* file, const std::string& name = "file")
+      : file(file), lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name))) {}
+
+    FileStream (const FileName& fileName)
+      : lineNumber(1), colNumber(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(fileName.str())))
+    {
+      file = fopen(fileName.c_str(),"r");
+      if (file == nullptr) THROW_RUNTIME_ERROR("cannot open file " + fileName.str());
+    }
+    ~FileStream() { if (file) fclose(file); }
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = fgetc(file);
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+  private:
+    FILE* file;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+
+  /*! creates a stream from a string */
+  class StrStream : public Stream<int>
+  {
+  public:
+
+    StrStream (const char* str)
+      : str(str), lineNumber(1), colNumber(0), charNumber(0) {}
+
+  public:
+    ParseLocation location() {
+      return ParseLocation(std::shared_ptr<std::string>(),lineNumber,colNumber,charNumber);
+    }
+
+    int next() {
+      int c = str[charNumber];
+      if (c == 0) return EOF;
+      if (c == '\n') { lineNumber++; colNumber = 0; } else if (c != '\r') colNumber++;
+      charNumber++;
+      return c;
+    }
+
+  private:
+    const char* str;
+    ssize_t lineNumber;           /// the line number the token is from
+    ssize_t colNumber;            /// the character number in the current line
+    ssize_t charNumber;           /// the character in the file
+  };
+
+  /*! creates a character stream from a command line */
+  class CommandLineStream : public Stream<int>
+  {
+  public:
+    CommandLineStream (int argc, char** argv, const std::string& name = "command line")
+      : i(0), j(0), charNumber(0), name(std::shared_ptr<std::string>(new std::string(name)))
+    {
+      if (argc > 0) {
+	for (size_t i=0; argv[0][i] && i<1024; i++) charNumber++;
+	charNumber++;
+      }
+      for (ssize_t k=1; k<argc; k++) args.push_back(argv[k]);
+    }
+    ~CommandLineStream() {}
+  public:
+    ParseLocation location() {
+      return ParseLocation(name,0,charNumber,charNumber);
+    }
+    int next() {
+      if (i == args.size()) return EOF;
+      if (j == args[i].size()) { i++; j=0; charNumber++; return ' '; }
+      charNumber++;
+      return args[i][j++];
+    }
+  private:
+    size_t i,j;
+    std::vector<std::string> args;
+    ssize_t charNumber;           /// the character in the file
+    std::shared_ptr<std::string> name;             /// name of buffer
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/streamfilters.h b/thirdparty/embree-aarch64/common/lexers/streamfilters.h
new file mode 100644
index 0000000000..25580a77b8
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/streamfilters.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /* removes all line comments from a stream */
+  class LineCommentFilter : public Stream<int>
+  {
+  public:
+    LineCommentFilter (const FileName& fileName, const std::string& lineComment)
+      : cin(new FileStream(fileName)), lineComment(lineComment) {}
+    LineCommentFilter (Ref<Stream<int> > cin, const std::string& lineComment)
+      : cin(cin), lineComment(lineComment) {}
+
+    ParseLocation location() { return cin->loc(); }
+
+    int next()
+    {
+      /* look if the line comment starts here */
+      for (size_t j=0; j<lineComment.size(); j++) {
+        if (cin->peek() != lineComment[j]) { cin->unget(j); goto not_found; }
+        cin->get();
+      }
+      /* eat all characters until the end of the line (or file) */
+      while (cin->peek() != '\n' && cin->peek() != EOF) cin->get();
+
+    not_found:
+      return cin->get();
+    }
+
+  private:
+    Ref<Stream<int> > cin;
+    std::string lineComment;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
new file mode 100644
index 0000000000..98dc80ad59
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
@@ -0,0 +1,51 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stringstream.h"
+
+namespace embree
+{
+  static const std::string stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+  
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* simple tokenizer */
+  StringStream::StringStream(const Ref<Stream<int> >& cin, const std::string& seps, const std::string& endl, bool multiLine)
+    : cin(cin), endl(endl), multiLine(multiLine)
+  {
+    createCharMap(isSepMap,seps);
+    createCharMap(isValidCharMap,stringChars);
+  }
+
+  std::string StringStream::next()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF) {
+      if (endl != "" && cin->peek() == '\n') { cin->drop(); return endl; }
+      if (multiLine && cin->peek() == '\\') {
+        cin->drop();
+        if (cin->peek() == '\n') { cin->drop(); continue; }
+        cin->unget();
+      }
+      if (!isSeparator(cin->peek())) break;
+      cin->drop();
+    }
+
+    /* parse everything until the next separator */
+    std::vector<char> str; str.reserve(64);
+    while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+      int c = cin->get();
+      // -- GODOT start --
+      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
+      if (!isValidChar(c)) abort();
+      // -- GODOT end --
+      str.push_back((char)c);
+    }
+    str.push_back(0);
+    return std::string(str.data());
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.h b/thirdparty/embree-aarch64/common/lexers/stringstream.h
new file mode 100644
index 0000000000..e6dbd4aecc
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/stringstream.h
@@ -0,0 +1,29 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+
+namespace embree
+{
+  /*! simple tokenizer that produces a string stream */
+  class StringStream : public Stream<std::string>
+  {
+  public:
+    StringStream(const Ref<Stream<int> >& cin, const std::string& seps = "\n\t\r ",
+                 const std::string& endl = "", bool multiLine = false);
+  public:
+    ParseLocation location() { return cin->loc(); }
+    std::string next();
+  private:
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+    __forceinline bool isValidChar(unsigned int c) const { return c<256 && isValidCharMap[c]; }
+  private:
+    Ref<Stream<int> > cin; /*! source character stream */
+    bool isSepMap[256];    /*! map for fast classification of separators */
+    bool isValidCharMap[256];  /*! map for valid characters */
+    std::string endl;      /*! the token of the end of line */
+    bool multiLine;        /*! whether to parse lines wrapped with \ */
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp b/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp
new file mode 100644
index 0000000000..d05be65862
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/tokenstream.cpp
@@ -0,0 +1,181 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tokenstream.h"
+#include "../math/math.h"
+
+namespace embree
+{
+  /* shorthands for common sets of characters */
+  const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz";
+  const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  const std::string TokenStream::numbers = "0123456789";
+  const std::string TokenStream::separators = "\n\t\r ";
+  const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\";
+
+  /* creates map for fast categorization of characters */
+  static void createCharMap(bool map[256], const std::string& chrs) {
+    for (size_t i=0; i<256; i++) map[i] = false;
+    for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true;
+  }
+
+  /* build full tokenizer that takes list of valid characters and keywords */
+  TokenStream::TokenStream(const Ref<Stream<int> >& cin,            //< stream to read from
+                                   const std::string& alpha,                //< valid characters for identifiers
+                                   const std::string& seps,                 //< characters that act as separators
+                                   const std::vector<std::string>& symbols) //< symbols
+    : cin(cin), symbols(symbols)
+  {
+    createCharMap(isAlphaMap,alpha);
+    createCharMap(isSepMap,seps);
+    createCharMap(isStringCharMap,stringChars);
+  }
+
+  bool TokenStream::decDigits(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get();
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str;
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::decDigits1(std::string& str_o)
+  {
+    bool ok = false;
+    std::string str;
+    while (isDigit(cin->peek())) { ok = true; str += (char)cin->get(); }
+    if (ok) str_o += str; else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::trySymbol(const std::string& symbol)
+  {
+    size_t pos = 0;
+    while (pos < symbol.size()) {
+      if (symbol[pos] != cin->peek()) { cin->unget(pos); return false; }
+      cin->drop(); pos++;
+    }
+    return true;
+  }
+
+  bool TokenStream::trySymbols(Token& token, const ParseLocation& loc)
+  {
+    for (size_t i=0; i<symbols.size(); i++) {
+      if (!trySymbol(symbols[i])) continue;
+      token = Token(symbols[i],Token::TY_SYMBOL,loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryFloat(Token& token, const ParseLocation& loc)
+  {
+    bool ok = false;
+    std::string str;
+    if (trySymbol("nan")) {
+      token = Token(float(nan));
+      return true;
+    }
+    if (trySymbol("+inf")) {
+      token = Token(float(pos_inf));
+      return true;
+    }
+    if (trySymbol("-inf")) {
+      token = Token(float(neg_inf));
+      return true;
+    }
+
+    if (decDigits(str))
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        decDigits(str);
+        if (cin->peek() == 'e' || cin->peek() == 'E') {
+          str += (char)cin->get();
+          if (decDigits(str)) ok = true; // 1.[2]E2
+        }
+        else ok = true; // 1.[2]
+      }
+      else if (cin->peek() == 'e' || cin->peek() == 'E') {
+        str += (char)cin->get();
+        if (decDigits(str)) ok = true; // 1E2
+      }
+    }
+    else
+    {
+      if (cin->peek() == '.') {
+        str += (char)cin->get();
+        if (decDigits(str)) {
+          if (cin->peek() == 'e' || cin->peek() == 'E') {
+            str += (char)cin->get();
+            if (decDigits(str)) ok = true; // .3E2
+          }
+          else ok = true; // .3
+        }
+      }
+    }
+    if (ok) {
+      token = Token((float)atof(str.c_str()),loc);
+    }
+    else cin->unget(str.size());
+    return ok;
+  }
+
+  bool TokenStream::tryInt(Token& token, const ParseLocation& loc) {
+    std::string str;
+    if (decDigits(str)) {
+      token = Token(atoi(str.c_str()),loc);
+      return true;
+    }
+    return false;
+  }
+
+  bool TokenStream::tryString(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (cin->peek() != '\"') return false;
+    cin->drop();
+    while (cin->peek() != '\"') {
+      const int c = cin->get();
+      if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character "+std::string(1,c)+" at "+loc.str());
+      str += (char)c;
+    }
+    cin->drop();
+    token = Token(str,Token::TY_STRING,loc);
+    return true;
+  }
+
+  bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc)
+  {
+    std::string str;
+    if (!isAlpha(cin->peek())) return false;
+    str += (char)cin->get();
+    while (isAlphaNum(cin->peek())) str += (char)cin->get();
+    token = Token(str,Token::TY_IDENTIFIER,loc);
+    return true;
+  }
+
+  void TokenStream::skipSeparators()
+  {
+    /* skip separators */
+    while (cin->peek() != EOF && isSeparator(cin->peek()))
+      cin->drop();
+  }
+
+  Token TokenStream::next()
+  {
+    Token token;
+    skipSeparators();
+    ParseLocation loc = cin->loc();
+    if (trySymbols   (token,loc)) return token;      /**< try to parse a symbol */
+    if (tryFloat     (token,loc)) return token;      /**< try to parse float */
+    if (tryInt       (token,loc)) return token;      /**< try to parse integer */
+    if (tryString    (token,loc)) return token;      /**< try to parse string */
+    if (tryIdentifier(token,loc)) return token;      /**< try to parse identifier */
+    if (cin->peek() == EOF  )     return Token(loc); /**< return EOF token */
+    return Token((char)cin->get(),loc);              /**< return invalid character token */
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/lexers/tokenstream.h b/thirdparty/embree-aarch64/common/lexers/tokenstream.h
new file mode 100644
index 0000000000..72a7b4f2f3
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/lexers/tokenstream.h
@@ -0,0 +1,164 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "stream.h"
+#include <string>
+#include <vector>
+
+namespace embree
+{
+  /*! token class */
+  class Token
+  {
+  public:
+
+    enum Type { TY_EOF, TY_CHAR, TY_INT, TY_FLOAT, TY_IDENTIFIER, TY_STRING, TY_SYMBOL };
+
+    Token (        const ParseLocation& loc = ParseLocation()) : ty(TY_EOF  ),       loc(loc) {}
+    Token (char c, const ParseLocation& loc = ParseLocation()) : ty(TY_CHAR ), c(c), loc(loc) {}
+    Token (int i,  const ParseLocation& loc = ParseLocation()) : ty(TY_INT  ), i(i), loc(loc) {}
+    Token (float f,const ParseLocation& loc = ParseLocation()) : ty(TY_FLOAT), f(f), loc(loc) {}
+    Token (std::string str, Type ty, const ParseLocation& loc = ParseLocation()) : ty(ty),   str(str), loc(loc) {}
+
+    static Token Eof()                { return Token(); }
+    static Token Sym(std::string str) { return Token(str,TY_SYMBOL); }
+    static Token Str(std::string str) { return Token(str,TY_STRING); }
+    static Token Id (std::string str) { return Token(str,TY_IDENTIFIER); }
+
+    char Char() const {
+      if (ty == TY_CHAR) return c;
+      THROW_RUNTIME_ERROR(loc.str()+": character expected");
+    }
+
+    int Int() const {
+      if (ty == TY_INT) return i;
+      THROW_RUNTIME_ERROR(loc.str()+": integer expected");
+    }
+
+    float Float(bool cast = true)  const {
+      if (ty == TY_FLOAT) return f;
+      if (ty == TY_INT && cast) return (float)i;
+      THROW_RUNTIME_ERROR(loc.str()+": float expected");
+    }
+
+    std::string Identifier() const {
+      if (ty == TY_IDENTIFIER) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": identifier expected");
+    }
+
+    std::string String() const {
+      if (ty == TY_STRING) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": string expected");
+    }
+
+    std::string Symbol() const {
+      if (ty == TY_SYMBOL) return str;
+      THROW_RUNTIME_ERROR(loc.str()+": symbol expected");
+    }
+
+    const ParseLocation& Location() const { return loc; }
+
+    friend bool operator==(const Token& a, const Token& b)
+    {
+      if (a.ty != b.ty) return false;
+      if (a.ty == TY_CHAR) return a.c == b.c;
+      if (a.ty == TY_INT) return a.i == b.i;
+      if (a.ty == TY_FLOAT) return a.f == b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str == b.str;
+      if (a.ty == TY_STRING) return a.str == b.str;
+      if (a.ty == TY_SYMBOL) return a.str == b.str;
+      return true;
+    }
+
+    friend bool operator!=(const Token& a, const Token& b) {
+      return !(a == b);
+    }
+
+    friend bool operator <( const Token& a, const Token& b ) {
+      if (a.ty != b.ty) return (int)a.ty < (int)b.ty;
+      if (a.ty == TY_CHAR) return a.c < b.c;
+      if (a.ty == TY_INT) return a.i < b.i;
+      if (a.ty == TY_FLOAT) return a.f < b.f;
+      if (a.ty == TY_IDENTIFIER) return a.str < b.str;
+      if (a.ty == TY_STRING) return a.str < b.str;
+      if (a.ty == TY_SYMBOL) return a.str < b.str;
+      return false;
+    }
+
+    friend std::ostream& operator<<(std::ostream& cout, const Token& t)
+    {
+      if (t.ty == TY_EOF) return cout << "eof";
+      if (t.ty == TY_CHAR) return cout << "Char(" << t.c << ")";
+      if (t.ty == TY_INT) return cout << "Int(" << t.i << ")";
+      if (t.ty == TY_FLOAT) return cout << "Float(" << t.f << ")";
+      if (t.ty == TY_IDENTIFIER) return cout << "Id(" << t.str << ")";
+      if (t.ty == TY_STRING) return cout << "String(" << t.str << ")";
+      if (t.ty == TY_SYMBOL) return cout << "Symbol(" << t.str << ")";
+      return cout << "unknown";
+    }
+
+  private:
+    Type ty;            //< the type of the token
+    union {
+      char c;           //< data for char tokens
+      int i;            //< data for int tokens
+      float f;          //< data for float tokens
+    };
+    std::string str;    //< data for string and identifier tokens
+    ParseLocation loc;  //< the location the token is from
+  };
+
+  /*! build full tokenizer that takes list of valid characters and keywords */
+  class TokenStream : public Stream<Token>
+  {
+  public:
+
+    /*! shorthands for common sets of characters */
+    static const std::string alpha;
+    static const std::string ALPHA;
+    static const std::string numbers;
+    static const std::string separators;
+    static const std::string stringChars;
+
+  public:
+    TokenStream(const Ref<Stream<int> >& cin,
+                const std::string& alpha, //< valid characters for identifiers
+                const std::string& seps,  //< characters that act as separators
+                const std::vector<std::string>& symbols = std::vector<std::string>()); //< symbols
+  public:
+    ParseLocation location() { return cin->loc(); }
+    Token next();
+    bool trySymbol(const std::string& symbol);
+
+  private:
+    void skipSeparators();
+    bool decDigits(std::string& str);
+    bool decDigits1(std::string& str);
+    bool trySymbols(Token& token, const ParseLocation& loc);
+    bool tryFloat(Token& token, const ParseLocation& loc);
+    bool tryInt(Token& token, const ParseLocation& loc);
+    bool tryString(Token& token, const ParseLocation& loc);
+    bool tryIdentifier(Token& token, const ParseLocation& loc);
+
+    Ref<Stream<int> > cin;
+    bool isSepMap[256];
+    bool isAlphaMap[256];
+    bool isStringCharMap[256];
+    std::vector<std::string> symbols;
+
+    /*! checks if a character is a separator */
+    __forceinline bool isSeparator(unsigned int c) const { return c<256 && isSepMap[c]; }
+
+    /*! checks if a character is a number */
+    __forceinline bool isDigit(unsigned int c) const {  return c >= '0' && c <= '9'; }
+
+    /*! checks if a character is valid inside a string */
+    __forceinline bool isStringChar(unsigned int c) const { return c<256 && isStringCharMap[c]; }
+
+    /*! checks if a character is legal for an identifier */
+    __forceinline bool isAlpha(unsigned int c) const {  return c<256 && isAlphaMap[c];  }
+    __forceinline bool isAlphaNum(unsigned int c) const { return isAlpha(c) || isDigit(c); }
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/math/AVX2NEON.h b/thirdparty/embree-aarch64/common/math/AVX2NEON.h
new file mode 100644
index 0000000000..e8698ac56d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/AVX2NEON.h
@@ -0,0 +1,986 @@
+#pragma once
+
+#include "SSE2NEON.h"
+
+
+#define AVX2NEON_ABI static inline  __attribute__((always_inline))
+
+
+struct __m256d;
+
+struct __m256 {
+    __m128 lo,hi;
+    __m256() {}
+};
+
+
+
+
+struct __m256i {
+    __m128i lo,hi;
+    explicit __m256i(const __m256 a) : lo(__m128i(a.lo)),hi(__m128i(a.hi)) {}
+    operator __m256() const {__m256 res; res.lo = __m128(lo);res.hi = __m128(hi); return res;}
+    __m256i() {}
+};
+ 
+
+
+
+struct __m256d {
+    float64x2_t lo,hi;
+    __m256d() {}
+    __m256d(const __m256& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
+    __m256d(const __m256i& a) : lo(float64x2_t(a.lo)),hi(float64x2_t(a.hi)) {}
+};
+
+#define UNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a) {type res;res.lo=basic_func(a.lo);res.hi=basic_func(a.hi);return res;}
+
+
+#define BINARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=basic_func(a.lo,b.lo);res.hi=basic_func(a.hi,b.hi);return res;}
+#define BINARY_AVX_OP_CAST(type,func,basic_func,bdst,bsrc) AVX2NEON_ABI type func(const type& a,const type& b) {type res;res.lo=bdst(basic_func(bsrc(a.lo),bsrc(b.lo)));res.hi=bdst(basic_func(bsrc(a.hi),bsrc(b.hi)));return res;}
+
+#define TERNARY_AVX_OP(type,func,basic_func) AVX2NEON_ABI type func(const type& a,const type& b,const type& c) {type res;res.lo=basic_func(a.lo,b.lo,c.lo);res.hi=basic_func(a.hi,b.hi,c.hi);return res;}
+
+
+#define CAST_SIMD_TYPE(to,name,from,basic_dst) AVX2NEON_ABI to name(const from& a) { to res; res.lo = basic_dst(a.lo); res.hi=basic_dst(a.hi); return res;}
+
+
+
+#define _mm_stream_load_si128 _mm_load_si128
+#define _mm256_stream_load_si256 _mm256_load_si256
+
+
+AVX2NEON_ABI
+__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8)
+{
+    __m128 res;
+    for (int i=0;i<4;i++)
+    {
+        if (imm8 & (1<<i))
+        {
+            res[i] = b[i];
+        }
+        else{
+            res[i] = a[i];
+        }
+    }
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
+{
+    __m128i res;
+    for (int i=0;i<4;i++)
+    {
+        if (imm8 & (1<<i))
+        {
+            res[i] = b[i];
+        }
+        else{
+            res[i] = a[i];
+        }
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
+{
+    return __m128(vmvnq_s32(__m128i(_mm_cmpgt_ps(a,b))));
+}
+
+
+AVX2NEON_ABI
+__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
+{
+    int64x2_t y;
+    y[0] = *(int64_t *)mem_addr;
+    y[1] = 0;
+    return __m128i(y);
+}
+
+AVX2NEON_ABI
+int _mm_movemask_popcnt(__m128 a)
+{
+    return __builtin_popcount(_mm_movemask_ps(a));
+}
+
+AVX2NEON_ABI
+__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
+{
+    __m128 res;
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) res[i] = mem_addr[i]; else res[i] = 0;
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) mem_addr[i] = a[i];
+    }
+}
+
+AVX2NEON_ABI
+void _mm_maskstore_epi32 (int * mem_addr, __m128i mask, __m128i a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask[i] & 0x80000000) mem_addr[i] = a[i];
+    }
+}
+
+AVX2NEON_ABI
+__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vnegq_f32(vfmaq_f32(c,a,b));
+}
+
+#define _mm_fnmsub_ss _mm_fnmsub_ps
+
+AVX2NEON_ABI
+__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vfmsq_f32(c,a,b);
+}
+
+#define _mm_fnmadd_ss _mm_fnmadd_ps
+
+
+AVX2NEON_ABI
+__m128 _mm_broadcast_ss (float const * mem_addr)
+{
+    return vdupq_n_f32(*mem_addr);
+}
+
+
+AVX2NEON_ABI
+__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c)
+{
+    return vfmaq_f32(vnegq_f32(c),a,b);
+}
+
+#define _mm_fmsub_ss _mm_fmsub_ps
+#define _mm_fmadd_ps _mm_madd_ps
+#define _mm_fmadd_ss _mm_madd_ps
+
+
+
+template<int code>
+AVX2NEON_ABI float32x4_t dpps_neon(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    v = 0;
+    v += (code & 0x10) ? a[0]*b[0] : 0;
+    v += (code & 0x20) ? a[1]*b[1] : 0;
+    v += (code & 0x40) ? a[2]*b[2] : 0;
+    v += (code & 0x80) ? a[3]*b[3] : 0;
+    float32x4_t res;
+    res[0] = (code & 0x1) ? v : 0;
+    res[1] = (code & 0x2) ? v : 0;
+    res[2] = (code & 0x4) ? v : 0;
+    res[3] = (code & 0x8) ? v : 0;
+    return res;
+}
+
+template<>
+inline float32x4_t dpps_neon<0x7f>(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    float32x4_t m = _mm_mul_ps(a,b);
+    m[3] = 0;
+    v = vaddvq_f32(m);
+    return _mm_set1_ps(v);
+}
+
+template<>
+inline float32x4_t dpps_neon<0xff>(const float32x4_t& a,const float32x4_t& b)
+{
+    float v;
+    float32x4_t m = _mm_mul_ps(a,b);
+    v = vaddvq_f32(m);
+    return _mm_set1_ps(v);
+}
+
+#define _mm_dp_ps(a,b,c) dpps_neon<c>((a),(b))
+
+
+
+AVX2NEON_ABI
+__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
+{
+    return __m128(vmvnq_s32(__m128i(_mm_cmpge_ps(a,b))));
+}
+
+
+AVX2NEON_ABI
+__m128 _mm_permutevar_ps (__m128 a, __m128i b)
+{
+    __m128 x;
+    for (int i=0;i<4;i++)
+    {
+        x[i] = a[b[i&3]];
+    }
+    return x;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_setzero_si256()
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s32(0);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_setzero_ps()
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(0.0f);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_undefined_si256()
+{
+    return _mm256_setzero_si256();
+}
+
+AVX2NEON_ABI
+__m256 _mm256_undefined_ps()
+{
+    return _mm256_setzero_ps();
+}
+
+CAST_SIMD_TYPE(__m256d,_mm256_castps_pd,__m256,float64x2_t)
+CAST_SIMD_TYPE(__m256i,_mm256_castps_si256,__m256,__m128i)
+CAST_SIMD_TYPE(__m256, _mm256_castsi256_ps, __m256i,__m128)
+CAST_SIMD_TYPE(__m256, _mm256_castpd_ps ,__m256d,__m128)
+CAST_SIMD_TYPE(__m256d, _mm256_castsi256_pd, __m256i,float64x2_t)
+CAST_SIMD_TYPE(__m256i, _mm256_castpd_si256, __m256d,__m128i)
+
+
+
+
+AVX2NEON_ABI
+__m128 _mm256_castps256_ps128 (__m256 a)
+{
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_castsi128_si256 (__m128i a)
+{
+    __m256i res;
+    res.lo = a ;
+    res.hi = vdupq_n_s32(0);
+    return res;
+}
+
+AVX2NEON_ABI
+__m128i _mm256_castsi256_si128 (__m256i a)
+{
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_castps128_ps256 (__m128 a)
+{
+    __m256 res;
+    res.lo = a;
+    res.hi = vdupq_n_f32(0);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256 _mm256_broadcast_ss (float const * mem_addr)
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(*mem_addr);
+    return res;
+}
+
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
+{
+    __m128i lo = {e0,e1,e2,e3}, hi = {e4,e5,e6,e7};
+    __m256i res;
+    res.lo = lo; res.hi = hi;
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256i _mm256_set1_epi32 (int a)
+{
+    __m256i res;
+    res.lo = res.hi = vdupq_n_s32(a);
+    return res;
+}
+
+
+
+
+AVX2NEON_ABI
+int _mm256_movemask_ps(const __m256& v)
+{
+    return (_mm_movemask_ps(v.hi) << 4) | _mm_movemask_ps(v.lo);
+}
+
+template<int imm8>
+AVX2NEON_ABI
+__m256 __mm256_permute_ps (const __m256& a)
+{
+    __m256 res;
+    res.lo = _mm_shuffle_ps(a.lo,a.lo,imm8);
+    res.hi = _mm_shuffle_ps(a.hi,a.hi,imm8);
+    return res;
+
+}
+
+#define _mm256_permute_ps(a,c) __mm256_permute_ps<c>(a)
+
+
+template<int imm8>
+AVX2NEON_ABI
+__m256 __mm256_shuffle_ps (const __m256 a,const __m256& b)
+{
+    __m256 res;
+    res.lo = _mm_shuffle_ps(a.lo,b.lo,imm8);
+    res.hi = _mm_shuffle_ps(a.hi,b.hi,imm8);
+    return res;
+
+}
+
+#define _mm256_shuffle_ps(a,b,c) __mm256_shuffle_ps<c>(a,b)
+
+AVX2NEON_ABI
+__m256i _mm256_set1_epi64x (long long a)
+{
+    __m256i res;
+    int64x2_t t = vdupq_n_s64(a);
+    res.lo = res.hi = __m128i(t);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
+{
+    __m256 res;
+    __m128 tmp;
+    switch (imm8 & 0x7)
+    {
+        case 0: tmp = a.lo; break;
+        case 1: tmp = a.hi; break;
+        case 2: tmp = b.lo; break;
+        case 3: tmp = b.hi; break;
+    }
+    if (imm8 & 0x8)
+        tmp = _mm_setzero_ps();
+
+    
+    
+    res.lo = tmp;
+    imm8 >>= 4;
+    
+    switch (imm8 & 0x7)
+    {
+        case 0: tmp = a.lo; break;
+        case 1: tmp = a.hi; break;
+        case 2: tmp = b.lo; break;
+        case 3: tmp = b.hi; break;
+    }
+    if (imm8 & 0x8)
+        tmp = _mm_setzero_ps();
+    
+    res.hi = tmp;
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_moveldup_ps (__m256 a)
+{
+    __m256 res;
+    res.lo[0] = res.lo[1] = a.lo[0];
+    res.lo[2] = res.lo[3] = a.lo[2];
+    res.hi[0] = res.hi[1] = a.hi[0];
+    res.hi[2] = res.hi[3] = a.hi[2];
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256 _mm256_movehdup_ps (__m256 a)
+{
+    __m256 res;
+    res.lo[0] = res.lo[1] = a.lo[1];
+    res.lo[2] = res.lo[3] = a.lo[3];
+    res.hi[0] = res.hi[1] = a.hi[1];
+    res.hi[2] = res.hi[3] = a.hi[3];
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
+{
+    __m256 res = a;
+    if (imm8 & 1) res.hi = b;
+    else res.lo = b;
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
+{
+    if (imm8 & 1) return a.hi;
+    return a.lo;
+}
+
+
+AVX2NEON_ABI
+__m256d _mm256_movedup_pd (__m256d a)
+{
+    __m256d res;
+    res.hi = a.hi;
+    res.lo[0] = res.lo[1] = a.lo[0];
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_abs_epi32(__m256i a)
+{
+   __m256i res;
+   res.lo = vabsq_s32(a.lo);
+   res.hi = vabsq_s32(a.hi);
+   return res;
+}
+
+UNARY_AVX_OP(__m256,_mm256_sqrt_ps,_mm_sqrt_ps)
+UNARY_AVX_OP(__m256,_mm256_rsqrt_ps,_mm_rsqrt_ps)
+UNARY_AVX_OP(__m256,_mm256_rcp_ps,_mm_rcp_ps)
+UNARY_AVX_OP(__m256,_mm256_floor_ps,vrndmq_f32)
+UNARY_AVX_OP(__m256,_mm256_ceil_ps,vrndpq_f32)
+
+
+BINARY_AVX_OP(__m256i,_mm256_add_epi32,_mm_add_epi32)
+BINARY_AVX_OP(__m256i,_mm256_sub_epi32,_mm_sub_epi32)
+BINARY_AVX_OP(__m256i,_mm256_mullo_epi32,_mm_mullo_epi32)
+
+BINARY_AVX_OP(__m256i,_mm256_min_epi32,_mm_min_epi32)
+BINARY_AVX_OP(__m256i,_mm256_max_epi32,_mm_max_epi32)
+BINARY_AVX_OP_CAST(__m256i,_mm256_min_epu32,vminq_u32,__m128i,uint32x4_t)
+BINARY_AVX_OP_CAST(__m256i,_mm256_max_epu32,vmaxq_u32,__m128i,uint32x4_t)
+
+BINARY_AVX_OP(__m256,_mm256_min_ps,_mm_min_ps)
+BINARY_AVX_OP(__m256,_mm256_max_ps,_mm_max_ps)
+
+BINARY_AVX_OP(__m256,_mm256_add_ps,_mm_add_ps)
+BINARY_AVX_OP(__m256,_mm256_mul_ps,_mm_mul_ps)
+BINARY_AVX_OP(__m256,_mm256_sub_ps,_mm_sub_ps)
+BINARY_AVX_OP(__m256,_mm256_div_ps,_mm_div_ps)
+
+BINARY_AVX_OP(__m256,_mm256_and_ps,_mm_and_ps)
+BINARY_AVX_OP(__m256,_mm256_andnot_ps,_mm_andnot_ps)
+BINARY_AVX_OP(__m256,_mm256_or_ps,_mm_or_ps)
+BINARY_AVX_OP(__m256,_mm256_xor_ps,_mm_xor_ps)
+
+BINARY_AVX_OP_CAST(__m256d,_mm256_and_pd,vandq_s64,float64x2_t,int64x2_t)
+BINARY_AVX_OP_CAST(__m256d,_mm256_or_pd,vorrq_s64,float64x2_t,int64x2_t)
+BINARY_AVX_OP_CAST(__m256d,_mm256_xor_pd,veorq_s64,float64x2_t,int64x2_t)
+
+
+
+BINARY_AVX_OP(__m256i,_mm256_and_si256,_mm_and_si128)
+BINARY_AVX_OP(__m256i,_mm256_or_si256,_mm_or_si128)
+BINARY_AVX_OP(__m256i,_mm256_xor_si256,_mm_xor_si128)
+
+
+BINARY_AVX_OP(__m256,_mm256_unpackhi_ps,_mm_unpackhi_ps)
+BINARY_AVX_OP(__m256,_mm256_unpacklo_ps,_mm_unpacklo_ps)
+TERNARY_AVX_OP(__m256,_mm256_blendv_ps,_mm_blendv_ps)
+
+
+TERNARY_AVX_OP(__m256,_mm256_fmadd_ps,_mm_fmadd_ps)
+TERNARY_AVX_OP(__m256,_mm256_fnmadd_ps,_mm_fnmadd_ps)
+TERNARY_AVX_OP(__m256,_mm256_fmsub_ps,_mm_fmsub_ps)
+TERNARY_AVX_OP(__m256,_mm256_fnmsub_ps,_mm_fnmsub_ps)
+
+
+BINARY_AVX_OP(__m256i,_mm256_unpackhi_epi32,_mm_unpackhi_epi32)
+BINARY_AVX_OP(__m256i,_mm256_unpacklo_epi32,_mm_unpacklo_epi32)
+
+
+BINARY_AVX_OP(__m256i,_mm256_cmpeq_epi32,_mm_cmpeq_epi32)
+BINARY_AVX_OP(__m256i,_mm256_cmpgt_epi32,_mm_cmpgt_epi32)
+BINARY_AVX_OP(__m256,_mm256_cmpeq_ps,_mm_cmpeq_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpneq_ps,_mm_cmpneq_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnlt_ps,_mm_cmpnlt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpngt_ps,_mm_cmpngt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpge_ps,_mm_cmpge_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnge_ps,_mm_cmpnge_ps)
+BINARY_AVX_OP(__m256,_mm256_cmplt_ps,_mm_cmplt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmple_ps,_mm_cmple_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpgt_ps,_mm_cmpgt_ps)
+BINARY_AVX_OP(__m256,_mm256_cmpnle_ps,_mm_cmpnle_ps)
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtps_epi32 (__m256 a)
+{
+    __m256i res;
+    res.lo = _mm_cvtps_epi32(a.lo);
+    res.hi = _mm_cvtps_epi32(a.hi);
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cvttps_epi32 (__m256 a)
+{
+    __m256i res;
+    res.lo = _mm_cvttps_epi32(a.lo);
+    res.hi = _mm_cvttps_epi32(a.hi);
+    return res;
+    
+}
+
+AVX2NEON_ABI
+__m256 _mm256_loadu_ps (float const * mem_addr)
+{
+    __m256 res;
+    res.lo = *(__m128 *)(mem_addr + 0);
+    res.hi = *(__m128 *)(mem_addr + 4);
+    return res;
+}
+#define _mm256_load_ps _mm256_loadu_ps
+
+
+AVX2NEON_ABI
+int _mm256_testz_ps (const __m256& a, const __m256& b)
+{
+    __m256 t = a;
+    if (&a != &b)
+        t = _mm256_and_ps(a,b);
+
+    __m128i l  = vshrq_n_s32(__m128i(t.lo),31);
+    __m128i h  = vshrq_n_s32(__m128i(t.hi),31);
+    return vaddvq_s32(vaddq_s32(l,h)) == 0;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0)
+{
+    __m256i res;
+    int64x2_t t0 = {e0,e1};
+    int64x2_t t1 = {e2,e3};
+    res.lo = __m128i(t0);
+    res.hi = __m128i(t1);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_setzero_pd ()
+{
+    __m256d res;
+    res.lo = res.hi = vdupq_n_f64(0);
+    return res;
+}
+
+AVX2NEON_ABI
+int _mm256_movemask_pd (__m256d a)
+{
+    int res = 0;
+    uint64x2_t x;
+    x = uint64x2_t(a.lo);
+    res |= (x[0] >> 63) ? 1 : 0;
+    res |= (x[0] >> 63) ? 2 : 0;
+    x = uint64x2_t(a.hi);
+    res |= (x[0] >> 63) ? 4 : 0;
+    res |= (x[0] >> 63) ? 8 : 0;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
+{
+    __m256i res;
+    res.lo = __m128i(vceqq_s64(int64x2_t(a.lo),int64x2_t(b.lo)));
+    res.hi = __m128i(vceqq_s64(int64x2_t(a.hi),int64x2_t(b.hi)));
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cmpeq_pd (__m256d a, __m256d b)
+{
+    __m256i res;
+    res.lo = __m128i(vceqq_f64(a.lo,b.lo));
+    res.hi = __m128i(vceqq_f64(a.hi,b.hi));
+    return res;
+}
+
+
+AVX2NEON_ABI
+int _mm256_testz_pd (const __m256d& a, const __m256d& b)
+{
+    __m256d t = a;
+
+    if (&a != &b)
+        t = _mm256_and_pd(a,b);
+
+    return _mm256_movemask_pd(t) == 0;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
+{
+    __m256d res;
+    uint64x2_t t = uint64x2_t(mask.lo);
+    res.lo[0] = (t[0] >> 63) ? b.lo[0] : a.lo[0];
+    res.lo[1] = (t[1] >> 63) ? b.lo[1] : a.lo[1];
+    t = uint64x2_t(mask.hi);
+    res.hi[0] = (t[0] >> 63) ? b.hi[0] : a.hi[0];
+    res.hi[1] = (t[1] >> 63) ? b.hi[1] : a.hi[1];
+    return res;
+}
+
+template<int imm8>
+__m256 __mm256_dp_ps (__m256 a, __m256 b)
+{
+    __m256 res;
+    res.lo = _mm_dp_ps(a.lo,b.lo,imm8);
+    res.hi = _mm_dp_ps(a.hi,b.hi,imm8);
+    return res;
+}
+
+#define _mm256_dp_ps(a,b,c) __mm256_dp_ps<c>(a,b)
+
+AVX2NEON_ABI
+double _mm256_permute4x64_pd_select(__m256d a, const int imm8)
+{
+    switch (imm8 & 3) {
+        case 0:
+            return a.lo[0];
+        case 1:
+            return a.lo[1];
+        case 2:
+            return a.hi[0];
+        case 3:
+            return a.hi[1];
+    }
+    __builtin_unreachable();
+    return 0;
+}
+
+AVX2NEON_ABI
+__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
+{
+    __m256d res;
+    res.lo[0] = _mm256_permute4x64_pd_select(a,imm8 >> 0);
+    res.lo[1] = _mm256_permute4x64_pd_select(a,imm8 >> 2);
+    res.hi[0] = _mm256_permute4x64_pd_select(a,imm8 >> 4);
+    res.hi[1] = _mm256_permute4x64_pd_select(a,imm8 >> 6);
+    
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
+{
+    return __m256i(_mm256_insertf128_ps((__m256)a,(__m128)b,imm8));
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
+{
+    __m256i res;
+    res.lo = *(__m128i *)((int32_t *)mem_addr + 0);
+    res.hi = *(__m128i *)((int32_t *)mem_addr + 4);
+    return res;
+}
+
+#define _mm256_load_si256 _mm256_loadu_si256
+
+AVX2NEON_ABI
+void _mm256_storeu_ps (float * mem_addr, __m256 a)
+{
+    *(__m128 *)(mem_addr + 0) = a.lo;
+    *(__m128 *)(mem_addr + 4) = a.hi;
+
+}
+
+#define _mm256_store_ps _mm256_storeu_ps
+#define _mm256_stream_ps _mm256_storeu_ps
+
+
+AVX2NEON_ABI
+void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
+{
+    *(__m128i *)((int *)mem_addr + 0) = a.lo;
+    *(__m128i *)((int *)mem_addr + 4) = a.hi;
+
+}
+
+#define _mm256_store_si256 _mm256_storeu_si256
+
+
+
+AVX2NEON_ABI
+__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
+{
+    __m256 res;
+    res.lo = _mm_maskload_ps(mem_addr,mask.lo);
+    res.hi = _mm_maskload_ps(mem_addr + 4,mask.hi);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepu8_epi32 (__m128i a)
+{
+    __m256i res;
+    uint8x16_t x = uint8x16_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepi8_epi32 (__m128i a)
+{
+    __m256i res;
+    int8x16_t x = int8x16_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepu16_epi32 (__m128i a)
+{
+    __m256i res;
+    uint16x8_t x = uint16x8_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_cvtepi16_epi32 (__m128i a)
+{
+    __m256i res;
+    int16x8_t x = int16x8_t(a);
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = x[i];
+        res.hi[i] = x[i+4];
+    }
+    return res;
+}
+
+
+
+AVX2NEON_ABI
+void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
+{
+    _mm_maskstore_epi32(mem_addr,mask.lo,a.lo);
+    _mm_maskstore_epi32(mem_addr + 4,mask.hi,a.hi);
+}
+
+AVX2NEON_ABI
+__m256i _mm256_slli_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_slli_epi32(a.lo,imm8);
+    res.hi = _mm_slli_epi32(a.hi,imm8);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_srli_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srli_epi32(a.lo,imm8);
+    res.hi = _mm_srli_epi32(a.hi,imm8);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256i _mm256_srai_epi32 (__m256i a, int imm8)
+{
+    __m256i res;
+    res.lo = _mm_srai_epi32(a.lo,imm8);
+    res.hi = _mm_srai_epi32(a.hi,imm8);
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = vshlq_s32(a.lo,count.lo);
+    res.hi = vshlq_s32(a.hi,count.hi);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = vshlq_s32(a.lo,vnegq_s32(count.lo));
+    res.hi = vshlq_s32(a.hi,vnegq_s32(count.hi));
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
+{
+    __m256i res;
+    res.lo = __m128i(vshlq_u32(uint32x4_t(a.lo),vnegq_s32(count.lo)));
+    res.hi = __m128i(vshlq_u32(uint32x4_t(a.hi),vnegq_s32(count.hi)));
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
+{
+    return __m256i(_mm256_permute2f128_ps(__m256(a),__m256(b),imm8));
+}
+
+
+AVX2NEON_ABI
+__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
+{
+    if (imm8 & 1) return a.hi;
+    return a.lo;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_set1_ps(float x)
+{
+    __m256 res;
+    res.lo = res.hi = vdupq_n_f32(x);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
+{
+    __m256 res;
+    res.lo = _mm_set_ps(e3,e2,e1,e0);
+    res.hi = _mm_set_ps(e7,e6,e5,e4);
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
+{
+    __m256 res;
+    res.lo = res.hi = *mem_addr;
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_cvtepi32_ps (__m256i a)
+{
+    __m256 res;
+    res.lo = _mm_cvtepi32_ps(a.lo);
+    res.hi = _mm_cvtepi32_ps(a.hi);
+    return res;
+}
+AVX2NEON_ABI
+void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
+{
+    for (int i=0;i<4;i++) {
+        if (mask.lo[i] & 0x80000000) mem_addr[i] = a.lo[i];
+        if (mask.hi[i] & 0x80000000) mem_addr[i+4] = a.hi[i];
+    }
+}
+
+AVX2NEON_ABI
+__m256d _mm256_andnot_pd (__m256d a, __m256d b)
+{
+    __m256d res;
+    res.lo = float64x2_t(_mm_andnot_ps(__m128(a.lo),__m128(b.lo)));
+    res.hi = float64x2_t(_mm_andnot_ps(__m128(a.hi),__m128(b.hi)));
+    return res;
+}
+
+AVX2NEON_ABI
+__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
+{
+    __m256 res;
+    res.lo = _mm_blend_ps(a.lo,b.lo,imm8 & 0xf);
+    res.hi = _mm_blend_ps(a.hi,b.hi,imm8 >> 4);
+    return res;
+
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
+{
+    __m256i res;
+    res.lo = _mm_blend_epi32(a.lo,b.lo,imm8 & 0xf);
+    res.hi = _mm_blend_epi32(a.hi,b.hi,imm8 >> 4);
+    return res;
+
+}
+
+AVX2NEON_ABI
+__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
+{
+    __m256i res;
+    for (int i=0;i<4;i++)
+    {
+        res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
+        res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
+    }
+    return res;
+}
+
+
+AVX2NEON_ABI
+__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
+{
+    __m256i res = _mm256_setzero_si256();
+    for (int i=0;i<4;i++)
+    {
+        if (mask.lo[i] >> 31) res.lo[i] = *(int *)((char *) base_addr + (vindex.lo[i]*scale));
+        if (mask.hi[i] >> 31) res.hi[i] = *(int *)((char *) base_addr + (vindex.hi[i]*scale));
+    }
+    
+    return res;
+
+}
+
+
diff --git a/thirdparty/embree-aarch64/common/math/SSE2NEON.h b/thirdparty/embree-aarch64/common/math/SSE2NEON.h
new file mode 100644
index 0000000000..2013151d31
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/SSE2NEON.h
@@ -0,0 +1,1753 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding ARM NEON versions
+//
+// This header file does not (yet) translate *all* of the SSE intrinsics.
+// Since this is in support of a specific porting effort, I have only
+// included the intrinsics I needed to get my port to work.
+//
+// Questions/Comments/Feedback send to: jratcliffscarab@gmail.com
+//
+// If you want to improve or add to this project, send me an
+// email and I will probably approve your access to the depot.
+//
+// Project is located here:
+//
+//	https://github.com/jratcliff63367/sse2neon
+//
+// Show your appreciation for open source by sending me a bitcoin tip to the following
+// address.
+//
+// TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p :
+// https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p
+//
+//
+// Contributors to this project are:
+//
+// John W. Ratcliff : jratcliffscarab@gmail.com
+// Brandon Rowlett  : browlett@nvidia.com
+// Ken Fast         : kfast@gdeb.com
+// Eric van Beurden : evanbeurden@nvidia.com
+//
+//
+// *********************************************************************************************************************
+// Release notes for January 20, 2017 version:
+//
+// The unit tests have been refactored.  They no longer assert on an error, instead they return a pass/fail condition
+// The unit-tests now test 10,000 random float and int values against each intrinsic.
+//
+// SSE2NEON now supports 95 SSE intrinsics.  39 of them have formal unit tests which have been implemented and
+// fully tested on NEON/ARM.  The remaining 56 still need unit tests implemented.
+//
+// A struct is now defined in this header file called 'SIMDVec' which can be used by applications which
+// attempt to access the contents of an _m128 struct directly.  It is important to note that accessing the __m128
+// struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128 struct directly so the developer
+// can use the SIMDVec as an alias for it.  Any casting must be done manually by the developer, as you cannot
+// cast or otherwise alias the base NEON data type for intrinsic operations.
+//
+// A bug was found with the _mm_shuffle_ps intrinsic.  If the shuffle permutation was not one of the ones with
+// a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing
+// to return the correct value.  This is now fixed.
+//
+// A bug was found with the _mm_cvtps_epi32 intrinsic.  This converts floating point values to integers.
+// It was not honoring the correct rounding mode.  In SSE the default rounding mode when converting from float to int
+// is to use 'round to even' otherwise known as 'bankers rounding'.  ARMv7 did not support this feature but ARMv8 does.
+// As it stands today, this header file assumes ARMv8.  If you are trying to target really old ARM devices, you may get
+// a build error.
+//
+// Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are
+// producing the correct results on NEON.  These unit tests will be added as soon as possible.
+//
+// Here is the list of new instrinsics which have been added:
+//
+// _mm_cvtss_f32     :  extracts the lower order floating point value from the parameter
+// _mm_add_ss        : adds the scalar single - precision floating point values of a and b
+// _mm_div_ps        : Divides the four single - precision, floating - point values of a and b.
+// _mm_div_ss        : Divides the scalar single - precision floating point value of a by b.
+// _mm_sqrt_ss       : Computes the approximation of the square root of the scalar single - precision floating point value of in.
+// _mm_rsqrt_ps      : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in.
+// _mm_comilt_ss     : Compares the lower single - precision floating point scalar values of a and b using a less than operation
+// _mm_comigt_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than operation.
+// _mm_comile_ss     :  Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation.
+// _mm_comige_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation.
+// _mm_comieq_ss     :  Compares the lower single - precision floating point scalar values of a and b using an equality operation.
+// _mm_comineq_s     :  Compares the lower single - precision floating point scalar values of a and b using an inequality operation
+// _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b.
+// _mm_unpackhi_epi16:  Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b.
+//
+// *********************************************************************************************************************
+/*
+** The MIT license:
+**
+** Permission is hereby granted, free of charge, to any person obtaining a copy
+** of this software and associated documentation files (the "Software"), to deal
+** in the Software without restriction, including without limitation the rights
+** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+** copies of the Software, and to permit persons to whom the Software is furnished
+** to do so, subject to the following conditions:
+**
+** The above copyright notice and this permission notice shall be included in all
+** copies or substantial portions of the Software.
+
+** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#pragma once
+
+#define GCC 1
+#define ENABLE_CPP_VERSION 0
+
+// enable precise emulation of _mm_min_ps and _mm_max_ps?
+// This would slow down the computation a bit, but gives consistent result with x86 SSE2.
+// (e.g. would solve a hole or NaN pixel in the rendering result)
+#define USE_PRECISE_MINMAX_IMPLEMENTATION (1)
+
+#if GCC
+#define FORCE_INLINE					inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x)					__attribute__((aligned(x)))
+#else
+#define FORCE_INLINE					inline
+#define ALIGN_STRUCT(x)					__declspec(align(x))
+#endif
+
+#include <stdint.h>
+#include "arm_neon.h"
+#if defined(__aarch64__)
+#include "constants.h"
+#endif
+
+
+#if !defined(__has_builtin)
+#define __has_builtin(x) (0)
+#endif
+
+/*******************************************************/
+/* MACRO for shuffle parameter for _mm_shuffle_ps().   */
+/* Argument fp3 is a digit[0123] that represents the fp*/
+/* from argument "b" of mm_shuffle_ps that will be     */
+/* placed in fp3 of result. fp2 is the same for fp2 in */
+/* result. fp1 is a digit[0123] that represents the fp */
+/* from argument "a" of mm_shuffle_ps that will be     */
+/* places in fp1 of result. fp0 is the same for fp0 of */
+/* result                                              */
+/*******************************************************/
+#if defined(__aarch64__)
+#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3),  (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3),  (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )
+#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3),  (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+2), (((fp2)*4)+3),  (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3),  (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*4)+16+3) } )
+#endif
+
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) (((fp3) << 6) | ((fp2) << 4) | \
+  ((fp1) << 2) | ((fp0)))
+
+typedef float32x4_t __m128;
+typedef int32x4_t __m128i;
+
+// union intended to allow direct access to an __m128 variable using the names that the MSVC
+// compiler provides.  This union should really only be used when trying to access the members
+// of the vector as integer values.  GCC/clang allow native access to the float members through
+// a simple array access operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause a performance
+// hit.  If it really is needed however, the original __m128 variable can be aliased with a
+// pointer to this union and used to access individual components.  The use of this union should
+// be hidden behind a macro that is used throughout the codebase to access the members instead
+// of always declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec
+{
+  float       m128_f32[4];    // as floats - do not to use this.  Added for convenience.
+  int8_t      m128_i8[16];    // as signed 8-bit integers.
+  int16_t     m128_i16[8];    // as signed 16-bit integers.
+  int32_t     m128_i32[4];    // as signed 32-bit integers.
+  int64_t     m128_i64[2];    // as signed 64-bit integers.
+  uint8_t     m128_u8[16];    // as unsigned 8-bit integers.
+  uint16_t    m128_u16[8];    // as unsigned 16-bit integers.
+  uint32_t    m128_u32[4];    // as unsigned 32-bit integers.
+  uint64_t    m128_u64[2];    // as unsigned 64-bit integers.
+  double	    m128_f64[2];    // as signed double
+} SIMDVec;
+
+// ******************************************
+// CPU stuff
+// ******************************************
+
+typedef SIMDVec __m128d;
+
+#include <stdlib.h>
+
+#ifndef _MM_MASK_MASK
+#define _MM_MASK_MASK 0x1f80
+#define _MM_MASK_DIV_ZERO 0x200
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_DENORMALS_ZERO_ON 0x40
+#define _MM_MASK_DENORM 0x100
+#endif
+#define _MM_SET_EXCEPTION_MASK(x)
+#define _MM_SET_FLUSH_ZERO_MODE(x)
+#define _MM_SET_DENORMALS_ZERO_MODE(x)
+
+FORCE_INLINE void _mm_pause()
+{
+}
+
+FORCE_INLINE void _mm_mfence()
+{
+    __sync_synchronize();
+}
+
+#define _MM_HINT_T0 3
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 1
+#define _MM_HINT_NTA 0
+
+FORCE_INLINE void _mm_prefetch(const void* ptr, unsigned int level)
+{
+   __builtin_prefetch(ptr);
+ 
+}
+
+FORCE_INLINE void* _mm_malloc(int size, int align)
+{
+    void *ptr;
+    // align must be multiple of sizeof(void *) for posix_memalign.
+    if (align < sizeof(void *)) {
+        align = sizeof(void *);
+    }
+
+    if ((align % sizeof(void *)) != 0) {
+        // fallback to malloc
+        ptr = malloc(size);
+    } else {
+        if (posix_memalign(&ptr, align, size)) {
+          return 0;
+        }
+    }
+
+    return ptr;
+}
+
+FORCE_INLINE void _mm_free(void* ptr)
+{
+        free(ptr);
+}
+
+FORCE_INLINE int _mm_getcsr()
+{
+        return 0;
+}
+
+FORCE_INLINE void _mm_setcsr(int val)
+{
+        return;
+}
+
+// ******************************************
+// Set/get methods
+// ******************************************
+
+// extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
+#if defined(__aarch64__)
+FORCE_INLINE float _mm_cvtss_f32(const __m128& x)
+{
+    return x[0];
+}
+#else
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(a, 0);
+}
+#endif
+
+// Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128()
+{
+  return vdupq_n_s32(0);
+}
+
+// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+  return vdupq_n_f32(0);
+}
+
+// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+  return vdupq_n_f32(_w);
+}
+
+// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+  return vdupq_n_f32(_w);
+}
+
+// Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+#if defined(__aarch64__) 
+FORCE_INLINE __m128 _mm_set_ps(const float w, const float z, const float y, const float x)
+{
+    float32x4_t t = { x, y, z, w };
+    return t;
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(const float w, const float z , const float y , const float x )
+{
+    float32x4_t t = { w, z, y, x };
+    return t;
+}
+#else
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float __attribute__((aligned(16))) data[4] = { x, y, z, w };
+    return vld1q_f32(data);
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x )
+{
+    float __attribute__ ((aligned (16))) data[4] = { w, z, y, x };
+    return vld1q_f32(data);
+}
+#endif
+
+// Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+  return vdupq_n_s32(_i);
+}
+
+//Set the first lane to of 4 signed single-position, floating-point number to w
+#if defined(__aarch64__)
+FORCE_INLINE __m128 _mm_set_ss(float _w)
+{
+    float32x4_t res = {_w, 0, 0, 0};
+    return res;
+}
+
+// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32x4_t t = {i0,i1,i2,i3};
+    return t;
+}
+#else
+FORCE_INLINE __m128 _mm_set_ss(float _w)
+{
+    __m128 val = _mm_setzero_ps();
+    return vsetq_lane_f32(_w, val, 0);
+}
+
+// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 };
+    return vld1q_s32(data);
+}
+#endif
+
+// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+  vst1q_f32(p, a);
+}
+
+// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+  vst1q_f32(p, a);
+}
+
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+  vst1q_s32((int32_t*) p,a);
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a )
+{
+  vst1q_s32((int32_t*) p,a);
+}
+
+// Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+  vst1q_lane_f32(p, a, 0);
+}
+
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.  https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b)
+{
+  *a = (__m128i)vsetq_lane_s64((int64_t)vget_low_s32(b), *(int64x2_t*)a, 0);
+}
+
+// Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float * p)
+{
+  return vld1q_dup_f32(p);
+}
+
+// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float * p)
+{
+  return vld1q_f32(p);
+}
+
+// Loads four single-precision, floating-point values.  https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float * p)
+{
+  // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon
+  return vld1q_f32(p);
+}
+
+// Loads an single - precision, floating - point value into the low word and clears the upper three words.  https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float * p)
+{
+  __m128 result = vdupq_n_f32(0);
+  return vsetq_lane_f32(*p, result, 0);
+}
+
+FORCE_INLINE __m128i _mm_loadu_si128(__m128i *p)
+{
+  return (__m128i)vld1q_s32((const int32_t*) p);
+}
+
+
+// ******************************************
+// Logic/Binary operations
+// ******************************************
+
+// Compares for inequality.  https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+  return (__m128)vmvnq_s32((__m128i)vceqq_f32(a, b));
+}
+
+// Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+  return (__m128)vbicq_s32((__m128i)b, (__m128i)a); // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+  return (__m128i)vbicq_s32(b, a); // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+  return (__m128i)vandq_s32(a, b);
+}
+
+// Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+  return (__m128)vandq_s32((__m128i)a, (__m128i)b);
+}
+
+// Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+  return (__m128)vorrq_s32((__m128i)a, (__m128i)b);
+}
+
+// Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+  return (__m128)veorq_s32((__m128i)a, (__m128i)b);
+}
+
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+  return (__m128i)vorrq_s32(a, b);
+}
+
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+  return veorq_s32(a, b);
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+#if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this
+  uint32x4_t &ia = *(uint32x4_t *)&a;
+  return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8);
+#else
+    
+#if defined(__aarch64__)
+    uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask);
+    return vaddvq_u32(t2);
+#else
+  static const uint32x4_t movemask = { 1, 2, 4, 8 };
+  static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+  uint32x4_t t0 = vreinterpretq_u32_f32(a);
+  uint32x4_t t1 = vtstq_u32(t0, highbit);
+  uint32x4_t t2 = vandq_u32(t1, movemask);
+  uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
+  return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
+#endif
+    
+#endif
+}
+
+#if defined(__aarch64__)
+FORCE_INLINE int _mm_movemask_popcnt_ps(__m128 a)
+{
+    uint32x4_t t2 = vandq_u32(vreinterpretq_u32_f32(a), embree::movemask_mask);
+    t2 = vreinterpretq_u32_u8(vcntq_u8(vreinterpretq_u8_u32(t2)));
+    return vaddvq_u32(t2);
+    
+}
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+  return vcombine_f32(vget_high_f32(a), vget_low_f32(b));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high end of result
+// takes the higher two 32 bit values from b and swaps them and places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+  return vcombine_f32(vrev64_f32(vget_low_f32(a)), vrev64_f32(vget_high_f32(b)));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+  return vcombine_f32(vget_low_f32(a), vget_high_f32(b));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 1)), vdup_n_f32(vgetq_lane_f32(b, 0)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 2)), vdup_n_f32(vgetq_lane_f32(b, 0)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 0)), vdup_n_f32(vgetq_lane_f32(b, 2)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+  float32_t a0 = vgetq_lane_f32(a, 0);
+  float32_t a2 = vgetq_lane_f32(a, 2);
+  float32x2_t aVal = vdup_n_f32(a2);
+  aVal = vset_lane_f32(a0, aVal, 1);
+  return vcombine_f32(aVal, vget_high_f32(b));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+  return vcombine_f32(vdup_n_f32(vgetq_lane_f32(a, 3)), vdup_n_f32(vgetq_lane_f32(b, 1)));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+  float32_t b0 = vgetq_lane_f32(b, 0);
+  float32_t b2 = vgetq_lane_f32(b, 2);
+  float32x2_t bVal = vdup_n_f32(b0);
+  bVal = vset_lane_f32(b2, bVal, 1);
+  return vcombine_f32(vget_low_f32(a), bVal);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+  float32_t b0 = vgetq_lane_f32(b, 0);
+  float32_t b2 = vgetq_lane_f32(b, 2);
+  float32x2_t bVal = vdup_n_f32(b0);
+  bVal = vset_lane_f32(b2, bVal, 1);
+  return vcombine_f32(vrev64_f32(vget_low_f32(a)), bVal);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+  float32_t b0 = vgetq_lane_f32(b, 0);
+  float32_t b2 = vgetq_lane_f32(b, 2);
+  float32x2_t bVal = vdup_n_f32(b0);
+  bVal = vset_lane_f32(b2, bVal, 1);
+  return vcombine_f32(vget_high_f32(a), bVal);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+  float32x2_t a21 = vget_high_f32(vextq_f32(a, a, 3));
+  float32x2_t b03 = vget_low_f32(vextq_f32(b, b, 3));
+  return vcombine_f32(a21, b03);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+  float32x2_t a03 = vget_low_f32(vextq_f32(a, a, 3));
+  float32x2_t b21 = vget_high_f32(vextq_f32(b, b, 3));
+  return vcombine_f32(a03, b21);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+  float32x2_t a10 = vget_low_f32(a);
+  float32x2_t b10 = vget_low_f32(b);
+  return vcombine_f32(a10, b10);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+  float32x2_t a01 = vrev64_f32(vget_low_f32(a));
+  float32x2_t b10 = vget_low_f32(b);
+  return vcombine_f32(a01, b10);
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+  float32x2_t a01 = vrev64_f32(vget_low_f32(a));
+  float32x2_t b01 = vrev64_f32(vget_low_f32(b));
+  return vcombine_f32(a01, b01);
+}
+
+// NEON does not support a general purpose permute intrinsic
+// Currently I am not sure whether the C implementation is faster or slower than the NEON version.
+// Note, this has to be expanded as a template because the shuffle value must be an immediate value.
+// The same is true on SSE as well.
+// Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+template <int i>
+FORCE_INLINE __m128 _mm_shuffle_ps_default(const __m128& a, const __m128& b)
+{
+#if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet.
+  __m128 ret;
+  ret[0] = a[i & 0x3];
+  ret[1] = a[(i >> 2) & 0x3];
+  ret[2] = b[(i >> 4) & 0x03];
+  ret[3] = b[(i >> 6) & 0x03];
+  return ret;
+#else
+# if __has_builtin(__builtin_shufflevector)
+    return __builtin_shufflevector(             \
+        a, b, (i) & (0x3), ((i) >> 2) & 0x3,
+        (((i) >> 4) & 0x3) + 4, (((i) >> 6) & 0x3) + 4);
+# else
+    const int i0 = (i >> 0)&0x3;
+    const int i1 = (i >> 2)&0x3;
+    const int i2 = (i >> 4)&0x3;
+    const int i3 = (i >> 6)&0x3;
+
+    if (&a == &b)
+     {
+         if (i0 == i1 && i0 == i2 && i0 == i3)
+         {
+             return (float32x4_t)vdupq_laneq_f32(a,i0);
+         }
+         static const uint8_t tbl[16] = {
+             (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3,
+             (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3,
+             (i2*4) + 0,(i2*4) + 1,(i2*4) + 2,(i2*4) + 3,
+             (i3*4) + 0,(i3*4) + 1,(i3*4) + 2,(i3*4) + 3
+         };
+         
+         return (float32x4_t)vqtbl1q_s8(int8x16_t(b),*(uint8x16_t *)tbl);
+         
+     }
+     else
+     {
+         
+         static const uint8_t tbl[16] = {
+             (i0*4) + 0,(i0*4) + 1,(i0*4) + 2,(i0*4) + 3,
+             (i1*4) + 0,(i1*4) + 1,(i1*4) + 2,(i1*4) + 3,
+             (i2*4) + 0 + 16,(i2*4) + 1 + 16,(i2*4) + 2 + 16,(i2*4) + 3 + 16,
+             (i3*4) + 0 + 16,(i3*4) + 1 + 16,(i3*4) + 2 + 16,(i3*4) + 3 + 16
+         };
+         
+         return float32x4_t(vqtbl2q_s8((int8x16x2_t){int8x16_t(a),int8x16_t(b)},*(uint8x16_t *)tbl));
+     }
+# endif //builtin(shufflevector)
+#endif
+}
+
+template <int i >
+FORCE_INLINE __m128 _mm_shuffle_ps_function(const __m128& a, const __m128& b)
+{
+  switch (i)
+  {
+    case _MM_SHUFFLE(1, 0, 3, 2):
+      return _mm_shuffle_ps_1032(a, b);
+      break;
+    case _MM_SHUFFLE(2, 3, 0, 1):
+      return _mm_shuffle_ps_2301(a, b);
+      break;
+    case _MM_SHUFFLE(3, 2, 1, 0):
+      return _mm_shuffle_ps_3210(a, b);
+      break;
+    case _MM_SHUFFLE(0, 0, 1, 1):
+      return _mm_shuffle_ps_0011(a, b);
+      break;
+    case _MM_SHUFFLE(0, 0, 2, 2):
+      return _mm_shuffle_ps_0022(a, b);
+      break;
+    case _MM_SHUFFLE(2, 2, 0, 0):
+      return _mm_shuffle_ps_2200(a, b);
+      break;
+    case _MM_SHUFFLE(3, 2, 0, 2):
+      return _mm_shuffle_ps_3202(a, b);
+      break;
+    case _MM_SHUFFLE(1, 1, 3, 3):
+      return _mm_shuffle_ps_1133(a, b);
+      break;
+    case _MM_SHUFFLE(2, 0, 1, 0):
+      return _mm_shuffle_ps_2010(a, b);
+      break;
+    case _MM_SHUFFLE(2, 0, 0, 1):
+      return _mm_shuffle_ps_2001(a, b);
+      break;
+    case _MM_SHUFFLE(2, 0, 3, 2):
+      return _mm_shuffle_ps_2032(a, b);
+      break;
+    case _MM_SHUFFLE(0, 3, 2, 1):
+      return _mm_shuffle_ps_0321(a, b);
+      break;
+    case _MM_SHUFFLE(2, 1, 0, 3):
+      return _mm_shuffle_ps_2103(a, b);
+      break;
+    case _MM_SHUFFLE(1, 0, 1, 0):
+      return _mm_shuffle_ps_1010(a, b);
+      break;
+    case _MM_SHUFFLE(1, 0, 0, 1):
+      return _mm_shuffle_ps_1001(a, b);
+      break;
+    case _MM_SHUFFLE(0, 1, 0, 1):
+      return _mm_shuffle_ps_0101(a, b);
+      break;
+  }
+  return _mm_shuffle_ps_default<i>(a, b);
+}
+
+# if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_default<i>(a,b)
+# else
+#define _mm_shuffle_ps(a,b,i) _mm_shuffle_ps_function<i>(a,b)
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a, __m128i b)
+{
+  return vcombine_s32(vget_high_s32(a), vget_low_s32(b));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end of result
+// takes the higher two 32 bit values from b and swaps them and places in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a, __m128i b)
+{
+  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_high_s32(b)));
+}
+
+// shift a right by 32 bits, and put the lower 32 bits of a into the upper 32 bits of b
+// when a and b are the same, rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a, __m128i b)
+{
+  return vextq_s32(a, b, 1);
+}
+
+// shift a left by 32 bits, and put the upper 32 bits of b into the lower 32 bits of a
+// when a and b are the same, rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a, __m128i b)
+{
+  return vextq_s32(a, b, 3);
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of b and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a, __m128i b)
+{
+  return vcombine_s32(vget_low_s32(a), vget_low_s32(a));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a, __m128i b)
+{
+  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vget_low_s32(b));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits
+// gets the lower 64 bits of b, swaps the 0 and 1 elements, and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a, __m128i b)
+{
+  return vcombine_s32(vrev64_s32(vget_low_s32(a)), vrev64_s32(vget_low_s32(b)));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a, __m128i b)
+{
+  return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 1)), vdup_n_s32(vgetq_lane_s32(b, 2)));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a, __m128i b)
+{
+  return vcombine_s32(vdup_n_s32(vgetq_lane_s32(a, 2)), vrev64_s32(vget_low_s32(b)));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a, __m128i b)
+{
+  return vcombine_s32(vget_high_s32(a), vdup_n_s32(vgetq_lane_s32(b, 3)));
+}
+
+template <int i >
+FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __m128i b)
+{
+#if ENABLE_CPP_VERSION
+  __m128i ret;
+  ret[0] = a[i & 0x3];
+  ret[1] = a[(i >> 2) & 0x3];
+  ret[2] = b[(i >> 4) & 0x03];
+  ret[3] = b[(i >> 6) & 0x03];
+  return ret;
+#else
+  __m128i ret = vmovq_n_s32(vgetq_lane_s32(a, i & 0x3));
+  ret = vsetq_lane_s32(vgetq_lane_s32(a, (i >> 2) & 0x3), ret, 1);
+  ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 4) & 0x3), ret, 2);
+  ret = vsetq_lane_s32(vgetq_lane_s32(b, (i >> 6) & 0x3), ret, 3);
+  return ret;
+#endif
+}
+
+template <int i >
+FORCE_INLINE __m128i _mm_shuffle_epi32_function(__m128i a, __m128i b)
+{
+  switch (i)
+  {
+    case _MM_SHUFFLE(1, 0, 3, 2): return _mm_shuffle_epi_1032(a, b); break;
+    case _MM_SHUFFLE(2, 3, 0, 1): return _mm_shuffle_epi_2301(a, b); break;
+    case _MM_SHUFFLE(0, 3, 2, 1): return _mm_shuffle_epi_0321(a, b); break;
+    case _MM_SHUFFLE(2, 1, 0, 3): return _mm_shuffle_epi_2103(a, b); break;
+    case _MM_SHUFFLE(1, 0, 1, 0): return _mm_shuffle_epi_1010(a, b); break;
+    case _MM_SHUFFLE(1, 0, 0, 1): return _mm_shuffle_epi_1001(a, b); break;
+    case _MM_SHUFFLE(0, 1, 0, 1): return _mm_shuffle_epi_0101(a, b); break;
+    case _MM_SHUFFLE(2, 2, 1, 1): return _mm_shuffle_epi_2211(a, b); break;
+    case _MM_SHUFFLE(0, 1, 2, 2): return _mm_shuffle_epi_0122(a, b); break;
+    case _MM_SHUFFLE(3, 3, 3, 2): return _mm_shuffle_epi_3332(a, b); break;
+    default: return _mm_shuffle_epi32_default<i>(a, b);
+  }
+}
+
+template <int i >
+FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a)
+{
+  return vdupq_n_s32(vgetq_lane_s32(a, i));
+}
+
+template <int i>
+FORCE_INLINE __m128i _mm_shuffle_epi32_single(__m128i a)
+{
+  switch (i)
+  {
+    case _MM_SHUFFLE(0, 0, 0, 0): return _mm_shuffle_epi32_splat<0>(a); break;
+    case _MM_SHUFFLE(1, 1, 1, 1): return _mm_shuffle_epi32_splat<1>(a); break;
+    case _MM_SHUFFLE(2, 2, 2, 2): return _mm_shuffle_epi32_splat<2>(a); break;
+    case _MM_SHUFFLE(3, 3, 3, 3): return _mm_shuffle_epi32_splat<3>(a); break;
+    default: return _mm_shuffle_epi32_function<i>(a, a);
+  }
+}
+
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.	https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+#define _mm_shuffle_epi32(a,i) _mm_shuffle_epi32_single<i>(a)
+
+template <int i>
+FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a)
+{
+  int16x8_t ret = (int16x8_t)a;
+  int16x4_t highBits = vget_high_s16(ret);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, i & 0x3), ret, 4);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 2) & 0x3), ret, 5);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 4) & 0x3), ret, 6);
+  ret = vsetq_lane_s16(vget_lane_s16(highBits, (i >> 6) & 0x3), ret, 7);
+  return (__m128i)ret;
+}
+
+// Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm.  https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+#define _mm_shufflehi_epi16(a,i) _mm_shufflehi_epi16_function<i>(a)
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
+//#define _mm_slli_epi32(a, imm) (__m128i)vshlq_n_s32(a,imm)
+
+// Based on SIMDe
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, const int imm8)
+{
+#if defined(__aarch64__)
+    const int32x4_t s = vdupq_n_s32(imm8);
+    return vshlq_s32(a, s);
+#else
+  int32_t __attribute__((aligned(16))) data[4];
+  vst1q_s32(data, a);
+  const int s = (imm8 > 31) ? 0 : imm8;
+  data[0] = data[0] << s;
+  data[1] = data[1] << s;
+  data[2] = data[2] << s;
+  data[3] = data[3] << s;
+
+  return vld1q_s32(data);
+#endif
+}
+
+
+//Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros.  https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx
+//#define _mm_srli_epi32( a, imm ) (__m128i)vshrq_n_u32((uint32x4_t)a, imm)
+
+// Based on SIMDe
+FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, const int imm8)
+{
+#if defined(__aarch64__)
+    const int shift = (imm8 > 31) ? 0 : imm8;  // Unfortunately, we need to check for this case for embree.
+    const int32x4_t s = vdupq_n_s32(-shift);
+    return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(a), s));
+#else
+  int32_t __attribute__((aligned(16))) data[4];
+  vst1q_s32(data, a);
+
+  const int s = (imm8 > 31) ? 0 : imm8;
+
+  data[0] = data[0] >> s;
+  data[1] = data[1] >> s;
+  data[2] = data[2] >> s;
+  data[3] = data[3] >> s;
+
+  return vld1q_s32(data);
+#endif
+}
+
+
+// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit.  https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
+//#define _mm_srai_epi32( a, imm ) vshrq_n_s32(a, imm)
+
+// Based on SIMDe
+FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, const int imm8)
+{
+#if defined(__aarch64__)
+    const int32x4_t s = vdupq_n_s32(-imm8);
+    return vshlq_s32(a, s);
+#else
+  int32_t __attribute__((aligned(16))) data[4];
+  vst1q_s32(data, a);
+  const uint32_t m = (uint32_t) ((~0U) << (32 - imm8));
+
+  for (int i = 0; i < 4; i++) {
+    uint32_t is_neg = ((uint32_t) (((data[i]) >> 31)));
+    data[i] = (data[i] >> imm8) | (m * is_neg);
+  }
+
+  return vld1q_s32(data);
+#endif
+}
+
+// Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
+//#define _mm_srli_si128( a, imm ) (__m128i)vmaxq_s8((int8x16_t)a, vextq_s8((int8x16_t)a, vdupq_n_s8(0), imm))
+#define _mm_srli_si128( a, imm ) (__m128i)vextq_s8((int8x16_t)a, vdupq_n_s8(0), (imm))
+
+// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate.  https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
+#define _mm_slli_si128( a, imm ) (__m128i)vextq_s8(vdupq_n_s8(0), (int8x16_t)a, 16 - (imm))
+
+// NEON does not provide a version of this function, here is an article about some ways to repro the results.
+// http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
+// Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i _a)
+{
+  uint8x16_t input = (uint8x16_t)_a;
+  const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 };
+  uint8x8_t mask_and = vdup_n_u8(0x80);
+  int8x8_t mask_shift = vld1_s8(xr);
+
+  uint8x8_t lo = vget_low_u8(input);
+  uint8x8_t hi = vget_high_u8(input);
+
+  lo = vand_u8(lo, mask_and);
+  lo = vshl_u8(lo, mask_shift);
+
+  hi = vand_u8(hi, mask_and);
+  hi = vshl_u8(hi, mask_shift);
+
+  lo = vpadd_u8(lo, lo);
+  lo = vpadd_u8(lo, lo);
+  lo = vpadd_u8(lo, lo);
+
+  hi = vpadd_u8(hi, hi);
+  hi = vpadd_u8(hi, hi);
+  hi = vpadd_u8(hi, hi);
+
+  return ((hi[0] << 8) | (lo[0] & 0xFF));
+}
+
+
+// ******************************************
+// Math operations
+// ******************************************
+
+// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+  return vsubq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+  return vsubq_f32(a, b);
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+  return vsubq_s32(a, b);
+}
+
+// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+  return vaddq_f32(a, b);
+}
+
+// adds the scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+  const float32_t     b0 = vgetq_lane_f32(b, 0);
+  float32x4_t         value = vdupq_n_f32(0);
+
+  //the upper values in the result must be the remnants of <a>.
+  value = vsetq_lane_f32(b0, value, 0);
+  return vaddq_f32(a, value);
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+  return vaddq_s32(a, b);
+}
+
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vaddq_s16((int16x8_t)a, (int16x8_t)b);
+}
+
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vmulq_s16((int16x8_t)a, (int16x8_t)b);
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32 (__m128i a, __m128i b)
+{
+  return (__m128i)vmulq_s32((int32x4_t)a,(int32x4_t)b);
+}
+
+// Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+  return vmulq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+  return vmulq_f32(a, b);
+}
+
+// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+#if defined(BUILD_IOS)
+  return vdivq_f32(vdupq_n_f32(1.0f),in);
+    
+#endif
+    // Get an initial estimate of 1/in.
+  float32x4_t reciprocal = vrecpeq_f32(in);
+
+  // We only return estimated 1/in.
+  // Newton-Raphon iteration shold be done in the outside of _mm_rcp_ps().
+
+  // TODO(LTE): We could delete these ifdef?
+  reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(in, reciprocal), reciprocal);
+  return reciprocal;
+
+}
+
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 in)
+{
+  float32x4_t value;
+  float32x4_t result = in;
+
+  value = _mm_rcp_ps(in);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(BUILD_IOS) 
+  return vdivq_f32(a,b);
+#else
+  float32x4_t reciprocal = _mm_rcp_ps(b);
+    
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+
+  // Add one more round of newton-raphson since NEON's reciprocal estimation has less accuracy compared to SSE2's rcp.
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+
+  // Another round for safety
+  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
+
+    
+  return vmulq_f32(a, reciprocal);
+#endif
+}
+
+// Divides the scalar single-precision floating point value of a by b.  https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+  float32x4_t value;
+  float32x4_t result = a;
+  value = _mm_div_ps(a, b);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in.  https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+	
+  float32x4_t value = vrsqrteq_f32(in);
+  
+  // TODO: We must debug and ensure that rsqrt(0) and rsqrt(-0) yield proper values.
+  // Related code snippets can be found here: https://cpp.hotexamples.com/examples/-/-/vrsqrteq_f32/cpp-vrsqrteq_f32-function-examples.html
+  // If we adapt this function, we might be able to avoid special zero treatment in _mm_sqrt_ps
+  
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+
+  // one more round to get better precision
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+
+  // another round for safety
+  value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(in, value), value));
+
+  return value;
+}
+
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+  float32x4_t result = in;
+  
+  __m128 value = _mm_rsqrt_ps(in);
+
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+
+// Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if defined(BUILD_IOS)
+  return vsqrtq_f32(in);
+#else
+  __m128 reciprocal = _mm_rsqrt_ps(in);
+  
+  // We must treat sqrt(in == 0) in a special way. At this point reciprocal contains gargabe due to vrsqrteq_f32(0) returning +inf.
+  // We assign 0 to reciprocal wherever required.
+  const float32x4_t vzero = vdupq_n_f32(0.0f);
+  const uint32x4_t mask = vceqq_f32(in, vzero);
+  reciprocal = vbslq_f32(mask, vzero, reciprocal);
+  
+  // sqrt(x) = x * (1 / sqrt(x))
+  return vmulq_f32(in, reciprocal);
+#endif
+}
+
+// Computes the approximation of the square root of the scalar single-precision floating point value of in.  https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+  float32x4_t value;
+  float32x4_t result = in;
+
+  value = _mm_sqrt_ps(in);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+
+// Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if USE_PRECISE_MINMAX_IMPLEMENTATION
+  return vbslq_f32(vcltq_f32(b,a),a,b);
+#else
+  // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels)
+  return vmaxq_f32(a, b);
+#endif
+}
+
+// Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if USE_PRECISE_MINMAX_IMPLEMENTATION
+  return vbslq_f32(vcltq_f32(a,b),a,b);
+#else
+  // Faster, but would give inconsitent rendering(e.g. holes, NaN pixels)
+  return vminq_f32(a, b);
+#endif
+}
+
+// Computes the maximum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+  float32x4_t value;
+  float32x4_t result = a;
+ 
+  value = _mm_max_ps(a, b);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+  float32x4_t value;
+  float32x4_t result = a;
+
+    
+  value = _mm_min_ps(a, b);
+  return vsetq_lane_f32(vgetq_lane_f32(value, 0), result, 0);
+}
+
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vminq_s16((int16x8_t)a, (int16x8_t)b);
+}
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b )
+{
+  return vmaxq_s32(a,b);
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b )
+{
+  return vminq_s32(a,b);
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+  int16x8_t ret = vqdmulhq_s16((int16x8_t)a, (int16x8_t)b);
+  ret = vshrq_n_s16(ret, 1);
+  return (__m128i)ret;
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point values a and b.
+//https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b )
+{
+#if defined(__aarch64__)
+    return vpaddq_f32(a,b);
+#else
+// This does not work, no vpaddq...
+//	return (__m128) vpaddq_f32(a,b);
+        //
+        // get two f32x2_t values from a
+        // do vpadd
+        // put result in low half of f32x4 result
+        //
+        // get two f32x2_t values from b
+        // do vpadd
+        // put result in high half of f32x4 result
+        //
+        // combine
+        return vcombine_f32( vpadd_f32( vget_low_f32(a), vget_high_f32(a) ), vpadd_f32( vget_low_f32(b), vget_high_f32(b) ) );
+#endif
+}
+
+// ******************************************
+// Compare operations
+// ******************************************
+
+// Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcltq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+  return (__m128) vmvnq_s32((__m128i)_mm_cmplt_ps(a,b));
+}
+
+// Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcgtq_f32(a, b);
+}
+
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+  return (__m128) _mm_cmpgt_ps(a,b);
+}
+
+
+// Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcgeq_f32(a, b);
+}
+
+// Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+  return (__m128)vcleq_f32(a, b);
+}
+
+// Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+  return (__m128)vceqq_f32(a, b);
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)vcltq_s32(a, b);
+}
+
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+  return (__m128i) vceqq_s32(a,b);
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)vcgtq_s32(a, b);
+}
+
+// Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx
+// see also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b )
+{
+  // Note: NEON does not have ordered compare builtin
+  // Need to compare a eq a and b eq b to check for NaN
+  // Do AND of results to get final
+  return (__m128) vreinterpretq_f32_u32( vandq_u32( vceqq_f32(a,a), vceqq_f32(b,b) ) );
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcltq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcgtq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcleq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vcgeq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vceqq_f32(a, b);
+  return vgetq_lane_u32(value, 0);
+}
+
+// Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+  uint32x4_t value;
+
+  value = vceqq_f32(a, b);
+  return !vgetq_lane_u32(value, 0);
+}
+
+// according to the documentation, these intrinsics behave the same as the non-'u' versions.  We'll just alias them here.
+#define _mm_ucomilt_ss      _mm_comilt_ss
+#define _mm_ucomile_ss      _mm_comile_ss
+#define _mm_ucomigt_ss      _mm_comigt_ss
+#define _mm_ucomige_ss      _mm_comige_ss
+#define _mm_ucomieq_ss      _mm_comieq_ss
+#define _mm_ucomineq_ss     _mm_comineq_ss
+
+// ******************************************
+// Conversions
+// ******************************************
+
+// Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+  return vcvtq_s32_f32(a);
+}
+
+// Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+  return vcvtq_f32_s32(a);
+}
+
+// Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support!
+// It is supported on ARMv8 however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if 1
+  return vcvtnq_s32_f32(a);
+#else
+  __m128 half = vdupq_n_f32(0.5f);
+  const __m128 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(a), 31)));
+  const __m128 aPlusHalf = vaddq_f32(a, half);
+  const __m128 aRound = vsubq_f32(aPlusHalf, sign);
+  return vcvtq_s32_f32(aRound);
+#endif
+}
+
+// Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+  return vgetq_lane_s32(a, 0);
+}
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+  __m128i result = vdupq_n_s32(0);
+  return vsetq_lane_s32(a, result, 0);
+}
+
+
+// Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+#if defined(__aarch64__)
+    return (__m128i)a;
+#else
+  return *(const __m128i *)&a;
+#endif
+}
+
+// Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+#if defined(__aarch64__)
+    return (__m128)a;
+#else
+  return *(const __m128 *)&a;
+#endif
+}
+
+// Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+  return vld1q_s32((int32_t *)p);
+}
+
+FORCE_INLINE __m128d _mm_castps_pd(const __m128 a)
+{
+  return *(const __m128d *)&a;
+}
+
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+  return *(const __m128d *)&a;
+}
+// ******************************************
+// Miscellaneous Operations
+// ******************************************
+
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+  return (__m128i)vcombine_s8(vqmovn_s16((int16x8_t)a), vqmovn_s16((int16x8_t)b));
+}
+
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+  return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b));
+}
+
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+  return (__m128i)vcombine_s16(vqmovn_s32(a), vqmovn_s32(b));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+  int8x8_t a1 = (int8x8_t)vget_low_s16((int16x8_t)a);
+  int8x8_t b1 = (int8x8_t)vget_low_s16((int16x8_t)b);
+
+  int8x8x2_t result = vzip_s8(a1, b1);
+
+  return (__m128i)vcombine_s8(result.val[0], result.val[1]);
+}
+
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+  int16x4_t a1 = vget_low_s16((int16x8_t)a);
+  int16x4_t b1 = vget_low_s16((int16x8_t)b);
+
+  int16x4x2_t result = vzip_s16(a1, b1);
+
+  return (__m128i)vcombine_s16(result.val[0], result.val[1]);
+}
+
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b.  https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+  int32x2_t a1 = vget_low_s32(a);
+  int32x2_t b1 = vget_low_s32(b);
+
+  int32x2x2_t result = vzip_s32(a1, b1);
+
+  return vcombine_s32(result.val[0], result.val[1]);
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+  float32x2x2_t result = vzip_f32(vget_low_f32(a), vget_low_f32(b));
+  return vcombine_f32(result.val[0], result.val[1]);
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+  float32x2x2_t result = vzip_f32(vget_high_f32(a), vget_high_f32(b));
+  return vcombine_f32(result.val[0], result.val[1]);
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+  int8x8_t a1 = (int8x8_t)vget_high_s16((int16x8_t)a);
+  int8x8_t b1 = (int8x8_t)vget_high_s16((int16x8_t)b);
+
+  int8x8x2_t result = vzip_s8(a1, b1);
+
+  return (__m128i)vcombine_s8(result.val[0], result.val[1]);
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+  int16x4_t a1 = vget_high_s16((int16x8_t)a);
+  int16x4_t b1 = vget_high_s16((int16x8_t)b);
+
+  int16x4x2_t result = vzip_s16(a1, b1);
+
+  return (__m128i)vcombine_s16(result.val[0], result.val[1]);
+}
+
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b.  https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+  int32x2_t a1 = vget_high_s32(a);
+  int32x2_t b1 = vget_high_s32(b);
+
+  int32x2x2_t result = vzip_s32(a1, b1);
+
+  return vcombine_s32(result.val[0], result.val[1]);
+}
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero extends.  https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+#define _mm_extract_epi16( a, imm ) vgetq_lane_s16((int16x8_t)a, imm)
+
+// ******************************************
+// Streaming Extensions
+// ******************************************
+
+// Guarantees that every preceding store is globally visible before any subsequent store.  https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
+{
+  __sync_synchronize();
+}
+
+// Stores the data in a to the address p without polluting the caches.  If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned.  https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+  *p = a;
+}
+
+// Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const*p)
+{
+  // no corollary for Neon?
+}
+
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t a, int64_t b)
+{
+  // Stick to the flipped behavior of x86.
+  int64_t __attribute__((aligned(16))) data[2] = { b, a };
+  return (__m128i)vld1q_s64(data);
+}
+
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+  return (__m128i)vmovq_n_s64(_i);
+}
+
+#if defined(__aarch64__)
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c)
+{
+    int32x4_t mask = vshrq_n_s32(__m128i(c),31);
+    return vbslq_f32( uint32x4_t(mask), b, a);
+}
+
+FORCE_INLINE __m128i _mm_load4epu8_epi32(__m128i *ptr)
+{
+    uint8x8_t  t0 = vld1_u8((uint8_t*)ptr);
+    uint16x8_t t1 = vmovl_u8(t0);
+    uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));
+    return vreinterpretq_s32_u32(t2);
+}
+
+FORCE_INLINE __m128i _mm_load4epu16_epi32(__m128i *ptr)
+{
+    uint16x8_t t0 = vld1q_u16((uint16_t*)ptr);
+    uint32x4_t t1 = vmovl_u16(vget_low_u16(t0));
+    return vreinterpretq_s32_u32(t1);
+}
+
+FORCE_INLINE __m128i _mm_load4epi8_f32(__m128i *ptr)
+{
+    int8x8_t    t0 = vld1_s8((int8_t*)ptr);
+    int16x8_t   t1 = vmovl_s8(t0);
+    int32x4_t   t2 = vmovl_s16(vget_low_s16(t1));
+    float32x4_t t3 = vcvtq_f32_s32(t2);
+    return vreinterpretq_s32_f32(t3);
+}
+
+FORCE_INLINE __m128i _mm_load4epu8_f32(__m128i *ptr)
+{
+    uint8x8_t   t0 = vld1_u8((uint8_t*)ptr);
+    uint16x8_t  t1 = vmovl_u8(t0);
+    uint32x4_t  t2 = vmovl_u16(vget_low_u16(t1));
+    return vreinterpretq_s32_u32(t2);
+}
+
+FORCE_INLINE __m128i _mm_load4epi16_f32(__m128i *ptr)
+{
+    int16x8_t   t0 = vld1q_s16((int16_t*)ptr);
+    int32x4_t   t1 = vmovl_s16(vget_low_s16(t0));
+    float32x4_t t2 = vcvtq_f32_s32(t1);
+    return vreinterpretq_s32_f32(t2);
+}
+
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b));
+}
+
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i* ptr)
+{
+    // No non-temporal load on a single register on ARM.
+    return vreinterpretq_s32_u8(vld1q_u8((uint8_t*)ptr));
+}
+
+FORCE_INLINE void _mm_stream_ps(float* ptr, __m128i a)
+{
+    // No non-temporal store on a single register on ARM.
+    vst1q_f32((float*)ptr, vreinterpretq_f32_s32(a));
+}
+
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b)));
+}
+
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(a), vreinterpretq_u32_s32(b)));
+}
+
+FORCE_INLINE __m128 _mm_abs_ps(__m128 a)
+{
+    return vabsq_f32(a);
+}
+
+FORCE_INLINE __m128 _mm_madd_ps(__m128 a, __m128 b, __m128 c)
+{
+    return vmlaq_f32(c, a, b);
+}
+
+FORCE_INLINE __m128 _mm_msub_ps(__m128 a, __m128 b, __m128 c)
+{
+    return vmlsq_f32(c, a, b);
+}
+
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+  return vabsq_s32(a);
+}
+#endif  //defined(__aarch64__)
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+  return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a)));
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+  return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+}
+
+#endif
diff --git a/thirdparty/embree-aarch64/common/math/affinespace.h b/thirdparty/embree-aarch64/common/math/affinespace.h
new file mode 100644
index 0000000000..32452fbe72
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/affinespace.h
@@ -0,0 +1,361 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linearspace2.h"
+#include "linearspace3.h"
+#include "quaternion.h"
+#include "bbox.h"
+#include "vec4.h"
+
+namespace embree
+{
+  #define VectorT typename L::Vector
+  #define ScalarT typename L::Vector::Scalar
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Affine Space
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L>
+    struct AffineSpaceT
+    {
+      L l;           /*< linear part of affine space */
+      VectorT p;     /*< affine part of affine space */
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constructors, Assignment, Cast, Copy Operations
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT           ( )                           { }
+      __forceinline AffineSpaceT           ( const AffineSpaceT& other ) { l = other.l; p = other.p; }
+      __forceinline AffineSpaceT           ( const L           & other ) { l = other  ; p = VectorT(zero); }
+      __forceinline AffineSpaceT& operator=( const AffineSpaceT& other ) { l = other.l; p = other.p; return *this; }
+
+      __forceinline AffineSpaceT( const VectorT& vx, const VectorT& vy, const VectorT& vz, const VectorT& p ) : l(vx,vy,vz), p(p) {}
+      __forceinline AffineSpaceT( const L& l, const VectorT& p ) : l(l), p(p) {}
+
+      template<typename L1> __forceinline AffineSpaceT( const AffineSpaceT<L1>& s ) : l(s.l), p(s.p) {}
+
+      ////////////////////////////////////////////////////////////////////////////////
+      // Constants
+      ////////////////////////////////////////////////////////////////////////////////
+
+      __forceinline AffineSpaceT( ZeroTy ) : l(zero), p(zero) {}
+      __forceinline AffineSpaceT( OneTy )  : l(one),  p(zero) {}
+
+      /*! return matrix for scaling */
+      static __forceinline AffineSpaceT scale(const VectorT& s) { return L::scale(s); }
+
+      /*! return matrix for translation */
+      static __forceinline AffineSpaceT translate(const VectorT& p) { return AffineSpaceT(one,p); }
+
+      /*! return matrix for rotation, only in 2D */
+      static __forceinline AffineSpaceT rotate(const ScalarT& r) { return L::rotate(r); }
+
+      /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */
+      static __forceinline AffineSpaceT rotate(const VectorT& u, const ScalarT& r) { return L::rotate(u,r); }
+
+      /*! return matrix for rotation around arbitrary axis and point, only in 3D */
+      static __forceinline AffineSpaceT rotate(const VectorT& p, const VectorT& u, const ScalarT& r) { return translate(+p) * rotate(u,r) * translate(-p);  }
+
+      /*! return matrix for looking at given point, only in 3D */
+      static __forceinline AffineSpaceT lookat(const VectorT& eye, const VectorT& point, const VectorT& up) {
+        VectorT Z = normalize(point-eye);
+        VectorT U = normalize(cross(up,Z));
+        VectorT V = normalize(cross(Z,U));
+        return AffineSpaceT(L(U,V,Z),eye);
+      }
+
+    };
+  
+  // template specialization to get correct identity matrix for type AffineSpace3fa
+  template<>
+    __forceinline AffineSpaceT<LinearSpace3ff>::AffineSpaceT( OneTy )  : l(one),  p(0.f, 0.f, 0.f, 1.f) {}
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> operator -( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(-a.l,-a.p); }
+  template<typename L> __forceinline AffineSpaceT<L> operator +( const AffineSpaceT<L>& a ) { return AffineSpaceT<L>(+a.l,+a.p); }
+  template<typename L> __forceinline AffineSpaceT<L>        rcp( const AffineSpaceT<L>& a ) { L il = rcp(a.l); return AffineSpaceT<L>(il,-(il*a.p)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator +( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l+b.l,a.p+b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator -( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l-b.l,a.p-b.p); }
+
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const ScalarT        & a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a*b.l,a*b.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator *( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return AffineSpaceT<L>(a.l*b.l,a.l*b.p+a.p); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a * rcp(b); }
+  template<typename L> __forceinline const AffineSpaceT<L> operator /( const AffineSpaceT<L>& a, const ScalarT        & b ) { return a * rcp(b); }
+
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator *=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a * b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a = a / b; }
+  template<typename L> __forceinline AffineSpaceT<L>& operator /=( AffineSpaceT<L>& a, const ScalarT        & b ) { return a = a / b; }
+
+  template<typename L> __forceinline VectorT xfmPoint (const AffineSpaceT<L>& m, const VectorT& p) { return madd(VectorT(p.x),m.l.vx,madd(VectorT(p.y),m.l.vy,madd(VectorT(p.z),m.l.vz,m.p))); }
+  template<typename L> __forceinline VectorT xfmVector(const AffineSpaceT<L>& m, const VectorT& v) { return xfmVector(m.l,v); }
+  template<typename L> __forceinline VectorT xfmNormal(const AffineSpaceT<L>& m, const VectorT& n) { return xfmNormal(m.l,n); }
+
+  __forceinline const BBox<Vec3fa> xfmBounds(const AffineSpaceT<LinearSpace3<Vec3fa> >& m, const BBox<Vec3fa>& b) 
+  { 
+    BBox3fa dst = empty;
+    const Vec3fa p0(b.lower.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p0));
+    const Vec3fa p1(b.lower.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p1));
+    const Vec3fa p2(b.lower.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p2));
+    const Vec3fa p3(b.lower.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p3));
+    const Vec3fa p4(b.upper.x,b.lower.y,b.lower.z); dst.extend(xfmPoint(m,p4));
+    const Vec3fa p5(b.upper.x,b.lower.y,b.upper.z); dst.extend(xfmPoint(m,p5));
+    const Vec3fa p6(b.upper.x,b.upper.y,b.lower.z); dst.extend(xfmPoint(m,p6));
+    const Vec3fa p7(b.upper.x,b.upper.y,b.upper.z); dst.extend(xfmPoint(m,p7));
+    return dst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline bool operator ==( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l == b.l && a.p == b.p; }
+  template<typename L> __forceinline bool operator !=( const AffineSpaceT<L>& a, const AffineSpaceT<L>& b ) { return a.l != b.l || a.p != b.p; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> __forceinline AffineSpaceT<L> select ( const typename L::Vector::Scalar::Bool& s, const AffineSpaceT<L>& t, const AffineSpaceT<L>& f ) {
+    return AffineSpaceT<L>(select(s,t.l,f.l),select(s,t.p,f.p));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename L> static embree_ostream operator<<(embree_ostream cout, const AffineSpaceT<L>& m) {
+    return cout << "{ l = " << m.l << ", p = " << m.p << " }";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Template Instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef AffineSpaceT<LinearSpace2f> AffineSpace2f;
+  typedef AffineSpaceT<LinearSpace3f> AffineSpace3f;
+  typedef AffineSpaceT<LinearSpace3fa> AffineSpace3fa;
+  typedef AffineSpaceT<LinearSpace3fx> AffineSpace3fx;
+  typedef AffineSpaceT<LinearSpace3ff> AffineSpace3ff;
+  typedef AffineSpaceT<Quaternion3f > OrthonormalSpace3f;
+
+  template<int N> using AffineSpace3vf = AffineSpaceT<LinearSpace3<Vec3<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<4>>>>  AffineSpace3vf4;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<8>>>>  AffineSpace3vf8;
+  typedef AffineSpaceT<LinearSpace3<Vec3<vfloat<16>>>> AffineSpace3vf16;
+
+  template<int N> using AffineSpace3vff = AffineSpaceT<LinearSpace3<Vec4<vfloat<N>>>>;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<4>>>>  AffineSpace3vfa4;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<8>>>>  AffineSpace3vfa8;
+  typedef AffineSpaceT<LinearSpace3<Vec4<vfloat<16>>>> AffineSpace3vfa16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T, typename R>
+  __forceinline AffineSpaceT<T> lerp(const AffineSpaceT<T>& M0,
+                                     const AffineSpaceT<T>& M1,
+                                     const R& t)
+  {
+    return AffineSpaceT<T>(lerp(M0.l,M1.l,t),lerp(M0.p,M1.p,t));
+  }
+
+  // slerp interprets the 16 floats of the matrix M = D * R * S as components of
+  // three matrizes (D, R, S) that are interpolated individually.
+  template<typename T> __forceinline AffineSpaceT<LinearSpace3<Vec3<T>>>
+  slerp(const AffineSpaceT<LinearSpace3<Vec4<T>>>& M0,
+        const AffineSpaceT<LinearSpace3<Vec4<T>>>& M1,
+        const T& t)
+  {
+    QuaternionT<T> q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    QuaternionT<T> q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    QuaternionT<T> q = slerp(q0, q1, t);
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> S = lerp(M0, M1, t);
+    AffineSpaceT<LinearSpace3<Vec3<T>>> D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpaceT<LinearSpace3<Vec3<T>>> R = LinearSpace3<Vec3<T>>(q);
+    return D * R * S;
+  }
+
+  // this is a specialized version for Vec3fa because that does
+  // not play along nicely with the other templated Vec3/Vec4 types
+  __forceinline AffineSpace3fa slerp(const AffineSpace3ff& M0,
+                                     const AffineSpace3ff& M1,
+                                     const float& t)
+  {
+    Quaternion3f q0(M0.p.w, M0.l.vx.w, M0.l.vy.w, M0.l.vz.w);
+    Quaternion3f q1(M1.p.w, M1.l.vx.w, M1.l.vy.w, M1.l.vz.w);
+    Quaternion3f q = slerp(q0, q1, t);
+
+    AffineSpace3fa S = lerp(M0, M1, t);
+    AffineSpace3fa D(one);
+    D.p.x = S.l.vx.y;
+    D.p.y = S.l.vx.z;
+    D.p.z = S.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * S;
+  }
+  
+  __forceinline AffineSpace3fa quaternionDecompositionToAffineSpace(const AffineSpace3ff& qd)
+  {
+    // compute affine transform from quaternion decomposition
+    Quaternion3f q(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    AffineSpace3fa M = qd;
+    AffineSpace3fa D(one);
+    D.p.x = M.l.vx.y;
+    D.p.y = M.l.vx.z;
+    D.p.z = M.l.vy.z;
+    M.l.vx.y = 0;
+    M.l.vx.z = 0;
+    M.l.vy.z = 0;
+    AffineSpace3fa R = LinearSpace3fa(q);
+    return D * R * M;
+  }
+  
+  __forceinline void quaternionDecomposition(const AffineSpace3ff& qd, Vec3fa& T, Quaternion3f& q, AffineSpace3fa& S)
+  {
+    q = Quaternion3f(qd.p.w, qd.l.vx.w, qd.l.vy.w, qd.l.vz.w);
+    S = qd;
+    T.x = qd.l.vx.y;
+    T.y = qd.l.vx.z;
+    T.z = qd.l.vy.z;
+    S.l.vx.y = 0;
+    S.l.vx.z = 0;
+    S.l.vy.z = 0;
+  }
+
+  __forceinline AffineSpace3fx quaternionDecomposition(Vec3fa const& T, Quaternion3f const& q, AffineSpace3fa const& S)
+  {
+    AffineSpace3ff M = S;
+    M.l.vx.w = q.i;
+    M.l.vy.w = q.j;
+    M.l.vz.w = q.k;
+    M.p.w    = q.r;
+    M.l.vx.y = T.x;
+    M.l.vx.z = T.y;
+    M.l.vy.z = T.z;
+    return M;
+  }
+
+  struct __aligned(16) QuaternionDecomposition
+  {
+    float scale_x = 1.f;
+    float scale_y = 1.f;
+    float scale_z = 1.f;
+    float skew_xy = 0.f;
+    float skew_xz = 0.f;
+    float skew_yz = 0.f;
+    float shift_x = 0.f;
+    float shift_y = 0.f;
+    float shift_z = 0.f;
+    float quaternion_r = 1.f;
+    float quaternion_i = 0.f;
+    float quaternion_j = 0.f;
+    float quaternion_k = 0.f;
+    float translation_x = 0.f;
+    float translation_y = 0.f;
+    float translation_z = 0.f;
+  };
+
+  __forceinline QuaternionDecomposition quaternionDecomposition(AffineSpace3ff const& M)
+  {
+    QuaternionDecomposition qd;
+    qd.scale_x       = M.l.vx.x;
+    qd.scale_y       = M.l.vy.y;
+    qd.scale_z       = M.l.vz.z;
+    qd.shift_x       = M.p.x;
+    qd.shift_y       = M.p.y;
+    qd.shift_z       = M.p.z;
+    qd.translation_x = M.l.vx.y;
+    qd.translation_y = M.l.vx.z;
+    qd.translation_z = M.l.vy.z;
+    qd.skew_xy       = M.l.vy.x;
+    qd.skew_xz       = M.l.vz.x;
+    qd.skew_yz       = M.l.vz.y;
+    qd.quaternion_r  = M.p.w;
+    qd.quaternion_i  = M.l.vx.w;
+    qd.quaternion_j  = M.l.vy.w;
+    qd.quaternion_k  = M.l.vz.w;
+    return qd;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /*
+   * ! Template Specialization for 2D: return matrix for rotation around point
+   * (rotation around arbitrarty vector is not meaningful in 2D)
+   */
+  template<> __forceinline
+  AffineSpace2f AffineSpace2f::rotate(const Vec2f& p, const float& r) {
+    return translate(+p)*AffineSpace2f(LinearSpace2f::rotate(r))*translate(-p);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Similarity Transform
+  //
+  // checks, if M is a similarity transformation, i.e if there exists a factor D
+  // such that for all x,y: distance(Mx, My) = D * distance(x, y)
+  ////////////////////////////////////////////////////////////////////////////////
+  __forceinline bool similarityTransform(const AffineSpace3fa& M, float* D)
+  {
+    if (D) *D = 0.f;
+    if (abs(dot(M.l.vx, M.l.vy)) > 1e-5f) return false;
+    if (abs(dot(M.l.vx, M.l.vz)) > 1e-5f) return false;
+    if (abs(dot(M.l.vy, M.l.vz)) > 1e-5f) return false;
+
+    const float D_x = dot(M.l.vx, M.l.vx);
+    const float D_y = dot(M.l.vy, M.l.vy);
+    const float D_z = dot(M.l.vz, M.l.vz);
+
+    if (abs(D_x - D_y) > 1e-5f ||
+        abs(D_x - D_z) > 1e-5f ||
+        abs(D_y - D_z) > 1e-5f)
+      return false;
+
+    if (D) *D = sqrtf(D_x);
+    return true;
+  }
+
+  __forceinline void AffineSpace3fa_store_unaligned(const AffineSpace3fa &source, AffineSpace3fa* ptr)
+  {
+    Vec3fa::storeu(&ptr->l.vx, source.l.vx);
+    Vec3fa::storeu(&ptr->l.vy, source.l.vy);
+    Vec3fa::storeu(&ptr->l.vz, source.l.vz);
+    Vec3fa::storeu(&ptr->p, source.p);
+  }
+
+  __forceinline AffineSpace3fa AffineSpace3fa_load_unaligned(AffineSpace3fa* ptr)
+  {
+    AffineSpace3fa space;
+    space.l.vx = Vec3fa::loadu(&ptr->l.vx);
+    space.l.vy = Vec3fa::loadu(&ptr->l.vy);
+    space.l.vz = Vec3fa::loadu(&ptr->l.vz);
+    space.p    = Vec3fa::loadu(&ptr->p);
+    return space;
+  }
+
+  #undef VectorT
+  #undef ScalarT
+}
diff --git a/thirdparty/embree-aarch64/common/math/bbox.h b/thirdparty/embree-aarch64/common/math/bbox.h
new file mode 100644
index 0000000000..29bb13912b
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/bbox.h
@@ -0,0 +1,331 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+
+namespace embree
+{
+  namespace internal {
+
+    template <typename T> __forceinline T divideByTwo(const T& v) { return v / T(2); }
+    template <> __forceinline float divideByTwo<float>(const float& v) { return v * 0.5f; }
+    template <> __forceinline double divideByTwo<double>(const double& v) { return v * 0.5; }
+
+  } // namespace internal
+  template<typename T>
+  struct BBox
+  {
+    T lower, upper;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox           ( )                   { }
+    template<typename T1>
+    __forceinline BBox           ( const BBox<T1>& other ) : lower(other.lower), upper(other.upper) {}
+    __forceinline BBox& operator=( const BBox& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+    __forceinline BBox ( const T& v                     ) : lower(v), upper(v) {}
+    __forceinline BBox ( const T& lower, const T& upper ) : lower(lower), upper(upper) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Extending Bounds
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const BBox& extend(const BBox& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+    __forceinline const BBox& extend(const T   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+
+    /*! tests if box is empty */
+    __forceinline bool empty() const { for (int i=0; i<T::N; i++) if (lower[i] > upper[i]) return true; return false; }
+
+    /*! computes the size of the box */
+    __forceinline T size() const { return upper - lower; }
+
+    /*! computes the center of the box */
+    __forceinline T center() const { return internal::divideByTwo<T>(lower+upper); }
+
+    /*! computes twice the center of the box */
+    __forceinline T center2() const { return lower+upper; }
+
+    /*! merges two boxes */
+    __forceinline static const BBox merge (const BBox& a, const BBox& b) {
+      return BBox(min(a.lower, b.lower), max(a.upper, b.upper));
+    }
+
+     /*! enlarge box by some scaling factor */
+    __forceinline BBox enlarge_by(const float a) const {
+      return BBox(lower - T(a)*abs(lower), upper + T(a)*abs(upper));
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline BBox( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( FalseTy ) : lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( TrueTy  ) : lower(neg_inf), upper(pos_inf) {}
+    __forceinline BBox( NegInfTy ): lower(pos_inf), upper(neg_inf) {}
+    __forceinline BBox( PosInfTy ): lower(neg_inf), upper(pos_inf) {}
+  };
+
+  template<> __forceinline bool BBox<float>::empty() const {
+    return lower > upper;
+  }
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  template<> __forceinline bool BBox<Vec3fa>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+  template<> __forceinline bool BBox<Vec3fx>::empty() const {
+    return !all(le_mask(lower,upper));
+  }
+#endif
+
+  /*! tests if box is finite */
+  __forceinline bool isvalid( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)));
+  }
+
+  /*! tests if box is finite and non-empty*/
+  __forceinline bool isvalid_non_empty( const BBox<Vec3fa>& v ) {
+    return all(gt_mask(v.lower,Vec3fa_t(-FLT_LARGE)) & lt_mask(v.upper,Vec3fa_t(+FLT_LARGE)) & le_mask(v.lower,v.upper));
+  }
+  
+  /*! tests if box has finite entries */
+  __forceinline bool is_finite( const BBox<Vec3fa>& b) {
+    return is_finite(b.lower) && is_finite(b.upper);
+  }
+
+  /*! test if point contained in box */
+  __forceinline bool inside ( const BBox<Vec3fa>& b, const Vec3fa& p ) { return all(ge_mask(p,b.lower) & le_mask(p,b.upper)); }
+
+  /*! computes the center of the box */
+  template<typename T> __forceinline const T center2(const BBox<T>& box) { return box.lower + box.upper; }
+  template<typename T> __forceinline const T center (const BBox<T>& box) { return internal::divideByTwo<T>(center2(box)); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume    ( const BBox<Vec3fa>& b ) { return reduce_mul(b.size()); }
+  __forceinline float safeVolume( const BBox<Vec3fa>& b ) { if (b.empty()) return 0.0f; else return volume(b); }
+
+  /*! computes the volume of a bounding box */
+  __forceinline float volume( const BBox<Vec3f>& b )  { return reduce_mul(b.size()); }
+
+  /*! computes the surface area of a bounding box */
+  template<typename T> __forceinline const T area( const BBox<Vec2<T> >& b ) { const Vec2<T> d = b.size(); return d.x*d.y; }
+
+  template<typename T> __forceinline const T halfArea( const BBox<Vec3<T> >& b ) { return halfArea(b.size()); }
+  template<typename T> __forceinline const T     area( const BBox<Vec3<T> >& b ) { return T(2)*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fa>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fa>& b ) { return 2.0f*halfArea(b); }
+
+  __forceinline float halfArea( const BBox<Vec3fx>& b ) { return halfArea(b.size()); }
+  __forceinline float     area( const BBox<Vec3fx>& b ) { return 2.0f*halfArea(b); }
+
+  template<typename Vec> __forceinline float safeArea( const BBox<Vec>& b ) { if (b.empty()) return 0.0f; else return area(b); }
+
+  template<typename T> __forceinline float expectedApproxHalfArea(const BBox<T>& box) {
+    return halfArea(box);
+  }
+
+  /*! merges bounding boxes and points */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const       T& b ) { return BBox<T>(min(a.lower, b    ), max(a.upper, b    )); }
+  template<typename T> __forceinline const BBox<T> merge( const       T& a, const BBox<T>& b ) { return BBox<T>(min(a    , b.lower), max(a    , b.upper)); }
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(min(a.lower, b.lower), max(a.upper, b.upper)); }
+
+  /*! Merges three boxes. */
+  template<typename T> __forceinline const BBox<T> merge( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return merge(a,merge(b,c)); }
+
+  /*! Merges four boxes. */
+  template<typename T> __forceinline BBox<T> merge(const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d) {
+    return merge(merge(a,b),merge(c,d));
+  }
+
+  /*! Comparison Operators */
+  template<typename T> __forceinline bool operator==( const BBox<T>& a, const BBox<T>& b ) { return a.lower == b.lower && a.upper == b.upper; }
+  template<typename T> __forceinline bool operator!=( const BBox<T>& a, const BBox<T>& b ) { return a.lower != b.lower || a.upper != b.upper; }
+
+  /*! scaling */
+  template<typename T> __forceinline BBox<T> operator *( const float& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+  template<typename T> __forceinline BBox<T> operator *( const     T& a, const BBox<T>& b ) { return BBox<T>(a*b.lower,a*b.upper); }
+
+  /*! translations */
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower+b.lower,a.upper+b.upper); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(a.lower-b.lower,a.upper-b.upper); }
+  template<typename T> __forceinline BBox<T> operator +( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower+b      ,a.upper+b      ); }
+  template<typename T> __forceinline BBox<T> operator -( const BBox<T>& a, const      T & b ) { return BBox<T>(a.lower-b      ,a.upper-b      ); }
+
+  /*! extension */
+  template<typename T> __forceinline BBox<T> enlarge(const BBox<T>& a, const T& b) { return BBox<T>(a.lower-b, a.upper+b); }
+
+  /*! intersect bounding boxes */
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b ) { return BBox<T>(max(a.lower, b.lower), min(a.upper, b.upper)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c ) { return intersect(a,intersect(b,c)); }
+  template<typename T> __forceinline const BBox<T> intersect( const BBox<T>& a, const BBox<T>& b, const BBox<T>& c, const BBox<T>& d ) { return intersect(intersect(a,b),intersect(c,d)); }
+
+  /*! subtract bounds from each other */
+  template<typename T> __forceinline void subtract(const BBox<T>& a, const BBox<T>& b, BBox<T>& c, BBox<T>& d)
+  {
+    c.lower = a.lower;
+    c.upper = min(a.upper,b.lower);
+    d.lower = max(a.lower,b.upper);
+    d.upper = a.upper;
+  }
+
+  /*! tests if bounding boxes (and points) are disjoint (empty intersection) */
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const BBox<T>& b ) { return intersect(a,b).empty(); }
+  template<typename T> __inline bool disjoint( const BBox<T>& a, const       T& b ) { return disjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool disjoint( const       T& a, const BBox<T>& b ) { return disjoint(BBox<T>(a),b); }
+
+  /*! tests if bounding boxes (and points) are conjoint (non-empty intersection) */
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const BBox<T>& b ) { return !intersect(a,b).empty(); }
+  template<typename T> __inline bool conjoint( const BBox<T>& a, const       T& b ) { return conjoint(a,BBox<T>(b)); }
+  template<typename T> __inline bool conjoint( const       T& a, const BBox<T>& b ) { return conjoint(BBox<T>(a),b); }
+
+  /*! subset relation */
+  template<typename T> __inline bool subset( const BBox<T>& a, const BBox<T>& b )
+  { 
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.lower[i] < b.lower[i] ) return false;
+    for ( size_t i = 0; i < T::N; i++ ) if ( a.upper[i] > b.upper[i] ) return false;
+    return true; 
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fa>& a, const BBox<Vec3fa>& b ) {
+    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+  }
+
+  template<> __inline bool subset( const BBox<Vec3fx>& a, const BBox<Vec3fx>& b ) {
+    return all(ge_mask(a.lower,b.lower)) & all(le_mask(a.upper,b.upper));
+  }
+  
+  /*! blending */
+  template<typename T>
+    __forceinline BBox<T> lerp(const BBox<T>& b0, const BBox<T>& b1, const float t) {
+    return BBox<T>(lerp(b0.lower,b1.lower,t),lerp(b0.upper,b1.upper,t));
+  }
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const BBox<T>& box) {
+    return cout << "[" << box.lower << "; " << box.upper << "]";
+  }
+
+  /*! default template instantiations */
+  typedef BBox<float> BBox1f;
+  typedef BBox<Vec2f> BBox2f;
+  typedef BBox<Vec2fa> BBox2fa;
+  typedef BBox<Vec3f> BBox3f;
+  typedef BBox<Vec3fa> BBox3fa;
+  typedef BBox<Vec3fx> BBox3fx;
+  typedef BBox<Vec3ff> BBox3ff;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined (__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<int N>
+    __forceinline BBox<Vec3<vfloat<N>>> transpose(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox<Vec3<vfloat4>> transpose<4>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat4>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox<Vec3<vfloat8>> transpose<8>(const BBox3fa* bounds)
+  {
+    BBox<Vec3<vfloat8>> dest;
+    
+    transpose((vfloat4&)bounds[0].lower,
+              (vfloat4&)bounds[1].lower,
+              (vfloat4&)bounds[2].lower,
+              (vfloat4&)bounds[3].lower,
+              (vfloat4&)bounds[4].lower,
+              (vfloat4&)bounds[5].lower,
+              (vfloat4&)bounds[6].lower,
+              (vfloat4&)bounds[7].lower,
+              dest.lower.x,
+              dest.lower.y,
+              dest.lower.z);
+    
+    transpose((vfloat4&)bounds[0].upper,
+              (vfloat4&)bounds[1].upper,
+              (vfloat4&)bounds[2].upper,
+              (vfloat4&)bounds[3].upper,
+              (vfloat4&)bounds[4].upper,
+              (vfloat4&)bounds[5].upper,
+              (vfloat4&)bounds[6].upper,
+              (vfloat4&)bounds[7].upper,
+              dest.upper.x,
+              dest.upper.y,
+              dest.upper.z);
+    
+    return dest;
+  }
+#endif
+  
+  template<int N>
+    __forceinline BBox3fa merge(const BBox3fa* bounds);
+  
+  template<>
+    __forceinline BBox3fa merge<4>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(bounds[0].lower,bounds[1].lower),
+                             min(bounds[2].lower,bounds[3].lower));
+    const Vec3fa upper = max(max(bounds[0].upper,bounds[1].upper),
+                             max(bounds[2].upper,bounds[3].upper));
+    return BBox3fa(lower,upper);
+  }
+  
+#if defined(__AVX__)
+  template<>
+    __forceinline BBox3fa merge<8>(const BBox3fa* bounds)
+  {
+    const Vec3fa lower = min(min(min(bounds[0].lower,bounds[1].lower),min(bounds[2].lower,bounds[3].lower)),
+                             min(min(bounds[4].lower,bounds[5].lower),min(bounds[6].lower,bounds[7].lower)));
+    const Vec3fa upper = max(max(max(bounds[0].upper,bounds[1].upper),max(bounds[2].upper,bounds[3].upper)),
+                             max(max(bounds[4].upper,bounds[5].upper),max(bounds[6].upper,bounds[7].upper)));
+    return BBox3fa(lower,upper);
+  }
+#endif
+}
+
diff --git a/thirdparty/embree-aarch64/common/math/col3.h b/thirdparty/embree-aarch64/common/math/col3.h
new file mode 100644
index 0000000000..f52015fb88
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/col3.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col3
+  {
+    T r, g, b;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3           ( )                   { }
+    __forceinline Col3           ( const Col3& other ) { r = other.r; g = other.g; b = other.b; }
+    __forceinline Col3& operator=( const Col3& other ) { r = other.r; g = other.g; b = other.b; return *this; }
+
+    __forceinline explicit Col3 (const T& v)                         : r(v), g(v), b(v) {}
+    __forceinline          Col3 (const T& r, const T& g, const T& b) : r(r), g(g), b(b) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col3 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)    {}
+    __forceinline Col3 (OneTy)    : r(one)    , g(one)    , b(one)     {}
+    __forceinline Col3 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf) {}
+    __forceinline Col3 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col3<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col3<uint8_t      > Col3uc;
+  typedef Col3<float        > Col3f;
+}
diff --git a/thirdparty/embree-aarch64/common/math/col4.h b/thirdparty/embree-aarch64/common/math/col4.h
new file mode 100644
index 0000000000..90df293f8e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/col4.h
@@ -0,0 +1,47 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Col4
+  {
+    T r, g, b, a;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4           ( )                   { }
+    __forceinline Col4           ( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; }
+    __forceinline Col4& operator=( const Col4& other ) { r = other.r; g = other.g; b = other.b; a = other.a; return *this; }
+
+    __forceinline explicit Col4 (const T& v) : r(v), g(v), b(v), a(v) {}
+    __forceinline          Col4 (const T& r, const T& g, const T& b, const T& a) : r(r), g(g), b(b), a(a) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Col4 (ZeroTy)   : r(zero)   , g(zero)   , b(zero)   , a(zero) {}
+    __forceinline Col4 (OneTy)    : r(one)    , g(one)    , b(one)    , a(one) {}
+    __forceinline Col4 (PosInfTy) : r(pos_inf), g(pos_inf), b(pos_inf), a(pos_inf) {}
+    __forceinline Col4 (NegInfTy) : r(neg_inf), g(neg_inf), b(neg_inf), a(neg_inf) {}
+  };
+
+  /*! output operator */
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Col4<T>& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ", " << a.a << ")";
+  }
+
+  /*! default template instantiations */
+  typedef Col4<uint8_t      > Col4uc;
+  typedef Col4<float        > Col4f;
+}
diff --git a/thirdparty/embree-aarch64/common/math/color.h b/thirdparty/embree-aarch64/common/math/color.h
new file mode 100644
index 0000000000..c3083e4fc0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/color.h
@@ -0,0 +1,257 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "constants.h"
+#include "col3.h"
+#include "col4.h"
+
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGBA Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color4
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b,a; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4 () {}
+    __forceinline Color4 ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color4 (const float v) : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color4 (const float r, const float g, const float b, const float a) : m128(_mm_set_ps(a,b,g,r)) {}
+
+    __forceinline explicit Color4 ( const Col3uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(255.0f,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col3f&  other ) { m128 = _mm_set_ps(1.0f,other.b,other.g,other.r); }
+    __forceinline explicit Color4 ( const Col4uc& other ) { m128 = _mm_mul_ps(_mm_set_ps(other.a,other.b,other.g,other.r),_mm_set1_ps(one_over_255)); }
+    __forceinline explicit Color4 ( const Col4f&  other ) { m128 = _mm_set_ps(other.a,other.b,other.g,other.r); }
+
+    __forceinline Color4           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color4& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = a; }
+    __forceinline void set(Col3uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    {
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+      d.a = (uint8_t)(s[3]); 
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color4( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color4( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color4( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color4( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE RGB Color Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct Color
+  {
+    union {
+      __m128 m128;
+      struct { float r,g,b; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color () {}
+    __forceinline Color ( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Color  (const float v)                               : m128(_mm_set1_ps(v)) {}
+    __forceinline          Color  (const float r, const float g, const float b) : m128(_mm_set_ps(0.0f,b,g,r)) {}
+
+    __forceinline Color           ( const Color& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color& other ) { m128 = other.m128; return *this; }
+
+    __forceinline Color           ( const Color4& other ) : m128(other.m128) {}
+    __forceinline Color& operator=( const Color4& other ) { m128 = other.m128; return *this; }
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Set
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline void set(Col3f& d) const { d.r = r; d.g = g; d.b = b; }
+    __forceinline void set(Col4f& d) const { d.r = r; d.g = g; d.b = b; d.a = 1.0f; }
+    __forceinline void set(Col3uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+    }
+    __forceinline void set(Col4uc& d) const 
+    { 
+      vfloat4 s = clamp(vfloat4(m128))*255.0f;
+      d.r = (uint8_t)(s[0]); 
+      d.g = (uint8_t)(s[1]); 
+      d.b = (uint8_t)(s[2]); 
+      d.a = 255; 
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Color( ZeroTy   ) : m128(_mm_set1_ps(0.0f)) {}
+    __forceinline Color( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Color( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Color( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a ) { return a; }
+  __forceinline const Color operator -( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline const Color abs  ( const Color& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline const Color rcp  ( const Color& a )
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    __m128 reciprocal = _mm_rcp_ps(a.m128);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+    return (const Color)reciprocal;
+#else
+#if defined(__AVX512VL__)
+    const Color r = _mm_rcp14_ps(a.m128);
+#else
+    const Color r = _mm_rcp_ps(a.m128);
+#endif
+    return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif  //defined(__aarch64__) && defined(BUILD_IOS)
+  }
+  __forceinline const Color rsqrt( const Color& a )
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    __m128 r = _mm_rsqrt_ps(a.m128);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+    return r;
+#else
+      
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+      
+#endif  //defined(__aarch64__) && defined(BUILD_IOS)
+  }
+  __forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator +( const Color& a, const Color& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline const Color operator -( const Color& a, const Color& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const Color& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline const Color operator *( const Color& a, const float  b ) { return a * Color(b); }
+  __forceinline const Color operator *( const float  a, const Color& b ) { return Color(a) * b; }
+  __forceinline const Color operator /( const Color& a, const Color& b ) { return a * rcp(b); }
+  __forceinline const Color operator /( const Color& a, const float  b ) { return a * rcp(b); }
+
+  __forceinline const Color min( const Color& a, const Color& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline const Color max( const Color& a, const Color& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color operator+=(Color& a, const Color& b) { return a = a + b; }
+  __forceinline const Color operator-=(Color& a, const Color& b) { return a = a - b; }
+  __forceinline const Color operator*=(Color& a, const Color& b) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const Color& b) { return a = a / b; }
+  __forceinline const Color operator*=(Color& a, const float b      ) { return a = a * b; }
+  __forceinline const Color operator/=(Color& a, const float b      ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Color& v) { return v.r+v.g+v.b; }
+  __forceinline float reduce_mul(const Color& v) { return v.r*v.g*v.b; }
+  __forceinline float reduce_min(const Color& v) { return min(v.r,v.g,v.b); }
+  __forceinline float reduce_max(const Color& v) { return max(v.r,v.g,v.b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Color& a, const Color& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+  __forceinline bool operator < ( const Color& a, const Color& b ) {
+    if (a.r != b.r) return a.r < b.r;
+    if (a.g != b.g) return a.g < b.g;
+    if (a.b != b.b) return a.b < b.b;
+    return false;
+  }
+
+   ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline const Color select( bool s, const Color& t, const Color& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Special Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  /*! computes luminance of a color */
+  __forceinline float luminance (const Color& a) { return madd(0.212671f,a.r,madd(0.715160f,a.g,0.072169f*a.b)); }
+
+  /*! output operator */
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Color& a) {
+    return cout << "(" << a.r << ", " << a.g << ", " << a.b << ")";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/math/constants.cpp b/thirdparty/embree-aarch64/common/math/constants.cpp
new file mode 100644
index 0000000000..eeff131664
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/constants.cpp
@@ -0,0 +1,61 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+#include "constants.h"
+
+namespace embree
+{
+  TrueTy True;
+  FalseTy False;
+  ZeroTy zero;
+  OneTy one;
+  NegInfTy neg_inf;
+  PosInfTy inf;
+  PosInfTy pos_inf;
+  NaNTy nan;
+  UlpTy ulp;
+  PiTy pi;
+  OneOverPiTy one_over_pi;
+  TwoPiTy two_pi;
+  OneOverTwoPiTy one_over_two_pi;
+  FourPiTy four_pi;
+  OneOverFourPiTy one_over_four_pi;
+  StepTy step;
+  ReverseStepTy reverse_step;
+  EmptyTy empty;
+  UndefinedTy undefined;
+
+#if defined(__aarch64__)
+const uint32x4_t movemask_mask = { 1, 2, 4, 8 };
+const uint32x4_t vzero = { 0, 0, 0, 0 };
+const uint32x4_t v0x80000000 = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+const uint32x4_t v0x7fffffff = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
+const uint32x4_t v000F = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t v00F0 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t v00FF = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t v0F00 = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 };
+const uint32x4_t v0F0F = { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t v0FF0 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t v0FFF = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t vF000 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
+const uint32x4_t vF00F = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t vF0F0 = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t vF0FF = { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint32x4_t vFF00 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
+const uint32x4_t vFF0F = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF };
+const uint32x4_t vFFF0 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
+const uint32x4_t vFFFF = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+const uint8x16_t v0022 = {0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11};
+const uint8x16_t v1133 = {4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15};
+const uint8x16_t v0101 = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
+const float32x4_t vOne = { 1.0f, 1.0f, 1.0f, 1.0f };
+const float32x4_t vmOne = { -1.0f, -1.0f, -1.0f, -1.0f };
+const float32x4_t vInf = { INFINITY, INFINITY, INFINITY, INFINITY };
+const float32x4_t vmInf = { -INFINITY, -INFINITY, -INFINITY, -INFINITY };
+#endif
+
+}
diff --git a/thirdparty/embree-aarch64/common/math/constants.h b/thirdparty/embree-aarch64/common/math/constants.h
new file mode 100644
index 0000000000..e80abec80f
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/constants.h
@@ -0,0 +1,239 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+#include <limits>
+
+#define _USE_MATH_DEFINES
+#include <math.h> // using cmath causes issues under Windows
+#include <cfloat>
+#include <climits>
+
+// Math constants may not be defined in libcxx + mingw + strict C++ standard
+#if defined(__MINGW32__)
+
+// TODO(LTE): use constexpr
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+#ifndef M_1_PI
+#define M_1_PI 0.31830988618379067154
+#endif
+
+#endif // __MINGW32__
+
+namespace embree
+{
+  static MAYBE_UNUSED const float one_over_255 = 1.0f/255.0f;
+  static MAYBE_UNUSED const float min_rcp_input = 1E-18f;  // for abs(x) >= min_rcp_input the newton raphson rcp calculation does not fail
+
+  /* we consider floating point numbers in that range as valid input numbers */
+  static MAYBE_UNUSED float FLT_LARGE = 1.844E18f;
+
+  struct TrueTy {
+    __forceinline operator bool( ) const { return true; }
+  };
+
+  extern MAYBE_UNUSED TrueTy True;
+
+  struct FalseTy {
+    __forceinline operator bool( ) const { return false; }
+  };
+
+  extern MAYBE_UNUSED FalseTy False;
+  
+  struct ZeroTy
+  {
+    __forceinline operator          double   ( ) const { return 0; }
+    __forceinline operator          float    ( ) const { return 0; }
+    __forceinline operator          long long( ) const { return 0; }
+    __forceinline operator unsigned long long( ) const { return 0; }
+    __forceinline operator          long     ( ) const { return 0; }
+    __forceinline operator unsigned long     ( ) const { return 0; }
+    __forceinline operator          int      ( ) const { return 0; }
+    __forceinline operator unsigned int      ( ) const { return 0; }
+    __forceinline operator          short    ( ) const { return 0; }
+    __forceinline operator unsigned short    ( ) const { return 0; }
+    __forceinline operator          int8_t     ( ) const { return 0; }
+    __forceinline operator uint8_t     ( ) const { return 0; }
+  }; 
+
+  extern MAYBE_UNUSED ZeroTy zero;
+
+  struct OneTy
+  {
+    __forceinline operator          double   ( ) const { return 1; }
+    __forceinline operator          float    ( ) const { return 1; }
+    __forceinline operator          long long( ) const { return 1; }
+    __forceinline operator unsigned long long( ) const { return 1; }
+    __forceinline operator          long     ( ) const { return 1; }
+    __forceinline operator unsigned long     ( ) const { return 1; }
+    __forceinline operator          int      ( ) const { return 1; }
+    __forceinline operator unsigned int      ( ) const { return 1; }
+    __forceinline operator          short    ( ) const { return 1; }
+    __forceinline operator unsigned short    ( ) const { return 1; }
+    __forceinline operator          int8_t     ( ) const { return 1; }
+    __forceinline operator uint8_t     ( ) const { return 1; }
+  };
+
+  extern MAYBE_UNUSED OneTy one;
+
+  struct NegInfTy
+  {
+    __forceinline operator          double   ( ) const { return -std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return -std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::min(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::min(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::min(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::min(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::min(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::min(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::min(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::min(); }
+    __forceinline operator          int8_t     ( ) const { return std::numeric_limits<int8_t>::min(); }
+    __forceinline operator uint8_t     ( ) const { return std::numeric_limits<uint8_t>::min(); }
+
+  };
+
+  extern MAYBE_UNUSED NegInfTy neg_inf;
+
+  struct PosInfTy
+  {
+    __forceinline operator          double   ( ) const { return std::numeric_limits<double>::infinity(); }
+    __forceinline operator          float    ( ) const { return std::numeric_limits<float>::infinity(); }
+    __forceinline operator          long long( ) const { return std::numeric_limits<long long>::max(); }
+    __forceinline operator unsigned long long( ) const { return std::numeric_limits<unsigned long long>::max(); }
+    __forceinline operator          long     ( ) const { return std::numeric_limits<long>::max(); }
+    __forceinline operator unsigned long     ( ) const { return std::numeric_limits<unsigned long>::max(); }
+    __forceinline operator          int      ( ) const { return std::numeric_limits<int>::max(); }
+    __forceinline operator unsigned int      ( ) const { return std::numeric_limits<unsigned int>::max(); }
+    __forceinline operator          short    ( ) const { return std::numeric_limits<short>::max(); }
+    __forceinline operator unsigned short    ( ) const { return std::numeric_limits<unsigned short>::max(); }
+    __forceinline operator          int8_t     ( ) const { return std::numeric_limits<int8_t>::max(); }
+    __forceinline operator uint8_t     ( ) const { return std::numeric_limits<uint8_t>::max(); }
+  };
+
+  extern MAYBE_UNUSED PosInfTy inf;
+  extern MAYBE_UNUSED PosInfTy pos_inf;
+
+  struct NaNTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::quiet_NaN(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::quiet_NaN(); }
+  };
+
+  extern MAYBE_UNUSED NaNTy nan;
+
+  struct UlpTy
+  {
+    __forceinline operator double( ) const { return std::numeric_limits<double>::epsilon(); }
+    __forceinline operator float ( ) const { return std::numeric_limits<float>::epsilon(); }
+  };
+
+  extern MAYBE_UNUSED UlpTy ulp;
+
+  struct PiTy
+  {
+    __forceinline operator double( ) const { return double(M_PI); }
+    __forceinline operator float ( ) const { return float(M_PI); }
+  };
+
+  extern MAYBE_UNUSED PiTy pi;
+
+  struct OneOverPiTy
+  {
+    __forceinline operator double( ) const { return double(M_1_PI); }
+    __forceinline operator float ( ) const { return float(M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverPiTy one_over_pi;
+
+  struct TwoPiTy
+  {
+    __forceinline operator double( ) const { return double(2.0*M_PI); }
+    __forceinline operator float ( ) const { return float(2.0*M_PI); }
+  };
+
+  extern MAYBE_UNUSED TwoPiTy two_pi;
+
+  struct OneOverTwoPiTy
+  {
+    __forceinline operator double( ) const { return double(0.5*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.5*M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverTwoPiTy one_over_two_pi;
+
+  struct FourPiTy
+  {
+    __forceinline operator double( ) const { return double(4.0*M_PI); } 
+    __forceinline operator float ( ) const { return float(4.0*M_PI); }
+  };
+
+  extern MAYBE_UNUSED FourPiTy four_pi;
+
+  struct OneOverFourPiTy
+  {
+    __forceinline operator double( ) const { return double(0.25*M_1_PI); }
+    __forceinline operator float ( ) const { return float(0.25*M_1_PI); }
+  };
+
+  extern MAYBE_UNUSED OneOverFourPiTy one_over_four_pi;
+
+  struct StepTy {
+  };
+
+  extern MAYBE_UNUSED StepTy step;
+
+  struct ReverseStepTy {
+  };
+
+  extern MAYBE_UNUSED ReverseStepTy reverse_step;
+
+  struct EmptyTy {
+  };
+
+  extern MAYBE_UNUSED EmptyTy empty;
+
+  struct FullTy {
+  };
+
+  extern MAYBE_UNUSED FullTy full;
+
+  struct UndefinedTy {
+  };
+
+  extern MAYBE_UNUSED UndefinedTy undefined;
+    
+#if defined(__aarch64__)
+  extern const uint32x4_t movemask_mask;
+  extern const uint32x4_t vzero;
+  extern const uint32x4_t v0x80000000;
+  extern const uint32x4_t v0x7fffffff;
+  extern const uint32x4_t v000F;
+  extern const uint32x4_t v00F0;
+  extern const uint32x4_t v00FF;
+  extern const uint32x4_t v0F00;
+  extern const uint32x4_t v0F0F;
+  extern const uint32x4_t v0FF0;
+  extern const uint32x4_t v0FFF;
+  extern const uint32x4_t vF000;
+  extern const uint32x4_t vF00F;
+  extern const uint32x4_t vF0F0;
+  extern const uint32x4_t vF0FF;
+  extern const uint32x4_t vFF00;
+  extern const uint32x4_t vFF0F;
+  extern const uint32x4_t vFFF0;
+  extern const uint32x4_t vFFFF;
+  extern const uint8x16_t v0022;
+  extern const uint8x16_t v1133;
+  extern const uint8x16_t v0101;
+  extern const float32x4_t vOne;
+  extern const float32x4_t vmOne;
+  extern const float32x4_t vInf;
+  extern const float32x4_t vmInf;
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/math/interval.h b/thirdparty/embree-aarch64/common/math/interval.h
new file mode 100644
index 0000000000..f06478e881
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/interval.h
@@ -0,0 +1,161 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+#include "vec3.h"
+#include "bbox.h"
+
+namespace embree
+{
+  template<typename V>
+    struct Interval
+    {
+      V lower, upper;
+      
+      __forceinline Interval() {}
+      __forceinline Interval           ( const Interval& other ) { lower = other.lower; upper = other.upper; }
+      __forceinline Interval& operator=( const Interval& other ) { lower = other.lower; upper = other.upper; return *this; }
+
+      __forceinline Interval(const V& a) : lower(a), upper(a) {}
+      __forceinline Interval(const V& lower, const V& upper) : lower(lower), upper(upper) {}
+      __forceinline Interval(const BBox<V>& a) : lower(a.lower), upper(a.upper) {}
+          
+      /*! tests if box is empty */
+      //__forceinline bool empty() const { return lower > upper; }
+      
+      /*! computes the size of the interval */
+      __forceinline V size() const { return upper - lower; }
+      
+      __forceinline V center() const { return 0.5f*(lower+upper); }
+      
+      __forceinline const Interval& extend(const Interval& other) { lower = min(lower,other.lower); upper = max(upper,other.upper); return *this; }
+      __forceinline const Interval& extend(const V   & other) { lower = min(lower,other      ); upper = max(upper,other      ); return *this; }
+      
+      __forceinline friend Interval operator +( const Interval& a, const Interval& b ) {
+        return Interval(a.lower+b.lower,a.upper+b.upper);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const Interval& b ) {
+        return Interval(a.lower-b.upper,a.upper-b.lower);
+      }
+      
+      __forceinline friend Interval operator -( const Interval& a, const V& b ) {
+        return Interval(a.lower-b,a.upper-b);
+      }
+      
+      __forceinline friend Interval operator *( const Interval& a, const Interval& b )
+      {
+        const V ll = a.lower*b.lower;
+        const V lu = a.lower*b.upper;
+        const V ul = a.upper*b.lower;
+        const V uu = a.upper*b.upper;
+        return Interval(min(ll,lu,ul,uu),max(ll,lu,ul,uu));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b) {
+        return Interval(min(a.lower,b.lower),max(a.upper,b.upper));
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c) {
+        return merge(merge(a,b),c);
+      }
+      
+      __forceinline friend Interval merge( const Interval& a, const Interval& b, const Interval& c, const Interval& d) {
+        return merge(merge(a,b),merge(c,d));
+      }
+      
+      /*! intersect bounding boxes */
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b ) { return Interval(max(a.lower, b.lower), min(a.upper, b.upper)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c ) { return intersect(a,intersect(b,c)); }
+      __forceinline friend const Interval intersect( const Interval& a, const Interval& b, const Interval& c, const Interval& d ) { return intersect(intersect(a,b),intersect(c,d)); }       
+      
+      friend embree_ostream operator<<(embree_ostream cout, const Interval& a) {
+        return cout << "[" << a.lower << ", " << a.upper << "]";
+      }
+      
+      ////////////////////////////////////////////////////////////////////////////////
+      /// Constants
+      ////////////////////////////////////////////////////////////////////////////////
+      
+      __forceinline Interval( EmptyTy ) : lower(pos_inf), upper(neg_inf) {}
+      __forceinline Interval( FullTy  ) : lower(neg_inf), upper(pos_inf) {}
+    };
+
+  __forceinline bool isEmpty(const Interval<float>& v) { 
+    return v.lower > v.upper;
+  }
+
+  __forceinline vboolx isEmpty(const Interval<vfloatx>& v) {
+    return v.lower > v.upper;
+  }
+  
+  /*! subset relation */
+  template<typename T> __forceinline bool subset( const Interval<T>& a, const Interval<T>& b ) { 
+    return (a.lower > b.lower) && (a.upper < b.upper);
+  }
+
+  template<typename T> __forceinline bool subset( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) { 
+    return subset(a.x,b.x) && subset(a.y,b.y);
+  }
+
+  template<typename T> __forceinline const Vec2<Interval<T>> intersect( const Vec2<Interval<T>>& a, const Vec2<Interval<T>>& b ) {
+    return Vec2<Interval<T>>(intersect(a.x,b.x),intersect(a.y,b.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Interval<T> select ( bool s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  template<typename T> __forceinline Interval<T> select ( const typename T::Bool& s, const Interval<T>& t, const Interval<T>& f ) {
+    return Interval<T>(select(s,t.lower,f.lower),select(s,t.upper,f.upper));
+  }
+
+  __forceinline int numRoots(const Interval<float>& p0, const Interval<float>& p1)
+  {
+    float eps = 1E-4f;
+    bool neg0 = p0.lower < eps; bool pos0 = p0.upper > -eps;
+    bool neg1 = p1.lower < eps; bool pos1 = p1.upper > -eps;
+    return (neg0 && pos1) || (pos0 && neg1) || (neg0 && pos0) || (neg1 && pos1);
+  }
+  
+  typedef Interval<float> Interval1f;
+  typedef Vec2<Interval<float>> Interval2f;
+  typedef Vec3<Interval<float>> Interval3f;
+
+inline void swap(float& a, float& b) { float tmp = a; a = b; b = tmp; }
+
+inline Interval1f shift(const Interval1f& v, float shift) { return Interval1f(v.lower + shift, v.upper + shift); }
+
+#define TWO_PI (2.0*M_PI)
+inline Interval1f sin(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float sinLower = sin(interval.lower);
+  float sinUpper = sin(interval.upper);
+  if (sinLower > sinUpper) swap(sinLower, sinUpper);
+  if (interval.lower <       M_PI / 2.0 && interval.upper >       M_PI / 2.0) sinUpper =  1.0;
+  if (interval.lower < 3.0 * M_PI / 2.0 && interval.upper > 3.0 * M_PI / 2.0) sinLower = -1.0;
+  return Interval1f(sinLower, sinUpper);
+}
+
+inline Interval1f cos(Interval1f interval)
+{
+  if (interval.upper-interval.lower >= M_PI) { return Interval1f(-1.0, 1.0); }
+  if (interval.upper > TWO_PI)                 { interval = shift(interval, -TWO_PI*floor(interval.upper/TWO_PI)); }
+  if (interval.lower < 0)                      { interval = shift(interval, -TWO_PI*floor(interval.lower/TWO_PI)); }
+  float cosLower = cos(interval.lower);
+  float cosUpper = cos(interval.upper);
+  if (cosLower > cosUpper) swap(cosLower, cosUpper);
+  if (interval.lower < M_PI && interval.upper > M_PI) cosLower = -1.0;
+  return Interval1f(cosLower, cosUpper);
+}
+#undef TWO_PI
+}
diff --git a/thirdparty/embree-aarch64/common/math/lbbox.h b/thirdparty/embree-aarch64/common/math/lbbox.h
new file mode 100644
index 0000000000..95df4a918d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/lbbox.h
@@ -0,0 +1,289 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "range.h"
+
+namespace embree
+{
+  template<typename T>
+    __forceinline std::pair<T,T> globalLinear(const std::pair<T,T>& v, const BBox1f& dt)
+  {
+    const float rcp_dt_size = float(1.0f)/dt.size();
+    const T g0 = lerp(v.first,v.second,-dt.lower*rcp_dt_size);
+    const T g1 = lerp(v.first,v.second,(1.0f-dt.lower)*rcp_dt_size);
+    return std::make_pair(g0,g1);
+  }
+
+  template<typename T>
+  struct LBBox
+  {
+  public:
+    __forceinline LBBox () {}
+
+    template<typename T1>
+    __forceinline LBBox ( const LBBox<T1>& other )
+    : bounds0(other.bounds0), bounds1(other.bounds1) {} 
+
+    __forceinline LBBox& operator= ( const LBBox& other ) { 
+      bounds0 = other.bounds0; bounds1 = other.bounds1; return *this; 
+    }
+
+    __forceinline LBBox (EmptyTy) 
+      : bounds0(EmptyTy()), bounds1(EmptyTy()) {}
+    
+    __forceinline explicit LBBox ( const BBox<T>& bounds) 
+      : bounds0(bounds), bounds1(bounds) { }
+    
+    __forceinline LBBox ( const BBox<T>& bounds0, const BBox<T>& bounds1) 
+      : bounds0(bounds0), bounds1(bounds1) { }
+
+    LBBox ( const avector<BBox<T>>& bounds ) 
+    {
+      assert(bounds.size());
+      BBox<T> b0 = bounds.front();
+      BBox<T> b1 = bounds.back();
+      for (size_t i=1; i<bounds.size()-1; i++) {
+        const float f = float(i)/float(bounds.size()-1);
+        const BBox<T> bt = lerp(b0,b1,f);
+        const T dlower = min(bounds[i].lower-bt.lower,T(zero));
+        const T dupper = max(bounds[i].upper-bt.upper,T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range, float numTimeSegments)
+    {
+      const float lower = time_range.lower*numTimeSegments;
+      const float upper = time_range.upper*numTimeSegments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const int ilower = (int)ilowerf;
+      const int iupper = (int)iupperf;
+
+      const BBox<T> blower0 = bounds(ilower);
+      const BBox<T> bupper1 = bounds(iupper);
+
+      if (iupper-ilower == 1) {
+        bounds0 = lerp(blower0, bupper1, lower-ilowerf);
+        bounds1 = lerp(bupper1, blower0, iupperf-upper);
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilower+1);
+      const BBox<T> bupper0 = bounds(iupper-1);
+      BBox<T> b0 = lerp(blower0, blower1, lower-ilowerf);
+      BBox<T> b1 = lerp(bupper1, bupper0, iupperf-upper);
+
+      for (int i = ilower+1; i < iupper; i++)
+      {
+        const float f = (float(i)/numTimeSegments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const BBox1f& time_range_in, const BBox1f& geom_time_range, float geom_time_segments)
+    {
+      /* normalize global time_range_in to local geom_time_range */
+      const BBox1f time_range((time_range_in.lower-geom_time_range.lower)/geom_time_range.size(),
+                              (time_range_in.upper-geom_time_range.lower)/geom_time_range.size());
+        
+      const float lower = time_range.lower*geom_time_segments;
+      const float upper = time_range.upper*geom_time_segments;
+      const float ilowerf = floor(lower);
+      const float iupperf = ceil(upper);
+      const float ilowerfc = max(0.0f,ilowerf);
+      const float iupperfc = min(iupperf,geom_time_segments);
+      const int   ilowerc = (int)ilowerfc;
+      const int   iupperc = (int)iupperfc;
+      assert(iupperc-ilowerc > 0);
+
+      /* this larger iteration range guarantees that we process borders of geom_time_range is (partially) inside time_range_in */
+      const int ilower_iter = max(-1,(int)ilowerf);
+      const int iupper_iter = min((int)iupperf,(int)geom_time_segments+1);
+        
+      const BBox<T> blower0 = bounds(ilowerc);
+      const BBox<T> bupper1 = bounds(iupperc);
+      if (iupper_iter-ilower_iter == 1) {
+        bounds0 = lerp(blower0, bupper1, max(0.0f,lower-ilowerfc));
+        bounds1 = lerp(bupper1, blower0, max(0.0f,iupperfc-upper));
+        return;
+      }
+
+      const BBox<T> blower1 = bounds(ilowerc+1);
+      const BBox<T> bupper0 = bounds(iupperc-1);
+      BBox<T> b0 = lerp(blower0, blower1, max(0.0f,lower-ilowerfc));
+      BBox<T> b1 = lerp(bupper1, bupper0, max(0.0f,iupperfc-upper));
+
+      for (int i = ilower_iter+1; i < iupper_iter; i++)
+      {
+        const float f = (float(i)/geom_time_segments - time_range.lower) / time_range.size();
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+    /*! calculates the linear bounds of a primitive for the specified time range */
+    template<typename BoundsFunc>
+    __forceinline LBBox(const BoundsFunc& bounds, const range<int>& time_range, int numTimeSegments)
+    {
+      const int ilower = time_range.begin();
+      const int iupper = time_range.end();
+
+      BBox<T> b0 = bounds(ilower);
+      BBox<T> b1 = bounds(iupper);
+
+      if (iupper-ilower == 1)
+      {
+        bounds0 = b0;
+        bounds1 = b1;
+        return;
+      }
+  
+      for (int i = ilower+1; i<iupper; i++)
+      {
+        const float f = float(i - time_range.begin()) / float(time_range.size());
+        const BBox<T> bt = lerp(b0, b1, f);
+        const BBox<T> bi = bounds(i);
+        const T dlower = min(bi.lower-bt.lower, T(zero));
+        const T dupper = max(bi.upper-bt.upper, T(zero));
+        b0.lower += dlower; b1.lower += dlower;
+        b0.upper += dupper; b1.upper += dupper;
+      }
+
+      bounds0 = b0;
+      bounds1 = b1;
+    }
+
+  public:
+
+    __forceinline bool empty() const {
+      return bounds().empty();
+    }
+
+    __forceinline BBox<T> bounds () const {
+      return merge(bounds0,bounds1);
+    }
+
+    __forceinline BBox<T> interpolate( const float t ) const {
+      return lerp(bounds0,bounds1,t);
+    }
+
+    __forceinline LBBox<T> interpolate( const BBox1f& dt ) const {
+      return LBBox<T>(interpolate(dt.lower),interpolate(dt.upper));
+    }
+
+    __forceinline void extend( const LBBox& other ) {
+      bounds0.extend(other.bounds0);
+      bounds1.extend(other.bounds1);
+    }
+
+    __forceinline float expectedHalfArea() const;
+
+    __forceinline float expectedHalfArea(const BBox1f& dt) const {
+      return interpolate(dt).expectedHalfArea();
+    }
+
+    __forceinline float expectedApproxHalfArea() const {
+      return 0.5f*(halfArea(bounds0) + halfArea(bounds1));
+    }
+
+    /* calculates bounds for [0,1] time range from bounds in dt time range */
+    __forceinline LBBox global(const BBox1f& dt) const 
+    {
+      const float rcp_dt_size = 1.0f/dt.size();
+      const BBox<T> b0 = interpolate(-dt.lower*rcp_dt_size);
+      const BBox<T> b1 = interpolate((1.0f-dt.lower)*rcp_dt_size);
+      return LBBox(b0,b1);
+    }
+
+    /*! Comparison Operators */
+    //template<typename TT> friend __forceinline bool operator==( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    //template<typename TT> friend __forceinline bool operator!=( const LBBox<TT>& a, const LBBox<TT>& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    friend __forceinline bool operator==( const LBBox& a, const LBBox& b ) { return a.bounds0 == b.bounds0 && a.bounds1 == b.bounds1; }
+    friend __forceinline bool operator!=( const LBBox& a, const LBBox& b ) { return a.bounds0 != b.bounds0 || a.bounds1 != b.bounds1; }
+    
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const LBBox& box) {
+      return cout << "LBBox { " << box.bounds0 << "; " << box.bounds1 << " }";
+    }
+
+  public:
+    BBox<T> bounds0, bounds1;
+  };
+
+  /*! tests if box is finite */
+  template<typename T>
+    __forceinline bool isvalid( const LBBox<T>& v ) {
+    return isvalid(v.bounds0) && isvalid(v.bounds1);
+  }
+
+  template<typename T>
+    __forceinline bool isvalid_non_empty( const LBBox<T>& v ) {
+    return isvalid_non_empty(v.bounds0) && isvalid_non_empty(v.bounds1);
+  }
+  
+  template<typename T>
+    __forceinline T expectedArea(const T& a0, const T& a1, const T& b0, const T& b1)
+  {
+    const T da = a1-a0;
+    const T db = b1-b0;
+    return a0*b0+(a0*db+da*b0)*T(0.5f) + da*db*T(1.0f/3.0f);
+  }
+  
+  template<> __forceinline float LBBox<Vec3fa>::expectedHalfArea() const 
+  {
+    const Vec3fa d0 = bounds0.size();
+    const Vec3fa d1 = bounds1.size();
+    return reduce_add(expectedArea(Vec3fa(d0.x,d0.y,d0.z),
+                                   Vec3fa(d1.x,d1.y,d1.z),
+                                   Vec3fa(d0.y,d0.z,d0.x),
+                                   Vec3fa(d1.y,d1.z,d1.x)));
+  }
+
+  template<typename T>
+  __forceinline float expectedApproxHalfArea(const LBBox<T>& box) {
+    return box.expectedApproxHalfArea(); 
+  }
+
+  template<typename T>
+  __forceinline LBBox<T> merge(const LBBox<T>& a, const LBBox<T>& b) {
+    return LBBox<T>(merge(a.bounds0, b.bounds0), merge(a.bounds1, b.bounds1));
+  }
+
+   /*! subset relation */
+  template<typename T> __inline bool subset( const LBBox<T>& a, const LBBox<T>& b ) {
+    return subset(a.bounds0,b.bounds0) && subset(a.bounds1,b.bounds1);
+  }
+
+  /*! default template instantiations */
+  typedef LBBox<float> LBBox1f;
+  typedef LBBox<Vec2f> LBBox2f;
+  typedef LBBox<Vec3f> LBBox3f;
+  typedef LBBox<Vec3fa> LBBox3fa;
+  typedef LBBox<Vec3fx> LBBox3fx;
+}
diff --git a/thirdparty/embree-aarch64/common/math/linearspace2.h b/thirdparty/embree-aarch64/common/math/linearspace2.h
new file mode 100644
index 0000000000..b9a382962c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/linearspace2.h
@@ -0,0 +1,148 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec2.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 2D Linear Transform (2x2 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace2
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace2           ( ) {}
+    __forceinline LinearSpace2           ( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; }
+    __forceinline LinearSpace2& operator=( const LinearSpace2& other ) { vx = other.vx; vy = other.vy; return *this; }
+
+    template<typename L1> __forceinline LinearSpace2( const LinearSpace2<L1>& s ) : vx(s.vx), vy(s.vy) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace2(const Vector& vx, const Vector& vy)
+      : vx(vx), vy(vy) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace2(const Scalar& m00, const Scalar& m01, 
+                               const Scalar& m10, const Scalar& m11)
+      : vx(m00,m10), vy(m01,m11) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return vx.x*vy.y - vx.y*vy.x; }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y,-vy.x,-vx.y,vx.x); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace2 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace2 transposed() const { return LinearSpace2(vx.x,vx.y,vy.x,vy.y); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace2( ZeroTy ) : vx(zero), vy(zero) {}
+    __forceinline LinearSpace2( OneTy ) : vx(one, zero), vy(zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace2 scale(const Vector& s) {
+      return LinearSpace2(s.x,   0,
+                          0  , s.y);
+    }
+
+    /*! return matrix for rotation */
+    static __forceinline LinearSpace2 rotate(const Scalar& r) {
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace2(c, -s,
+                          s,  c);
+    }
+
+    /*! return closest orthogonal matrix (i.e. a general rotation including reflection) */
+    LinearSpace2 orthogonal() const 
+    {
+      LinearSpace2 m = *this;
+
+      // mirrored?
+      Scalar mirror(one);
+      if (m.det() < Scalar(zero)) {
+        m.vx = -m.vx;
+        mirror = -mirror;
+      }
+
+      // rotation
+      for (int i = 0; i < 99; i++) {
+        const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse());
+        const LinearSpace2 d = m_next - m;
+        m = m_next;
+        // norm^2 of difference small enough?
+        if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8)
+          break;
+      }
+
+      // rotation * mirror_x
+      return LinearSpace2(mirror*m.vx, m.vy);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy;
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a ) { return LinearSpace2<T>(-a.vx,-a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a ) { return LinearSpace2<T>(+a.vx,+a.vy); }
+  template<typename T> __forceinline LinearSpace2<T> rcp       ( const LinearSpace2<T>& a ) { return a.inverse(); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace2<T> operator +( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx+b.vx,a.vy+b.vy); }
+  template<typename T> __forceinline LinearSpace2<T> operator -( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return LinearSpace2<T>(a.vx-b.vx,a.vy-b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator*(const typename T::Scalar & a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+  template<typename T> __forceinline T               operator*(const LinearSpace2<T>& a, const T              & b) { return b.x*a.vx + b.y*a.vy; }
+  template<typename T> __forceinline LinearSpace2<T> operator*(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return LinearSpace2<T>(a*b.vx, a*b.vy); }
+
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const typename T::Scalar & b) { return LinearSpace2<T>(a.vx/b, a.vy/b); }
+  template<typename T> __forceinline LinearSpace2<T> operator/(const LinearSpace2<T>& a, const LinearSpace2<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace2<T>& operator *=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace2<T>& operator /=( LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx == b.vx && a.vy == b.vy; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace2<T>& a, const LinearSpace2<T>& b ) { return a.vx != b.vx || a.vy != b.vy; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace2<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace2<Vec2f> LinearSpace2f;
+  typedef LinearSpace2<Vec2fa> LinearSpace2fa;
+}
diff --git a/thirdparty/embree-aarch64/common/math/linearspace3.h b/thirdparty/embree-aarch64/common/math/linearspace3.h
new file mode 100644
index 0000000000..12b5bb776b
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/linearspace3.h
@@ -0,0 +1,213 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "quaternion.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 3D Linear Transform (3x3 Matrix)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct LinearSpace3
+  {
+    typedef T Vector;
+    typedef typename T::Scalar Scalar;
+
+    /*! default matrix constructor */
+    __forceinline LinearSpace3           ( ) {}
+    __forceinline LinearSpace3           ( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; }
+    __forceinline LinearSpace3& operator=( const LinearSpace3& other ) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; }
+
+    template<typename L1> __forceinline LinearSpace3( const LinearSpace3<L1>& s ) : vx(s.vx), vy(s.vy), vz(s.vz) {}
+
+    /*! matrix construction from column vectors */
+    __forceinline LinearSpace3(const Vector& vx, const Vector& vy, const Vector& vz)
+      : vx(vx), vy(vy), vz(vz) {}
+
+    /*! construction from quaternion */
+    __forceinline LinearSpace3( const QuaternionT<Scalar>& q )
+      : vx((q.r*q.r + q.i*q.i - q.j*q.j - q.k*q.k), 2.0f*(q.i*q.j + q.r*q.k), 2.0f*(q.i*q.k - q.r*q.j))
+      , vy(2.0f*(q.i*q.j - q.r*q.k), (q.r*q.r - q.i*q.i + q.j*q.j - q.k*q.k), 2.0f*(q.j*q.k + q.r*q.i))
+      , vz(2.0f*(q.i*q.k + q.r*q.j), 2.0f*(q.j*q.k - q.r*q.i), (q.r*q.r - q.i*q.i - q.j*q.j + q.k*q.k)) {}
+
+    /*! matrix construction from row mayor data */
+    __forceinline LinearSpace3(const Scalar& m00, const Scalar& m01, const Scalar& m02,
+                               const Scalar& m10, const Scalar& m11, const Scalar& m12,
+                               const Scalar& m20, const Scalar& m21, const Scalar& m22)
+      : vx(m00,m10,m20), vy(m01,m11,m21), vz(m02,m12,m22) {}
+
+    /*! compute the determinant of the matrix */
+    __forceinline const Scalar det() const { return dot(vx,cross(vy,vz)); }
+
+    /*! compute adjoint matrix */
+    __forceinline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy,vz),cross(vz,vx),cross(vx,vy)).transposed(); }
+
+    /*! compute inverse matrix */
+    __forceinline const LinearSpace3 inverse() const { return adjoint()/det(); }
+
+    /*! compute transposed matrix */
+    __forceinline const LinearSpace3 transposed() const { return LinearSpace3(vx.x,vx.y,vx.z,vy.x,vy.y,vy.z,vz.x,vz.y,vz.z); }
+
+    /*! returns first row of matrix */
+    __forceinline Vector row0() const { return Vector(vx.x,vy.x,vz.x); }
+
+    /*! returns second row of matrix */
+    __forceinline Vector row1() const { return Vector(vx.y,vy.y,vz.y); }
+
+    /*! returns third row of matrix */
+    __forceinline Vector row2() const { return Vector(vx.z,vy.z,vz.z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline LinearSpace3( ZeroTy ) : vx(zero), vy(zero), vz(zero) {}
+    __forceinline LinearSpace3( OneTy ) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) {}
+
+    /*! return matrix for scaling */
+    static __forceinline LinearSpace3 scale(const Vector& s) {
+      return LinearSpace3(s.x,   0,   0,
+                          0  , s.y,   0,
+                          0  ,   0, s.z);
+    }
+
+    /*! return matrix for rotation around arbitrary axis */
+    static __forceinline LinearSpace3 rotate(const Vector& _u, const Scalar& r) {
+      Vector u = normalize(_u);
+      Scalar s = sin(r), c = cos(r);
+      return LinearSpace3(u.x*u.x+(1-u.x*u.x)*c,  u.x*u.y*(1-c)-u.z*s,    u.x*u.z*(1-c)+u.y*s,
+                          u.x*u.y*(1-c)+u.z*s,    u.y*u.y+(1-u.y*u.y)*c,  u.y*u.z*(1-c)-u.x*s,
+                          u.x*u.z*(1-c)-u.y*s,    u.y*u.z*(1-c)+u.x*s,    u.z*u.z+(1-u.z*u.z)*c);
+    }
+
+  public:
+
+    /*! the column vectors of the matrix */
+    Vector vx,vy,vz;
+  };
+
+  /*! compute transposed matrix */
+  template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const { 
+    vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
+    return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz)); 
+  }
+
+  template<typename T>
+    __forceinline const LinearSpace3<T> transposed(const LinearSpace3<T>& xfm) { 
+    return xfm.transposed();
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a ) { return LinearSpace3<T>(-a.vx,-a.vy,-a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a ) { return LinearSpace3<T>(+a.vx,+a.vy,+a.vz); }
+  template<typename T> __forceinline LinearSpace3<T> rcp       ( const LinearSpace3<T>& a ) { return a.inverse(); }
+
+  /* constructs a coordinate frame form a normalized normal */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N) 
+  {
+    const T dx0(0,N.z,-N.y);
+    const T dx1(-N.z,0,N.x);
+    const T dx = normalize(select(dot(dx0,dx0) > dot(dx1,dx1),dx0,dx1));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+
+  /* constructs a coordinate frame from a normal and approximate x-direction */
+  template<typename T> __forceinline LinearSpace3<T> frame(const T& N, const T& dxi)
+  {
+    if (abs(dot(dxi,N)) > 0.99f) return frame(N); // fallback in case N and dxi are very parallel
+    const T dx = normalize(cross(dxi,N));
+    const T dy = normalize(cross(N,dx));
+    return LinearSpace3<T>(dx,dy,N);
+  }
+  
+  /* clamps linear space to range -1 to +1 */
+  template<typename T> __forceinline LinearSpace3<T> clamp(const LinearSpace3<T>& space) {
+    return LinearSpace3<T>(clamp(space.vx,T(-1.0f),T(1.0f)),
+                           clamp(space.vy,T(-1.0f),T(1.0f)),
+                           clamp(space.vz,T(-1.0f),T(1.0f)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> operator +( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx+b.vx,a.vy+b.vy,a.vz+b.vz); }
+  template<typename T> __forceinline LinearSpace3<T> operator -( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return LinearSpace3<T>(a.vx-b.vx,a.vy-b.vy,a.vz-b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator*(const typename T::Scalar & a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+  template<typename T> __forceinline T               operator*(const LinearSpace3<T>& a, const T              & b) { return madd(T(b.x),a.vx,madd(T(b.y),a.vy,T(b.z)*a.vz)); }
+  template<typename T> __forceinline LinearSpace3<T> operator*(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return LinearSpace3<T>(a*b.vx, a*b.vy, a*b.vz); }
+
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const typename T::Scalar & b) { return LinearSpace3<T>(a.vx/b, a.vy/b, a.vz/b); }
+  template<typename T> __forceinline LinearSpace3<T> operator/(const LinearSpace3<T>& a, const LinearSpace3<T>& b) { return a * rcp(b); }
+
+  template<typename T> __forceinline LinearSpace3<T>& operator *=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a * b; }
+  template<typename T> __forceinline LinearSpace3<T>& operator /=( LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a = a / b; }
+
+  template<typename T> __forceinline T       xfmPoint (const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmVector(const LinearSpace3<T>& s, const T      & a) { return madd(T(a.x),s.vx,madd(T(a.y),s.vy,T(a.z)*s.vz)); }
+  template<typename T> __forceinline T       xfmNormal(const LinearSpace3<T>& s, const T      & a) { return xfmVector(s.inverse().transposed(),a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; }
+  template<typename T> __forceinline bool operator !=( const LinearSpace3<T>& a, const LinearSpace3<T>& b ) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline LinearSpace3<T> select ( const typename T::Scalar::Bool& s, const LinearSpace3<T>& t, const LinearSpace3<T>& f ) {
+    return LinearSpace3<T>(select(s,t.vx,f.vx),select(s,t.vy,f.vy),select(s,t.vz,f.vz));
+  }
+
+  /*! blending */
+  template<typename T>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0, const LinearSpace3<T>& l1, const float t) 
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const LinearSpace3<T>& m) {
+    return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}";
+  }
+
+  /*! Shortcuts for common linear spaces. */
+  typedef LinearSpace3<Vec3f> LinearSpace3f;
+  typedef LinearSpace3<Vec3fa> LinearSpace3fa;
+  typedef LinearSpace3<Vec3fx> LinearSpace3fx;
+  typedef LinearSpace3<Vec3ff> LinearSpace3ff;
+
+  template<int N> using LinearSpace3vf = LinearSpace3<Vec3<vfloat<N>>>;
+  typedef LinearSpace3<Vec3<vfloat<4>>>  LinearSpace3vf4;
+  typedef LinearSpace3<Vec3<vfloat<8>>>  LinearSpace3vf8;
+  typedef LinearSpace3<Vec3<vfloat<16>>> LinearSpace3vf16;
+
+  /*! blending */
+  template<typename T, typename S>
+    __forceinline LinearSpace3<T> lerp(const LinearSpace3<T>& l0,
+                                       const LinearSpace3<T>& l1,
+                                       const S& t)
+  {
+    return LinearSpace3<T>(lerp(l0.vx,l1.vx,t),
+                           lerp(l0.vy,l1.vy,t),
+                           lerp(l0.vz,l1.vz,t));
+  }
+
+}
diff --git a/thirdparty/embree-aarch64/common/math/math.h b/thirdparty/embree-aarch64/common/math/math.h
new file mode 100644
index 0000000000..6d54abd44d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/math.h
@@ -0,0 +1,451 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "constants.h"
+#include <cmath>
+
+#if defined(__ARM_NEON)
+#include "SSE2NEON.h"
+#if defined(NEON_AVX2_EMULATION)
+#include "AVX2NEON.h"
+#endif
+#else
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#include <immintrin.h>
+#endif
+
+#if defined(__WIN32__) && !defined(__MINGW32__)
+#if (__MSV_VER <= 1700)
+namespace std
+{
+  __forceinline bool isinf ( const float x ) { return _finite(x) == 0; }
+  __forceinline bool isnan ( const float x ) { return _isnan(x) != 0; }
+  __forceinline bool isfinite (const float x) { return _finite(x) != 0; }
+}
+#endif
+#endif
+
+namespace embree
+{
+  __forceinline bool isvalid ( const float& v ) {
+    return (v > -FLT_LARGE) & (v < +FLT_LARGE);
+  }
+
+  __forceinline int cast_f2i(float f) {
+    union { float f; int i; } v; v.f = f; return v.i;
+  }
+
+  __forceinline float cast_i2f(int i) {
+    union { float f; int i; } v; v.i = i; return v.f;
+  }
+
+  __forceinline int   toInt  (const float& a) { return int(a); }
+  __forceinline float toFloat(const int&   a) { return float(a); }
+
+#if defined(__WIN32__) && !defined(__MINGW32__)
+  __forceinline bool finite ( const float x ) { return _finite(x) != 0; }
+#endif
+
+  __forceinline float sign ( const float x ) { return x<0?-1.0f:1.0f; }
+  __forceinline float sqr  ( const float x ) { return x*x; }
+
+  __forceinline float rcp  ( const float x )
+  {
+#if defined(__aarch64__)
+      // Move scalar to vector register and do rcp.
+      __m128 a;
+      a[0] = x;
+      float32x4_t reciprocal = vrecpeq_f32(a);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+      return reciprocal[0];
+#else
+
+    const __m128 a = _mm_set_ss(x);
+
+#if defined(__AVX512VL__)
+    const __m128 r = _mm_rcp14_ss(_mm_set_ss(0.0f),a);
+#else
+    const __m128 r = _mm_rcp_ss(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_fnmadd_ss(r, a, _mm_set_ss(2.0f))));
+#else
+    return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
+#endif
+
+#endif  //defined(__aarch64__)
+  }
+
+  __forceinline float signmsk ( const float x ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = 0x80000000;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#endif
+  }
+  __forceinline float xorf( const float x, const float y ) {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128 b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_xor_ps(a, b);
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
+#endif
+  }
+  __forceinline float andf( const float x, const unsigned y ) {
+#if defined(__aarch64__) 
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      __m128i b;
+      a[0] = x;
+      b[0] = y;
+      a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
+      return a[0];
+#else
+    return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
+#endif
+  }
+  __forceinline float rsqrt( const float x )
+  {
+#if defined(__aarch64__)
+      // FP and Neon shares same vector register in arm64
+      __m128 a;
+      a[0] = x;
+      __m128 value = _mm_rsqrt_ps(a);
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
+      return value[0];
+#else
+
+    const __m128 a = _mm_set_ss(x);
+#if defined(__AVX512VL__)
+    const __m128 r = _mm_rsqrt14_ss(_mm_set_ss(0.0f),a);
+#else
+    const __m128 r = _mm_rsqrt_ss(a);
+#endif
+    const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r),
+                                _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r)));
+    return _mm_cvtss_f32(c);
+#endif
+  }
+
+#if defined(__WIN32__) && (__MSC_VER <= 1700) && !defined(__MINGW32__)
+  __forceinline float nextafter(float x, float y) { if ((x<y) == (x>0)) return x*(1.1f+float(ulp)); else return x*(0.9f-float(ulp)); }
+  __forceinline double nextafter(double x, double y) { return _nextafter(x, y); }
+  __forceinline int roundf(float f) { return (int)(f + 0.5f); }
+#else
+  __forceinline float nextafter(float x, float y) { return ::nextafterf(x, y); }
+  __forceinline double nextafter(double x, double y) { return ::nextafter(x, y); }
+#endif
+
+  __forceinline float abs  ( const float x ) { return ::fabsf(x); }
+  __forceinline float acos ( const float x ) { return ::acosf (x); }
+  __forceinline float asin ( const float x ) { return ::asinf (x); }
+  __forceinline float atan ( const float x ) { return ::atanf (x); }
+  __forceinline float atan2( const float y, const float x ) { return ::atan2f(y, x); }
+  __forceinline float cos  ( const float x ) { return ::cosf  (x); }
+  __forceinline float cosh ( const float x ) { return ::coshf (x); }
+  __forceinline float exp  ( const float x ) { return ::expf  (x); }
+  __forceinline float fmod ( const float x, const float y ) { return ::fmodf (x, y); }
+  __forceinline float log  ( const float x ) { return ::logf  (x); }
+  __forceinline float log10( const float x ) { return ::log10f(x); }
+  __forceinline float pow  ( const float x, const float y ) { return ::powf  (x, y); }
+  __forceinline float sin  ( const float x ) { return ::sinf  (x); }
+  __forceinline float sinh ( const float x ) { return ::sinhf (x); }
+  __forceinline float sqrt ( const float x ) { return ::sqrtf (x); }
+  __forceinline float tan  ( const float x ) { return ::tanf  (x); }
+  __forceinline float tanh ( const float x ) { return ::tanhf (x); }
+  __forceinline float floor( const float x ) { return ::floorf (x); }
+  __forceinline float ceil ( const float x ) { return ::ceilf (x); }
+  __forceinline float frac ( const float x ) { return x-floor(x); }
+
+  __forceinline double abs  ( const double x ) { return ::fabs(x); }
+  __forceinline double sign ( const double x ) { return x<0?-1.0:1.0; }
+  __forceinline double acos ( const double x ) { return ::acos (x); }
+  __forceinline double asin ( const double x ) { return ::asin (x); }
+  __forceinline double atan ( const double x ) { return ::atan (x); }
+  __forceinline double atan2( const double y, const double x ) { return ::atan2(y, x); }
+  __forceinline double cos  ( const double x ) { return ::cos  (x); }
+  __forceinline double cosh ( const double x ) { return ::cosh (x); }
+  __forceinline double exp  ( const double x ) { return ::exp  (x); }
+  __forceinline double fmod ( const double x, const double y ) { return ::fmod (x, y); }
+  __forceinline double log  ( const double x ) { return ::log  (x); }
+  __forceinline double log10( const double x ) { return ::log10(x); }
+  __forceinline double pow  ( const double x, const double y ) { return ::pow  (x, y); }
+  __forceinline double rcp  ( const double x ) { return 1.0/x; }
+  __forceinline double rsqrt( const double x ) { return 1.0/::sqrt(x); }
+  __forceinline double sin  ( const double x ) { return ::sin  (x); }
+  __forceinline double sinh ( const double x ) { return ::sinh (x); }
+  __forceinline double sqr  ( const double x ) { return x*x; }
+  __forceinline double sqrt ( const double x ) { return ::sqrt (x); }
+  __forceinline double tan  ( const double x ) { return ::tan  (x); }
+  __forceinline double tanh ( const double x ) { return ::tanh (x); }
+  __forceinline double floor( const double x ) { return ::floor (x); }
+  __forceinline double ceil ( const double x ) { return ::ceil (x); }
+
+#if defined(__aarch64__) 
+    __forceinline float mini(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_min_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
+  __forceinline float mini(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_min_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+#if defined(__aarch64__) 
+    __forceinline float maxi(float a, float b) {
+        // FP and Neon shares same vector register in arm64
+        __m128 x;
+        __m128 y;
+        x[0] = a;
+        y[0] = b;
+        x = _mm_max_ps(x, y);
+        return x[0];
+    }
+#elif defined(__SSE4_1__)
+  __forceinline float maxi(float a, float b) {
+    const __m128i ai = _mm_castps_si128(_mm_set_ss(a));
+    const __m128i bi = _mm_castps_si128(_mm_set_ss(b));
+    const __m128i ci = _mm_max_epi32(ai,bi);
+    return _mm_cvtss_f32(_mm_castsi128_ps(ci));
+  }
+#endif
+
+  template<typename T>
+    __forceinline T twice(const T& a) { return a+a; }
+
+  __forceinline      int min(int      a, int      b) { return a<b ? a:b; }
+  __forceinline unsigned min(unsigned a, unsigned b) { return a<b ? a:b; }
+  __forceinline  int64_t min(int64_t  a, int64_t  b) { return a<b ? a:b; }
+  __forceinline    float min(float    a, float    b) { return a<b ? a:b; }
+  __forceinline   double min(double   a, double   b) { return a<b ? a:b; }
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline   size_t min(size_t   a, size_t   b) { return a<b ? a:b; }
+#endif
+
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c) { return min(min(a,b),c); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d) { return min(min(a,b),min(c,d)); }
+  template<typename T> __forceinline T min(const T& a, const T& b, const T& c, const T& d, const T& e) { return min(min(min(a,b),min(c,d)),e); }
+
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c) { return mini(mini(a,b),c); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d) { return mini(mini(a,b),mini(c,d)); }
+  template<typename T> __forceinline T mini(const T& a, const T& b, const T& c, const T& d, const T& e) { return mini(mini(mini(a,b),mini(c,d)),e); }
+
+  __forceinline      int max(int      a, int      b) { return a<b ? b:a; }
+  __forceinline unsigned max(unsigned a, unsigned b) { return a<b ? b:a; }
+  __forceinline  int64_t max(int64_t  a, int64_t  b) { return a<b ? b:a; }
+  __forceinline    float max(float    a, float    b) { return a<b ? b:a; }
+  __forceinline   double max(double   a, double   b) { return a<b ? b:a; }
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline   size_t max(size_t   a, size_t   b) { return a<b ? b:a; }
+#endif
+
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c) { return max(max(a,b),c); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d) { return max(max(a,b),max(c,d)); }
+  template<typename T> __forceinline T max(const T& a, const T& b, const T& c, const T& d, const T& e) { return max(max(max(a,b),max(c,d)),e); }
+
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c) { return maxi(maxi(a,b),c); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d) { return maxi(maxi(a,b),maxi(c,d)); }
+  template<typename T> __forceinline T maxi(const T& a, const T& b, const T& c, const T& d, const T& e) { return maxi(maxi(maxi(a,b),maxi(c,d)),e); }
+
+#if defined(__MACOSX__)
+  __forceinline ssize_t min(ssize_t a, ssize_t b) { return a<b ? a:b; }
+  __forceinline ssize_t max(ssize_t a, ssize_t b) { return a<b ? b:a; }
+#endif
+
+#if defined(__MACOSX__) && !defined(__INTEL_COMPILER)
+  __forceinline void sincosf(float x, float *sin, float *cos) {
+    __sincosf(x,sin,cos);
+  }
+#endif
+
+#if defined(__WIN32__) || defined(__FreeBSD__)
+  __forceinline void sincosf(float x, float *s, float *c) {
+    *s = sinf(x); *c = cosf(x);
+  }
+#endif
+
+  template<typename T> __forceinline T clamp(const T& x, const T& lower = T(zero), const T& upper = T(one)) { return max(min(x,upper),lower); }
+  template<typename T> __forceinline T clampz(const T& x, const T& upper) { return max(T(zero), min(x,upper)); }
+
+  template<typename T> __forceinline T  deg2rad ( const T& x )  { return x * T(1.74532925199432957692e-2f); }
+  template<typename T> __forceinline T  rad2deg ( const T& x )  { return x * T(5.72957795130823208768e1f); }
+  template<typename T> __forceinline T  sin2cos ( const T& x )  { return sqrt(max(T(zero),T(one)-x*x)); }
+  template<typename T> __forceinline T  cos2sin ( const T& x )  { return sin2cos(x); }
+
+#if defined(__AVX2__)
+  __forceinline float madd  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float msub  ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmadd_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+  __forceinline float nmsub ( const float a, const float b, const float c) { return _mm_cvtss_f32(_mm_fnmsub_ss(_mm_set_ss(a),_mm_set_ss(b),_mm_set_ss(c))); }
+#elif defined (__aarch64__) && defined(__clang__)
+#pragma clang fp contract(fast)
+
+
+__forceinline float madd  ( const float a, const float b, const float c) { return a*b + c; }
+__forceinline float msub  ( const float a, const float b, const float c) { return a*b - c; }
+__forceinline float nmadd ( const float a, const float b, const float c) { return c - a*b; }
+__forceinline float nmsub ( const float a, const float b, const float c) { return -(c + a*b); }
+
+#pragma clang fp contract(on)
+#else
+  __forceinline float madd  ( const float a, const float b, const float c) { return a*b+c; }
+  __forceinline float msub  ( const float a, const float b, const float c) { return a*b-c; }
+  __forceinline float nmadd ( const float a, const float b, const float c) { return -a*b+c;}
+  __forceinline float nmsub ( const float a, const float b, const float c) { return -a*b-c; }
+#endif
+
+  /*! random functions */
+  template<typename T> T random() { return T(0); }
+#if defined(_WIN32)
+  template<> __forceinline int      random() { return int(rand()) ^ (int(rand()) << 8) ^ (int(rand()) << 16); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 8) ^ (uint32_t(rand()) << 16); }
+#else
+  template<> __forceinline int      random() { return int(rand()); }
+  template<> __forceinline uint32_t random() { return uint32_t(rand()) ^ (uint32_t(rand()) << 16); }
+#endif
+  template<> __forceinline float  random() { return rand()/float(RAND_MAX); }
+  template<> __forceinline double random() { return rand()/double(RAND_MAX); }
+
+#if _WIN32
+  __forceinline double drand48() {
+    return double(rand())/double(RAND_MAX);
+  }
+
+  __forceinline void srand48(long seed) {
+    return srand(seed);
+  }
+#endif
+
+  /*! selects */
+  __forceinline bool  select(bool s, bool  t , bool f) { return s ? t : f; }
+  __forceinline int   select(bool s, int   t,   int f) { return s ? t : f; }
+  __forceinline float select(bool s, float t, float f) { return s ? t : f; }
+
+  __forceinline bool all(bool s) { return s; }
+
+  __forceinline float lerp(const float v0, const float v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  template<typename T>
+    __forceinline T lerp2(const float x0, const float x1, const float x2, const float x3, const T& u, const T& v) {
+    return madd((1.0f-u),madd((1.0f-v),T(x0),v*T(x2)),u*madd((1.0f-v),T(x1),v*T(x3)));
+  }
+
+  /*! exchange */
+  template<typename T> __forceinline void xchg ( T& a, T& b ) { const T tmp = a; a = b; b = tmp; }
+
+
+  template<typename T> __forceinline T prod_diff(const T& a,const T& b,const T& c,const T& d) {
+#if 1//!defined(__aarch64__)
+      return msub(a,b,c*d);
+#else
+      return nmadd(c,d,a*b);
+#endif
+  }
+
+  /*! bit reverse operation */
+  template<class T>
+    __forceinline T bitReverse(const T& vin)
+  {
+    T v = vin;
+    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+    v = ( v >> 16             ) | ( v               << 16);
+    return v;
+  }
+
+  /*! bit interleave operation */
+  template<class T>
+    __forceinline T bitInterleave(const T& xin, const T& yin, const T& zin)
+  {
+	  T x = xin, y = yin, z = zin;
+    x = (x | (x << 16)) & 0x030000FF;
+    x = (x | (x <<  8)) & 0x0300F00F;
+    x = (x | (x <<  4)) & 0x030C30C3;
+    x = (x | (x <<  2)) & 0x09249249;
+
+    y = (y | (y << 16)) & 0x030000FF;
+    y = (y | (y <<  8)) & 0x0300F00F;
+    y = (y | (y <<  4)) & 0x030C30C3;
+    y = (y | (y <<  2)) & 0x09249249;
+
+    z = (z | (z << 16)) & 0x030000FF;
+    z = (z | (z <<  8)) & 0x0300F00F;
+    z = (z | (z <<  4)) & 0x030C30C3;
+    z = (z | (z <<  2)) & 0x09249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+
+#if defined(__AVX2__) && !defined(__aarch64__)
+
+  template<>
+    __forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
+  {
+    const unsigned int xx = pdep(xi,0x49249249 /* 0b01001001001001001001001001001001 */ );
+    const unsigned int yy = pdep(yi,0x92492492 /* 0b10010010010010010010010010010010 */);
+    const unsigned int zz = pdep(zi,0x24924924 /* 0b00100100100100100100100100100100 */);
+    return xx | yy | zz;
+  }
+
+#endif
+
+  /*! bit interleave operation for 64bit data types*/
+  template<class T>
+    __forceinline T bitInterleave64(const T& xin, const T& yin, const T& zin){
+    T x = xin & 0x1fffff;
+    T y = yin & 0x1fffff;
+    T z = zin & 0x1fffff;
+
+    x = (x | x << 32) & 0x1f00000000ffff;
+    x = (x | x << 16) & 0x1f0000ff0000ff;
+    x = (x | x << 8) & 0x100f00f00f00f00f;
+    x = (x | x << 4) & 0x10c30c30c30c30c3;
+    x = (x | x << 2) & 0x1249249249249249;
+
+    y = (y | y << 32) & 0x1f00000000ffff;
+    y = (y | y << 16) & 0x1f0000ff0000ff;
+    y = (y | y << 8) & 0x100f00f00f00f00f;
+    y = (y | y << 4) & 0x10c30c30c30c30c3;
+    y = (y | y << 2) & 0x1249249249249249;
+
+    z = (z | z << 32) & 0x1f00000000ffff;
+    z = (z | z << 16) & 0x1f0000ff0000ff;
+    z = (z | z << 8) & 0x100f00f00f00f00f;
+    z = (z | z << 4) & 0x10c30c30c30c30c3;
+    z = (z | z << 2) & 0x1249249249249249;
+
+    return x | (y << 1) | (z << 2);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/math/obbox.h b/thirdparty/embree-aarch64/common/math/obbox.h
new file mode 100644
index 0000000000..032b56904e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/obbox.h
@@ -0,0 +1,39 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bbox.h"
+#include "linearspace3.h"
+
+namespace embree
+{
+  /*! Oriented bounding box */
+  template<typename T>
+    struct OBBox 
+  {
+  public:
+    
+    __forceinline OBBox () {}
+    
+    __forceinline OBBox (EmptyTy) 
+      : space(one), bounds(empty) {}
+    
+    __forceinline OBBox (const BBox<T>& bounds) 
+      : space(one), bounds(bounds) {}
+      
+    __forceinline OBBox (const LinearSpace3<T>& space, const BBox<T>& bounds) 
+      : space(space), bounds(bounds) {}
+    
+    friend embree_ostream operator<<(embree_ostream cout, const OBBox& p) {
+      return cout << "{ space = " << p.space << ", bounds = " << p.bounds << "}";
+    }
+    
+  public:
+    LinearSpace3<T> space; //!< orthonormal transformation
+    BBox<T> bounds;        //!< bounds in transformed space
+  };
+
+  typedef OBBox<Vec3f> OBBox3f;
+  typedef OBBox<Vec3fa> OBBox3fa;
+}
diff --git a/thirdparty/embree-aarch64/common/math/quaternion.h b/thirdparty/embree-aarch64/common/math/quaternion.h
new file mode 100644
index 0000000000..20c69bc62f
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/quaternion.h
@@ -0,0 +1,254 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "vec3.h"
+#include "vec4.h"
+
+#include "transcendental.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////
+  // Quaternion Struct
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T>
+  struct QuaternionT
+  {
+    typedef Vec3<T> Vector;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT           ()                     { }
+    __forceinline QuaternionT           ( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; }
+    __forceinline QuaternionT& operator=( const QuaternionT& other ) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; }
+
+    __forceinline          QuaternionT( const T& r       ) : r(r), i(zero), j(zero), k(zero) {}
+    __forceinline explicit QuaternionT( const Vec3<T>& v ) : r(zero), i(v.x), j(v.y), k(v.z) {}
+    __forceinline explicit QuaternionT( const Vec4<T>& v ) : r(v.x), i(v.y), j(v.z), k(v.w) {}
+    __forceinline          QuaternionT( const T& r, const T& i, const T& j, const T& k ) : r(r), i(i), j(j), k(k) {}
+    __forceinline          QuaternionT( const T& r, const Vec3<T>& v ) : r(r), i(v.x), j(v.y), k(v.z) {}
+
+    __inline QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz );
+    __inline QuaternionT( const T& yaw, const T& pitch, const T& roll );
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline QuaternionT( ZeroTy ) : r(zero), i(zero), j(zero), k(zero) {}
+    __forceinline QuaternionT( OneTy  ) : r( one), i(zero), j(zero), k(zero) {}
+
+    /*! return quaternion for rotation around arbitrary axis */
+    static __forceinline QuaternionT rotate(const Vec3<T>& u, const T& r) {
+      return QuaternionT<T>(cos(T(0.5)*r),sin(T(0.5)*r)*normalize(u));
+    }
+
+    /*! returns the rotation axis of the quaternion as a vector */
+    __forceinline Vec3<T> v( ) const { return Vec3<T>(i, j, k); }
+
+  public:
+    T r, i, j, k;
+  };
+
+  template<typename T> __forceinline QuaternionT<T> operator *( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a * b.r, a * b.i, a * b.j, a * b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r * b, a.i * b, a.j * b, a.k * b); }
+
+  ////////////////////////////////////////////////////////////////
+  // Unary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a ) { return QuaternionT<T>(+a.r, +a.i, +a.j, +a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a ) { return QuaternionT<T>(-a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline QuaternionT<T> conj      ( const QuaternionT<T>& a ) { return QuaternionT<T>(a.r, -a.i, -a.j, -a.k); }
+  template<typename T> __forceinline T              abs       ( const QuaternionT<T>& a ) { return sqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> rcp       ( const QuaternionT<T>& a ) { return conj(a)*rcp(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+  template<typename T> __forceinline QuaternionT<T> normalize ( const QuaternionT<T>& a ) { return a*rsqrt(a.r*a.r + a.i*a.i + a.j*a.j + a.k*a.k); }
+
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  msub(const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(msub(a, q.r, p.r),
+                          msub(a, q.i, p.i),
+                          msub(a, q.j, p.j),
+                          msub(a, q.k, p.k));
+  }
+  // evaluates a*q-r
+  template<typename T> __forceinline QuaternionT<T>
+  madd (const T& a, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(madd(a, q.r, p.r),
+                          madd(a, q.i, p.i),
+                          madd(a, q.j, p.j),
+                          madd(a, q.k, p.k));
+  }
+
+  ////////////////////////////////////////////////////////////////
+  // Binary Operators
+  ////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline QuaternionT<T> operator +( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a + b.r,  b.i,  b.j,  b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r + b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator +( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const T             & a, const QuaternionT<T>& b ) { return QuaternionT<T>(a - b.r, -b.i, -b.j, -b.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const T             & b ) { return QuaternionT<T>(a.r - b, a.i, a.j, a.k); }
+  template<typename T> __forceinline QuaternionT<T> operator -( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return QuaternionT<T>(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); }
+
+  template<typename T> __forceinline Vec3<T>       operator *( const QuaternionT<T>& a, const Vec3<T>      & b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline QuaternionT<T> operator *( const QuaternionT<T>& a, const QuaternionT<T>& b ) {
+    return QuaternionT<T>(a.r*b.r - a.i*b.i - a.j*b.j - a.k*b.k,
+                          a.r*b.i + a.i*b.r + a.j*b.k - a.k*b.j,
+                          a.r*b.j - a.i*b.k + a.j*b.r + a.k*b.i,
+                          a.r*b.k + a.i*b.j - a.j*b.i + a.k*b.r);
+  }
+  template<typename T> __forceinline QuaternionT<T> operator /( const T             & a, const QuaternionT<T>& b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const T             & b ) { return a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T> operator /( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a*rcp(b); }
+
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const T             & b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator +=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a+b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const T             & b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator -=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a-b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const T             & b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator *=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*b; }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const T             & b ) { return a = a*rcp(b); }
+  template<typename T> __forceinline QuaternionT<T>& operator /=( QuaternionT<T>& a, const QuaternionT<T>& b ) { return a = a*rcp(b); }
+
+  template<typename T, typename M> __forceinline QuaternionT<T>
+  select(const M& m, const QuaternionT<T>& q, const QuaternionT<T>& p)
+  {
+    return QuaternionT<T>(select(m, q.r, p.r),
+                          select(m, q.i, p.i),
+                          select(m, q.j, p.j),
+                          select(m, q.k, p.k));
+  }
+
+
+  template<typename T> __forceinline Vec3<T> xfmPoint ( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmVector( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+  template<typename T> __forceinline Vec3<T> xfmNormal( const QuaternionT<T>& a, const Vec3<T>&       b ) { return (a*QuaternionT<T>(b)*conj(a)).v(); }
+
+  template<typename T> __forceinline T dot(const QuaternionT<T>& a, const QuaternionT<T>& b) { return a.r*b.r + a.i*b.i + a.j*b.j + a.k*b.k; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; }
+  template<typename T> __forceinline bool operator !=( const QuaternionT<T>& a, const QuaternionT<T>& b ) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Orientation Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> QuaternionT<T>::QuaternionT( const Vec3<T>& vx, const Vec3<T>& vy, const Vec3<T>& vz )
+  {
+    if ( vx.x + vy.y + vz.z >= T(zero) )
+    {
+      const T t = T(one) + (vx.x + vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = t*s;
+      i = (vy.z - vz.y)*s;
+      j = (vz.x - vx.z)*s;
+      k = (vx.y - vy.x)*s;
+    }
+    else if ( vx.x >= max(vy.y, vz.z) )
+    {
+      const T t = (T(one) + vx.x) - (vy.y + vz.z);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vy.z - vz.y)*s;
+      i = t*s;
+      j = (vx.y + vy.x)*s;
+      k = (vz.x + vx.z)*s;
+    }
+    else if ( vy.y >= vz.z ) // if ( vy.y >= max(vz.z, vx.x) )
+    {
+      const T t = (T(one) + vy.y) - (vz.z + vx.x);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vz.x - vx.z)*s;
+      i = (vx.y + vy.x)*s;
+      j = t*s;
+      k = (vy.z + vz.y)*s;
+    }
+    else //if ( vz.z >= max(vy.y, vx.x) )
+    {
+      const T t = (T(one) + vz.z) - (vx.x + vy.y);
+      const T s = rsqrt(t)*T(0.5f);
+      r = (vx.y - vy.x)*s;
+      i = (vz.x + vx.z)*s;
+      j = (vy.z + vz.y)*s;
+      k = t*s;
+    }
+  }
+
+  template<typename T> QuaternionT<T>::QuaternionT( const T& yaw, const T& pitch, const T& roll )
+  {
+    const T cya = cos(yaw  *T(0.5f));
+    const T cpi = cos(pitch*T(0.5f));
+    const T cro = cos(roll *T(0.5f));
+    const T sya = sin(yaw  *T(0.5f));
+    const T spi = sin(pitch*T(0.5f));
+    const T sro = sin(roll *T(0.5f));
+    r = cro*cya*cpi + sro*sya*spi;
+    i = cro*cya*spi + sro*sya*cpi;
+    j = cro*sya*cpi - sro*cya*spi;
+    k = sro*cya*cpi - cro*sya*spi;
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  //////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> static embree_ostream operator<<(embree_ostream cout, const QuaternionT<T>& q) {
+    return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }";
+  }
+
+  /*! default template instantiations */
+  typedef QuaternionT<float>  Quaternion3f;
+  typedef QuaternionT<double> Quaternion3d;
+
+  template<int N> using Quaternion3vf = QuaternionT<vfloat<N>>;
+  typedef QuaternionT<vfloat<4>>  Quaternion3vf4;
+  typedef QuaternionT<vfloat<8>>  Quaternion3vf8;
+  typedef QuaternionT<vfloat<16>> Quaternion3vf16;
+
+  //////////////////////////////////////////////////////////////////////////////
+  /// Interpolation
+  //////////////////////////////////////////////////////////////////////////////
+  template<typename T>
+  __forceinline QuaternionT<T>lerp(const QuaternionT<T>& q0,
+                                   const QuaternionT<T>& q1,
+                                   const T& factor)
+  {
+    QuaternionT<T> q;
+    q.r = lerp(q0.r, q1.r, factor);
+    q.i = lerp(q0.i, q1.i, factor);
+    q.j = lerp(q0.j, q1.j, factor);
+    q.k = lerp(q0.k, q1.k, factor);
+    return q;
+  }
+
+  template<typename T>
+  __forceinline QuaternionT<T> slerp(const QuaternionT<T>& q0,
+                                     const QuaternionT<T>& q1_,
+                                     const T& t)
+  {
+    T cosTheta = dot(q0, q1_);
+    QuaternionT<T> q1 = select(cosTheta < 0.f, -q1_, q1_);
+    cosTheta          = select(cosTheta < 0.f, -cosTheta, cosTheta);
+    if (unlikely(all(cosTheta > 0.9995f))) {
+      return normalize(lerp(q0, q1, t));
+    }
+    const T phi = t * fastapprox::acos(cosTheta);
+    T sinPhi, cosPhi;
+    fastapprox::sincos(phi, sinPhi, cosPhi);
+    QuaternionT<T> qperp = sinPhi * normalize(msub(cosTheta, q0, q1));
+    return msub(cosPhi, q0, qperp);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/math/range.h b/thirdparty/embree-aarch64/common/math/range.h
new file mode 100644
index 0000000000..762d9cd9ea
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/range.h
@@ -0,0 +1,137 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../math/math.h"
+
+namespace embree
+{
+  template<typename Ty>
+    struct range 
+    {
+      __forceinline range() {}
+
+      __forceinline range(const Ty& begin)
+        : _begin(begin), _end(begin+1) {}
+      
+      __forceinline range(const Ty& begin, const Ty& end)
+        : _begin(begin), _end(end) {}
+ 
+      __forceinline range(const range& other)
+        : _begin(other._begin), _end(other._end) {}
+
+      template<typename T1>
+      __forceinline range(const range<T1>& other)
+        : _begin(Ty(other._begin)), _end(Ty(other._end)) {}
+
+      template<typename T1>
+      __forceinline range& operator =(const range<T1>& other) {
+        _begin = other._begin;
+        _end = other._end;
+        return *this;
+      }
+      
+      __forceinline Ty begin() const {
+        return _begin;
+      }
+      
+      __forceinline Ty end() const {
+	return _end;
+      }
+
+      __forceinline range intersect(const range& r) const {
+        return range (max(_begin,r._begin),min(_end,r._end));
+      }
+
+      __forceinline Ty size() const {
+        return _end - _begin;
+      }
+
+      __forceinline bool empty() const { 
+        return _end <= _begin; 
+      }
+
+      __forceinline Ty center() const {
+        return (_begin + _end)/2;
+      }
+
+      __forceinline std::pair<range,range> split() const 
+      {
+        const Ty _center = center();
+        return std::make_pair(range(_begin,_center),range(_center,_end));
+      }
+
+      __forceinline void split(range& left_o, range& right_o) const 
+      {
+        const Ty _center = center();
+        left_o = range(_begin,_center);
+        right_o = range(_center,_end);
+      }
+
+      __forceinline friend bool operator< (const range& r0, const range& r1) {
+        return r0.size() < r1.size();
+      }
+	
+      friend embree_ostream operator<<(embree_ostream cout, const range& r) {
+        return cout << "range [" << r.begin() << ", " << r.end() << "]";
+      }
+      
+      Ty _begin, _end;
+    };
+
+  template<typename Ty>
+    range<Ty> make_range(const Ty& begin, const Ty& end) {
+    return range<Ty>(begin,end);
+  }
+
+  template<typename Ty>
+    struct extended_range : public range<Ty>
+    {
+      __forceinline extended_range () {}
+
+      __forceinline extended_range (const Ty& begin)
+        : range<Ty>(begin), _ext_end(begin+1) {}
+      
+      __forceinline extended_range (const Ty& begin, const Ty& end)
+        : range<Ty>(begin,end), _ext_end(end) {}
+
+      __forceinline extended_range (const Ty& begin, const Ty& end, const Ty& ext_end)
+        : range<Ty>(begin,end), _ext_end(ext_end) {}
+      
+      __forceinline Ty ext_end() const {
+	return _ext_end;
+      }
+
+      __forceinline Ty ext_size() const {
+        return _ext_end - range<Ty>::_begin;
+      }
+
+      __forceinline Ty ext_range_size() const {
+        return _ext_end - range<Ty>::_end;
+      }
+
+      __forceinline bool has_ext_range() const {
+        assert(_ext_end >= range<Ty>::_end);
+        return (_ext_end - range<Ty>::_end) > 0;
+      }
+
+      __forceinline void set_ext_range(const size_t ext_end){
+        assert(ext_end >= range<Ty>::_end);
+        _ext_end = ext_end;
+      }
+
+      __forceinline void move_right(const size_t plus){
+        range<Ty>::_begin   += plus;
+        range<Ty>::_end     += plus;
+        _ext_end += plus;
+      }
+
+      friend embree_ostream operator<<(embree_ostream cout, const extended_range& r) {
+        return cout << "extended_range [" << r.begin() << ", " << r.end() <<  " (" << r.ext_end() << ")]";
+      }
+      
+      Ty _ext_end;
+    };
+}
diff --git a/thirdparty/embree-aarch64/common/math/transcendental.h b/thirdparty/embree-aarch64/common/math/transcendental.h
new file mode 100644
index 0000000000..6855d82b53
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/transcendental.h
@@ -0,0 +1,525 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+// Transcendental functions from "ispc": https://github.com/ispc/ispc/
+// Most of the transcendental implementations in ispc code come from
+// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
+
+#include "../simd/simd.h"
+
+namespace embree
+{
+
+namespace fastapprox
+{
+
+template <typename T>
+__forceinline T sin(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto sinUseCos = (kMod4 == 1 | kMod4 == 3);
+  auto flipSign = (kMod4 > 1);
+
+  // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
+  // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
+  static const float sinC2  = -0.16666667163372039794921875;
+  static const float sinC4  = +8.333347737789154052734375e-3;
+  static const float sinC6  = -1.9842604524455964565277099609375e-4;
+  static const float sinC8  = +2.760012648650445044040679931640625e-6;
+  static const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  static const float cosC2  = -0.5;
+  static const float cosC4  = +4.166664183139801025390625e-2;
+  static const float cosC6  = -1.388833043165504932403564453125e-3;
+  static const float cosC8  = +2.47562347794882953166961669921875e-5;
+  static const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(sinUseCos, 1., x);
+  auto c2  = select(sinUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(sinUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(sinUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(sinUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(sinUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline T cos(const T &v)
+{
+  static const float piOverTwoVec = 1.57079637050628662109375;
+  static const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+
+  auto kMod4 = k & 3;
+  auto cosUseCos = (kMod4 == 0 | kMod4 == 2);
+  auto flipSign = (kMod4 == 1 | kMod4 == 2);
+
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto outside = select(cosUseCos, 1., x);
+  auto c2  = select(cosUseCos, T(cosC2),  T(sinC2));
+  auto c4  = select(cosUseCos, T(cosC4),  T(sinC4));
+  auto c6  = select(cosUseCos, T(cosC6),  T(sinC6));
+  auto c8  = select(cosUseCos, T(cosC8),  T(sinC8));
+  auto c10 = select(cosUseCos, T(cosC10), T(sinC10));
+
+  auto x2 = x * x;
+  auto formula = x2 * c10 + c8;
+  formula = x2 * formula + c6;
+  formula = x2 * formula + c4;
+  formula = x2 * formula + c2;
+  formula = x2 * formula + 1.;
+  formula *= outside;
+
+  formula = select(flipSign, -formula, formula);
+  return formula;
+}
+
+template <typename T>
+__forceinline void sincos(const T &v, T &sinResult, T &cosResult)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  const float twoOverPiVec = 0.636619746685028076171875;
+  auto scaled = v * twoOverPiVec;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * piOverTwoVec;
+  auto kMod4 = k & 3;
+  auto cosUseCos = ((kMod4 == 0) | (kMod4 == 2));
+  auto sinUseCos = ((kMod4 == 1) | (kMod4 == 3));
+  auto sinFlipSign = (kMod4 > 1);
+  auto cosFlipSign = ((kMod4 == 1) | (kMod4 == 2));
+
+  const float oneVec = +1.;
+  const float sinC2  = -0.16666667163372039794921875;
+  const float sinC4  = +8.333347737789154052734375e-3;
+  const float sinC6  = -1.9842604524455964565277099609375e-4;
+  const float sinC8  = +2.760012648650445044040679931640625e-6;
+  const float sinC10 = -2.50293279435709337121807038784027099609375e-8;
+
+  const float cosC2  = -0.5;
+  const float cosC4  = +4.166664183139801025390625e-2;
+  const float cosC6  = -1.388833043165504932403564453125e-3;
+  const float cosC8  = +2.47562347794882953166961669921875e-5;
+  const float cosC10 = -2.59630184018533327616751194000244140625e-7;
+
+  auto x2 = x * x;
+
+  auto sinFormula = x2 * sinC10 + sinC8;
+  auto cosFormula = x2 * cosC10 + cosC8;
+  sinFormula = x2 * sinFormula + sinC6;
+  cosFormula = x2 * cosFormula + cosC6;
+
+  sinFormula = x2 * sinFormula + sinC4;
+  cosFormula = x2 * cosFormula + cosC4;
+
+  sinFormula = x2 * sinFormula + sinC2;
+  cosFormula = x2 * cosFormula + cosC2;
+
+  sinFormula = x2 * sinFormula + oneVec;
+  cosFormula = x2 * cosFormula + oneVec;
+
+  sinFormula *= x;
+
+  sinResult = select(sinUseCos, cosFormula, sinFormula);
+  cosResult = select(cosUseCos, cosFormula, sinFormula);
+
+  sinResult = select(sinFlipSign, -sinResult, sinResult);
+  cosResult = select(cosFlipSign, -cosResult, cosResult);
+}
+
+template <typename T>
+__forceinline T tan(const T &v)
+{
+  const float piOverFourVec = 0.785398185253143310546875;
+  const float fourOverPiVec = 1.27323949337005615234375;
+
+  auto xLt0 = v < 0.;
+  auto y = select(xLt0, -v, v);
+  auto scaled = y * fourOverPiVec;
+
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  auto x = y - kReal * piOverFourVec;
+
+  // If k & 1, x -= Pi/4
+  auto needOffset = (k & 1) != 0;
+  x = select(needOffset, x - piOverFourVec, x);
+
+  // If k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
+  auto kMod4 = k & 3;
+  auto useCotan = (kMod4 == 1) | (kMod4 == 2);
+
+  const float oneVec = 1.0;
+
+  const float tanC2  = +0.33333075046539306640625;
+  const float tanC4  = +0.13339905440807342529296875;
+  const float tanC6  = +5.3348250687122344970703125e-2;
+  const float tanC8  = +2.46033705770969390869140625e-2;
+  const float tanC10 = +2.892402000725269317626953125e-3;
+  const float tanC12 = +9.500005282461643218994140625e-3;
+
+  const float cotC2  = -0.3333333432674407958984375;
+  const float cotC4  = -2.222204394638538360595703125e-2;
+  const float cotC6  = -2.11752182804048061370849609375e-3;
+  const float cotC8  = -2.0846328698098659515380859375e-4;
+  const float cotC10 = -2.548247357481159269809722900390625e-5;
+  const float cotC12 = -3.5257363606433500535786151885986328125e-7;
+
+  auto x2 = x * x;
+  T z;
+  if (any(useCotan))
+  {
+    auto cotVal = x2 * cotC12 + cotC10;
+    cotVal = x2 * cotVal + cotC8;
+    cotVal = x2 * cotVal + cotC6;
+    cotVal = x2 * cotVal + cotC4;
+    cotVal = x2 * cotVal + cotC2;
+    cotVal = x2 * cotVal + oneVec;
+    // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
+    cotVal /= -x;
+    z = cotVal;
+  }
+  auto useTan = !useCotan;
+  if (any(useTan))
+  {
+    auto tanVal = x2 * tanC12 + tanC10;
+    tanVal = x2 * tanVal + tanC8;
+    tanVal = x2 * tanVal + tanC6;
+    tanVal = x2 * tanVal + tanC4;
+    tanVal = x2 * tanVal + tanC2;
+    tanVal = x2 * tanVal + oneVec;
+    // Equation was for tan(x)/x
+    tanVal *= x;
+    z = select(useTan, tanVal, z);
+  }
+  return select(xLt0, -z, z);
+}
+
+template <typename T>
+__forceinline T asin(const T &x0)
+{
+  auto isneg = (x0 < 0.f);
+  auto x = abs(x0);
+  auto isnan = (x > 1.f);
+
+  // sollya
+  // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
+  //           [1e-20;.9999999999999999]);
+  // avg error: 1.1105439e-06, max error 1.3187528e-06
+  auto v = 1.57079517841339111328125f +
+           x * (-0.21450997889041900634765625f +
+                x * (8.78556668758392333984375e-2f +
+                     x * (-4.489909112453460693359375e-2f +
+                          x * (1.928029954433441162109375e-2f +
+                               x * (-4.3095736764371395111083984375e-3f)))));
+
+  v *= -sqrt(1.f - x);
+  v = v + 1.57079637050628662109375f;
+
+  v = select(v < 0.f, T(0.f), v);
+  v = select(isneg, -v, v);
+  v = select(isnan, T(cast_i2f(0x7fc00000)), v);
+
+  return v;
+}
+
+template <typename T>
+__forceinline T acos(const T &v)
+{
+  return 1.57079637050628662109375f - asin(v);
+}
+
+template <typename T>
+__forceinline T atan(const T &v)
+{
+  const float piOverTwoVec = 1.57079637050628662109375;
+  // atan(-x) = -atan(x) (so flip from negative to positive first)
+  // If x > 1 -> atan(x) = Pi/2 - atan(1/x)
+  auto xNeg = v < 0.f;
+  auto xFlipped = select(xNeg, -v, v);
+
+  auto xGt1 = xFlipped > 1.;
+  auto x = select(xGt1, rcpSafe(xFlipped), xFlipped);
+
+  // These coefficients approximate atan(x)/x
+  const float atanC0  = +0.99999988079071044921875;
+  const float atanC2  = -0.3333191573619842529296875;
+  const float atanC4  = +0.199689209461212158203125;
+  const float atanC6  = -0.14015688002109527587890625;
+  const float atanC8  = +9.905083477497100830078125e-2;
+  const float atanC10 = -5.93664981424808502197265625e-2;
+  const float atanC12 = +2.417283318936824798583984375e-2;
+  const float atanC14 = -4.6721356920897960662841796875e-3;
+
+  auto x2 = x * x;
+  auto result = x2 * atanC14 + atanC12;
+  result = x2 * result + atanC10;
+  result = x2 * result + atanC8;
+  result = x2 * result + atanC6;
+  result = x2 * result + atanC4;
+  result = x2 * result + atanC2;
+  result = x2 * result + atanC0;
+  result *= x;
+
+  result = select(xGt1, piOverTwoVec - result, result);
+  result = select(xNeg, -result, result);
+  return result;
+}
+
+template <typename T>
+__forceinline T atan2(const T &y, const T &x)
+{
+  const float piVec = 3.1415926536;
+  // atan2(y, x) =
+  //
+  // atan2(y > 0, x = +-0) ->  Pi/2
+  // atan2(y < 0, x = +-0) -> -Pi/2
+  // atan2(y = +-0, x < +0) -> +-Pi
+  // atan2(y = +-0, x >= +0) -> +-0
+  //
+  // atan2(y >= 0, x < 0) ->  Pi + atan(y/x)
+  // atan2(y <  0, x < 0) -> -Pi + atan(y/x)
+  // atan2(y, x > 0) -> atan(y/x)
+  //
+  // and then a bunch of code for dealing with infinities.
+  auto yOverX = y * rcpSafe(x);
+  auto atanArg = atan(yOverX);
+  auto xLt0 = x < 0.f;
+  auto yLt0 = y < 0.f;
+  auto offset = select(xLt0,
+                select(yLt0, T(-piVec), T(piVec)), 0.f);
+  return offset + atanArg;
+}
+
+template <typename T>
+__forceinline T exp(const T &v)
+{
+  const float ln2Part1 = 0.6931457519;
+  const float ln2Part2 = 1.4286067653e-6;
+  const float oneOverLn2 = 1.44269502162933349609375;
+
+  auto scaled = v * oneOverLn2;
+  auto kReal = floor(scaled);
+  auto k = toInt(kReal);
+
+  // Reduced range version of x
+  auto x = v - kReal * ln2Part1;
+  x -= kReal * ln2Part2;
+
+  // These coefficients are for e^x in [0, ln(2)]
+  const float one = 1.;
+  const float c2 = 0.4999999105930328369140625;
+  const float c3 = 0.166668415069580078125;
+  const float c4 = 4.16539050638675689697265625e-2;
+  const float c5 = 8.378830738365650177001953125e-3;
+  const float c6 = 1.304379315115511417388916015625e-3;
+  const float c7 = 2.7555381529964506626129150390625e-4;
+
+  auto result = x * c7 + c6;
+  result = x * result + c5;
+  result = x * result + c4;
+  result = x * result + c3;
+  result = x * result + c2;
+  result = x * result + one;
+  result = x * result + one;
+
+  // Compute 2^k (should differ for float and double, but I'll avoid
+  // it for now and just do floats)
+  const int fpbias = 127;
+  auto biasedN = k + fpbias;
+  auto overflow = kReal > fpbias;
+  // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
+  // we've got underflow. -127 * ln(2) -> -88.02. So the most
+  // negative float input that doesn't result in zero is like -88.
+  auto underflow = kReal <= -fpbias;
+  const int infBits = 0x7f800000;
+  biasedN <<= 23;
+  // Reinterpret this thing as float
+  auto twoToTheN = asFloat(biasedN);
+  // Handle both doubles and floats (hopefully eliding the copy for float)
+  auto elemtype2n = twoToTheN;
+  result *= elemtype2n;
+  result = select(overflow, cast_i2f(infBits), result);
+  result = select(underflow, 0., result);
+  return result;
+}
+
+// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
+// * log(2) + log(y) where y is the reduced range (usually in [1/2, 1)).
+template <typename T, typename R>
+__forceinline void __rangeReduceLog(const T &input,
+                                    T &reduced,
+                                    R &exponent)
+{
+  auto intVersion = asInt(input);
+  // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
+  // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
+  //                    0x7  0xF  0x8  0x0  0x0  0x0  0x0  0x0
+  // non-exponent     = 1000 0000 0111 1111 1111 1111 1111 1111
+  //                  = 0x8  0x0  0x7  0xF  0xF  0xF  0xF  0xF
+
+  //const int exponentMask(0x7F800000)
+  static const int nonexponentMask = 0x807FFFFF;
+
+  // We want the reduced version to have an exponent of -1 which is
+  // -1 + 127 after biasing or 126
+  static const int exponentNeg1 = (126l << 23);
+  // NOTE(boulos): We don't need to mask anything out since we know
+  // the sign bit has to be 0. If it's 1, we need to return infinity/nan
+  // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
+  auto biasedExponent = intVersion >> 23; // This number is [0, 255] but it means [-127, 128]
+
+  auto offsetExponent = biasedExponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
+  exponent = offsetExponent - 127;          // get the real value
+
+  // Blend the offset_exponent with the original input (do this in
+  // int for now, until I decide if float can have & and &not)
+  auto blended = (intVersion & nonexponentMask) | (exponentNeg1);
+  reduced = asFloat(blended);
+}
+
+template <typename T> struct ExponentType            { };
+template <int N>      struct ExponentType<vfloat<N>> { typedef vint<N> Ty; };
+template <>           struct ExponentType<float>     { typedef int     Ty; };
+
+template <typename T>
+__forceinline T log(const T &v)
+{
+  T reduced;
+  typename ExponentType<T>::Ty exponent;
+
+  const int nanBits = 0x7fc00000;
+  const int negInfBits = 0xFF800000;
+  const float nan = cast_i2f(nanBits);
+  const float negInf = cast_i2f(negInfBits);
+  auto useNan = v < 0.;
+  auto useInf = v == 0.;
+  auto exceptional = useNan | useInf;
+  const float one = 1.0;
+
+  auto patched = select(exceptional, one, v);
+  __rangeReduceLog(patched, reduced, exponent);
+
+  const float ln2 = 0.693147182464599609375;
+
+  auto x1 = one - reduced;
+  const float c1 = +0.50000095367431640625;
+  const float c2 = +0.33326041698455810546875;
+  const float c3 = +0.2519190013408660888671875;
+  const float c4 = +0.17541764676570892333984375;
+  const float c5 = +0.3424419462680816650390625;
+  const float c6 = -0.599632322788238525390625;
+  const float c7 = +1.98442304134368896484375;
+  const float c8 = -2.4899270534515380859375;
+  const float c9 = +1.7491014003753662109375;
+
+  auto result = x1 * c9 + c8;
+  result = x1 * result + c7;
+  result = x1 * result + c6;
+  result = x1 * result + c5;
+  result = x1 * result + c4;
+  result = x1 * result + c3;
+  result = x1 * result + c2;
+  result = x1 * result + c1;
+  result = x1 * result + one;
+
+  // Equation was for -(ln(red)/(1-red))
+  result *= -x1;
+  result += toFloat(exponent) * ln2;
+
+  return select(exceptional,
+                select(useNan, T(nan), T(negInf)),
+                result);
+}
+
+template <typename T>
+__forceinline T pow(const T &x, const T &y)
+{
+  auto x1 = abs(x);
+  auto z = exp(y * log(x1));
+
+  // Handle special cases
+  const float twoOver23 = 8388608.0f;
+  auto yInt = y == round(y);
+  auto yOddInt = select(yInt, asInt(abs(y) + twoOver23) << 31, 0); // set sign bit
+
+  // x == 0
+  z = select(x == 0.0f,
+      select(y < 0.0f, T(inf) | signmsk(x),
+      select(y == 0.0f, T(1.0f), asFloat(yOddInt) & x)), z);
+
+  // x < 0
+  auto xNegative = x < 0.0f;
+  if (any(xNegative))
+  {
+    auto z1 = z | asFloat(yOddInt);
+    z1 = select(yInt, z1, std::numeric_limits<float>::quiet_NaN());
+    z = select(xNegative, z1, z);
+  }
+
+  auto xFinite = isfinite(x);
+  auto yFinite = isfinite(y);
+  if (all(xFinite & yFinite))
+    return z;
+
+  // x finite and y infinite
+  z = select(andn(xFinite, yFinite),
+      select(x1 == 1.0f, 1.0f,
+      select((x1 > 1.0f) ^ (y < 0.0f), inf, T(0.0f))), z);
+
+  // x infinite
+  z = select(xFinite, z,
+      select(y == 0.0f, 1.0f,
+      select(y < 0.0f, T(0.0f), inf) | (asFloat(yOddInt) & x)));
+
+  return z;
+}
+
+template <typename T>
+__forceinline T pow(const T &x, float y)
+{
+  return pow(x, T(y));
+}
+
+} // namespace fastapprox
+
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/common/math/vec2.h b/thirdparty/embree-aarch64/common/math/vec2.h
new file mode 100644
index 0000000000..a619459e9c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec2.h
@@ -0,0 +1,235 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  struct Vec2fa;
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 2D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec2
+  {
+    enum { N = 2 };
+    union {
+      struct { T x, y; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ) {}
+    __forceinline explicit Vec2( const T& a             ) : x(a), y(a) {}
+    __forceinline          Vec2( const T& x, const T& y ) : x(x), y(y) {}
+
+    __forceinline Vec2( const Vec2& other ) { x = other.x; y = other.y; }
+    __forceinline Vec2( const Vec2fa& other );
+
+    template<typename T1> __forceinline Vec2( const Vec2<T1>& a ) : x(T(a.x)), y(T(a.y)) {}
+    template<typename T1> __forceinline Vec2& operator =( const Vec2<T1>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2& operator =( const Vec2& other ) { x = other.x; y = other.y; return *this; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2( ZeroTy   ) : x(zero), y(zero) {}
+    __forceinline Vec2( OneTy    ) : x(one),  y(one) {}
+    __forceinline Vec2( PosInfTy ) : x(pos_inf), y(pos_inf) {}
+    __forceinline Vec2( NegInfTy ) : x(neg_inf), y(neg_inf) {}
+
+#if defined(__WIN32__) && _MSC_VER == 1800 // workaround for older VS 2013 compiler
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return (&x)[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 2); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 2); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis )      { assert(axis < 2); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a ) { return Vec2<T>(+a.x, +a.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a ) { return Vec2<T>(-a.x, -a.y); }
+  template<typename T> __forceinline Vec2<T> abs       ( const Vec2<T>& a ) { return Vec2<T>(abs  (a.x), abs  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rcp       ( const Vec2<T>& a ) { return Vec2<T>(rcp  (a.x), rcp  (a.y)); }
+  template<typename T> __forceinline Vec2<T> rsqrt     ( const Vec2<T>& a ) { return Vec2<T>(rsqrt(a.x), rsqrt(a.y)); }
+  template<typename T> __forceinline Vec2<T> sqrt      ( const Vec2<T>& a ) { return Vec2<T>(sqrt (a.x), sqrt (a.y)); }
+  template<typename T> __forceinline Vec2<T> frac      ( const Vec2<T>& a ) { return Vec2<T>(frac (a.x), frac (a.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x + b.x, a.y + b.y); }
+  template<typename T> __forceinline Vec2<T> operator +( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x + b  , a.y + b  ); }
+  template<typename T> __forceinline Vec2<T> operator +( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   + b.x, a   + b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x - b.x, a.y - b.y); }
+  template<typename T> __forceinline Vec2<T> operator -( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x - b  , a.y - b  ); }
+  template<typename T> __forceinline Vec2<T> operator -( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   - b.x, a   - b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x * b.x, a.y * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   * b.x, a   * b.y); }
+  template<typename T> __forceinline Vec2<T> operator *( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x * b  , a.y * b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const Vec2<T>& b ) { return Vec2<T>(a.x / b.x, a.y / b.y); }
+  template<typename T> __forceinline Vec2<T> operator /( const Vec2<T>& a, const       T& b ) { return Vec2<T>(a.x / b  , a.y / b  ); }
+  template<typename T> __forceinline Vec2<T> operator /( const       T& a, const Vec2<T>& b ) { return Vec2<T>(a   / b.x, a   / b.y); }
+
+  template<typename T> __forceinline Vec2<T> min(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(min(a.x, b.x), min(a.y, b.y)); }
+  template<typename T> __forceinline Vec2<T> max(const Vec2<T>& a, const Vec2<T>& b) { return Vec2<T>(max(a.x, b.x), max(a.y, b.y)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> madd  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y) ); }
+
+  template<typename T> __forceinline Vec2<T> madd  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( madd(a,b.x,c.x), madd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> msub  ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>( msub(a,b.x,c.x), msub(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmadd ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y) ); }
+  template<typename T> __forceinline Vec2<T> nmsub ( const T& a, const Vec2<T>& b, const Vec2<T>& c) { return Vec2<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y) ); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T>& operator +=( Vec2<T>& a, const Vec2<T>& b ) { a.x += b.x; a.y += b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator -=( Vec2<T>& a, const Vec2<T>& b ) { a.x -= b.x; a.y -= b.y; return a; }
+  template<typename T> __forceinline Vec2<T>& operator *=( Vec2<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; return a; }
+  template<typename T> __forceinline Vec2<T>& operator /=( Vec2<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec2<T>& a ) { return a.x + a.y; }
+  template<typename T> __forceinline T reduce_mul( const Vec2<T>& a ) { return a.x * a.y; }
+  template<typename T> __forceinline T reduce_min( const Vec2<T>& a ) { return min(a.x, a.y); }
+  template<typename T> __forceinline T reduce_max( const Vec2<T>& a ) { return max(a.x, a.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec2<T>& a, const Vec2<T>& b ) { return a.x == b.x && a.y == b.y; }
+  template<typename T> __forceinline bool operator !=( const Vec2<T>& a, const Vec2<T>& b ) { return a.x != b.x || a.y != b.y; }
+  template<typename T> __forceinline bool operator < ( const Vec2<T>& a, const Vec2<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> shift_right_1( const Vec2<T>& a ) {
+    return Vec2<T>(shift_right_1(a.x),shift_right_1(a.y));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       dot      ( const Vec2<T>& a, const Vec2<T>& b ) { return madd(a.x,b.x,a.y*b.y); }
+  template<typename T> __forceinline Vec2<T> cross    ( const Vec2<T>& a )                   { return Vec2<T>(-a.y,a.x); } 
+  template<typename T> __forceinline T       length   ( const Vec2<T>& a )                   { return sqrt(dot(a,a)); }
+  template<typename T> __forceinline Vec2<T> normalize( const Vec2<T>& a )                   { return a*rsqrt(dot(a,a)); }
+  template<typename T> __forceinline T       distance ( const Vec2<T>& a, const Vec2<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline T       det      ( const Vec2<T>& a, const Vec2<T>& b ) { return a.x*b.y - a.y*b.x; }
+
+  template<typename T> __forceinline Vec2<T> normalize_safe( const Vec2<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ),a, a*rsqrt(d) );
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec2<T> select ( bool s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const Vec2<bool>& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y));
+  }
+
+  template<typename T> __forceinline Vec2<T> select ( const typename T::Bool& s, const Vec2<T>& t, const Vec2<T>& f ) {
+    return Vec2<T>(select(s,t.x,f.x),select(s,t.y,f.y));
+  }
+
+  template<typename T>
+    __forceinline Vec2<T> lerp(const Vec2<T>& v0, const Vec2<T>& v1, const T& t) {
+    return madd(Vec2<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec2<T>& a )
+  {
+    const Vec2<T> b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Default template instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef Vec2<bool > Vec2b;
+  typedef Vec2<int  > Vec2i;
+  typedef Vec2<float> Vec2f;
+}
+
+#include "vec2fa.h"
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined(__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX__)
+  template<> __forceinline Vec2<vfloat8>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec2<vfloat16>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec2fa.h b/thirdparty/embree-aarch64/common/math/vec2fa.h
new file mode 100644
index 0000000000..451ecd556c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec2fa.h
@@ -0,0 +1,317 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec2fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec2fa
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 2 };
+    union {
+      __m128 m128;
+      struct { float x,y,az,aw; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ) {}
+    __forceinline Vec2fa( const __m128 a ) : m128(a) {}
+
+    __forceinline Vec2fa            ( const Vec2<float>& other  ) { x = other.x; y = other.y; }
+    __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; }
+
+    __forceinline Vec2fa            ( const Vec2fa& other ) { m128 = other.m128; }
+    __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {}
+
+    __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec2fa load( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline Vec2fa loadu( const void* const a ) {
+      return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1))));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec2fa& v ) {
+      _mm_storeu_ps((float*)ptr,v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec2fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec2fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 2); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; }
+  __forceinline Vec2fa operator -( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa abs  ( const Vec2fa& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec2fa sign ( const Vec2fa& a ) {
+    return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero)));
+  }
+
+  __forceinline Vec2fa rcp  ( const Vec2fa& a )
+  {
+#if defined(__aarch64__)
+        __m128 reciprocal = _mm_rcp_ps(a.m128);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+        return (const Vec2fa)reciprocal;
+#else
+#if defined(__AVX512VL__)
+    const Vec2fa r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec2fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec2fa res = _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+#else
+    const Vec2fa res = _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+#endif  //defined(__aarch64__) 
+  }
+
+  __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec2fa sqr  ( const Vec2fa& a ) { return _mm_mul_ps(a,a); }
+
+  __forceinline Vec2fa rsqrt( const Vec2fa& a )
+  {
+#if defined(__aarch64__)
+        __m128 r = _mm_rsqrt_ps(a.m128);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        return r;
+#else
+        
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+        
+#endif
+  }
+
+  __forceinline Vec2fa zero_fix(const Vec2fa& a) {
+    return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec2fa rcp_safe(const Vec2fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec2fa log ( const Vec2fa& a ) {
+    return Vec2fa(logf(a.x),logf(a.y));
+  }
+
+  __forceinline Vec2fa exp ( const Vec2fa& a ) {
+    return Vec2fa(expf(a.x),expf(a.y));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); }
+  __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec2fa operator /( const Vec2fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec2fa operator /( const        float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) {
+      return Vec2fa(powf(a.x,b),powf(a.y,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); }
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+  __forceinline Vec2fa madd  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; }
+  __forceinline Vec2fa msub  ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; }
+  __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;}
+  __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec2fa madd  ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa msub  ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); }
+  __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; }
+  __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; }
+  __forceinline Vec2fa& operator *=( Vec2fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; }
+  __forceinline Vec2fa& operator /=( Vec2fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; }
+  __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; }
+  __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); }
+  __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; }
+  __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F));
+  }
+#else
+  __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec2fa cross ( const Vec2fa& a ) {
+    return Vec2fa(-a.y,a.x);
+  }
+
+  __forceinline float  sqr_length ( const Vec2fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec2fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec2fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec2fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec2fa normalize( const Vec2fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f, t, mask);
+  }
+
+  __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec2fa& a )
+  {
+    const Vec2fa b = abs(a);
+    if (b.x > b.y) return 0;
+    else return 1;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+__forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); }
+__forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); }
+//__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); }
+#elif defined (__SSE4_1__)
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+#else
+  //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); }
+  __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ")";
+  }
+
+  typedef Vec2fa Vec2fa_t;
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec3.h b/thirdparty/embree-aarch64/common/math/vec3.h
new file mode 100644
index 0000000000..1870321715
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec3.h
@@ -0,0 +1,349 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+
+namespace embree
+{
+  struct Vec3fa;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 3D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec3
+  {
+    enum { N  = 3 };
+
+    union {
+      struct {
+	T x, y, z;
+      };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ) {}
+    __forceinline explicit Vec3( const T& a                         ) : x(a), y(a), z(a) {}
+    __forceinline          Vec3( const T& x, const T& y, const T& z ) : x(x), y(y), z(z) {}
+
+    __forceinline Vec3( const Vec3& other ) { x = other.x; y = other.y; z = other.z; }
+    __forceinline Vec3( const Vec3fa& other );
+
+    template<typename T1> __forceinline Vec3( const Vec3<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)) {}
+    template<typename T1> __forceinline Vec3& operator =(const Vec3<T1>& other) { x = other.x; y = other.y; z = other.z; return *this; }
+
+    __forceinline Vec3& operator =(const Vec3& other) { x = other.x; y = other.y; z = other.z; return *this; }
+	
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3( ZeroTy   ) : x(zero), y(zero), z(zero) {}
+    __forceinline Vec3( OneTy    ) : x(one),  y(one),  z(one) {}
+    __forceinline Vec3( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf) {}
+    __forceinline Vec3( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+    __forceinline const T& operator []( const size_t axis ) const { assert(axis < 3); return (&x)[axis]; }
+    __forceinline       T& operator []( const size_t axis )       { assert(axis < 3); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 3); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 3); return components[axis]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a ) { return Vec3<T>(+a.x, +a.y, +a.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a ) { return Vec3<T>(-a.x, -a.y, -a.z); }
+  template<typename T> __forceinline Vec3<T> abs       ( const Vec3<T>& a ) { return Vec3<T>(abs  (a.x), abs  (a.y), abs  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rcp       ( const Vec3<T>& a ) { return Vec3<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z)); }
+  template<typename T> __forceinline Vec3<T> rsqrt     ( const Vec3<T>& a ) { return Vec3<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z)); }
+  template<typename T> __forceinline Vec3<T> sqrt      ( const Vec3<T>& a ) { return Vec3<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z)); }
+
+  template<typename T> __forceinline Vec3<T> zero_fix( const Vec3<T>& a )
+  {
+    return Vec3<T>(select(abs(a.x)<min_rcp_input,T(min_rcp_input),a.x),
+                   select(abs(a.y)<min_rcp_input,T(min_rcp_input),a.y),
+                   select(abs(a.z)<min_rcp_input,T(min_rcp_input),a.z));
+  }
+  template<typename T> __forceinline Vec3<T> rcp_safe(const Vec3<T>& a) { return rcp(zero_fix(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> operator +( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z); }
+  template<typename T> __forceinline Vec3<T> operator -( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   * b.x, a   * b.y, a   * b.z); }
+  template<typename T> __forceinline Vec3<T> operator *( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x * b  , a.y * b  , a.z * b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const       T& b ) { return Vec3<T>(a.x / b  , a.y / b  , a.z / b  ); }
+  template<typename T> __forceinline Vec3<T> operator /( const       T& a, const Vec3<T>& b ) { return Vec3<T>(a   / b.x, a   / b.y, a   / b.z); }
+  template<typename T> __forceinline Vec3<T> operator /( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(a.x / b.x, a.y / b.y, a.z / b.z); }
+
+  template<typename T> __forceinline Vec3<T> min(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); }
+  template<typename T> __forceinline Vec3<T> max(const Vec3<T>& a, const Vec3<T>& b) { return Vec3<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); }
+
+  template<typename T> __forceinline Vec3<T> operator >>( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x >> b, a.y >> b, a.z >> b); }
+  template<typename T> __forceinline Vec3<T> operator <<( const Vec3<T>& a, const int b ) { return Vec3<T>(a.x << b, a.y << b, a.z << b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> madd  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z)); }
+
+  template<typename T> __forceinline Vec3<T> madd  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> msub  ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z)); }
+  template<typename T> __forceinline Vec3<T> nmadd ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z));}
+  template<typename T> __forceinline Vec3<T> nmsub ( const T& a, const Vec3<T>& b, const Vec3<T>& c) { return Vec3<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const T        b ) { a.x += b;   a.y += b;   a.z += b;   return a; }
+  template<typename T> __forceinline Vec3<T>& operator +=( Vec3<T>& a, const Vec3<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator -=( Vec3<T>& a, const Vec3<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+  template<typename T> __forceinline Vec3<T>& operator *=( Vec3<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; return a; }
+  template<typename T> __forceinline Vec3<T>& operator /=( Vec3<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec3<T>& a ) { return a.x + a.y + a.z; }
+  template<typename T> __forceinline T reduce_mul( const Vec3<T>& a ) { return a.x * a.y * a.z; }
+  template<typename T> __forceinline T reduce_min( const Vec3<T>& a ) { return min(a.x, a.y, a.z); }
+  template<typename T> __forceinline T reduce_max( const Vec3<T>& a ) { return max(a.x, a.y, a.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec3<T>& a, const Vec3<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z; }
+  template<typename T> __forceinline bool operator !=( const Vec3<T>& a, const Vec3<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z; }
+  template<typename T> __forceinline bool operator < ( const Vec3<T>& a, const Vec3<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> shift_right_1( const Vec3<T>& a ) {
+    return Vec3<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<T> select ( bool s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const Vec3<bool>& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z));
+  }
+
+  template<typename T> __forceinline Vec3<T> select ( const typename T::Bool& s, const Vec3<T>& t, const Vec3<T>& f ) {
+    return Vec3<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z));
+  }
+
+  template<typename T>
+    __forceinline Vec3<T> lerp(const Vec3<T>& v0, const Vec3<T>& v1, const T& t) {
+    return madd(Vec3<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  template<typename T> __forceinline int maxDim ( const Vec3<T>& a )
+  {
+    const Vec3<T> b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec3<bool> eq_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x==b.x,a.y==b.y,a.z==b.z); }
+  template<typename T> __forceinline Vec3<bool> neq_mask(const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x!=b.x,a.y!=b.y,a.z!=b.z); }
+  template<typename T> __forceinline Vec3<bool> lt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x< b.x,a.y< b.y,a.z< b.z); }
+  template<typename T> __forceinline Vec3<bool> le_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x<=b.x,a.y<=b.y,a.z<=b.z); }
+  template<typename T> __forceinline Vec3<bool> gt_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x> b.x,a.y> b.y,a.z> b.z); }
+  template<typename T> __forceinline Vec3<bool> ge_mask( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<bool>(a.x>=b.x,a.y>=b.y,a.z>=b.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       sqr      ( const Vec3<T>& a )                   { return dot(a,a); }
+  template<typename T> __forceinline T       dot      ( const Vec3<T>& a, const Vec3<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,a.z*b.z)); }
+  template<typename T> __forceinline T       length   ( const Vec3<T>& a )                   { return sqrt(sqr(a)); }
+  template<typename T> __forceinline T       rcp_length( const Vec3<T>& a )                  { return rsqrt(sqr(a)); }
+  template<typename T> __forceinline Vec3<T> normalize( const Vec3<T>& a )                   { return a*rsqrt(sqr(a)); }
+  template<typename T> __forceinline T       distance ( const Vec3<T>& a, const Vec3<T>& b ) { return length(a-b); }
+  template<typename T> __forceinline Vec3<T> cross    ( const Vec3<T>& a, const Vec3<T>& b ) { return Vec3<T>(prod_diff(a.y,b.z,a.z,b.y), prod_diff(a.z,b.x,a.x,b.z), prod_diff(a.x,b.y,a.y,b.x)); }
+  template<typename T> __forceinline Vec3<T> stable_triangle_normal( const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& c )
+  {
+    const T ab_x = a.z*b.y, ab_y = a.x*b.z, ab_z = a.y*b.x;
+    const T bc_x = b.z*c.y, bc_y = b.x*c.z, bc_z = b.y*c.x;
+    const Vec3<T> cross_ab(msub(a.y,b.z,ab_x), msub(a.z,b.x,ab_y), msub(a.x,b.y,ab_z));
+    const Vec3<T> cross_bc(msub(b.y,c.z,bc_x), msub(b.z,c.x,bc_y), msub(b.x,c.y,bc_z));
+    const auto sx = abs(ab_x) < abs(bc_x);
+    const auto sy = abs(ab_y) < abs(bc_y);
+    const auto sz = abs(ab_z) < abs(bc_z);
+    return Vec3<T>(select(sx,cross_ab.x,cross_bc.x),
+                   select(sy,cross_ab.y,cross_bc.y),
+                   select(sz,cross_ab.z,cross_bc.z));
+  }
+
+  template<typename T> __forceinline T       sum      ( const Vec3<T>& a )                   { return a.x+a.y+a.z; }
+
+  template<typename T> __forceinline      T  halfArea ( const Vec3<T>& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  template<typename T> __forceinline      T  area     ( const Vec3<T>& d )                  { return 2.0f*halfArea(d); }
+
+  template<typename T> __forceinline Vec3<T> normalize_safe( const Vec3<T>& a ) {
+    const T d = dot(a,a); return select(d == T( zero ), a ,  a*rsqrt(d) );
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& P, const Vec3<T>& Q0, const Vec3<T>& Q1)
+  {
+    const Vec3<T> N = cross(P-Q0,Q1-Q0);
+    const Vec3<T> D = Q1-Q0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  template<typename T> __forceinline T sqr_point_to_line_distance(const Vec3<T>& PmQ0, const Vec3<T>& Q1mQ0)
+  {
+    const Vec3<T> N = cross(PmQ0,Q1mQ0);
+    const Vec3<T> D = Q1mQ0;
+    return dot(N,N)*rcp(dot(D,D));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  typedef Vec3<bool > Vec3b;
+  typedef Vec3<int  > Vec3i;
+  typedef Vec3<float> Vec3f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined(__AVX__)
+#include "../simd/avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<typename Out, typename In>
+  __forceinline Vec3<Out> broadcast(const Vec3<In>& a, const size_t k) {
+    return Vec3<Out>(Out(a.x[k]), Out(a.y[k]), Out(a.z[k]));
+  }
+
+  template<> __forceinline Vec3<float>::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; }
+
+#if defined(__AVX__)
+  template<> __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+#elif defined(__SSE__) || defined(__ARM_NEON)
+  template<>
+  __forceinline Vec3<vfloat4>::Vec3(const Vec3fa& a) {
+    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v);
+  }
+#endif
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+
+  template<>
+  __forceinline Vec3<vfloat4> broadcast<vfloat4,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat4> shuffle(const Vec3<vfloat4>& b) {
+    return Vec3<vfloat4>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Vec3<vfloat8>::Vec3(const Vec3fa& a) {
+    x = a.x; y = a.y; z = a.z;
+  }
+  __forceinline Vec3<vfloat4> broadcast4f(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]));
+  }
+  __forceinline Vec3<vfloat8> broadcast8f(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+  __forceinline Vec3<vfloat8> broadcast8f(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat4>(const Vec3<vfloat4>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+  template<>
+  __forceinline Vec3<vfloat8> broadcast<vfloat8,vfloat8>(const Vec3<vfloat8>& a, const size_t k) {
+    return Vec3<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline Vec3<vfloat8> shuffle(const Vec3<vfloat8>& b) {
+    return Vec3<vfloat8>(shuffle<i0,i1,i2,i3>(b.x), shuffle<i0,i1,i2,i3>(b.y), shuffle<i0,i1,i2,i3>(b.z));
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec3<vfloat16>::Vec3(const Vec3fa& a) : x(a.x), y(a.y), z(a.z) {}
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec3ba.h b/thirdparty/embree-aarch64/common/math/vec3ba.h
new file mode 100644
index 0000000000..90f31739c2
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec3ba.h
@@ -0,0 +1,120 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ba Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ba
+  {
+    ALIGNED_STRUCT_(16);
+    
+    union {
+      __m128 m128;
+      struct { int x,y,z; };
+    };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( ) {}
+    __forceinline Vec3ba( const __m128  input ) : m128(input) {}
+    __forceinline Vec3ba( const Vec3ba& other ) : m128(other.m128) {}
+    __forceinline Vec3ba& operator =(const Vec3ba& other) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3ba( bool a )
+      : m128(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+    __forceinline Vec3ba( bool a, bool b, bool c)
+      : m128(mm_lookupmask_ps[(size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+
+    __forceinline operator const __m128&() const { return m128; }
+    __forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ba( FalseTy ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3ba( TrueTy  ) : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator !( const Vec3ba& a ) { return _mm_xor_ps(a.m128, Vec3ba(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ba operator &( const Vec3ba& a, const Vec3ba& b ) { return _mm_and_ps(a.m128, b.m128); }
+  __forceinline Vec3ba operator |( const Vec3ba& a, const Vec3ba& b ) { return _mm_or_ps (a.m128, b.m128); }
+  __forceinline Vec3ba operator ^( const Vec3ba& a, const Vec3ba& b ) { return _mm_xor_ps(a.m128, b.m128); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline Vec3ba& operator &=( Vec3ba& a, const Vec3ba& b ) { return a = a & b; }
+  __forceinline Vec3ba& operator |=( Vec3ba& a, const Vec3ba& b ) { return a = a | b; }
+  __forceinline Vec3ba& operator ^=( Vec3ba& a, const Vec3ba& b ) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline bool operator ==( const Vec3ba& a, const Vec3ba& b ) { 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) == 7; 
+  }
+  __forceinline bool operator !=( const Vec3ba& a, const Vec3ba& b ) { 
+    return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.m128), _mm_castps_si128(b.m128)))) & 7) != 7; 
+  }
+  __forceinline bool operator < ( const Vec3ba& a, const Vec3ba& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) == 0x7; }
+  __forceinline bool reduce_or ( const Vec3ba& a ) { return (_mm_movemask_ps(a) & 0x7) != 0x0; }
+
+  __forceinline bool all       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x7; }
+  __forceinline bool any       ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) != 0x0; }
+  __forceinline bool none      ( const Vec3ba& b ) { return (_mm_movemask_ps(b) & 0x7) == 0x0; }
+
+  __forceinline size_t movemask(const Vec3ba& a) { return _mm_movemask_ps(a) & 0x7; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ba& a) {
+    return cout << "(" << (a.x ? "1" : "0") << ", " << (a.y ? "1" : "0") << ", " << (a.z ? "1" : "0") << ")";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec3fa.h b/thirdparty/embree-aarch64/common/math/vec3fa.h
new file mode 100644
index 0000000000..6163cfb596
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec3fa.h
@@ -0,0 +1,810 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fa Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fa
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    union {
+      __m128 m128;
+      struct { float x,y,z; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ) {}
+    __forceinline Vec3fa( const __m128 a ) : m128(a) {}
+
+    __forceinline Vec3fa            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+    __forceinline Vec3fa            ( const Vec3fa& other ) { m128 = other.m128; }
+    __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+    __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fa load( const void* const a ) {
+#if defined(__aarch64__)
+        __m128 t = _mm_load_ps((float*)a);
+        t[3] = 0.0f;
+        return Vec3fa(t);
+#else
+      return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+#endif
+    }
+
+    static __forceinline Vec3fa loadu( const void* const a ) {
+      return Vec3fa(_mm_loadu_ps((float*)a));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec3fa& v ) {
+      _mm_storeu_ps((float*)ptr,v.m128);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fa( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3fa( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; }
+  __forceinline Vec3fa operator -( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    return vnegq_f32(a.m128);
+#else
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+
+    return _mm_xor_ps(a.m128, mask);
+#endif
+  }
+  __forceinline Vec3fa abs  ( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    return _mm_abs_ps(a.m128);
+#else
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+#endif
+  }
+  __forceinline Vec3fa sign ( const Vec3fa& a ) {
+#if defined(__aarch64__)
+    Vec3fa r = blendv_ps(vOne, vmOne, _mm_cmplt_ps (a.m128,vdupq_n_f32(0.0f)));
+    return r;
+#else
+    return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128));
+#endif
+  }
+
+  __forceinline Vec3fa rcp  ( const Vec3fa& a )
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+  return vdivq_f32(vdupq_n_f32(1.0f),a.m128);
+#elif defined(__aarch64__)
+  __m128 reciprocal = _mm_rcp_ps(a.m128);
+  reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+  reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
+  return (const Vec3fa)reciprocal;
+#else
+        
+#if defined(__AVX512VL__)
+    const Vec3fa r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec3fa r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec3fa res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+#else
+    const Vec3fa res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+#endif  //defined(__aarch64__)
+  }
+
+  __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec3fa sqr  ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+  __forceinline Vec3fa rsqrt( const Vec3fa& a )
+  {
+#if defined(__aarch64__)
+        __m128 r = _mm_rsqrt_ps(a.m128);
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
+        return r;
+#else
+        
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
+  }
+
+  __forceinline Vec3fa zero_fix(const Vec3fa& a) {
+    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec3fa rcp_safe(const Vec3fa& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fa log ( const Vec3fa& a ) {
+    return Vec3fa(logf(a.x),logf(a.y),logf(a.z));
+  }
+
+  __forceinline Vec3fa exp ( const Vec3fa& a ) {
+    return Vec3fa(expf(a.x),expf(a.y),expf(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); }
+  __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec3fa operator /( const Vec3fa& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec3fa operator /( const        float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) {
+      return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+                                                                                
+#if defined(__aarch64__)
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        return _mm_madd_ps(a.m128, b.m128, c.m128);  //a*b+c;
+    }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        return _mm_msub_ps(a.m128, b.m128, c.m128);  //-a*b+c;
+    }
+  __forceinline Vec3fa nmsub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        Vec3fa t = _mm_madd_ps(a.m128, b.m128, c.m128);
+        return -t;
+    }
+  __forceinline Vec3fa msub( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) {
+        return _mm_madd_ps(a.m128,b.m128,vnegq_f32(c.m128)); //a*b-c
+    }
+
+#else
+  __forceinline Vec3fa madd  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; }
+  __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;}
+  __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; }
+  __forceinline Vec3fa msub  ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; }
+#endif
+
+#endif
+
+  __forceinline Vec3fa madd  ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa msub  ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); }
+  __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; }
+  __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; }
+  __forceinline Vec3fa& operator *=( Vec3fa& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; }
+  __forceinline Vec3fa& operator /=( Vec3fa& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__) && defined(BUILD_IOS)
+  __forceinline float reduce_add(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+    t[3] = 0.0f;
+    return vaddvq_f32(t);
+  }
+                                                                                
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+      t[3] = t[2];
+    return vminvq_f32(t);
+  }
+  __forceinline float reduce_max(const Vec3fa& v) {
+    float32x4_t t = v.m128;
+      t[3] = t[2];
+    return vmaxvq_f32(t);
+  }
+#else
+  __forceinline float reduce_add(const Vec3fa& v) {
+    const vfloat4 a(v.m128);
+    const vfloat4 b = shuffle<1>(a);
+    const vfloat4 c = shuffle<2>(a);
+    return _mm_cvtss_f32(a+b+c);
+  }
+
+  __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); }
+  __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); }
+#endif
+                                                                                
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+  __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+  __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+ #if defined(__aarch64__)
+  __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpgt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpge_ps (a.m128, b.m128); }
+#else
+  __forceinline Vec3ba gt_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask(const Vec3fa& a, const Vec3fa& b) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+#endif
+
+  __forceinline bool isvalid ( const Vec3fa& v ) {
+    return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fa& a ) {
+    return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fa& v ) {
+    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fa& a ) {
+    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+  }
+#else
+  __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b )
+  {
+    vfloat4 a0 = vfloat4(a.m128);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+    vfloat4 b1 = vfloat4(b.m128);
+    return Vec3fa(shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1)));
+  }
+
+  __forceinline float  sqr_length ( const Vec3fa& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fa& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fa& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fa& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fa normalize( const Vec3fa& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fa& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fa& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fa normalize_safe( const Vec3fa& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f.m128, t.m128, mask);
+  }
+
+  __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) {
+    return blendv_ps(f.m128, t.m128, s);
+  }
+
+  __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fa& a )
+  {
+    const Vec3fa b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  __forceinline Vec3fa floor(const Vec3fa& a) { return vrndmq_f32(a.m128); }
+  __forceinline Vec3fa ceil (const Vec3fa& a) { return vrndpq_f32(a.m128); }
+  __forceinline Vec3fa trunc(const Vec3fa& a) { return vrndq_f32(a.m128); }
+#elif defined (__SSE4_1__)
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+#else
+  __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); }
+  __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  typedef Vec3fa Vec3fa_t;
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3fx Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3fx
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef float Scalar;
+    enum { N = 3 };
+    union {
+      __m128 m128;
+      struct { float x,y,z; union { int a; unsigned u; float w; }; };
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ) {}
+    __forceinline Vec3fx( const __m128 a ) : m128(a) {}
+
+    __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {}
+    __forceinline operator Vec3fa () const { return Vec3fa(m128); }
+        
+    __forceinline explicit Vec3fx            ( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); }
+    //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128  = _mm_set_ps(0, other.z, other.y, other.x); return *this; }
+
+    __forceinline Vec3fx            ( const Vec3fx& other ) { m128 = other.m128; }
+
+    __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {}
+    __forceinline          Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {}
+
+    __forceinline Vec3fx( const Vec3fa& other, const int      a1) { m128 = other.m128; a = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; }
+    __forceinline Vec3fx( const Vec3fa& other, const float    w1) {
+#if defined (__aarch64__)
+      m128 = other.m128; m128[3] = w1;
+#elif defined (__SSE4_1__)
+      m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4);
+#else
+      const vint4 mask(-1,-1,-1,0);
+      m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1));
+#endif
+    }
+    //__forceinline Vec3fx( const float x, const float y, const float z, const int      a) : x(x), y(y), z(z), a(a) {} // not working properly!
+    //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly!
+    __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {}
+    
+    //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {}
+
+    __forceinline explicit operator const vfloat4() const { return vfloat4(m128); }
+    __forceinline explicit operator const   vint4() const { return vint4(_mm_cvtps_epi32(m128)); }
+    __forceinline explicit operator const  Vec2fa() const { return Vec2fa(m128); }
+    __forceinline explicit operator const  Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); }
+    
+    //__forceinline operator const __m128&() const { return m128; }
+    //__forceinline operator       __m128&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline Vec3fx load( const void* const a ) {
+      return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1))));
+    }
+
+    static __forceinline Vec3fx loadu( const void* const a ) {
+      return Vec3fx(_mm_loadu_ps((float*)a));
+    }
+
+    static __forceinline void storeu ( void* ptr, const Vec3fx& v ) {
+      _mm_storeu_ps((float*)ptr,v.m128);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3fx( ZeroTy   ) : m128(_mm_setzero_ps()) {}
+    __forceinline Vec3fx( OneTy    ) : m128(_mm_set1_ps(1.0f)) {}
+    __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {}
+    __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       float& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; }
+  __forceinline Vec3fx operator -( const Vec3fx& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
+    return _mm_xor_ps(a.m128, mask);
+  }
+  __forceinline Vec3fx abs  ( const Vec3fx& a ) {
+    const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+    return _mm_and_ps(a.m128, mask);
+  }
+  __forceinline Vec3fx sign ( const Vec3fx& a ) {
+    return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128));
+  }
+
+  __forceinline Vec3fx rcp  ( const Vec3fx& a )
+  {
+#if defined(__AVX512VL__)
+    const Vec3fx r = _mm_rcp14_ps(a.m128);
+#else
+    const Vec3fx r = _mm_rcp_ps(a.m128);
+#endif
+
+#if defined(__AVX2__)
+    const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f)));
+#else
+    const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128)));
+    //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
+#endif
+
+    return res;
+  }
+
+  __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); }
+  __forceinline Vec3fx sqr  ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); }
+
+  __forceinline Vec3fx rsqrt( const Vec3fx& a )
+  {
+#if defined(__AVX512VL__)
+    __m128 r = _mm_rsqrt14_ps(a.m128);
+#else
+    __m128 r = _mm_rsqrt_ps(a.m128);
+#endif
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+  }
+
+  __forceinline Vec3fx zero_fix(const Vec3fx& a) {
+    return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input)));
+  }
+  __forceinline Vec3fx rcp_safe(const Vec3fx& a) {
+    return rcp(zero_fix(a));
+  }
+  __forceinline Vec3fx log ( const Vec3fx& a ) {
+    return Vec3fx(logf(a.x),logf(a.y),logf(a.z));
+  }
+
+  __forceinline Vec3fx exp ( const Vec3fx& a ) {
+    return Vec3fx(expf(a.x),expf(a.y),expf(a.z));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); }
+  __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); }
+  __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); }
+  __forceinline Vec3fx operator /( const Vec3fx& a, const float b        ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); }
+  __forceinline Vec3fx operator /( const        float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); }
+
+  __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); }
+  __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); }
+
+#if defined(__SSE4_1__) || defined(__aarch64__)
+    __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+#if defined(__SSE4_1__) || defined(__aarch64__)
+    __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) {
+      const vint4 ai = _mm_castps_si128(a.m128);
+      const vint4 bi = _mm_castps_si128(b.m128);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#endif
+
+    __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) {
+      return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b));
+    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); }
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); }
+#else
+  __forceinline Vec3fx madd  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; }
+  __forceinline Vec3fx msub  ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; }
+  __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;}
+  __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; }
+#endif
+
+  __forceinline Vec3fx madd  ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx msub  ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); }
+  __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; }
+  __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; }
+  __forceinline Vec3fx& operator *=( Vec3fx& a, const float   b ) { return a = a * b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; }
+  __forceinline Vec3fx& operator /=( Vec3fx& a, const float   b ) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float reduce_add(const Vec3fx& v) {
+    const vfloat4 a(v.m128);
+    const vfloat4 b = shuffle<1>(a);
+    const vfloat4 c = shuffle<2>(a);
+    return _mm_cvtss_f32(a+b+c);
+  }
+
+  __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; }
+  __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); }
+  __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; }
+
+  __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); }
+  __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); }
+  __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); }
+  __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); }
+  __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); }
+  __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); }
+
+  __forceinline bool isvalid ( const Vec3fx& v ) {
+    return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite ( const Vec3fx& a ) {
+    return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX)));
+  }
+
+  __forceinline bool isvalid4 ( const Vec3fx& v ) {
+    return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite4 ( const Vec3fx& a ) {
+    return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE4_1__)
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F));
+  }
+#else
+  __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) {
+    return reduce_add(a*b);
+  }
+#endif
+
+  __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b )
+  {
+    vfloat4 a0 = vfloat4(a.m128);
+    vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128));
+    vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128));
+    vfloat4 b1 = vfloat4(b.m128);
+    return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1)));
+  }
+
+  __forceinline float  sqr_length ( const Vec3fx& a )                { return dot(a,a); }
+  __forceinline float  rcp_length ( const Vec3fx& a )                { return rsqrt(dot(a,a)); }
+  __forceinline float  rcp_length2( const Vec3fx& a )                { return rcp(dot(a,a)); }
+  __forceinline float  length   ( const Vec3fx& a )                  { return sqrt(dot(a,a)); }
+  __forceinline Vec3fx normalize( const Vec3fx& a )                  { return a*rsqrt(dot(a,a)); }
+  __forceinline float  distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); }
+  __forceinline float  halfArea ( const Vec3fx& d )                  { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  __forceinline float  area     ( const Vec3fx& d )                  { return 2.0f*halfArea(d); }
+
+  __forceinline Vec3fx normalize_safe( const Vec3fx& a ) {
+    const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  }
+
+  /*! differentiated normalization */
+  __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp)
+  {
+    const float pp  = dot(p,p);
+    const float pdp = dot(p,dp);
+    return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) {
+    __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps();
+    return blendv_ps(f.m128, t.m128, mask);
+  }
+
+  __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) {
+    return blendv_ps(f.m128, t.m128, s);
+  }
+
+  __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) {
+    return madd(1.0f-t,v0,t*v1);
+  }
+
+  __forceinline int maxDim ( const Vec3fx& a )
+  {
+    const Vec3fx b = abs(a);
+    if (b.x > b.y) {
+      if (b.x > b.z) return 0; else return 2;
+    } else {
+      if (b.y > b.z) return 1; else return 2;
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__SSE4_1__) && !defined(__aarch64__)
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF    ); }
+#else
+  __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); }
+  __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); }
+  __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+
+  
+  typedef Vec3fx Vec3ff;
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec3ia.h b/thirdparty/embree-aarch64/common/math/vec3ia.h
new file mode 100644
index 0000000000..737f67fd72
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec3ia.h
@@ -0,0 +1,210 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/alloc.h"
+#include "math.h"
+#include "../simd/sse.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// SSE Vec3ia Type
+  ////////////////////////////////////////////////////////////////////////////////
+
+  struct __aligned(16) Vec3ia
+  {
+    ALIGNED_STRUCT_(16);
+
+    union {
+      __m128i m128;
+      struct { int x,y,z; };
+    };
+
+    typedef int Scalar;
+    enum { N = 3 };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ) {}
+    __forceinline Vec3ia( const __m128i a ) : m128(a) {}
+    __forceinline Vec3ia( const Vec3ia& other ) : m128(other.m128) {}
+    __forceinline Vec3ia& operator =(const Vec3ia& other) { m128 = other.m128; return *this; }
+
+    __forceinline explicit Vec3ia( const int a ) : m128(_mm_set1_epi32(a)) {}
+    __forceinline          Vec3ia( const int x, const int y, const int z) : m128(_mm_set_epi32(z, z, y, x)) {}
+    __forceinline explicit Vec3ia( const __m128 a ) : m128(_mm_cvtps_epi32(a)) {}
+
+    __forceinline operator const __m128i&() const { return m128; }
+    __forceinline operator       __m128i&()       { return m128; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3ia( ZeroTy   ) : m128(_mm_setzero_si128()) {}
+    __forceinline Vec3ia( OneTy    ) : m128(_mm_set1_epi32(1)) {}
+    __forceinline Vec3ia( PosInfTy ) : m128(_mm_set1_epi32(pos_inf)) {}
+    __forceinline Vec3ia( NegInfTy ) : m128(_mm_set1_epi32(neg_inf)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; }
+    __forceinline       int& operator []( const size_t index )       { assert(index < 3); return (&x)[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a ) { return a; }
+  __forceinline Vec3ia operator -( const Vec3ia& a ) { return _mm_sub_epi32(_mm_setzero_si128(), a.m128); }
+#if (defined(__aarch64__)) 
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return vabsq_s32(a.m128); }
+#elif defined(__SSSE3__)
+  __forceinline Vec3ia abs       ( const Vec3ia& a ) { return _mm_abs_epi32(a.m128); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia operator +( const Vec3ia& a, const Vec3ia& b ) { return _mm_add_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator +( const Vec3ia& a, const int     b ) { return a+Vec3ia(b); }
+  __forceinline Vec3ia operator +( const int     a, const Vec3ia& b ) { return Vec3ia(a)+b; }
+
+  __forceinline Vec3ia operator -( const Vec3ia& a, const Vec3ia& b ) { return _mm_sub_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator -( const Vec3ia& a, const int     b ) { return a-Vec3ia(b); }
+  __forceinline Vec3ia operator -( const int     a, const Vec3ia& b ) { return Vec3ia(a)-b; }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline Vec3ia operator *( const Vec3ia& a, const Vec3ia& b ) { return _mm_mullo_epi32(a.m128, b.m128); }
+  __forceinline Vec3ia operator *( const Vec3ia& a, const int     b ) { return a * Vec3ia(b); }
+  __forceinline Vec3ia operator *( const int     a, const Vec3ia& b ) { return Vec3ia(a) * b; }
+#endif
+
+  __forceinline Vec3ia operator &( const Vec3ia& a, const Vec3ia& b ) { return _mm_and_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator &( const Vec3ia& a, const int     b ) { return a & Vec3ia(b); }
+  __forceinline Vec3ia operator &( const int     a, const Vec3ia& b ) { return Vec3ia(a) & b; }
+
+  __forceinline Vec3ia operator |( const Vec3ia& a, const Vec3ia& b ) { return _mm_or_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator |( const Vec3ia& a, const int     b ) { return a | Vec3ia(b); }
+  __forceinline Vec3ia operator |( const int     a, const Vec3ia& b ) { return Vec3ia(a) | b; }
+
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const Vec3ia& b ) { return _mm_xor_si128(a.m128, b.m128); }
+  __forceinline Vec3ia operator ^( const Vec3ia& a, const int     b ) { return a ^ Vec3ia(b); }
+  __forceinline Vec3ia operator ^( const int     a, const Vec3ia& b ) { return Vec3ia(a) ^ b; }
+
+#if !defined(__ARM_NEON)
+  __forceinline Vec3ia operator <<( const Vec3ia& a, const int n ) { return _mm_slli_epi32(a.m128, n); }
+  __forceinline Vec3ia operator >>( const Vec3ia& a, const int n ) { return _mm_srai_epi32(a.m128, n); }
+
+  __forceinline Vec3ia sll ( const Vec3ia& a, const int b ) { return _mm_slli_epi32(a.m128, b); }
+  __forceinline Vec3ia sra ( const Vec3ia& a, const int b ) { return _mm_srai_epi32(a.m128, b); }
+  __forceinline Vec3ia srl ( const Vec3ia& a, const int b ) { return _mm_srli_epi32(a.m128, b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const Vec3ia& b ) { return a = a + b; }
+  __forceinline Vec3ia& operator +=( Vec3ia& a, const int&   b ) { return a = a + b; }
+  
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const Vec3ia& b ) { return a = a - b; }
+  __forceinline Vec3ia& operator -=( Vec3ia& a, const int&   b ) { return a = a - b; }
+  
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const Vec3ia& b ) { return a = a * b; }
+  __forceinline Vec3ia& operator *=( Vec3ia& a, const int&    b ) { return a = a * b; }
+#endif
+  
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const Vec3ia& b ) { return a = a & b; }
+  __forceinline Vec3ia& operator &=( Vec3ia& a, const int&    b ) { return a = a & b; }
+  
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const Vec3ia& b ) { return a = a | b; }
+  __forceinline Vec3ia& operator |=( Vec3ia& a, const int&    b ) { return a = a | b; }
+  
+#if !defined(__ARM_NEON)
+  __forceinline Vec3ia& operator <<=( Vec3ia& a, const int& b ) { return a = a << b; }
+  __forceinline Vec3ia& operator >>=( Vec3ia& a, const int& b ) { return a = a >> b; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+  __forceinline int reduce_add(const Vec3ia& v) {
+    int32x4_t t = v.m128;
+    t[3] = 0;
+    return vaddvq_s32(t);
+        
+  }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) {
+    int32x4_t t = (__m128i)blendv_ps((__m128)v0x7fffffff, (__m128)v.m128, (__m128)vFFF0);
+    return vminvq_s32(t);
+        
+  }
+  __forceinline int reduce_max(const Vec3ia& v) {
+    int32x4_t t = (__m128i)blendv_ps((__m128)v0x80000000, (__m128)v.m128, (__m128)vFFF0);
+    return vmaxvq_s32(t);
+        
+  }
+#else
+  __forceinline int reduce_add(const Vec3ia& v) { return v.x+v.y+v.z; }
+  __forceinline int reduce_mul(const Vec3ia& v) { return v.x*v.y*v.z; }
+  __forceinline int reduce_min(const Vec3ia& v) { return min(v.x,v.y,v.z); }
+  __forceinline int reduce_max(const Vec3ia& v) { return max(v.x,v.y,v.z); }
+#endif
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool operator ==( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) == 7; }
+  __forceinline bool operator !=( const Vec3ia& a, const Vec3ia& b ) { return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(a.m128, b.m128))) & 7) != 7; }
+  __forceinline bool operator < ( const Vec3ia& a, const Vec3ia& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    return false;
+  }
+
+  __forceinline Vec3ba eq_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpeq_epi32 (a.m128, b.m128)); }
+  __forceinline Vec3ba lt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmplt_epi32 (a.m128, b.m128)); }
+  __forceinline Vec3ba gt_mask( const Vec3ia& a, const Vec3ia& b ) { return _mm_castsi128_ps(_mm_cmpgt_epi32 (a.m128, b.m128)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline Vec3ia select( const Vec3ba& m, const Vec3ia& t, const Vec3ia& f ) {
+#if defined(__aarch64__) || defined(__SSE4_1__)
+    return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+    return _mm_or_si128(_mm_and_si128(_mm_castps_si128(m), t), _mm_andnot_si128(_mm_castps_si128(m), f)); 
+#endif
+  }
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return _mm_min_epi32(a.m128,b.m128); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return _mm_max_epi32(a.m128,b.m128); }
+#else
+  __forceinline Vec3ia min( const Vec3ia& a, const Vec3ia& b ) { return select(lt_mask(a,b),a,b); }
+  __forceinline Vec3ia max( const Vec3ia& a, const Vec3ia& b ) { return select(gt_mask(a,b),a,b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3ia& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/math/vec4.h b/thirdparty/embree-aarch64/common/math/vec4.h
new file mode 100644
index 0000000000..d16542f507
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/math/vec4.h
@@ -0,0 +1,258 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "math.h"
+#include "vec3.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Generic 4D vector Class
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> struct Vec4
+  {
+    enum { N = 4 };    
+    union {
+      struct { T x, y, z, w; };
+#if !(defined(__WIN32__) && _MSC_VER == 1800) // workaround for older VS 2013 compiler
+      T components[N];
+#endif
+    };
+
+    typedef T Scalar;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Construction
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec4( ) {}
+    __forceinline explicit Vec4( const T& a                                     ) : x(a), y(a), z(a), w(a) {}
+    __forceinline          Vec4( const T& x, const T& y, const T& z, const T& w ) : x(x), y(y), z(z), w(w) {}
+    __forceinline          Vec4( const Vec3<T>& xyz, const T& w ) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
+
+    __forceinline Vec4( const Vec4& other ) { x = other.x; y = other.y; z = other.z; w = other.w; }
+    __forceinline Vec4( const Vec3fx& other );
+
+    template<typename T1> __forceinline Vec4( const Vec4<T1>& a ) : x(T(a.x)), y(T(a.y)), z(T(a.z)), w(T(a.w)) {}
+    template<typename T1> __forceinline Vec4& operator =(const Vec4<T1>& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+    __forceinline Vec4& operator =(const Vec4& other) { x = other.x; y = other.y; z = other.z; w = other.w; return *this; }
+
+    __forceinline operator Vec3<T> () const { return Vec3<T>(x,y,z); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec4( ZeroTy   ) : x(zero), y(zero), z(zero), w(zero) {}
+    __forceinline Vec4( OneTy    ) : x(one),  y(one),  z(one),  w(one) {}
+    __forceinline Vec4( PosInfTy ) : x(pos_inf), y(pos_inf), z(pos_inf), w(pos_inf) {}
+    __forceinline Vec4( NegInfTy ) : x(neg_inf), y(neg_inf), z(neg_inf), w(neg_inf) {}
+
+#if defined(__WIN32__) && (_MSC_VER == 1800) // workaround for older VS 2013 compiler
+	__forceinline const T& operator [](const size_t axis) const { assert(axis < 4); return (&x)[axis]; }
+	__forceinline       T& operator [](const size_t axis)       { assert(axis < 4); return (&x)[axis]; }
+#else
+	__forceinline const T& operator [](const size_t axis ) const { assert(axis < 4); return components[axis]; }
+	__forceinline       T& operator [](const size_t axis)        { assert(axis < 4); return components[axis]; }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Swizzles
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Vec3<T> xyz() const { return Vec3<T>(x, y, z); }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a ) { return Vec4<T>(+a.x, +a.y, +a.z, +a.w); }
+  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a ) { return Vec4<T>(-a.x, -a.y, -a.z, -a.w); }
+  template<typename T> __forceinline Vec4<T> abs       ( const Vec4<T>& a ) { return Vec4<T>(abs  (a.x), abs  (a.y), abs  (a.z), abs  (a.w)); }
+  template<typename T> __forceinline Vec4<T> rcp       ( const Vec4<T>& a ) { return Vec4<T>(rcp  (a.x), rcp  (a.y), rcp  (a.z), rcp  (a.w)); }
+  template<typename T> __forceinline Vec4<T> rsqrt     ( const Vec4<T>& a ) { return Vec4<T>(rsqrt(a.x), rsqrt(a.y), rsqrt(a.z), rsqrt(a.w)); }
+  template<typename T> __forceinline Vec4<T> sqrt      ( const Vec4<T>& a ) { return Vec4<T>(sqrt (a.x), sqrt (a.y), sqrt (a.z), sqrt (a.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> operator +( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+  template<typename T> __forceinline Vec4<T> operator -( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   * b.x, a   * b.y, a   * b.z, a   * b.w); }
+  template<typename T> __forceinline Vec4<T> operator *( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x * b  , a.y * b  , a.z * b  , a.w * b  ); }
+  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const Vec4<T>& b ) { return Vec4<T>(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); }
+  template<typename T> __forceinline Vec4<T> operator /( const Vec4<T>& a, const       T& b ) { return Vec4<T>(a.x / b  , a.y / b  , a.z / b  , a.w / b  ); }
+  template<typename T> __forceinline Vec4<T> operator /( const       T& a, const Vec4<T>& b ) { return Vec4<T>(a   / b.x, a   / b.y, a   / b.z, a   / b.w); }
+
+  template<typename T> __forceinline Vec4<T> min(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); }
+  template<typename T> __forceinline Vec4<T> max(const Vec4<T>& a, const Vec4<T>& b) { return Vec4<T>(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> madd  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a.x,b.x,c.x), madd(a.y,b.y,c.y), madd(a.z,b.z,c.z), madd(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> msub  ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a.x,b.x,c.x), msub(a.y,b.y,c.y), msub(a.z,b.z,c.z), msub(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmadd ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a.x,b.x,c.x),nmadd(a.y,b.y,c.y),nmadd(a.z,b.z,c.z),nmadd(a.w,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmsub ( const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a.x,b.x,c.x),nmsub(a.y,b.y,c.y),nmsub(a.z,b.z,c.z),nmsub(a.w,b.w,c.w)); }
+
+  template<typename T> __forceinline Vec4<T> madd  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( madd(a,b.x,c.x), madd(a,b.y,c.y), madd(a,b.z,c.z), madd(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> msub  ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>( msub(a,b.x,c.x), msub(a,b.y,c.y), msub(a,b.z,c.z), msub(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmadd ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmadd(a,b.x,c.x),nmadd(a,b.y,c.y),nmadd(a,b.z,c.z),nmadd(a,b.w,c.w)); }
+  template<typename T> __forceinline Vec4<T> nmsub ( const T& a, const Vec4<T>& b, const Vec4<T>& c) { return Vec4<T>(nmsub(a,b.x,c.x),nmsub(a,b.y,c.y),nmsub(a,b.z,c.z),nmsub(a,b.w,c.w)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T>& operator +=( Vec4<T>& a, const Vec4<T>& b ) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+  template<typename T> __forceinline Vec4<T>& operator -=( Vec4<T>& a, const Vec4<T>& b ) { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+  template<typename T> __forceinline Vec4<T>& operator *=( Vec4<T>& a, const       T& b ) { a.x *= b  ; a.y *= b  ; a.z *= b  ; a.w *= b  ; return a; }
+  template<typename T> __forceinline Vec4<T>& operator /=( Vec4<T>& a, const       T& b ) { a.x /= b  ; a.y /= b  ; a.z /= b  ; a.w /= b  ; return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T reduce_add( const Vec4<T>& a ) { return a.x + a.y + a.z + a.w; }
+  template<typename T> __forceinline T reduce_mul( const Vec4<T>& a ) { return a.x * a.y * a.z * a.w; }
+  template<typename T> __forceinline T reduce_min( const Vec4<T>& a ) { return min(a.x, a.y, a.z, a.w); }
+  template<typename T> __forceinline T reduce_max( const Vec4<T>& a ) { return max(a.x, a.y, a.z, a.w); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline bool operator ==( const Vec4<T>& a, const Vec4<T>& b ) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; }
+  template<typename T> __forceinline bool operator !=( const Vec4<T>& a, const Vec4<T>& b ) { return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; }
+  template<typename T> __forceinline bool operator < ( const Vec4<T>& a, const Vec4<T>& b ) {
+    if (a.x != b.x) return a.x < b.x;
+    if (a.y != b.y) return a.y < b.y;
+    if (a.z != b.z) return a.z < b.z;
+    if (a.w != b.w) return a.w < b.w;
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Shift Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> shift_right_1( const Vec4<T>& a ) {
+    return Vec4<T>(shift_right_1(a.x),shift_right_1(a.y),shift_right_1(a.z),shift_right_1(a.w));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline T       dot      ( const Vec4<T>& a, const Vec4<T>& b ) { return madd(a.x,b.x,madd(a.y,b.y,madd(a.z,b.z,a.w*b.w))); }
+
+  template<typename T> __forceinline T       length   ( const Vec4<T>& a )                   { return sqrt(dot(a,a)); }
+  template<typename T> __forceinline Vec4<T> normalize( const Vec4<T>& a )                   { return a*rsqrt(dot(a,a)); }
+  template<typename T> __forceinline T       distance ( const Vec4<T>& a, const Vec4<T>& b ) { return length(a-b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline Vec4<T> select ( bool s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+  }
+
+  template<typename T> __forceinline Vec4<T> select ( const Vec4<bool>& s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s.x,t.x,f.x),select(s.y,t.y,f.y),select(s.z,t.z,f.z),select(s.w,t.w,f.w));
+  }
+
+  template<typename T> __forceinline Vec4<T> select ( const typename T::Bool& s, const Vec4<T>& t, const Vec4<T>& f ) {
+    return Vec4<T>(select(s,t.x,f.x),select(s,t.y,f.y),select(s,t.z,f.z),select(s,t.w,f.w));
+  }
+
+  template<typename T>
+    __forceinline Vec4<T> lerp(const Vec4<T>& v0, const Vec4<T>& v1, const T& t) {
+    return madd(Vec4<T>(T(1.0f)-t),v0,t*v1);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename T> __forceinline embree_ostream operator<<(embree_ostream cout, const Vec4<T>& a) {
+    return cout << "(" << a.x << ", " << a.y << ", " << a.z << ", " << a.w << ")";
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Default template instantiations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  typedef Vec4<bool         > Vec4b;
+  typedef Vec4<uint8_t      > Vec4uc;
+  typedef Vec4<int          > Vec4i;
+  typedef Vec4<float        > Vec4f;
+}
+
+#include "vec3ba.h"
+#include "vec3ia.h"
+#include "vec3fa.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// SSE / AVX / MIC specializations
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#include "../simd/sse.h"
+#endif
+
+#if defined __AVX__
+#include "../simd/avx.h"
+#endif
+
+#if defined __AVX512F__
+#include "../simd/avx512.h"
+#endif
+
+namespace embree
+{
+  template<> __forceinline Vec4<float>::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; }
+
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+#elif defined(__SSE__) || defined(__ARM_NEON)
+  template<> __forceinline Vec4<vfloat4>::Vec4( const Vec3fx& a ) {
+    const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v);
+  }
+#endif
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+  __forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat4>& a, const size_t k ) {
+    return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k]));
+  }
+#endif
+
+#if defined(__AVX__)
+  template<> __forceinline Vec4<vfloat8>::Vec4( const Vec3fx& a ) {
+    x = a.x; y = a.y; z = a.z; w = a.w;
+  }
+  __forceinline Vec4<vfloat4> broadcast4f( const Vec4<vfloat8>& a, const size_t k ) {
+    return Vec4<vfloat4>(vfloat4::broadcast(&a.x[k]), vfloat4::broadcast(&a.y[k]), vfloat4::broadcast(&a.z[k]), vfloat4::broadcast(&a.w[k]));
+  }
+  __forceinline Vec4<vfloat8> broadcast8f( const Vec4<vfloat4>& a, const size_t k ) {
+    return Vec4<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k]));
+  }
+  __forceinline Vec4<vfloat8> broadcast8f( const Vec4<vfloat8>& a, const size_t k ) {
+    return Vec4<vfloat8>(vfloat8::broadcast(&a.x[k]), vfloat8::broadcast(&a.y[k]), vfloat8::broadcast(&a.z[k]), vfloat8::broadcast(&a.w[k]));
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<> __forceinline Vec4<vfloat16>::Vec4( const Vec3fx& a ) : x(a.x), y(a.y), z(a.z), w(a.w) {}
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/simd/avx.h b/thirdparty/embree-aarch64/common/simd/avx.h
new file mode 100644
index 0000000000..c840e41805
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/avx.h
@@ -0,0 +1,34 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "sse.h"
+
+#if defined(__AVX512VL__)
+#include "vboolf8_avx512.h"
+#include "vboold4_avx512.h"
+#else
+#include "vboolf8_avx.h"
+#include "vboold4_avx.h"
+#endif
+
+#if defined(__AVX2__)
+#include "vint8_avx2.h"
+#include "vuint8_avx2.h"
+#if defined(__X86_64__)
+#include "vllong4_avx2.h"
+#endif
+#else
+#include "vint8_avx.h"
+#include "vuint8_avx.h"
+#endif
+#include "vfloat8_avx.h"
+#if defined(__X86_64__)
+#include "vdouble4_avx.h"
+#endif
+
+#if defined(__AVX512F__)
+#include "avx512.h"
+#endif
+
diff --git a/thirdparty/embree-aarch64/common/simd/avx512.h b/thirdparty/embree-aarch64/common/simd/avx512.h
new file mode 100644
index 0000000000..25414ab5b1
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/avx512.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../math/constants.h"
+#include "../sys/alloc.h"
+#include "varying.h"
+
+#include "vboolf16_avx512.h"
+#include "vint16_avx512.h"
+#include "vuint16_avx512.h"
+#include "vfloat16_avx512.h"
+
+#include "vboold8_avx512.h"
+#include "vllong8_avx512.h"
+#include "vdouble8_avx512.h"
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Prefetching
+  ////////////////////////////////////////////////////////////////////////////////
+
+#define PFHINT_L1   0
+#define PFHINT_L2   1
+#define PFHINT_NT   2
+
+  template<const unsigned int mode>
+    __forceinline void prefetch(const void * __restrict__ const m)
+  {
+    if (mode == PFHINT_L1)
+      _mm_prefetch((const char*)m,_MM_HINT_T0); 
+    else if (mode == PFHINT_L2) 
+      _mm_prefetch((const char*)m,_MM_HINT_T1); 
+    else if (mode == PFHINT_NT) 
+      _mm_prefetch((const char*)m,_MM_HINT_NTA); 
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/simd.h b/thirdparty/embree-aarch64/common/simd/simd.h
new file mode 100644
index 0000000000..647851110b
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/simd.h
@@ -0,0 +1,110 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+/* include SSE wrapper classes */
+#if defined(__SSE__) || defined(__ARM_NEON)
+#  include "sse.h"
+#endif
+
+/* include AVX wrapper classes */
+#if defined(__AVX__)
+#  include "avx.h"
+#endif
+
+/* include AVX512 wrapper classes */
+#if defined (__AVX512F__)
+#  include "avx512.h"
+#endif
+
+namespace embree
+{
+  template <int N>
+  __forceinline vbool<N> isfinite(const vfloat<N>& v)
+  {
+    return (v >= vfloat<N>(-std::numeric_limits<float>::max()))
+         & (v <= vfloat<N>( std::numeric_limits<float>::max()));
+  }
+  
+  /* foreach unique */
+  template<typename vbool, typename vint, typename Closure>
+  __forceinline void foreach_unique(const vbool& valid0, const vint& vi, const Closure& closure)
+  {
+    vbool valid1 = valid0;
+    while (any(valid1)) {
+      const int j = int(bsf(movemask(valid1)));
+      const int i = vi[j];
+      const vbool valid2 = valid1 & (i == vi);
+      valid1 = andn(valid1, valid2);
+      closure(valid2, i);
+    }
+  }
+
+  /* returns the next unique value i in vi and the corresponding valid_i mask */
+  template<typename vbool, typename vint>
+  __forceinline int next_unique(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+  {
+    assert(any(valid));
+    const int j = int(bsf(movemask(valid)));
+    const int i = vi[j];
+    valid_i = valid & (i == vi);
+    valid = andn(valid, valid_i);
+    return i;
+  }
+
+  /* foreach unique index */
+  template<typename vbool, typename vint, typename Closure>
+  __forceinline void foreach_unique_index(const vbool& valid0, const vint& vi, const Closure& closure)
+  {
+    vbool valid1 = valid0;
+    while (any(valid1)) {
+      const int j = int(bsf(movemask(valid1)));
+      const int i = vi[j];
+      const vbool valid2 = valid1 & (i == vi);
+      valid1 = andn(valid1, valid2);
+      closure(valid2, i, j);
+    }
+  }
+
+  /* returns the index of the next unique value i in vi and the corresponding valid_i mask */
+  template<typename vbool, typename vint>
+  __forceinline int next_unique_index(vbool& valid, const vint& vi, /*out*/ vbool& valid_i)
+  {
+    assert(any(valid));
+    const int j = int(bsf(movemask(valid)));
+    const int i = vi[j];
+    valid_i = valid & (i == vi);
+    valid = andn(valid, valid_i);
+    return j;
+  }
+
+  template<typename Closure>
+  __forceinline void foreach2(int x0, int x1, int y0, int y1, const Closure& closure)
+  {
+    __aligned(64) int U[2*VSIZEX];
+    __aligned(64) int V[2*VSIZEX];
+    int index = 0;
+    for (int y=y0; y<y1; y++) {
+      const bool lasty = y+1>=y1;
+      const vintx vy = y;
+      for (int x=x0; x<x1; ) { //x+=VSIZEX) {
+        const bool lastx = x+VSIZEX >= x1;
+        vintx vx = x+vintx(step);
+        vintx::storeu(&U[index], vx);
+        vintx::storeu(&V[index], vy);
+        const int dx = min(x1-x,VSIZEX);
+        index += dx;
+        x += dx;
+        if (index >= VSIZEX || (lastx && lasty)) {
+          const vboolx valid = vintx(step) < vintx(index);
+          closure(valid, vintx::load(U), vintx::load(V));
+          x-= max(0, index-VSIZEX);
+          index = 0;
+        }
+      }
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/sse.cpp b/thirdparty/embree-aarch64/common/simd/sse.cpp
new file mode 100644
index 0000000000..1732cfa421
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/sse.cpp
@@ -0,0 +1,34 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sse.h"
+
+namespace embree 
+{
+  const __m128 mm_lookupmask_ps[16] = {
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0, 0,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32( 0,-1,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1, 0,-1,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1, 0,-1)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1, 0)),
+    _mm_castsi128_ps(_mm_set_epi32(-1,-1,-1,-1))
+  };
+
+  const __m128d mm_lookupmask_pd[4] = {
+    _mm_castsi128_pd(_mm_set_epi32( 0, 0, 0, 0)),
+    _mm_castsi128_pd(_mm_set_epi32( 0, 0,-1,-1)),
+    _mm_castsi128_pd(_mm_set_epi32(-1,-1, 0, 0)),
+    _mm_castsi128_pd(_mm_set_epi32(-1,-1,-1,-1))
+  };
+
+}
diff --git a/thirdparty/embree-aarch64/common/simd/sse.h b/thirdparty/embree-aarch64/common/simd/sse.h
new file mode 100644
index 0000000000..6bc818b55b
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/sse.h
@@ -0,0 +1,35 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/intrinsics.h"
+#include "../sys/alloc.h"
+#include "../math/constants.h"
+#include "varying.h"
+
+namespace embree 
+{
+#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__)
+  __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
+    return _mm_blendv_ps(f,t,mask);
+  }
+#else
+  __forceinline __m128 blendv_ps(__m128 f, __m128 t, __m128 mask) { 
+    return _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)); 
+  }
+#endif
+
+  extern const __m128  mm_lookupmask_ps[16];
+  extern const __m128d mm_lookupmask_pd[4];
+}
+
+#if defined(__AVX512VL__)
+#include "vboolf4_avx512.h"
+#else
+#include "vboolf4_sse2.h"
+#endif
+#include "vint4_sse2.h"
+#include "vuint4_sse2.h"
+#include "vfloat4_sse2.h"
diff --git a/thirdparty/embree-aarch64/common/simd/varying.h b/thirdparty/embree-aarch64/common/simd/varying.h
new file mode 100644
index 0000000000..9a46817da9
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/varying.h
@@ -0,0 +1,132 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+
+namespace embree
+{
+  /* Varying numeric types */
+  template<int N>
+  struct vfloat
+  {
+    union { float f[N]; int i[N]; };
+    __forceinline const float& operator [](size_t index) const { assert(index < N); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < N); return f[index]; }
+  };
+
+  template<int N>
+  struct vdouble
+  {
+    union { double f[N]; long long i[N]; };
+    __forceinline const double& operator [](size_t index) const { assert(index < N); return f[index]; }
+    __forceinline       double& operator [](size_t index)       { assert(index < N); return f[index]; }
+  };
+
+  template<int N>
+  struct vint
+  {
+    int i[N];
+    __forceinline const int& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  template<int N>
+  struct vuint
+  {
+    unsigned int i[N];
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  template<int N>
+  struct vllong
+  {
+    long long i[N];
+    __forceinline const long long& operator [](size_t index) const { assert(index < N); return i[index]; }
+    __forceinline       long long& operator [](size_t index)       { assert(index < N); return i[index]; }
+  };
+
+  /* Varying bool types */
+  template<int N> struct vboolf { int       i[N]; }; // for float/int
+  template<int N> struct vboold { long long i[N]; }; // for double/long long
+
+  /* Aliases to default types */
+  template<int N> using vreal = vfloat<N>;
+  template<int N> using vbool = vboolf<N>;
+
+  /* Varying size constants */
+#if defined(__AVX512VL__) // SKX
+  const int VSIZEX = 8;  // default size
+  const int VSIZEL = 16; // large size
+#elif defined(__AVX512F__) // KNL
+  const int VSIZEX = 16;
+  const int VSIZEL = 16;
+#elif defined(__AVX__)
+  const int VSIZEX = 8;
+  const int VSIZEL = 8;
+#else
+  const int VSIZEX = 4;
+  const int VSIZEL = 4;
+#endif
+
+  /* Extends varying size N to optimal or up to max(N, N2) */
+  template<int N, int N2 = VSIZEX>
+  struct vextend
+  {
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+    /* use 16-wide SIMD calculations on KNL even for 4 and 8 wide SIMD */
+    static const int size = (N2 == VSIZEX) ? VSIZEX : N;
+    #define SIMD_MODE(N) N, 16
+#else
+    /* calculate with same SIMD width otherwise */
+    static const int size = N;
+    #define SIMD_MODE(N) N, N
+#endif
+  };
+
+  /* 4-wide shortcuts */
+  typedef vfloat<4>  vfloat4;
+  typedef vdouble<4> vdouble4;
+  typedef vreal<4>   vreal4;
+  typedef vint<4>    vint4;
+  typedef vuint<4>  vuint4;
+  typedef vllong<4>  vllong4;
+  typedef vbool<4>   vbool4;
+  typedef vboolf<4>  vboolf4;
+  typedef vboold<4>  vboold4;
+
+  /* 8-wide shortcuts */
+  typedef vfloat<8>  vfloat8;
+  typedef vdouble<8> vdouble8;
+  typedef vreal<8>   vreal8;
+  typedef vint<8>    vint8;
+  typedef vuint<8>    vuint8;
+  typedef vllong<8>  vllong8;
+  typedef vbool<8>   vbool8;
+  typedef vboolf<8>  vboolf8;
+  typedef vboold<8>  vboold8;
+
+  /* 16-wide shortcuts */
+  typedef vfloat<16>  vfloat16;
+  typedef vdouble<16> vdouble16;
+  typedef vreal<16>   vreal16;
+  typedef vint<16>    vint16;
+  typedef vuint<16>   vuint16;
+  typedef vllong<16>  vllong16;
+  typedef vbool<16>   vbool16;
+  typedef vboolf<16>  vboolf16;
+  typedef vboold<16>  vboold16;
+
+  /* Default shortcuts */
+  typedef vfloat<VSIZEX>  vfloatx;
+  typedef vdouble<VSIZEX> vdoublex;
+  typedef vreal<VSIZEX>   vrealx;
+  typedef vint<VSIZEX>    vintx;
+  typedef vuint<VSIZEX>   vuintx;
+  typedef vllong<VSIZEX>  vllongx;
+  typedef vbool<VSIZEX>   vboolx;
+  typedef vboolf<VSIZEX>  vboolfx;
+  typedef vboold<VSIZEX>  vbooldx;
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx.h
new file mode 100644
index 0000000000..6505ee56f3
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboold4_avx.h
@@ -0,0 +1,160 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 4-wide AVX bool type for 64bit data types*/
+  template<>
+  struct vboold<4>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboold4 Bool;
+
+    enum  { size = 4 };       // number of SIMD elements
+    union {                   // data
+      __m256d v;
+      struct { __m128d vl,vh; };
+      long long i[4];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold4& a) { v = a.v; }
+    __forceinline vboold4& operator =(const vboold4& a) { v = a.v; return *this; }
+
+    __forceinline vboold(__m256d a) : v(a) {}
+    __forceinline vboold(__m256i a) : v(_mm256_castsi256_pd(a)) {}
+
+    __forceinline operator const __m256() const { return _mm256_castpd_ps(v); }
+    __forceinline operator const __m256i() const { return _mm256_castpd_si256(v); }
+    __forceinline operator const __m256d() const { return v; }
+
+    __forceinline vboold(int a)
+    {
+      assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+      const __m256i mask = _mm256_set_epi64x(0x8, 0x4, 0x2, 0x1);
+      const __m256i b = _mm256_set1_epi64x(a);
+      const __m256i c = _mm256_and_si256(b,mask);
+      v = _mm256_castsi256_pd(_mm256_cmpeq_epi64(c,mask));
+#else
+      vl = mm_lookupmask_pd[a & 0x3];
+      vh = mm_lookupmask_pd[a >> 2];
+#endif
+    }
+    
+    __forceinline vboold(__m128d a, __m128d b) : vl(a), vh(b) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(_mm256_setzero_pd()) {}
+#if !defined(__aarch64__)
+    __forceinline vboold(TrueTy)  : v(_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), _CMP_EQ_OQ)) {}
+#else
+    __forceinline vboold(TrueTy)  : v(_mm256_cmpeq_pd(_mm256_setzero_pd(), _mm256_setzero_pd())) {}
+#endif
+      
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool       operator [](size_t index) const { assert(index < 4); return (_mm256_movemask_pd(v) >> index) & 1; }
+    __forceinline long long& operator [](size_t index)       { assert(index < 4); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm256_xor_pd(a, vboold4(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm256_and_pd(a, b); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm256_or_pd (a, b); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm256_andnot_pd(b, a); }
+
+  __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+  __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+  __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm256_xor_pd(_mm256_xor_pd(a,b),vboold4(embree::True)); }
+
+  __forceinline vboold4 select(const vboold4& mask, const vboold4& t, const vboold4& f) {
+    return _mm256_blendv_pd(f, t, mask); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__aarch64__)
+  __forceinline vboold4 unpacklo(const vboold4& a, const vboold4& b) { return _mm256_unpacklo_pd(a, b); }
+  __forceinline vboold4 unpackhi(const vboold4& a, const vboold4& b) { return _mm256_unpackhi_pd(a, b); }
+#endif
+
+#if defined(__AVX2__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboold4 shuffle(const vboold4& v) {
+    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i>
+  __forceinline vboold4 shuffle(const vboold4& v) {
+    return _mm256_permute4x64_pd(v, _MM_SHUFFLE(i, i, i, i));
+  }
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool reduce_and(const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+  __forceinline bool reduce_or (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+
+  __forceinline bool all (const vboold4& a) { return _mm256_movemask_pd(a) == (unsigned int)0xf; }
+  __forceinline bool any (const vboold4& a) { return !_mm256_testz_pd(a,a); }
+  __forceinline bool none(const vboold4& a) { return _mm256_testz_pd(a,a) != 0; }
+
+  __forceinline bool all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+  __forceinline bool none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+  __forceinline unsigned int movemask(const vboold4& a) { return _mm256_movemask_pd(a); }
+  __forceinline size_t       popcnt  (const vboold4& a) { return popcnt((size_t)_mm256_movemask_pd(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold4& a, size_t index) { return a[index]; }
+  __forceinline void set  (vboold4& a, size_t index)     { a[index] = -1; }
+  __forceinline void clear(vboold4& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+                       << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h
new file mode 100644
index 0000000000..4fe730d713
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboold4_avx512.h
@@ -0,0 +1,140 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 4-wide AVX-512 bool type */
+  template<>
+  struct vboold<4>
+  {
+    typedef vboold4 Bool;
+    typedef vint4   Int;
+
+    enum { size = 4 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold4& t) { v = t.v; }
+    __forceinline vboold4& operator =(const vboold4& f) { v = f.v; return *this; }
+
+    __forceinline vboold(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboold(bool b) { v = b ? 0xf : 0x0; }
+    __forceinline vboold(int t)  { v = (__mmask8)t; }
+    __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const {
+      return _mm_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m256i mask64() const {
+      return _mm256_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(0x0) {}
+    __forceinline vboold(TrueTy)  : v(0xf) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !(const vboold4& a) { return _mm512_kandn(a, 0xf); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator &(const vboold4& a, const vboold4& b) { return _mm512_kand(a, b); }
+  __forceinline vboold4 operator |(const vboold4& a, const vboold4& b) { return _mm512_kor(a, b); }
+  __forceinline vboold4 operator ^(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboold4 andn(const vboold4& a, const vboold4& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4& operator &=(vboold4& a, const vboold4& b) { return a = a & b; }
+  __forceinline vboold4& operator |=(vboold4& a, const vboold4& b) { return a = a | b; }
+  __forceinline vboold4& operator ^=(vboold4& a, const vboold4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold4 operator !=(const vboold4& a, const vboold4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold4 operator ==(const vboold4& a, const vboold4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+  __forceinline vboold4 select(const vboold4& s, const vboold4& a, const vboold4& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboold4& a) { return a.v == 0xf; }
+  __forceinline int any (const vboold4& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboold4& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboold4& valid, const vboold4& b) { return all((!valid) | b); }
+  __forceinline int any (const vboold4& valid, const vboold4& b) { return any(valid & b); }
+  __forceinline int none(const vboold4& valid, const vboold4& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboold4& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboold4& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboold4& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboold4& a, size_t index)       { assert(index < 4); a |= 1 << index; }
+  __forceinline void clear(vboold4& a, size_t index)     { assert(index < 4); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold4& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<4; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h
new file mode 100644
index 0000000000..fdf3f00de5
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboold8_avx512.h
@@ -0,0 +1,148 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX-512 bool type */
+  template<>
+  struct vboold<8>
+  {
+    typedef vboold8 Bool;
+    typedef vint8   Int;
+
+    enum { size = 8 }; // number of SIMD elements
+    __mmask8 v;        // data
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboold() {}
+    __forceinline vboold(const vboold8& t) { v = t.v; }
+    __forceinline vboold8& operator =(const vboold8& f) { v = f.v; return *this; }
+
+    __forceinline vboold(const __mmask8& t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+    
+    __forceinline vboold(bool b) { v = b ? 0xff : 0x00; }
+    __forceinline vboold(int t)  { v = (__mmask8)t; }
+    __forceinline vboold(unsigned int t) { v = (__mmask8)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+#if defined(__AVX512BW__)
+      return _mm_movm_epi8(v);
+#else
+      const __m512i f = _mm512_set1_epi64(0);
+      const __m512i t = _mm512_set1_epi64(-1);
+      const __m512i m =  _mm512_mask_or_epi64(f,v,t,t); 
+      return _mm512_cvtepi64_epi8(m);
+#endif
+    }
+
+    /* return int64 mask */
+    __forceinline __m512i mask64() const { 
+#if defined(__AVX512DQ__)
+      return _mm512_movm_epi64(v);
+#else
+      const __m512i f = _mm512_set1_epi64(0);
+      const __m512i t = _mm512_set1_epi64(-1);
+      return _mm512_mask_or_epi64(f,v,t,t); 
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboold(FalseTy) : v(0x00) {}
+    __forceinline vboold(TrueTy)  : v(0xff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator !(const vboold8& a) { return _mm512_knot(a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator &(const vboold8& a, const vboold8& b) { return _mm512_kand(a, b); }
+  __forceinline vboold8 operator |(const vboold8& a, const vboold8& b) { return _mm512_kor(a, b); }
+  __forceinline vboold8 operator ^(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboold8 andn(const vboold8& a, const vboold8& b) { return _mm512_kandn(b, a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8& operator &=(vboold8& a, const vboold8& b) { return a = a & b; }
+  __forceinline vboold8& operator |=(vboold8& a, const vboold8& b) { return a = a | b; }
+  __forceinline vboold8& operator ^=(vboold8& a, const vboold8& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboold8 operator !=(const vboold8& a, const vboold8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboold8 operator ==(const vboold8& a, const vboold8& b) { return _mm512_kxnor(a, b); }
+  
+  __forceinline vboold8 select(const vboold8& s, const vboold8& a, const vboold8& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline int all (const vboold8& a) { return a.v == 0xff; }
+  __forceinline int any (const vboold8& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboold8& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboold8& valid, const vboold8& b) { return all((!valid) | b); }
+  __forceinline int any (const vboold8& valid, const vboold8& b) { return any(valid & b); }
+  __forceinline int none(const vboold8& valid, const vboold8& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboold8& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboold8& a) { return popcnt(a.v); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboold8& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboold8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboold8& a, size_t index)       { assert(index < 8); a |= 1 << index; }
+  __forceinline void clear(vboold8& a, size_t index)     { assert(index < 8); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboold8& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<8; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h
new file mode 100644
index 0000000000..238cdc8eb9
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboolf16_avx512.h
@@ -0,0 +1,150 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 16-wide AVX-512 bool type */
+  template<>
+  struct vboolf<16>
+  {
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum { size = 16 }; // number of SIMD elements
+    __mmask16 v;        // data
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf16& t) { v = t.v; }
+    __forceinline vboolf16& operator =(const vboolf16& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask16& t) { v = t; }
+    __forceinline operator __mmask16() const { return v; }
+    
+    __forceinline vboolf(bool b) { v = b ? 0xFFFF : 0x0000; }
+    __forceinline vboolf(int t) { v = (__mmask16)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask16)t; }
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+#if defined(__AVX512BW__)
+      return _mm_movm_epi8(v);
+#else
+      const __m512i f = _mm512_set1_epi32(0);
+      const __m512i t = _mm512_set1_epi32(-1);
+      const __m512i m =  _mm512_mask_or_epi32(f,v,t,t);
+      return _mm512_cvtepi32_epi8(m);
+#endif
+    }
+
+    /* return int32 mask */
+    __forceinline __m512i mask32() const {
+#if defined(__AVX512DQ__)
+      return _mm512_movm_epi32(v);
+#else
+      const __m512i f = _mm512_set1_epi32(0);
+      const __m512i t = _mm512_set1_epi32(-1);
+      return _mm512_mask_or_epi32(f,v,t,t);
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x0000) {}
+    __forceinline vboolf(TrueTy)  : v(0xffff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+  
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 16); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator !(const vboolf16& a) { return _mm512_knot(a); }
+  
+   ////////////////////////////////////////////////////////////////////////////////
+   /// Binary Operators
+   ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator &(const vboolf16& a, const vboolf16& b) { return _mm512_kand(a,b); }
+  __forceinline vboolf16 operator |(const vboolf16& a, const vboolf16& b) { return _mm512_kor(a,b); }
+  __forceinline vboolf16 operator ^(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a,b); }
+
+  __forceinline vboolf16 andn(const vboolf16& a, const vboolf16& b) { return _mm512_kandn(b,a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16& operator &=(vboolf16& a, const vboolf16& b) { return a = a & b; }
+  __forceinline vboolf16& operator |=(vboolf16& a, const vboolf16& b) { return a = a | b; }
+  __forceinline vboolf16& operator ^=(vboolf16& a, const vboolf16& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf16 operator !=(const vboolf16& a, const vboolf16& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf16 operator ==(const vboolf16& a, const vboolf16& b) { return _mm512_kxnor(a, b); }
+  
+  __forceinline vboolf16 select(const vboolf16& s, const vboolf16& a, const vboolf16& b) {
+    return _mm512_kor(_mm512_kand(s,a),_mm512_kandn(s,b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline int all (const vboolf16& a) { return  _mm512_kortestc(a,a) != 0; }
+  __forceinline int any (const vboolf16& a) { return  _mm512_kortestz(a,a) == 0; }
+  __forceinline int none(const vboolf16& a) { return  _mm512_kortestz(a,a) != 0; }
+
+  __forceinline int all (const vboolf16& valid, const vboolf16& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf16& valid, const vboolf16& b) { return any(valid & b); }
+  __forceinline int none(const vboolf16& valid, const vboolf16& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboolf16& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf16& a) { return popcnt(a.v); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Convertion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt (const vboolf16& a) { return mm512_mask2int(a); }
+  __forceinline vboolf16     toMask(const int& a)      { return mm512_int2mask(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf16& a, size_t index) { assert(index < 16); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf16& a, size_t index)       { assert(index < 16); a |= 1 << index; }
+  __forceinline void clear(vboolf16& a, size_t index)     { assert(index < 16); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf16& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<16; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h
new file mode 100644
index 0000000000..2ae4c4470e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboolf4_avx512.h
@@ -0,0 +1,143 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 4-wide AVX-512 bool type */
+  template<>
+  struct vboolf<4>
+  {
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+
+    enum { size = 4 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf4& t) { v = t.v; }
+    __forceinline vboolf4& operator =(const vboolf4& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboolf(bool b) { v = b ? 0xf : 0x0; }
+    __forceinline vboolf(int t)  { v = (__mmask8)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+    __forceinline vboolf(bool a, bool b, bool c, bool d)
+      : v((__mmask8)((int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m128i mask32() const {
+      return _mm_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m256i mask64() const {
+      return _mm256_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x0) {}
+    __forceinline vboolf(TrueTy)  : v(0xf) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 4); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm512_kandn(a, 0xf); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm512_kand(a, b); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm512_kor(a, b); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+  __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+  __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm512_kand(_mm512_kxnor(a, b), 0xf); }
+
+  __forceinline vboolf4 select(const vboolf4& s, const vboolf4& a, const vboolf4& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboolf4& a) { return a.v == 0xf; }
+  __forceinline int any (const vboolf4& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboolf4& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+  __forceinline int none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboolf4& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf4& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboolf4& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf4& a, size_t index) { assert(index < 4); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf4& a, size_t index)       { assert(index < 4); a |= 1 << index; }
+  __forceinline void clear(vboolf4& a, size_t index)     { assert(index < 4); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<4; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h b/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h
new file mode 100644
index 0000000000..ed53b3c783
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboolf4_sse2.h
@@ -0,0 +1,198 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 4-wide SSE bool type */
+  template<>
+  struct vboolf<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };            // number of SIMD elements
+    union { __m128 v; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf4& other) { v = other.v; }
+    __forceinline vboolf4& operator =(const vboolf4& other) { v = other.v; return *this; }
+
+    __forceinline vboolf(__m128 input) : v(input) {}
+    __forceinline operator const __m128&() const { return v; }
+    __forceinline operator const __m128i() const { return _mm_castps_si128(v); }
+    __forceinline operator const __m128d() const { return _mm_castps_pd(v); }
+    
+    __forceinline vboolf(bool a)
+      : v(mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)]) {}
+    __forceinline vboolf(bool a, bool b)
+      : v(mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d)
+      : v(mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)]) {}
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      __forceinline vboolf(int mask) { v = mm_lookupmask_ps[mask]; }
+      __forceinline vboolf(unsigned int mask) { v = mm_lookupmask_ps[mask]; }
+#else
+    __forceinline vboolf(int mask) { assert(mask >= 0 && mask < 16); v = mm_lookupmask_ps[mask]; }
+    __forceinline vboolf(unsigned int mask) { assert(mask < 16); v = mm_lookupmask_ps[mask]; }
+#endif
+    /* return int32 mask */
+    __forceinline __m128i mask32() const { 
+      return _mm_castps_si128(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(_mm_setzero_ps()) {}
+    __forceinline vboolf(TrueTy)  : v(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      __forceinline bool operator [](size_t index) const { return (_mm_movemask_ps(v) >> index) & 1; }
+      __forceinline int& operator [](size_t index)       { return i[index]; }
+#else
+    __forceinline bool operator [](size_t index) const { assert(index < 4); return (_mm_movemask_ps(v) >> index) & 1; }
+    __forceinline int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+#endif
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator !(const vboolf4& a) { return _mm_xor_ps(a, vboolf4(embree::True)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator &(const vboolf4& a, const vboolf4& b) { return _mm_and_ps(a, b); }
+  __forceinline vboolf4 operator |(const vboolf4& a, const vboolf4& b) { return _mm_or_ps (a, b); }
+  __forceinline vboolf4 operator ^(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+
+  __forceinline vboolf4 andn(const vboolf4& a, const vboolf4& b) { return _mm_andnot_ps(b, a); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4& operator &=(vboolf4& a, const vboolf4& b) { return a = a & b; }
+  __forceinline vboolf4& operator |=(vboolf4& a, const vboolf4& b) { return a = a | b; }
+  __forceinline vboolf4& operator ^=(vboolf4& a, const vboolf4& b) { return a = a ^ b; }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 operator !=(const vboolf4& a, const vboolf4& b) { return _mm_xor_ps(a, b); }
+  __forceinline vboolf4 operator ==(const vboolf4& a, const vboolf4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  
+  __forceinline vboolf4 select(const vboolf4& m, const vboolf4& t, const vboolf4& f) {
+#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__)
+    return _mm_blendv_ps(f, t, m); 
+#else
+    return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f)); 
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vboolf4 unpacklo(const vboolf4& a, const vboolf4& b) { return _mm_unpacklo_ps(a, b); }
+  __forceinline vboolf4 unpackhi(const vboolf4& a, const vboolf4& b) { return _mm_unpackhi_ps(a, b); }
+
+#if defined(__aarch64__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return vreinterpretq_f32_u8(vqtbl1q_u8( vreinterpretq_u8_s32(v), _MN_SHUFFLE(i0, i1, i2, i3)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+    return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+  }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return _mm_castsi128_ps(_mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf4 shuffle(const vboolf4& a, const vboolf4& b) {
+    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+#endif
+                                                                
+  template<int i0>
+  __forceinline vboolf4 shuffle(const vboolf4& v) {
+    return shuffle<i0,i0,i0,i0>(v);
+  }
+
+#if defined(__SSE3__)
+  template<> __forceinline vboolf4 shuffle<0, 0, 2, 2>(const vboolf4& v) { return _mm_moveldup_ps(v); }
+  template<> __forceinline vboolf4 shuffle<1, 1, 3, 3>(const vboolf4& v) { return _mm_movehdup_ps(v); }
+  template<> __forceinline vboolf4 shuffle<0, 1, 0, 1>(const vboolf4& v) { return _mm_castpd_ps(_mm_movedup_pd(v)); }
+#endif
+
+#if defined(__SSE4_1__) && !defined(__aarch64__)
+  template<int dst, int src, int clr> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+  template<int dst, int src> __forceinline vboolf4 insert(const vboolf4& a, const vboolf4& b) { return insert<dst, src, 0>(a, b); }
+  template<int dst> __forceinline vboolf4 insert(const vboolf4& a, const bool b) { return insert<dst, 0>(a, vboolf4(b)); }
+#endif
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+    
+  __forceinline bool reduce_and(const vboolf4& a) { return _mm_movemask_ps(a) == 0xf; }
+  __forceinline bool reduce_or (const vboolf4& a) { return _mm_movemask_ps(a) != 0x0; }
+
+  __forceinline bool all (const vboolf4& b) { return _mm_movemask_ps(b) == 0xf; }
+  __forceinline bool any (const vboolf4& b) { return _mm_movemask_ps(b) != 0x0; }
+  __forceinline bool none(const vboolf4& b) { return _mm_movemask_ps(b) == 0x0; }
+
+  __forceinline bool all (const vboolf4& valid, const vboolf4& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboolf4& valid, const vboolf4& b) { return any(valid & b); }
+  __forceinline bool none(const vboolf4& valid, const vboolf4& b) { return none(valid & b); }
+  
+  __forceinline size_t movemask(const vboolf4& a) { return _mm_movemask_ps(a); }
+#if defined(__aarch64__) && defined(BUILD_IOS)
+__forceinline size_t popcnt(const vboolf4& a) { return _mm_movemask_popcnt_ps(a); }
+#else
+#if defined(__SSE4_2__)
+  __forceinline size_t popcnt(const vboolf4& a) { return popcnt((size_t)_mm_movemask_ps(a)); }
+#else
+  __forceinline size_t popcnt(const vboolf4& a) { return bool(a[0])+bool(a[1])+bool(a[2])+bool(a[3]); }
+#endif
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf4& a, size_t index) { return a[index]; }
+  __forceinline void set(vboolf4& a, size_t index)       { a[index] = -1; }
+  __forceinline void clear(vboolf4& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h
new file mode 100644
index 0000000000..4f64741b55
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboolf8_avx.h
@@ -0,0 +1,189 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX bool type */
+  template<>
+  struct vboolf<8>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };       // number of SIMD elements
+    union {                   // data
+      __m256 v;
+      struct { __m128 vl,vh; };
+      int i[8];
+    };  
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf8& a) { v = a.v; }
+    __forceinline vboolf8& operator =(const vboolf8& a) { v = a.v; return *this; }
+
+    __forceinline vboolf(__m256 a) : v(a) {}
+    __forceinline operator const __m256&() const { return v; }
+    __forceinline operator const __m256i() const { return _mm256_castps_si256(v); }
+    __forceinline operator const __m256d() const { return _mm256_castps_pd(v); }
+
+    __forceinline vboolf(int a)
+    {
+      assert(a >= 0 && a <= 255);
+#if defined (__AVX2__)
+      const __m256i mask = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1);
+      const __m256i b = _mm256_set1_epi32(a);
+      const __m256i c = _mm256_and_si256(b,mask);
+      v = _mm256_castsi256_ps(_mm256_cmpeq_epi32(c,mask));
+#else
+      vl = mm_lookupmask_ps[a & 0xF];
+      vh = mm_lookupmask_ps[a >> 4];
+#endif
+    }
+
+    __forceinline vboolf(const vboolf4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+    __forceinline vboolf(const vboolf4& a, const vboolf4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+    __forceinline vboolf(__m128 a, __m128 b) : vl(a), vh(b) {}
+
+    __forceinline vboolf(bool a) : v(vboolf8(vboolf4(a), vboolf4(a))) {}
+    __forceinline vboolf(bool a, bool b) : v(vboolf8(vboolf4(a), vboolf4(b))) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d) : v(vboolf8(vboolf4(a,b), vboolf4(c,d))) {}
+    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h) : v(vboolf8(vboolf4(a,b,c,d), vboolf4(e,f,g,h))) {}
+
+    /* return int32 mask */
+    __forceinline __m256i mask32() const { 
+      return _mm256_castps_si256(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(_mm256_setzero_ps()) {}
+#if !defined(__aarch64__)
+    __forceinline vboolf(TrueTy)  : v(_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _CMP_EQ_OQ)) {}
+#else
+    __forceinline vboolf(TrueTy)  : v(_mm256_cmpeq_ps(_mm256_setzero_ps(), _mm256_setzero_ps())) {}
+#endif
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const { assert(index < 8); return (_mm256_movemask_ps(v) >> index) & 1; }
+    __forceinline int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm256_xor_ps(a, vboolf8(embree::True)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm256_and_ps(a, b); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm256_or_ps (a, b); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm256_andnot_ps(b, a); }
+
+  __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+  __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+  __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(a, b); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm256_xor_ps(_mm256_xor_ps(a,b),vboolf8(embree::True)); }
+
+  __forceinline vboolf8 select(const vboolf8& mask, const vboolf8& t, const vboolf8& f) {
+    return _mm256_blendv_ps(f, t, mask); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 unpacklo(const vboolf8& a, const vboolf8& b) { return _mm256_unpacklo_ps(a, b); }
+  __forceinline vboolf8 unpackhi(const vboolf8& a, const vboolf8& b) { return _mm256_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vboolf8 shuffle(const vboolf8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1>
+  __forceinline vboolf8 shuffle4(const vboolf8& v) {
+    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vboolf8 shuffle4(const vboolf8& a, const vboolf8& b) {
+    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf8 shuffle(const vboolf8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vboolf8 shuffle(const vboolf8& a, const vboolf8& b) {
+    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<> __forceinline vboolf8 shuffle<0, 0, 2, 2>(const vboolf8& v) { return _mm256_moveldup_ps(v); }
+  template<> __forceinline vboolf8 shuffle<1, 1, 3, 3>(const vboolf8& v) { return _mm256_movehdup_ps(v); }
+  template<> __forceinline vboolf8 shuffle<0, 1, 0, 1>(const vboolf8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+
+  template<int i> __forceinline vboolf8 insert4(const vboolf8& a, const vboolf4& b) { return _mm256_insertf128_ps(a, b, i); }
+  template<int i> __forceinline vboolf4 extract4   (const vboolf8& a) { return _mm256_extractf128_ps(a, i); }
+  template<>      __forceinline vboolf4 extract4<0>(const vboolf8& a) { return _mm256_castps256_ps128(a);   }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool reduce_and(const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+  __forceinline bool reduce_or (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+
+  __forceinline bool all (const vboolf8& a) { return _mm256_movemask_ps(a) == (unsigned int)0xff; }
+  __forceinline bool any (const vboolf8& a) { return !_mm256_testz_ps(a,a); }
+  __forceinline bool none(const vboolf8& a) { return _mm256_testz_ps(a,a) != 0; }
+
+  __forceinline bool all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+  __forceinline bool any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+  __forceinline bool none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+  __forceinline unsigned int movemask(const vboolf8& a) { return _mm256_movemask_ps(a); }
+  __forceinline size_t       popcnt  (const vboolf8& a) { return popcnt((size_t)_mm256_movemask_ps(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf8& a, size_t index) { return a[index]; }
+  __forceinline void set(vboolf8& a, size_t index)       { a[index] = -1; }
+  __forceinline void clear(vboolf8& a, size_t index)     { a[index] =  0; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", "
+                       << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h b/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h
new file mode 100644
index 0000000000..2a52b554c7
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vboolf8_avx512.h
@@ -0,0 +1,143 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX-512 bool type */
+  template<>
+  struct vboolf<8>
+  {
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+
+    enum { size = 8 }; // number of SIMD elements
+    __mmask8 v;        // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf() {}
+    __forceinline vboolf(const vboolf8& t) { v = t.v; }
+    __forceinline vboolf8& operator =(const vboolf8& f) { v = f.v; return *this; }
+
+    __forceinline vboolf(const __mmask8 &t) { v = t; }
+    __forceinline operator __mmask8() const { return v; }
+
+    __forceinline vboolf(bool b) { v = b ? 0xff : 0x00; }
+    __forceinline vboolf(int t)  { v = (__mmask8)t; }
+    __forceinline vboolf(unsigned int t) { v = (__mmask8)t; }
+
+    __forceinline vboolf(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h)
+      : v((__mmask8)((int(h) << 7) | (int(g) << 6) | (int(f) << 5) | (int(e) << 4) | (int(d) << 3) | (int(c) << 2) | (int(b) << 1) | int(a))) {}
+
+    /* return int8 mask */
+    __forceinline __m128i mask8() const {
+      return _mm_movm_epi8(v);
+    }
+
+    /* return int32 mask */
+    __forceinline __m256i mask32() const {
+      return _mm256_movm_epi32(v);
+    }
+
+    /* return int64 mask */
+    __forceinline __m512i mask64() const {
+      return _mm512_movm_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vboolf(FalseTy) : v(0x00) {}
+    __forceinline vboolf(TrueTy)  : v(0xff) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline bool operator [](size_t index) const {
+      assert(index < 8); return (mm512_mask2int(v) >> index) & 1;
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !(const vboolf8& a) { return _mm512_knot(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator &(const vboolf8& a, const vboolf8& b) { return _mm512_kand(a, b); }
+  __forceinline vboolf8 operator |(const vboolf8& a, const vboolf8& b) { return _mm512_kor(a, b); }
+  __forceinline vboolf8 operator ^(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+
+  __forceinline vboolf8 andn(const vboolf8& a, const vboolf8& b) { return _mm512_kandn(b, a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8& operator &=(vboolf8& a, const vboolf8& b) { return a = a & b; }
+  __forceinline vboolf8& operator |=(vboolf8& a, const vboolf8& b) { return a = a | b; }
+  __forceinline vboolf8& operator ^=(vboolf8& a, const vboolf8& b) { return a = a ^ b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator !=(const vboolf8& a, const vboolf8& b) { return _mm512_kxor(a, b); }
+  __forceinline vboolf8 operator ==(const vboolf8& a, const vboolf8& b) { return _mm512_kxnor(a, b); }
+
+  __forceinline vboolf8 select(const vboolf8& s, const vboolf8& a, const vboolf8& b) {
+    return _mm512_kor(_mm512_kand(s, a), _mm512_kandn(s, b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reduction Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline int all (const vboolf8& a) { return a.v == 0xff; }
+  __forceinline int any (const vboolf8& a) { return _mm512_kortestz(a, a) == 0; }
+  __forceinline int none(const vboolf8& a) { return _mm512_kortestz(a, a) != 0; }
+
+  __forceinline int all (const vboolf8& valid, const vboolf8& b) { return all((!valid) | b); }
+  __forceinline int any (const vboolf8& valid, const vboolf8& b) { return any(valid & b); }
+  __forceinline int none(const vboolf8& valid, const vboolf8& b) { return none(valid & b); }
+
+  __forceinline size_t movemask(const vboolf8& a) { return _mm512_kmov(a); }
+  __forceinline size_t popcnt  (const vboolf8& a) { return popcnt(a.v); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Conversion Operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline unsigned int toInt(const vboolf8& a) { return mm512_mask2int(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Get/Set Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline bool get(const vboolf8& a, size_t index) { assert(index < 8); return (toInt(a) >> index) & 1; }
+  __forceinline void set(vboolf8& a, size_t index)       { assert(index < 8); a |= 1 << index; }
+  __forceinline void clear(vboolf8& a, size_t index)     { assert(index < 8); a = andn(a, 1 << index); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vboolf8& a)
+  {
+    cout << "<";
+    for (size_t i=0; i<8; i++) {
+      if ((a.v >> i) & 1) cout << "1"; else cout << "0";
+    }
+    return cout << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h b/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h
new file mode 100644
index 0000000000..1f65b45d7e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vdouble4_avx.h
@@ -0,0 +1,324 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{ 
+  /* 4-wide AVX 64-bit double type */
+  template<>
+  struct vdouble<4>
+  {
+    ALIGNED_STRUCT_(32);
+            
+    typedef vboold4 Bool;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union {             // data
+      __m256d v; 
+      double i[4]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vdouble() {}
+    __forceinline vdouble(const vdouble4& t) { v = t.v; }
+    __forceinline vdouble4& operator =(const vdouble4& f) { v = f.v; return *this; }
+
+    __forceinline vdouble(const __m256d& t) { v = t; }
+    __forceinline operator __m256d() const { return v; }
+
+    __forceinline vdouble(double i) {
+      v = _mm256_set1_pd(i);
+    }
+    
+    __forceinline vdouble(double a, double b, double c, double d) {
+      v = _mm256_set_pd(d,c,b,a);      
+    }
+   
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vdouble(ZeroTy) : v(_mm256_setzero_pd()) {}
+    __forceinline vdouble(OneTy)  : v(_mm256_set1_pd(1)) {}
+    __forceinline vdouble(StepTy) : v(_mm256_set_pd(3.0,2.0,1.0,0.0)) {}
+    __forceinline vdouble(ReverseStepTy) : v(_mm256_setr_pd(3.0,2.0,1.0,0.0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(double *__restrict__ ptr, const vdouble4& a) {
+      _mm256_stream_pd(ptr, a);
+    }
+
+    static __forceinline vdouble4 loadu(const double* addr) {
+      return _mm256_loadu_pd(addr);
+    }
+
+    static __forceinline vdouble4 load(const vdouble4* addr) {
+      return _mm256_load_pd((double*)addr);
+    }
+
+    static __forceinline vdouble4 load(const double* addr) {
+      return _mm256_load_pd(addr);
+    }
+
+    static __forceinline void store(double* ptr, const vdouble4& v) {
+      _mm256_store_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(double* ptr, const vdouble4& v) {
+      _mm256_storeu_pd(ptr, v);
+    }
+
+    static __forceinline vdouble4 broadcast(const void* a) { return _mm256_set1_pd(*(double*)a); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       double& operator [](size_t index)       { assert(index < 4); return i[index]; }
+    __forceinline const double& operator [](size_t index) const { assert(index < 4); return i[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline vdouble4 asDouble(const vllong4&  a) { return _mm256_castsi256_pd(a); }
+  __forceinline vllong4  asLLong (const vdouble4& a) { return _mm256_castpd_si256(a); }
+#endif
+
+  __forceinline vdouble4 operator +(const vdouble4& a) { return a; }
+  __forceinline vdouble4 operator -(const vdouble4& a) { return _mm256_sub_pd(_mm256_setzero_pd(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4 operator +(const vdouble4& a, const vdouble4& b) { return _mm256_add_pd(a, b); }
+  __forceinline vdouble4 operator +(const vdouble4& a, double          b) { return a + vdouble4(b); }
+  __forceinline vdouble4 operator +(double          a, const vdouble4& b) { return vdouble4(a) + b; }
+
+  __forceinline vdouble4 operator -(const vdouble4& a, const vdouble4& b) { return _mm256_sub_pd(a, b); }
+  __forceinline vdouble4 operator -(const vdouble4& a, double          b) { return a - vdouble4(b); }
+  __forceinline vdouble4 operator -(double          a, const vdouble4& b) { return vdouble4(a) - b; }
+
+  __forceinline vdouble4 operator *(const vdouble4& a, const vdouble4& b) { return _mm256_mul_pd(a, b); }
+  __forceinline vdouble4 operator *(const vdouble4& a, double          b) { return a * vdouble4(b); }
+  __forceinline vdouble4 operator *(double          a, const vdouble4& b) { return vdouble4(a) * b; }
+
+  __forceinline vdouble4 operator &(const vdouble4& a, const vdouble4& b) { return _mm256_and_pd(a, b); }
+  __forceinline vdouble4 operator &(const vdouble4& a, double          b) { return a & vdouble4(b); }
+  __forceinline vdouble4 operator &(double          a, const vdouble4& b) { return vdouble4(a) & b; }
+
+  __forceinline vdouble4 operator |(const vdouble4& a, const vdouble4& b) { return _mm256_or_pd(a, b); }
+  __forceinline vdouble4 operator |(const vdouble4& a, double          b) { return a | vdouble4(b); }
+  __forceinline vdouble4 operator |(double          a, const vdouble4& b) { return vdouble4(a) | b; }
+
+  __forceinline vdouble4 operator ^(const vdouble4& a, const vdouble4& b) { return _mm256_xor_pd(a, b); }
+  __forceinline vdouble4 operator ^(const vdouble4& a, double          b) { return a ^ vdouble4(b); }
+  __forceinline vdouble4 operator ^(double          a, const vdouble4& b) { return vdouble4(a) ^ b; }
+  
+  __forceinline vdouble4 min(const vdouble4& a, const vdouble4& b) { return _mm256_min_pd(a, b); }
+  __forceinline vdouble4 min(const vdouble4& a, double          b) { return min(a,vdouble4(b)); }
+  __forceinline vdouble4 min(double          a, const vdouble4& b) { return min(vdouble4(a),b); }
+
+  __forceinline vdouble4 max(const vdouble4& a, const vdouble4& b) { return _mm256_max_pd(a, b); }
+  __forceinline vdouble4 max(const vdouble4& a, double          b) { return max(a,vdouble4(b)); }
+  __forceinline vdouble4 max(double          a, const vdouble4& b) { return max(vdouble4(a),b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__FMA__)
+  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmadd_pd(a,b,c); }
+  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fmsub_pd(a,b,c); }
+  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmadd_pd(a,b,c); }
+  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return _mm256_fnmsub_pd(a,b,c); }
+#else
+  __forceinline vdouble4 madd (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b+c; }
+  __forceinline vdouble4 msub (const vdouble4& a, const vdouble4& b, const vdouble4& c) { return a*b-c; }
+  __forceinline vdouble4 nmadd(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b+c;}
+  __forceinline vdouble4 nmsub(const vdouble4& a, const vdouble4& b, const vdouble4& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4& operator +=(vdouble4& a, const vdouble4& b) { return a = a + b; }
+  __forceinline vdouble4& operator +=(vdouble4& a, double          b) { return a = a + b; }
+  
+  __forceinline vdouble4& operator -=(vdouble4& a, const vdouble4& b) { return a = a - b; }
+  __forceinline vdouble4& operator -=(vdouble4& a, double          b) { return a = a - b; }
+
+  __forceinline vdouble4& operator *=(vdouble4& a, const vdouble4& b) { return a = a * b; }
+  __forceinline vdouble4& operator *=(vdouble4& a, double          b) { return a = a * b; }
+  
+  __forceinline vdouble4& operator &=(vdouble4& a, const vdouble4& b) { return a = a & b; }
+  __forceinline vdouble4& operator &=(vdouble4& a, double          b) { return a = a & b; }
+  
+  __forceinline vdouble4& operator |=(vdouble4& a, const vdouble4& b) { return a = a | b; }
+  __forceinline vdouble4& operator |=(vdouble4& a, double          b) { return a = a | b; }
+  
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd_mask(a, b, _MM_CMPINT_LE); }
+#elif !defined(__aarch64__)
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS);  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS);  }
+#else
+  __forceinline vboold4 operator ==(const vdouble4& a, const vdouble4& b) { return _mm256_cmpeq_pd(a, b);  }
+  __forceinline vboold4 operator !=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpneq_pd(a, b); }
+  __forceinline vboold4 operator < (const vdouble4& a, const vdouble4& b) { return _mm256_cmplt_pd(a, b);  }
+  __forceinline vboold4 operator >=(const vdouble4& a, const vdouble4& b) { return _mm256_cmpnlt_pd(a, b); }
+  __forceinline vboold4 operator > (const vdouble4& a, const vdouble4& b) { return _mm256_cmpnle_pd(a, b); }
+  __forceinline vboold4 operator <=(const vdouble4& a, const vdouble4& b) { return _mm256_cmple_pd(a, b);  }
+#endif
+
+  __forceinline vboold4 operator ==(const vdouble4& a, double          b) { return a == vdouble4(b); }
+  __forceinline vboold4 operator ==(double          a, const vdouble4& b) { return vdouble4(a) == b; }
+
+  __forceinline vboold4 operator !=(const vdouble4& a, double          b) { return a != vdouble4(b); }
+  __forceinline vboold4 operator !=(double          a, const vdouble4& b) { return vdouble4(a) != b; }
+
+  __forceinline vboold4 operator < (const vdouble4& a, double          b) { return a <  vdouble4(b); }
+  __forceinline vboold4 operator < (double          a, const vdouble4& b) { return vdouble4(a) <  b; }
+
+  __forceinline vboold4 operator >=(const vdouble4& a, double          b) { return a >= vdouble4(b); }
+  __forceinline vboold4 operator >=(double          a, const vdouble4& b) { return vdouble4(a) >= b; }
+
+  __forceinline vboold4 operator > (const vdouble4& a, double          b) { return a >  vdouble4(b); }
+  __forceinline vboold4 operator > (double          a, const vdouble4& b) { return vdouble4(a) >  b; }
+
+  __forceinline vboold4 operator <=(const vdouble4& a, double          b) { return a <= vdouble4(b); }
+  __forceinline vboold4 operator <=(double          a, const vdouble4& b) { return vdouble4(a) <= b; }
+
+  __forceinline vboold4 eq(const vdouble4& a, const vdouble4& b) { return a == b; }
+  __forceinline vboold4 ne(const vdouble4& a, const vdouble4& b) { return a != b; }
+  __forceinline vboold4 lt(const vdouble4& a, const vdouble4& b) { return a <  b; }
+  __forceinline vboold4 ge(const vdouble4& a, const vdouble4& b) { return a >= b; }
+  __forceinline vboold4 gt(const vdouble4& a, const vdouble4& b) { return a >  b; }
+  __forceinline vboold4 le(const vdouble4& a, const vdouble4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return _mm256_mask_cmp_pd_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 eq(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a == b); }
+  __forceinline vboold4 ne(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a != b); }
+  __forceinline vboold4 lt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <  b); }
+  __forceinline vboold4 ge(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >= b); }
+  __forceinline vboold4 gt(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a >  b); }
+  __forceinline vboold4 le(const vboold4& mask, const vdouble4& a, const vdouble4& b) { return mask & (a <= b); }
+#endif
+ 
+  __forceinline vdouble4 select(const vboold4& m, const vdouble4& t, const vdouble4& f) {
+#if defined(__AVX512VL__)
+    return _mm256_mask_blend_pd(m, f, t);
+#else
+    return _mm256_blendv_pd(f, t, m);
+#endif
+  }
+
+  __forceinline void xchg(const vboold4& m, vdouble4& a, vdouble4& b) {
+    const vdouble4 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboold4 test(const vdouble4& a, const vdouble4& b) {
+#if defined(__AVX512VL__)
+    return _mm256_test_epi64_mask(_mm256_castpd_si256(a),_mm256_castpd_si256(b));
+#else
+    return _mm256_testz_si256(_mm256_castpd_si256(a),_mm256_castpd_si256(b));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vdouble4 shuffle(const vdouble4& v) {
+    return _mm256_permute_pd(v, (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+  }
+
+  template<int i>
+  __forceinline vdouble4 shuffle(const vdouble4& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1>
+  __forceinline vdouble4 shuffle2(const vdouble4& v) {
+    return _mm256_permute2f128_pd(v, v, (i1 << 4) | i0);
+  }
+
+  __forceinline double toScalar(const vdouble4& v) {
+    return _mm_cvtsd_f64(_mm256_castpd256_pd128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble4 vreduce_min2(const vdouble4& x) { return min(x, shuffle<1,0>(x)); }
+  __forceinline vdouble4 vreduce_min (const vdouble4& y) { const vdouble4 x = vreduce_min2(y); return min(x, shuffle2<1,0>(x)); }
+
+  __forceinline vdouble4 vreduce_max2(const vdouble4& x) { return max(x,shuffle<1,0>(x)); }
+  __forceinline vdouble4 vreduce_max (const vdouble4& y) { const vdouble4 x = vreduce_max2(y); return max(x, shuffle2<1,0>(x)); }
+
+  __forceinline vdouble4 vreduce_and2(const vdouble4& x) { return x & shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_and (const vdouble4& y) { const vdouble4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+  __forceinline vdouble4 vreduce_or2(const vdouble4& x) { return x | shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_or (const vdouble4& y) { const vdouble4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+  __forceinline vdouble4 vreduce_add2(const vdouble4& x) { return x + shuffle<1,0>(x); }
+  __forceinline vdouble4 vreduce_add (const vdouble4& y) { const vdouble4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+  __forceinline double reduce_add(const vdouble4& a) { return toScalar(vreduce_add(a)); }
+  __forceinline double reduce_min(const vdouble4& a) { return toScalar(vreduce_min(a)); }
+  __forceinline double reduce_max(const vdouble4& a) { return toScalar(vreduce_max(a)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble4& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<4; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h b/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h
new file mode 100644
index 0000000000..4eec7d2f6a
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vdouble8_avx512.h
@@ -0,0 +1,356 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX-512 64-bit double type */
+  template<>
+  struct vdouble<8>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboold8 Bool;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {              // data
+      __m512d v;
+      double i[8];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vdouble() {}
+    __forceinline vdouble(const vdouble8& t) { v = t.v; }
+    __forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; }
+
+    __forceinline vdouble(const __m512d& t) { v = t; }
+    __forceinline operator __m512d() const { return v; }
+    __forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); }
+
+    __forceinline vdouble(double i) {
+      v = _mm512_set1_pd(i);
+    }
+
+    __forceinline vdouble(double a, double b, double c, double d) {
+      v = _mm512_set4_pd(d,c,b,a);
+    }
+
+    __forceinline vdouble(double a0, double a1, double a2, double a3,
+                          double a4, double a5, double a6, double a7)
+    {
+      v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {}
+    __forceinline vdouble(OneTy)  : v(_mm512_set1_pd(1)) {}
+    __forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+    __forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) {
+      _mm512_stream_pd((double*)ptr, a);
+    }
+
+    static __forceinline vdouble8 loadu(const void* addr) {
+      return _mm512_loadu_pd((double*)addr);
+    }
+
+    static __forceinline vdouble8 load(const vdouble8* addr) {
+      return _mm512_load_pd((double*)addr);
+    }
+
+    static __forceinline vdouble8 load(const double* addr) {
+      return _mm512_load_pd(addr);
+    }
+
+    static __forceinline void store(void* ptr, const vdouble8& v) {
+      _mm512_store_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vdouble8& v) {
+      _mm512_storeu_pd(ptr, v);
+    }
+
+    static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) {
+      _mm512_mask_storeu_pd(ptr, mask, f);
+    }
+
+    static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) {
+      _mm512_mask_store_pd(addr, mask, v2);
+    }
+
+    /* pass by value to avoid compiler generating inefficient code */
+    static __forceinline void storeu_compact(const vboold8 mask,void * addr, const vdouble8& reg) {
+      _mm512_mask_compressstoreu_pd(addr, mask, reg);
+    }
+
+    static __forceinline vdouble8 compact64bit(const vboold8& mask, vdouble8& v) {
+      return _mm512_mask_compress_pd(v, mask, v);
+    }
+
+    static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) {
+      return _mm512_mask_compress_pd(v, mask, v);
+    }
+
+    static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) {
+      return _mm512_mask_compress_pd(a, mask, b);
+    }
+
+    static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline       double& operator [](size_t index)       { assert(index < 8); return i[index]; }
+    __forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 asDouble(const vllong8&  a) { return _mm512_castsi512_pd(a); }
+  __forceinline vllong8  asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); }
+
+  __forceinline vdouble8 operator +(const vdouble8& a) { return a; }
+  __forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); }
+  __forceinline vdouble8 operator +(const vdouble8& a, double          b) { return a + vdouble8(b); }
+  __forceinline vdouble8 operator +(double          a, const vdouble8& b) { return vdouble8(a) + b; }
+
+  __forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); }
+  __forceinline vdouble8 operator -(const vdouble8& a, double          b) { return a - vdouble8(b); }
+  __forceinline vdouble8 operator -(double          a, const vdouble8& b) { return vdouble8(a) - b; }
+
+  __forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); }
+  __forceinline vdouble8 operator *(const vdouble8& a, double          b) { return a * vdouble8(b); }
+  __forceinline vdouble8 operator *(double          a, const vdouble8& b) { return vdouble8(a) * b; }
+
+  __forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); }
+  __forceinline vdouble8 operator &(const vdouble8& a, double          b) { return a & vdouble8(b); }
+  __forceinline vdouble8 operator &(double          a, const vdouble8& b) { return vdouble8(a) & b; }
+
+  __forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); }
+  __forceinline vdouble8 operator |(const vdouble8& a, double          b) { return a | vdouble8(b); }
+  __forceinline vdouble8 operator |(double          a, const vdouble8& b) { return vdouble8(a) | b; }
+
+  __forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); }
+  __forceinline vdouble8 operator ^(const vdouble8& a, double          b) { return a ^ vdouble8(b); }
+  __forceinline vdouble8 operator ^(double          a, const vdouble8& b) { return vdouble8(a) ^ b; }
+
+  __forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); }
+
+  __forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); }
+  __forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); }
+
+  __forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); }
+  __forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); }
+  __forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return  _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); }
+
+  __forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); }
+  __forceinline vdouble8 min(const vdouble8& a, double          b) { return min(a,vdouble8(b)); }
+  __forceinline vdouble8 min(double          a, const vdouble8& b) { return min(vdouble8(a),b); }
+
+  __forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); }
+  __forceinline vdouble8 max(const vdouble8& a, double          b) { return max(a,vdouble8(b)); }
+  __forceinline vdouble8 max(double          a, const vdouble8& b) { return max(vdouble8(a),b); }
+
+  __forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); }
+  __forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); }
+
+  __forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); }
+  __forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); }
+  __forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); }
+  __forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); }
+  __forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; }
+  __forceinline vdouble8& operator +=(vdouble8& a, double          b) { return a = a + b; }
+
+  __forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; }
+  __forceinline vdouble8& operator -=(vdouble8& a, double          b) { return a = a - b; }
+
+  __forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; }
+  __forceinline vdouble8& operator *=(vdouble8& a, double          b) { return a = a * b; }
+
+  __forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; }
+  __forceinline vdouble8& operator &=(vdouble8& a, double          b) { return a = a & b; }
+
+  __forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; }
+  __forceinline vdouble8& operator |=(vdouble8& a, double          b) { return a = a | b; }
+
+  __forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; }
+  __forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vdouble8& a, double          b) { return a == vdouble8(b); }
+  __forceinline vboold8 operator ==(double          a, const vdouble8& b) { return vdouble8(a) == b; }
+
+  __forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vdouble8& a, double          b) { return a != vdouble8(b); }
+  __forceinline vboold8 operator !=(double          a, const vdouble8& b) { return vdouble8(a) != b; }
+
+  __forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vdouble8& a, double          b) { return a <  vdouble8(b); }
+  __forceinline vboold8 operator < (double          a, const vdouble8& b) { return vdouble8(a) <  b; }
+
+  __forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vdouble8& a, double          b) { return a >= vdouble8(b); }
+  __forceinline vboold8 operator >=(double          a, const vdouble8& b) { return vdouble8(a) >= b; }
+
+  __forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vdouble8& a, double          b) { return a >  vdouble8(b); }
+  __forceinline vboold8 operator > (double          a, const vdouble8& b) { return vdouble8(a) >  b; }
+
+  __forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vdouble8& a, double          b) { return a <= vdouble8(b); }
+  __forceinline vboold8 operator <=(double          a, const vdouble8& b) { return vdouble8(a) <= b; }
+
+  __forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); }
+
+  __forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) {
+    return _mm512_mask_or_pd(f,m,t,t);
+  }
+
+  __forceinline void xchg(const vboold8& m, vdouble8& a, vdouble8& b) {
+    const vdouble8 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboold8 test(const vboold8& m, const vdouble8& a, const vdouble8& b) {
+    return _mm512_mask_test_epi64_mask(m,_mm512_castpd_si512(a),_mm512_castpd_si512(b));
+  }
+
+  __forceinline vboold8 test(const vdouble8& a, const vdouble8& b) {
+    return _mm512_test_epi64_mask(_mm512_castpd_si512(a),_mm512_castpd_si512(b));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
+  }
+
+  template<int i>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vdouble8 shuffle(const vdouble8& v) {
+    return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vdouble8 shuffle4(const vdouble8& v) {
+    return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+  }
+
+  template<int i>
+  __forceinline vdouble8 shuffle4(const vdouble8& v) {
+    return shuffle4<i, i>(v);
+  }
+  
+  template<int i>
+  __forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) {
+    return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i));
+  }
+
+  __forceinline double toScalar(const vdouble8& v) {
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 vreduce_add2(vdouble8 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+  __forceinline vdouble8 vreduce_min2(vdouble8 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+  __forceinline vdouble8 vreduce_max2(vdouble8 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+  __forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); }
+  __forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) {
+    return _mm512_permutexvar_pd(index, v);
+  }
+
+  __forceinline vdouble8 reverse(const vdouble8& a) {
+    return permute(a, vllong8(reverse_step));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<8; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h b/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h
new file mode 100644
index 0000000000..aed2419b77
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vfloat16_avx512.h
@@ -0,0 +1,771 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 16-wide AVX-512 float type */
+  template<>
+  struct vfloat<16>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512 v; 
+      float f[16];
+      int i[16];
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+        
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat16& t) { v = t; }
+    __forceinline vfloat16& operator =(const vfloat16& f) { v = f.v; return *this; }
+
+    __forceinline vfloat(const __m512& t) { v = t; }
+    __forceinline operator __m512() const { return v; }
+    __forceinline operator __m256() const { return _mm512_castps512_ps256(v); }
+    __forceinline operator __m128() const { return _mm512_castps512_ps128(v); }
+
+    __forceinline vfloat(float f) {
+      v = _mm512_set1_ps(f);
+    }
+
+    __forceinline vfloat(float a, float b, float c, float d) {
+      v = _mm512_set4_ps(a, b, c, d);
+    }
+
+    __forceinline vfloat(const vfloat4& i) {
+      v = _mm512_broadcast_f32x4(i);
+    }
+
+    __forceinline vfloat(const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d) {
+      v = _mm512_castps128_ps512(a);
+      v = _mm512_insertf32x4(v, b, 1);
+      v = _mm512_insertf32x4(v, c, 2);
+      v = _mm512_insertf32x4(v, d, 3);
+    }
+
+    __forceinline vfloat(const vboolf16& mask, const vfloat4& a, const vfloat4& b) {
+      v = _mm512_broadcast_f32x4(a);
+      v = _mm512_mask_broadcast_f32x4(v,mask,b);
+    }
+
+    __forceinline vfloat(const vfloat8& i) {
+      v = _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castps_pd(i)));
+    }
+
+    __forceinline vfloat(const vfloat8& a, const vfloat8& b) {
+      v = _mm512_castps256_ps512(a);
+#if defined(__AVX512DQ__)
+      v = _mm512_insertf32x8(v, b, 1);
+#else
+      v = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(v), _mm256_castps_pd(b), 1));
+#endif
+    }
+
+    /* WARNING: due to f64x4 the mask is considered as an 8bit mask */
+    __forceinline vfloat(const vboolf16& mask, const vfloat8& a, const vfloat8& b) {
+      __m512d aa = _mm512_broadcast_f64x4(_mm256_castps_pd(a));
+      aa = _mm512_mask_broadcast_f64x4(aa,mask,_mm256_castps_pd(b));
+      v = _mm512_castpd_ps(aa);
+    }
+    
+    __forceinline explicit vfloat(const vint16& a) {
+      v = _mm512_cvtepi32_ps(a);
+    }
+
+    __forceinline explicit vfloat(const vuint16& a) {
+      v = _mm512_cvtepu32_ps(a);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm512_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm512_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm512_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm512_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm512_set_ps(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm512_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm512_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat16 load (const void* ptr) { return _mm512_load_ps((float*)ptr);  }
+    static __forceinline vfloat16 loadu(const void* ptr) { return _mm512_loadu_ps((float*)ptr); }
+
+    static __forceinline vfloat16 load (const vboolf16& mask, const void* ptr) { return _mm512_mask_load_ps (_mm512_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat16 loadu(const vboolf16& mask, const void* ptr) { return _mm512_mask_loadu_ps(_mm512_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (void* ptr, const vfloat16& v) { _mm512_store_ps ((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat16& v) { _mm512_storeu_ps((float*)ptr,v); }
+
+    static __forceinline void store (const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vfloat16& v) { _mm512_mask_storeu_ps((float*)ptr,mask,v); }
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vfloat16& a) {
+      _mm512_stream_ps((float*)ptr,a);
+    }
+
+    static __forceinline vfloat16 broadcast(const float* f) {
+      return _mm512_set1_ps(*f);
+    }
+
+    static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &v) {
+      return _mm512_mask_compress_ps(v, mask, v);
+    }
+    static __forceinline vfloat16 compact(const vboolf16& mask, vfloat16 &a, const vfloat16& b) {
+      return _mm512_mask_compress_ps(a, mask, b);
+    }
+
+    static __forceinline vfloat16 expand(const vboolf16& mask, const vfloat16& a, vfloat16& b) {
+      return _mm512_mask_expand_ps(b, mask, a);
+    }
+
+    static __forceinline vfloat16 loadu_compact(const vboolf16& mask, const void* ptr) {
+      return _mm512_mask_expandloadu_ps(_mm512_setzero_ps(), mask, (float*)ptr);
+    }
+
+    static __forceinline void storeu_compact(const vboolf16& mask, float *addr, const vfloat16 reg) {
+      _mm512_mask_compressstoreu_ps(addr, mask, reg);
+    }
+    
+    static __forceinline void storeu_compact_single(const vboolf16& mask, float * addr, const vfloat16& reg) {
+      //_mm512_mask_compressstoreu_ps(addr,mask,reg);
+      *addr = mm512_cvtss_f32(_mm512_mask_compress_ps(reg, mask, reg));
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat16 gather(const float* ptr, const vint16& index) {
+      return _mm512_i32gather_ps(index, ptr, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat16 gather(const vboolf16& mask, const float* ptr, const vint16& index) {
+      vfloat16 r = zero;
+      return _mm512_mask_i32gather_ps(r, mask, index, ptr, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(float* ptr, const vint16& index, const vfloat16& v) {
+      _mm512_i32scatter_ps(ptr, index, v, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, float* ptr, const vint16& index, const vfloat16& v) {
+      _mm512_mask_i32scatter_ps(ptr, mask, index, v, scale);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       float& operator [](size_t index)       { assert(index < 16); return f[index]; }
+    __forceinline const float& operator [](size_t index) const { assert(index < 16); return f[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 asFloat(const vint16&   a) { return _mm512_castsi512_ps(a); }
+  __forceinline vint16   asInt  (const vfloat16& a) { return _mm512_castps_si512(a); }
+  __forceinline vuint16  asUInt (const vfloat16& a) { return _mm512_castps_si512(a); }
+
+  __forceinline vint16   toInt  (const vfloat16& a) { return vint16(a); }
+  __forceinline vfloat16 toFloat(const vint16&   a) { return vfloat16(a); }
+
+  __forceinline vfloat16 operator +(const vfloat16& a) { return a; }
+  __forceinline vfloat16 operator -(const vfloat16& a) { return _mm512_mul_ps(a,vfloat16(-1)); }
+
+  __forceinline vfloat16 abs    (const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x7FFFFFFF))); }
+  __forceinline vfloat16 signmsk(const vfloat16& a) { return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(a),_mm512_set1_epi32(0x80000000))); }
+
+  __forceinline vfloat16 rcp(const vfloat16& a) {
+#if defined(__AVX512ER__)
+    return _mm512_rcp28_ps(a);
+#else
+    const vfloat16 r = _mm512_rcp14_ps(a);
+    return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vfloat16(2.0f)));
+#endif
+  }
+
+  __forceinline vfloat16 sqr (const vfloat16& a) { return _mm512_mul_ps(a,a); }
+  __forceinline vfloat16 sqrt(const vfloat16& a) { return _mm512_sqrt_ps(a); }
+
+  __forceinline vfloat16 rsqrt(const vfloat16& a)
+  {
+#if defined(__AVX512VL__)
+    const vfloat16 r = _mm512_rsqrt14_ps(a);
+    return _mm512_fmadd_ps(_mm512_set1_ps(1.5f), r,
+                           _mm512_mul_ps(_mm512_mul_ps(_mm512_mul_ps(a, _mm512_set1_ps(-0.5f)), r), _mm512_mul_ps(r, r))); 
+#else
+    return _mm512_rsqrt28_ps(a);
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 operator +(const vfloat16& a, const vfloat16& b) { return _mm512_add_ps(a, b); }
+  __forceinline vfloat16 operator +(const vfloat16& a, float           b) { return a + vfloat16(b); }
+  __forceinline vfloat16 operator +(float           a, const vfloat16& b) { return vfloat16(a) + b; }
+
+  __forceinline vfloat16 operator -(const vfloat16& a, const vfloat16& b) { return _mm512_sub_ps(a, b); }
+  __forceinline vfloat16 operator -(const vfloat16& a, float           b) { return a - vfloat16(b); }
+  __forceinline vfloat16 operator -(float           a, const vfloat16& b) { return vfloat16(a) - b; }
+
+  __forceinline vfloat16 operator *(const vfloat16& a, const vfloat16& b) { return _mm512_mul_ps(a, b); }
+  __forceinline vfloat16 operator *(const vfloat16& a, float           b) { return a * vfloat16(b); }
+  __forceinline vfloat16 operator *(float           a, const vfloat16& b) { return vfloat16(a) * b; }
+
+  __forceinline vfloat16 operator /(const vfloat16& a, const vfloat16& b) { return _mm512_div_ps(a,b); }
+  __forceinline vfloat16 operator /(const vfloat16& a, float           b) { return a/vfloat16(b); }
+  __forceinline vfloat16 operator /(float           a, const vfloat16& b) { return vfloat16(a)/b; }
+  
+  __forceinline vfloat16 operator &(const vfloat16& a, const vfloat16& b) { return _mm512_and_ps(a,b); }
+  __forceinline vfloat16 operator |(const vfloat16& a, const vfloat16& b) { return _mm512_or_ps(a,b); }
+  __forceinline vfloat16 operator ^(const vfloat16& a, const vfloat16& b) {
+    return  _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b))); 
+  }
+  
+  __forceinline vfloat16 min(const vfloat16& a, const vfloat16& b) {
+    return _mm512_min_ps(a,b); 
+  }
+  __forceinline vfloat16 min(const vfloat16& a, float b) {
+    return _mm512_min_ps(a,vfloat16(b));
+  }
+  __forceinline vfloat16 min(const float& a, const vfloat16& b) {
+    return _mm512_min_ps(vfloat16(a),b);
+  }
+
+  __forceinline vfloat16 max(const vfloat16& a, const vfloat16& b) {
+    return _mm512_max_ps(a,b); 
+  }
+  __forceinline vfloat16 max(const vfloat16& a, float b) {
+    return _mm512_max_ps(a,vfloat16(b));
+  }
+  __forceinline vfloat16 max(const float& a, const vfloat16& b) {
+    return _mm512_max_ps(vfloat16(a),b);
+  }
+
+  __forceinline vfloat16 mask_add(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) { return _mm512_mask_add_ps (c,mask,a,b); }
+  __forceinline vfloat16 mask_min(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) {
+    return _mm512_mask_min_ps(c,mask,a,b); 
+  }; 
+  __forceinline vfloat16 mask_max(const vboolf16& mask, const vfloat16& c, const vfloat16& a, const vfloat16& b) {
+    return _mm512_mask_max_ps(c,mask,a,b); 
+  }; 
+
+  __forceinline vfloat16 mini(const vfloat16& a, const vfloat16& b) {
+#if !defined(__AVX512ER__) // SKX
+    const vint16 ai = _mm512_castps_si512(a);
+    const vint16 bi = _mm512_castps_si512(b);
+    const vint16 ci = _mm512_min_epi32(ai,bi);
+    return _mm512_castsi512_ps(ci);
+#else // KNL
+    return min(a,b);
+#endif
+  }
+
+  __forceinline vfloat16 maxi(const vfloat16& a, const vfloat16& b) {
+#if !defined(__AVX512ER__) // SKX
+    const vint16 ai = _mm512_castps_si512(a);
+    const vint16 bi = _mm512_castps_si512(b);
+    const vint16 ci = _mm512_max_epi32(ai,bi);
+    return _mm512_castsi512_ps(ci);
+#else // KNL
+    return max(a,b);
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(a,b,c); }
+  __forceinline vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); }
+  __forceinline vfloat16 nmadd(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(a,b,c); }
+  __forceinline vfloat16 nmsub(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmsub_ps(a,b,c); }
+
+  __forceinline vfloat16 mask_msub(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_ps(a,mask,b,c); }
+  
+  __forceinline vfloat16 madd231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_ps(c,b,a); }
+  __forceinline vfloat16 msub213 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(a,b,c); }
+  __forceinline vfloat16 msub231 (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmsub_ps(c,b,a); }
+  __forceinline vfloat16 msubr231(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fnmadd_ps(c,b,a); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Operators with rounding
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vfloat16 madd_round_down(const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 madd_round_up  (const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_fmadd_round_ps(a,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 mul_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 mul_round_up  (const vfloat16& a, const vfloat16& b) { return _mm512_mul_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 add_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 add_round_up  (const vfloat16& a, const vfloat16& b) { return _mm512_add_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 sub_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 sub_round_up  (const vfloat16& a, const vfloat16& b) { return _mm512_sub_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 div_round_down(const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 div_round_up  (const vfloat16& a, const vfloat16& b) { return _mm512_div_round_ps(a,b,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 mask_msub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 mask_msub_round_up  (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_fmsub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+  
+  __forceinline vfloat16 mask_mul_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 mask_mul_round_up  (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_mul_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  __forceinline vfloat16 mask_sub_round_down(const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); }
+  __forceinline vfloat16 mask_sub_round_up  (const vboolf16& mask,const vfloat16& a, const vfloat16& b, const vfloat16& c) { return _mm512_mask_sub_round_ps(a,mask,b,c,_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); }
+
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16& operator +=(vfloat16& a, const vfloat16& b) { return a = a + b; }
+  __forceinline vfloat16& operator +=(vfloat16& a, float           b) { return a = a + b; }
+  
+  __forceinline vfloat16& operator -=(vfloat16& a, const vfloat16& b) { return a = a - b; }
+  __forceinline vfloat16& operator -=(vfloat16& a, float           b) { return a = a - b; }
+  
+  __forceinline vfloat16& operator *=(vfloat16& a, const vfloat16& b) { return a = a * b; }
+  __forceinline vfloat16& operator *=(vfloat16& a, float           b) { return a = a * b; }
+
+  __forceinline vfloat16& operator /=(vfloat16& a, const vfloat16& b) { return a = a / b; }
+  __forceinline vfloat16& operator /=(vfloat16& a, float           b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vfloat16& a, float           b) { return a == vfloat16(b); }
+  __forceinline vboolf16 operator ==(float           a, const vfloat16& b) { return vfloat16(a) == b; }
+
+  __forceinline vboolf16 operator !=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vfloat16& a, float           b) { return a != vfloat16(b); }
+  __forceinline vboolf16 operator !=(float           a, const vfloat16& b) { return vfloat16(a) != b; }
+
+  __forceinline vboolf16 operator < (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vfloat16& a, float           b) { return a <  vfloat16(b); }
+  __forceinline vboolf16 operator < (float           a, const vfloat16& b) { return vfloat16(a) <  b; }
+
+  __forceinline vboolf16 operator >=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vfloat16& a, float           b) { return a >= vfloat16(b); }
+  __forceinline vboolf16 operator >=(float           a, const vfloat16& b) { return vfloat16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vfloat16& a, float           b) { return a >  vfloat16(b); }
+  __forceinline vboolf16 operator > (float           a, const vfloat16& b) { return vfloat16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vfloat16& a, float           b) { return a <= vfloat16(b); }
+  __forceinline vboolf16 operator <=(float           a, const vfloat16& b) { return vfloat16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vfloat16& a, const vfloat16& b) { return _mm512_cmp_ps_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16& mask, const vfloat16& a, const vfloat16& b) { return _mm512_mask_cmp_ps_mask(mask,a,b,_MM_CMPINT_LE); }
+  
+  __forceinline vfloat16 select(const vboolf16& s, const vfloat16& t, const vfloat16& f) {
+    return _mm512_mask_blend_ps(s, f, t);
+  }
+
+  __forceinline vfloat16 lerp(const vfloat16& a, const vfloat16& b, const vfloat16& t) {
+    return madd(t,b-a,a);
+  }
+
+  __forceinline void xchg(vboolf16 m, vfloat16& a, vfloat16& b)
+  {
+    vfloat16 c = a;
+    a = select(m,b,a);
+    b = select(m,c,b); 
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vfloat16 floor(const vfloat16& a) {
+    return _mm512_floor_ps(a);
+  }
+  __forceinline vfloat16 ceil (const vfloat16& a) {
+    return _mm512_ceil_ps(a);
+  }
+  __forceinline vfloat16 round (const vfloat16& a) {
+    return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+  }
+  __forceinline vint16 floori (const vfloat16& a) {
+    return _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 unpacklo(const vfloat16& a, const vfloat16& b) { return _mm512_unpacklo_ps(a, b); }
+  __forceinline vfloat16 unpackhi(const vfloat16& a, const vfloat16& b) { return _mm512_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vfloat16 shuffle(const vfloat16& v) {
+    return _mm512_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat16 shuffle(const vfloat16& v) {
+    return _mm512_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i>
+  __forceinline vfloat16 shuffle4(const vfloat16& v) {
+    return _mm512_shuffle_f32x4(v, v ,_MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat16 shuffle4(const vfloat16& v) {
+    return _mm512_shuffle_f32x4(v, v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  __forceinline vfloat16 interleave_even(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1));
+  }
+
+  __forceinline vfloat16 interleave_odd(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1));
+  }
+
+  __forceinline vfloat16 interleave2_even(const vfloat16& a, const vfloat16& b) {
+    /* mask should be 8-bit but is 16-bit to reuse for interleave_even */
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xaaaa), _mm512_castps_si512(b), (_MM_PERM_ENUM)0xb1));
+  }
+
+  __forceinline vfloat16 interleave2_odd(const vfloat16& a, const vfloat16& b) {
+    /* mask should be 8-bit but is 16-bit to reuse for interleave_odd */
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x5555), _mm512_castps_si512(a), (_MM_PERM_ENUM)0xb1));
+  }
+
+  __forceinline vfloat16 interleave4_even(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(a), mm512_int2mask(0xcc), _mm512_castps_si512(b), (_MM_PERM_ENUM)0x4e));
+  }
+
+  __forceinline vfloat16 interleave4_odd(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_permutex_epi64(_mm512_castps_si512(b), mm512_int2mask(0x33), _mm512_castps_si512(a), (_MM_PERM_ENUM)0x4e));
+  }
+
+  __forceinline vfloat16 permute(vfloat16 v, __m512i index) {
+    return _mm512_castsi512_ps(_mm512_permutexvar_epi32(index, _mm512_castps_si512(v)));
+  }
+
+  __forceinline vfloat16 reverse(const vfloat16& v) {
+    return permute(v,_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0));
+  }
+
+  template<int i>
+  __forceinline vfloat16 align_shift_right(const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+  };
+
+  template<int i>
+  __forceinline vfloat16 mask_align_shift_right(const vboolf16& mask, vfloat16& c, const vfloat16& a, const vfloat16& b) {
+    return _mm512_castsi512_ps(_mm512_mask_alignr_epi32(_mm512_castps_si512(c),mask,_mm512_castps_si512(a),_mm512_castps_si512(b),i)); 
+  };
+ 
+  __forceinline vfloat16 shift_left_1(const vfloat16& a) {
+    vfloat16 z = zero;
+    return mask_align_shift_right<15>(0xfffe,z,a,a);
+  }
+
+  __forceinline vfloat16 shift_right_1(const vfloat16& x) {
+    return align_shift_right<1>(zero,x);
+  }
+
+  __forceinline float toScalar(const vfloat16& v) { return mm512_cvtss_f32(v); }
+
+
+  template<int i> __forceinline vfloat16 insert4(const vfloat16& a, const vfloat4& b) { return _mm512_insertf32x4(a, b, i); }
+
+  template<int N, int i>
+  vfloat<N> extractN(const vfloat16& v);
+
+  template<> __forceinline vfloat4 extractN<4,0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
+  template<> __forceinline vfloat4 extractN<4,1>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 1); }
+  template<> __forceinline vfloat4 extractN<4,2>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 2); }
+  template<> __forceinline vfloat4 extractN<4,3>(const vfloat16& v) { return _mm512_extractf32x4_ps(v, 3); }
+
+  template<> __forceinline vfloat8 extractN<8,0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
+  template<> __forceinline vfloat8 extractN<8,1>(const vfloat16& v) { return _mm512_extractf32x8_ps(v, 1); }
+
+  template<int i> __forceinline vfloat4 extract4   (const vfloat16& v) { return _mm512_extractf32x4_ps(v, i); }
+  template<>      __forceinline vfloat4 extract4<0>(const vfloat16& v) { return _mm512_castps512_ps128(v);    }
+
+  template<int i> __forceinline vfloat8 extract8   (const vfloat16& v) { return _mm512_extractf32x8_ps(v, i); }
+  template<>      __forceinline vfloat8 extract8<0>(const vfloat16& v) { return _mm512_castps512_ps256(v);    }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+  {
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+    vfloat16 a0a1_c0c1 = interleave_even(r0, r1);
+    vfloat16 a2a3_c2c3 = interleave_even(r2, r3);
+    vfloat16 b0b1_d0d1 = interleave_odd (r0, r1);
+    vfloat16 b2b3_d2d3 = interleave_odd (r2, r3);
+
+    c0 = interleave2_even(a0a1_c0c1, a2a3_c2c3);
+    c1 = interleave2_even(b0b1_d0d1, b2b3_d2d3);
+    c2 = interleave2_odd (a0a1_c0c1, a2a3_c2c3);
+    c3 = interleave2_odd (b0b1_d0d1, b2b3_d2d3);
+#else
+    vfloat16 a0a2_b0b2 = unpacklo(r0, r2);
+    vfloat16 c0c2_d0d2 = unpackhi(r0, r2);
+    vfloat16 a1a3_b1b3 = unpacklo(r1, r3);
+    vfloat16 c1c3_d1d3 = unpackhi(r1, r3);
+
+    c0 = unpacklo(a0a2_b0b2, a1a3_b1b3);
+    c1 = unpackhi(a0a2_b0b2, a1a3_b1b3);
+    c2 = unpacklo(c0c2_d0d2, c1c3_d1d3);
+    c3 = unpackhi(c0c2_d0d2, c1c3_d1d3);
+#endif
+  }
+
+  __forceinline void transpose(const vfloat4& r0,  const vfloat4& r1,  const vfloat4& r2,  const vfloat4& r3,
+                               const vfloat4& r4,  const vfloat4& r5,  const vfloat4& r6,  const vfloat4& r7,
+                               const vfloat4& r8,  const vfloat4& r9,  const vfloat4& r10, const vfloat4& r11,
+                               const vfloat4& r12, const vfloat4& r13, const vfloat4& r14, const vfloat4& r15,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3)
+  {
+    return transpose(vfloat16(r0, r4, r8, r12), vfloat16(r1, r5, r9, r13), vfloat16(r2, r6, r10, r14), vfloat16(r3, r7, r11, r15),
+                     c0, c1, c2, c3);
+  }
+
+  __forceinline void transpose(const vfloat16& r0, const vfloat16& r1, const vfloat16& r2, const vfloat16& r3,
+                               const vfloat16& r4, const vfloat16& r5, const vfloat16& r6, const vfloat16& r7,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+  {
+    vfloat16 a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3;
+    transpose(r0, r1, r2, r3, a0a1a2a3_e0e1e2e3, b0b1b2b3_f0f1f2f3, c0c1c2c3_g0g1g2g3, d0d1d2d3_h0h1h2h3);
+
+    vfloat16 a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7;
+    transpose(r4, r5, r6, r7, a4a5a6a7_e4e5e6e7, b4b5b6b7_f4f5f6f7, c4c5c6c7_g4g5g6g7, d4d5d6d7_h4h5h6h7);
+
+    c0 = interleave4_even(a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+    c1 = interleave4_even(b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+    c2 = interleave4_even(c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+    c3 = interleave4_even(d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+    c4 = interleave4_odd (a0a1a2a3_e0e1e2e3, a4a5a6a7_e4e5e6e7);
+    c5 = interleave4_odd (b0b1b2b3_f0f1f2f3, b4b5b6b7_f4f5f6f7);
+    c6 = interleave4_odd (c0c1c2c3_g0g1g2g3, c4c5c6c7_g4g5g6g7);
+    c7 = interleave4_odd (d0d1d2d3_h0h1h2h3, d4d5d6d7_h4h5h6h7);
+  }
+
+  __forceinline void transpose(const vfloat8& r0,  const vfloat8& r1,  const vfloat8& r2,  const vfloat8& r3,
+                               const vfloat8& r4,  const vfloat8& r5,  const vfloat8& r6,  const vfloat8& r7,
+                               const vfloat8& r8,  const vfloat8& r9,  const vfloat8& r10, const vfloat8& r11,
+                               const vfloat8& r12, const vfloat8& r13, const vfloat8& r14, const vfloat8& r15,
+                               vfloat16& c0, vfloat16& c1, vfloat16& c2, vfloat16& c3,
+                               vfloat16& c4, vfloat16& c5, vfloat16& c6, vfloat16& c7)
+  {
+    return transpose(vfloat16(r0, r8),  vfloat16(r1, r9),  vfloat16(r2, r10), vfloat16(r3, r11),
+                     vfloat16(r4, r12), vfloat16(r5, r13), vfloat16(r6, r14), vfloat16(r7, r15),
+                     c0, c1, c2, c3, c4, c5, c6, c7);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 vreduce_add2(vfloat16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vfloat16 vreduce_add4(vfloat16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vfloat16 vreduce_add8(vfloat16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vfloat16 vreduce_add (vfloat16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+  __forceinline vfloat16 vreduce_min2(vfloat16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_min4(vfloat16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vfloat16 vreduce_min8(vfloat16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_min (vfloat16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vfloat16 vreduce_max2(vfloat16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_max4(vfloat16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vfloat16 vreduce_max8(vfloat16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vfloat16 vreduce_max (vfloat16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline float reduce_add(const vfloat16& v) { return toScalar(vreduce_add(v)); }
+  __forceinline float reduce_min(const vfloat16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat16& v) { return toScalar(vreduce_max(v)); }
+ 
+  __forceinline size_t select_min(const vfloat16& v) { 
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_min(v)),_MM_CMPINT_EQ)));
+  }
+
+  __forceinline size_t select_max(const vfloat16& v) { 
+    return bsf(_mm512_kmov(_mm512_cmp_epi32_mask(_mm512_castps_si512(v),_mm512_castps_si512(vreduce_max(v)),_MM_CMPINT_EQ)));
+  }
+
+  __forceinline size_t select_min(const vboolf16& valid, const vfloat16& v) 
+  { 
+    const vfloat16 a = select(valid,v,vfloat16(pos_inf)); 
+    const vbool16 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+
+  __forceinline size_t select_max(const vboolf16& valid, const vfloat16& v) 
+  { 
+    const vfloat16 a = select(valid,v,vfloat16(neg_inf)); 
+    const vbool16 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+  
+  __forceinline vfloat16 prefix_sum(const vfloat16& a) 
+  {
+    const vfloat16 z(zero);
+    vfloat16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vfloat16 reverse_prefix_sum(const vfloat16& a) 
+  {
+    const vfloat16 z(zero);
+    vfloat16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  __forceinline vfloat16 prefix_min(const vfloat16& a)
+  {
+    const vfloat16 z(pos_inf);
+    vfloat16 v = a;
+    v = min(v,align_shift_right<16-1>(v,z));
+    v = min(v,align_shift_right<16-2>(v,z));
+    v = min(v,align_shift_right<16-4>(v,z));
+    v = min(v,align_shift_right<16-8>(v,z));
+    return v;  
+  }
+
+  __forceinline vfloat16 prefix_max(const vfloat16& a)
+  {
+    const vfloat16 z(neg_inf);
+    vfloat16 v = a;
+    v = max(v,align_shift_right<16-1>(v,z));
+    v = max(v,align_shift_right<16-2>(v,z));
+    v = max(v,align_shift_right<16-4>(v,z));
+    v = max(v,align_shift_right<16-8>(v,z));
+    return v;  
+  }
+
+
+  __forceinline vfloat16 reverse_prefix_min(const vfloat16& a)
+  {
+    const vfloat16 z(pos_inf);
+    vfloat16 v = a;
+    v = min(v,align_shift_right<1>(z,v));
+    v = min(v,align_shift_right<2>(z,v));
+    v = min(v,align_shift_right<4>(z,v));
+    v = min(v,align_shift_right<8>(z,v));
+    return v;  
+  }
+
+  __forceinline vfloat16 reverse_prefix_max(const vfloat16& a)
+  {
+    const vfloat16 z(neg_inf);
+    vfloat16 v = a;
+    v = max(v,align_shift_right<1>(z,v));
+    v = max(v,align_shift_right<2>(z,v));
+    v = max(v,align_shift_right<4>(z,v));
+    v = max(v,align_shift_right<8>(z,v));
+    return v;  
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat16 loadAOS4to16f(const float& x, const float& y, const float& z)
+  {
+    vfloat16 f = zero;
+    f = select(0x1111,vfloat16::broadcast(&x),f);
+    f = select(0x2222,vfloat16::broadcast(&y),f);
+    f = select(0x4444,vfloat16::broadcast(&z),f);
+    return f;
+  }
+
+  __forceinline vfloat16 loadAOS4to16f(unsigned int index,
+                                       const vfloat16& x,
+                                       const vfloat16& y,
+                                       const vfloat16& z)
+  {
+    vfloat16 f = zero;
+    f = select(0x1111,vfloat16::broadcast((float*)&x + index),f);
+    f = select(0x2222,vfloat16::broadcast((float*)&y + index),f);
+    f = select(0x4444,vfloat16::broadcast((float*)&z + index),f);
+    return f;
+  }
+
+  __forceinline vfloat16 loadAOS4to16f(unsigned int index,
+                                       const vfloat16& x,
+                                       const vfloat16& y,
+                                       const vfloat16& z,
+                                       const vfloat16& fill)
+  {
+    vfloat16 f = fill;
+    f = select(0x1111,vfloat16::broadcast((float*)&x + index),f);
+    f = select(0x2222,vfloat16::broadcast((float*)&y + index),f);
+    f = select(0x4444,vfloat16::broadcast((float*)&z + index),f);
+    return f;
+  }
+
+  __forceinline vfloat16 rcp_safe(const vfloat16& a) {
+    return rcp(select(a != vfloat16(zero), a, vfloat16(min_rcp_input)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h b/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h
new file mode 100644
index 0000000000..5732c0fbc8
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vfloat4_sse2.h
@@ -0,0 +1,925 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 4-wide SSE float type */
+  template<>
+  struct vfloat<4>
+  {
+    ALIGNED_STRUCT_(16);
+
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };                        // number of SIMD elements
+    union { __m128 v; float f[4]; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat4& other) { v = other.v; }
+    __forceinline vfloat4& operator =(const vfloat4& other) { v = other.v; return *this; }
+
+    __forceinline vfloat(__m128 a) : v(a) {}
+    __forceinline operator const __m128&() const { return v; }
+    __forceinline operator       __m128&()       { return v; }
+
+    __forceinline vfloat(float a) : v(_mm_set1_ps(a)) {}
+    __forceinline vfloat(float a, float b, float c, float d) : v(_mm_set_ps(d, c, b, a)) {}
+
+    __forceinline explicit vfloat(const vint4& a) : v(_mm_cvtepi32_ps(a)) {}
+#if defined(__aarch64__)
+    __forceinline explicit vfloat(const vuint4& x) {
+        v = vcvtq_f32_u32(vreinterpretq_u32_s32(x.v));
+    }
+#else
+    __forceinline explicit vfloat(const vuint4& x) {
+      const __m128i a   = _mm_and_si128(x,_mm_set1_epi32(0x7FFFFFFF));
+      const __m128i b   = _mm_and_si128(_mm_srai_epi32(x,31),_mm_set1_epi32(0x4F000000)); //0x4F000000 = 2^31
+      const __m128  af  = _mm_cvtepi32_ps(a);
+      const __m128  bf  = _mm_castsi128_ps(b);
+      v  = _mm_add_ps(af,bf);
+    }
+#endif
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat4 load (const void* a) { return _mm_load_ps((float*)a); }
+    static __forceinline vfloat4 loadu(const void* a) { return _mm_loadu_ps((float*)a); }
+
+    static __forceinline void store (void* ptr, const vfloat4& v) { _mm_store_ps((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat4& v) { _mm_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &v) {
+      return _mm_mask_compress_ps(v, mask, v);
+    }
+    static __forceinline vfloat4 compact(const vboolf4& mask, vfloat4 &a, const vfloat4& b) {
+      return _mm_mask_compress_ps(a, mask, b);
+    }
+
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_ps (_mm_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_ps(_mm_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_mask_storeu_ps((float*)ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_maskload_ps((float*)ptr,mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,v); }
+#else
+    static __forceinline vfloat4 load (const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_load_ps ((float*)ptr),mask); }
+    static __forceinline vfloat4 loadu(const vboolf4& mask, const void* ptr) { return _mm_and_ps(_mm_loadu_ps((float*)ptr),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vfloat4& v) { store (ptr,select(mask,v,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vfloat4& v) { storeu(ptr,select(mask,v,loadu(ptr))); }
+#endif
+
+#if defined(__AVX__)
+    static __forceinline vfloat4 broadcast(const void* a) { return _mm_broadcast_ss((float*)a); }
+#else
+    static __forceinline vfloat4 broadcast(const void* a) { return _mm_set1_ps(*(float*)a); }
+#endif
+
+    static __forceinline vfloat4 load_nt (const float* ptr) {
+#if defined (__SSE4_1__)
+    return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)ptr));
+#else
+    return _mm_load_ps(ptr);
+#endif
+  }
+
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const int8_t* ptr) {
+        return __m128(_mm_load4epi8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const int8_t* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const int8_t* ptr) {
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const uint8_t* ptr) {
+        return __m128(_mm_load4epu8_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const uint8_t* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const uint8_t* ptr) {
+      //return _mm_cvtpu8_ps(*(__m64*)ptr); // don't enable, will use MMX instructions
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+#if defined(__aarch64__)
+    static __forceinline vfloat4 load(const short* ptr) {
+        return __m128(_mm_load4epi16_f32(((__m128i*)ptr)));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vfloat4 load(const short* ptr) {
+      return _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+    }
+#else
+    static __forceinline vfloat4 load(const short* ptr) {
+      return vfloat4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+#endif
+
+    static __forceinline vfloat4 load(const unsigned short* ptr) {
+      return _mm_mul_ps(vfloat4(vint4::load(ptr)),vfloat4(1.0f/65535.0f));
+    }
+
+    static __forceinline void store_nt(void* ptr, const vfloat4& v)
+    {
+#if defined (__SSE4_1__)
+#if defined(__aarch64__)
+      _mm_stream_ps((float*)ptr,vreinterpretq_s32_f32(v.v));
+#else
+      _mm_stream_ps((float*)ptr,v);
+#endif
+#else
+      _mm_store_ps((float*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat4 gather(const float* ptr, const vint4& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_i32gather_ps(ptr, index, scale);
+#else
+      return vfloat4(
+        *(float*)(((int8_t*)ptr)+scale*index[0]),
+        *(float*)(((int8_t*)ptr)+scale*index[1]),
+        *(float*)(((int8_t*)ptr)+scale*index[2]),
+        *(float*)(((int8_t*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat4 gather(const vboolf4& mask, const float* ptr, const vint4& index) {
+      vfloat4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__)  && !defined(__aarch64__)
+      return _mm_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint4& index, const vfloat4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_i32scatter_ps((float*)ptr, index, v, scale);
+#else
+      *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vfloat4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_mask_i32scatter_ps((float*)ptr ,mask, index, v, scale);
+#else
+      if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    static __forceinline void store(const vboolf4& mask, int8_t* ptr, const vint4& ofs, const vfloat4& v) {
+      scatter<1>(mask,ptr,ofs,v);
+    }
+    static __forceinline void store(const vboolf4& mask, float* ptr, const vint4& ofs, const vfloat4& v) {
+      scatter<4>(mask,ptr,ofs,v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator [](size_t index) const { assert(index < 4); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < 4); return f[index]; }
+
+    friend __forceinline vfloat4 select(const vboolf4& m, const vfloat4& t, const vfloat4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_ps(m, f, t);
+#elif defined(__SSE4_1__) || (defined(__aarch64__))
+      return _mm_blendv_ps(f, t, m);
+#else
+      return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
+#endif
+    }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 asFloat(const vint4&   a) { return _mm_castsi128_ps(a); }
+  __forceinline vint4   asInt  (const vfloat4& a) { return _mm_castps_si128(a); }
+  __forceinline vuint4  asUInt (const vfloat4& a) { return _mm_castps_si128(a); }
+
+  __forceinline vint4   toInt  (const vfloat4& a) { return vint4(a); }
+  __forceinline vfloat4 toFloat(const vint4&   a) { return vfloat4(a); }
+
+  __forceinline vfloat4 operator +(const vfloat4& a) { return a; }
+#if defined(__aarch64__)
+  __forceinline vfloat4 operator -(const vfloat4& a) {
+    return vnegq_f32(a);
+  }
+#else
+  __forceinline vfloat4 operator -(const vfloat4& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+#endif
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_abs_ps(a); }
+#else
+  __forceinline vfloat4 abs(const vfloat4& a) { return _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))); }
+#endif
+
+#if defined(__AVX512VL__)
+  __forceinline vfloat4 sign(const vfloat4& a) { return _mm_mask_blend_ps(_mm_cmp_ps_mask(a, vfloat4(zero), _CMP_LT_OQ), vfloat4(one), -vfloat4(one)); }
+#else
+  __forceinline vfloat4 sign(const vfloat4& a) { return blendv_ps(vfloat4(one), -vfloat4(one), _mm_cmplt_ps(a, vfloat4(zero))); }
+#endif
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a, vreinterpretq_f32_u32(v0x80000000)); }
+#else
+  __forceinline vfloat4 signmsk(const vfloat4& a) { return _mm_and_ps(a,_mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }
+#endif
+
+  __forceinline vfloat4 rcp(const vfloat4& a)
+  {
+#if defined(__aarch64__)
+#if defined(BUILD_IOS)
+    return vfloat4(vdivq_f32(vdupq_n_f32(1.0f),a.v));
+#else //BUILD_IOS
+    __m128 reciprocal = _mm_rcp_ps(a);
+    reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+    // +1 round since NEON's reciprocal estimate instruction has less accuracy than SSE2's rcp.
+    reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
+    return (const vfloat4)reciprocal;
+#endif // BUILD_IOS
+#else
+
+#if defined(__AVX512VL__)
+    const vfloat4 r = _mm_rcp14_ps(a);
+#else
+    const vfloat4 r = _mm_rcp_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_mul_ps(r,_mm_fnmadd_ps(r, a, vfloat4(2.0f)));
+#else
+    return _mm_mul_ps(r,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r, a)));
+#endif
+
+#endif  //defined(__aarch64__)
+  }
+  __forceinline vfloat4 sqr (const vfloat4& a) { return _mm_mul_ps(a,a); }
+  __forceinline vfloat4 sqrt(const vfloat4& a) { return _mm_sqrt_ps(a); }
+
+  __forceinline vfloat4 rsqrt(const vfloat4& a)
+  {
+#if defined(__aarch64__)
+    vfloat4 r = _mm_rsqrt_ps(a);
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a, r), r));
+    return r;
+#else
+
+#if defined(__AVX512VL__)
+    const vfloat4 r = _mm_rsqrt14_ps(a);
+#else
+    const vfloat4 r = _mm_rsqrt_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm_fmadd_ps(_mm_set1_ps(1.5f), r,
+                        _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#else
+    return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f), r),
+                      _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));
+#endif
+
+#endif
+  }
+
+  __forceinline vboolf4 isnan(const vfloat4& a) {
+#if defined(__aarch64__)
+    const vfloat4 b = _mm_and_ps(a, vreinterpretq_f32_u32(v0x7fffffff));
+#else
+    const vfloat4 b = _mm_and_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
+#endif
+#if defined(__AVX512VL__)
+    return _mm_cmp_epi32_mask(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000), _MM_CMPINT_GT);
+#else
+    return _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_castps_si128(b), _mm_set1_epi32(0x7f800000)));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 operator +(const vfloat4& a, const vfloat4& b) { return _mm_add_ps(a, b); }
+  __forceinline vfloat4 operator +(const vfloat4& a, float          b) { return a + vfloat4(b); }
+  __forceinline vfloat4 operator +(float          a, const vfloat4& b) { return vfloat4(a) + b; }
+
+  __forceinline vfloat4 operator -(const vfloat4& a, const vfloat4& b) { return _mm_sub_ps(a, b); }
+  __forceinline vfloat4 operator -(const vfloat4& a, float          b) { return a - vfloat4(b); }
+  __forceinline vfloat4 operator -(float          a, const vfloat4& b) { return vfloat4(a) - b; }
+
+  __forceinline vfloat4 operator *(const vfloat4& a, const vfloat4& b) { return _mm_mul_ps(a, b); }
+  __forceinline vfloat4 operator *(const vfloat4& a, float          b) { return a * vfloat4(b); }
+  __forceinline vfloat4 operator *(float          a, const vfloat4& b) { return vfloat4(a) * b; }
+
+  __forceinline vfloat4 operator /(const vfloat4& a, const vfloat4& b) { return _mm_div_ps(a,b); }
+  __forceinline vfloat4 operator /(const vfloat4& a, float          b) { return a/vfloat4(b); }
+  __forceinline vfloat4 operator /(float          a, const vfloat4& b) { return vfloat4(a)/b; }
+
+  __forceinline vfloat4 operator &(const vfloat4& a, const vfloat4& b) { return _mm_and_ps(a,b); }
+  __forceinline vfloat4 operator |(const vfloat4& a, const vfloat4& b) { return _mm_or_ps(a,b); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vfloat4& b) { return _mm_xor_ps(a,b); }
+  __forceinline vfloat4 operator ^(const vfloat4& a, const vint4&   b) { return _mm_xor_ps(a,_mm_castsi128_ps(b)); }
+
+  __forceinline vfloat4 min(const vfloat4& a, const vfloat4& b) { return _mm_min_ps(a,b); }
+  __forceinline vfloat4 min(const vfloat4& a, float          b) { return _mm_min_ps(a,vfloat4(b)); }
+  __forceinline vfloat4 min(float          a, const vfloat4& b) { return _mm_min_ps(vfloat4(a),b); }
+
+  __forceinline vfloat4 max(const vfloat4& a, const vfloat4& b) { return _mm_max_ps(a,b); }
+  __forceinline vfloat4 max(const vfloat4& a, float          b) { return _mm_max_ps(a,vfloat4(b)); }
+  __forceinline vfloat4 max(float          a, const vfloat4& b) { return _mm_max_ps(vfloat4(a),b); }
+
+#if defined(__SSE4_1__) || defined(__aarch64__)
+
+    __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epi32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 minui(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_min_epu32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+
+    __forceinline vfloat4 maxui(const vfloat4& a, const vfloat4& b) {
+      const vint4 ai = _mm_castps_si128(a);
+      const vint4 bi = _mm_castps_si128(b);
+      const vint4 ci = _mm_max_epu32(ai,bi);
+      return _mm_castsi128_ps(ci);
+    }
+#else
+    __forceinline vfloat4 mini(const vfloat4& a, const vfloat4& b) {
+      return min(a,b);
+    }
+
+    __forceinline vfloat4 maxi(const vfloat4& a, const vfloat4& b) {
+      return max(a,b);
+    }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); }
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); }
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmsub_ps(a,b,c); }
+#else
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) {
+    return _mm_madd_ps(a, b, c);  //a*b+c;
+  }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) {
+    return _mm_msub_ps(a, b, c);  //-a*b+c;
+  }
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) {
+    return vnegq_f32(vfmaq_f32(c,a, b));
+  }
+#else
+  __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b+c; }
+  __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b+c;}
+  __forceinline vfloat4 nmsub(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return -a*b-c; }
+#endif
+  __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return a*b-c; }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4& operator +=(vfloat4& a, const vfloat4& b) { return a = a + b; }
+  __forceinline vfloat4& operator +=(vfloat4& a, float          b) { return a = a + b; }
+
+  __forceinline vfloat4& operator -=(vfloat4& a, const vfloat4& b) { return a = a - b; }
+  __forceinline vfloat4& operator -=(vfloat4& a, float          b) { return a = a - b; }
+
+  __forceinline vfloat4& operator *=(vfloat4& a, const vfloat4& b) { return a = a * b; }
+  __forceinline vfloat4& operator *=(vfloat4& a, float          b) { return a = a * b; }
+
+  __forceinline vfloat4& operator /=(vfloat4& a, const vfloat4& b) { return a = a / b; }
+  __forceinline vfloat4& operator /=(vfloat4& a, float          b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vfloat4& a, const vfloat4& b) { return _mm_cmpeq_ps (a, b); }
+  __forceinline vboolf4 operator !=(const vfloat4& a, const vfloat4& b) { return _mm_cmpneq_ps(a, b); }
+  __forceinline vboolf4 operator < (const vfloat4& a, const vfloat4& b) { return _mm_cmplt_ps (a, b); }
+#if defined(__aarch64__)
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpge_ps (a, b); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpgt_ps (a, b); }
+#else
+  __forceinline vboolf4 operator >=(const vfloat4& a, const vfloat4& b) { return _mm_cmpnlt_ps(a, b); }
+  __forceinline vboolf4 operator > (const vfloat4& a, const vfloat4& b) { return _mm_cmpnle_ps(a, b); }
+#endif
+  __forceinline vboolf4 operator <=(const vfloat4& a, const vfloat4& b) { return _mm_cmple_ps (a, b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vfloat4& a, float          b) { return a == vfloat4(b); }
+  __forceinline vboolf4 operator ==(float          a, const vfloat4& b) { return vfloat4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vfloat4& a, float          b) { return a != vfloat4(b); }
+  __forceinline vboolf4 operator !=(float          a, const vfloat4& b) { return vfloat4(a) != b; }
+
+  __forceinline vboolf4 operator < (const vfloat4& a, float          b) { return a <  vfloat4(b); }
+  __forceinline vboolf4 operator < (float          a, const vfloat4& b) { return vfloat4(a) <  b; }
+
+  __forceinline vboolf4 operator >=(const vfloat4& a, float          b) { return a >= vfloat4(b); }
+  __forceinline vboolf4 operator >=(float          a, const vfloat4& b) { return vfloat4(a) >= b; }
+
+  __forceinline vboolf4 operator > (const vfloat4& a, float          b) { return a >  vfloat4(b); }
+  __forceinline vboolf4 operator > (float          a, const vfloat4& b) { return vfloat4(a) >  b; }
+
+  __forceinline vboolf4 operator <=(const vfloat4& a, float          b) { return a <= vfloat4(b); }
+  __forceinline vboolf4 operator <=(float          a, const vfloat4& b) { return vfloat4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vfloat4& a, const vfloat4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vfloat4& a, const vfloat4& b) { return a != b; }
+  __forceinline vboolf4 lt(const vfloat4& a, const vfloat4& b) { return a <  b; }
+  __forceinline vboolf4 ge(const vfloat4& a, const vfloat4& b) { return a >= b; }
+  __forceinline vboolf4 gt(const vfloat4& a, const vfloat4& b) { return a >  b; }
+  __forceinline vboolf4 le(const vfloat4& a, const vfloat4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return _mm_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a != b); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <  b); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >= b); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a >  b); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vfloat4& a, const vfloat4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+    __forceinline vfloat4 select(const vfloat4& t, const vfloat4& f)
+  {
+#if defined(__SSE4_1__)
+    return _mm_blend_ps(f, t, mask);
+#else
+    return select(vboolf4(mask), t, f);
+#endif
+  }
+
+#if defined(__aarch64__)
+    template<> __forceinline vfloat4 select<0>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vzero));
+    }
+    template<> __forceinline vfloat4 select<1>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v000F));
+    }
+    template<> __forceinline vfloat4 select<2>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00F0));
+    }
+    template<> __forceinline vfloat4 select<3>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v00FF));
+    }
+    template<> __forceinline vfloat4 select<4>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F00));
+    }
+    template<> __forceinline vfloat4 select<5>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0F0F));
+    }
+    template<> __forceinline vfloat4 select<6>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FF0));
+    }
+    template<> __forceinline vfloat4 select<7>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(v0FFF));
+    }
+    template<> __forceinline vfloat4 select<8>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF000));
+    }
+    template<> __forceinline vfloat4 select<9>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF00F));
+    }
+    template<> __forceinline vfloat4 select<10>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0F0));
+    }
+    template<> __forceinline vfloat4 select<11>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vF0FF));
+    }
+    template<> __forceinline vfloat4 select<12>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF00));
+    }
+    template<> __forceinline vfloat4 select<13>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFF0F));
+    }
+    template<> __forceinline vfloat4 select<14>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFF0));
+    }
+    template<> __forceinline vfloat4 select<15>(const vfloat4& t, const vfloat4& f) {
+        return _mm_blendv_ps(f, t, vreinterpretq_f32_u32(vFFFF));
+    }
+#endif
+
+  __forceinline vfloat4 lerp(const vfloat4& a, const vfloat4& b, const vfloat4& t) {
+    return madd(t,b-a,a);
+  }
+
+  __forceinline bool isvalid(const vfloat4& v) {
+    return all((v > vfloat4(-FLT_LARGE)) & (v < vfloat4(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite(const vfloat4& a) {
+    return all((a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+  }
+
+  __forceinline bool is_finite(const vboolf4& valid, const vfloat4& a) {
+    return all(valid, (a >= vfloat4(-FLT_MAX)) & (a <= vfloat4(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return vrndmq_f32(a.v); } // towards -inf
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vrndpq_f32(a.v); } // toward +inf
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vrndq_f32(a.v); } // towards 0
+  __forceinline vfloat4 round(const vfloat4& a) { return vrndnq_f32(a.v); } // to nearest, ties to even. NOTE(LTE): arm clang uses vrndnq, old gcc uses vrndqn?
+#elif defined (__SSE4_1__)
+  __forceinline vfloat4 floor(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF   ); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF   ); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_ZERO      ); }
+  __forceinline vfloat4 round(const vfloat4& a) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } // (even) https://www.felixcloutier.com/x86/roundpd
+#else
+  __forceinline vfloat4 floor(const vfloat4& a) { return vfloat4(floorf(a[0]),floorf(a[1]),floorf(a[2]),floorf(a[3])); }
+  __forceinline vfloat4 ceil (const vfloat4& a) { return vfloat4(ceilf (a[0]),ceilf (a[1]),ceilf (a[2]),ceilf (a[3])); }
+  __forceinline vfloat4 trunc(const vfloat4& a) { return vfloat4(truncf(a[0]),truncf(a[1]),truncf(a[2]),truncf(a[3])); }
+  __forceinline vfloat4 round(const vfloat4& a) { return vfloat4(roundf(a[0]),roundf(a[1]),roundf(a[2]),roundf(a[3])); }
+#endif
+  __forceinline vfloat4 frac(const vfloat4& a) { return a-floor(a); }
+
+  __forceinline vint4 floori(const vfloat4& a) {
+#if defined(__aarch64__)
+    return vcvtq_s32_f32(floor(a));
+#elif defined(__SSE4_1__)
+    return vint4(floor(a));
+#else
+    return vint4(a-vfloat4(0.5f));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 unpacklo(const vfloat4& a, const vfloat4& b) { return _mm_unpacklo_ps(a, b); }
+  __forceinline vfloat4 unpackhi(const vfloat4& a, const vfloat4& b) { return _mm_unpackhi_ps(a, b); }
+
+#if defined(__aarch64__)
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& v) {
+          return vreinterpretq_f32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+      }
+      template<int i0, int i1, int i2, int i3>
+      __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+          return vreinterpretq_f32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+      }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat4 shuffle(const vfloat4& v) {
+    return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat4 shuffle(const vfloat4& a, const vfloat4& b) {
+    return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+#endif
+
+#if defined (__SSSE3__)
+  __forceinline vfloat4 shuffle8(const vfloat4& a, const vint4& shuf) {
+    return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
+  }
+#endif
+
+#if defined(__aarch64__) 
+  template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0022 )); }
+  template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v1133)); }
+  template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return __m128(vqtbl1q_u8( uint8x16_t(v.v), v0101)); }
+#elif defined(__SSE3__)
+  template<> __forceinline vfloat4 shuffle<0, 0, 2, 2>(const vfloat4& v) { return _mm_moveldup_ps(v); }
+  template<> __forceinline vfloat4 shuffle<1, 1, 3, 3>(const vfloat4& v) { return _mm_movehdup_ps(v); }
+  template<> __forceinline vfloat4 shuffle<0, 1, 0, 1>(const vfloat4& v) { return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vfloat4 shuffle(const vfloat4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__aarch64__)
+  template<int i> __forceinline float extract(const vfloat4& a);
+  template<> __forceinline float extract<0>(const vfloat4& b) {
+      return b[0];
+  }
+  template<> __forceinline float extract<1>(const vfloat4& b) {
+      return b[1];
+  }
+  template<> __forceinline float extract<2>(const vfloat4& b) {
+      return b[2];
+  }
+  template<> __forceinline float extract<3>(const vfloat4& b) {
+      return b[3];
+  }
+#elif defined (__SSE4_1__) && !defined(__GNUC__)
+  template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(_mm_extract_ps(a,i)); }
+  template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+#else
+  template<int i> __forceinline float extract(const vfloat4& a) { return _mm_cvtss_f32(shuffle<i,i,i,i>(a)); }
+  template<> __forceinline float extract<0>(const vfloat4& a) { return _mm_cvtss_f32(a); }
+#endif
+
+
+#if defined(__aarch64__)
+  template<int dst>  __forceinline vfloat4 insert(const vfloat4& a, float b);
+  template<> __forceinline vfloat4 insert<0>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[0] = b;
+        return c;
+  }
+  template<> __forceinline vfloat4 insert<1>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[1] = b;
+        return c;
+  }
+  template<> __forceinline vfloat4 insert<2>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[2] = b;
+        return c;
+  }
+  template<> __forceinline vfloat4 insert<3>(const vfloat4& a, float b)
+  {
+        vfloat4 c = a;
+        c[3] = b;
+        return c;
+  }
+#elif defined (__SSE4_1__)
+  template<int dst, int src, int clr> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr); }
+  template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { return insert<dst, src, 0>(a, b); }
+  template<int dst> __forceinline vfloat4 insert(const vfloat4& a, const float b) { return insert<dst, 0>(a, _mm_set_ss(b)); }
+#else
+  template<int dst, int src> __forceinline vfloat4 insert(const vfloat4& a, const vfloat4& b) { vfloat4 c = a; c[dst&3] = b[src&3]; return c; }
+  template<int dst>  __forceinline vfloat4 insert(const vfloat4& a, float b) { vfloat4 c = a; c[dst&3] = b; return c; }
+#endif
+
+#if defined(__aarch64__)
+  __forceinline float toScalar(const vfloat4& v) {
+    return v[0];
+  }
+#else
+  __forceinline float toScalar(const vfloat4& v) { return _mm_cvtss_f32(v); }
+#endif
+  __forceinline vfloat4 broadcast4f(const vfloat4& a, size_t k) {
+    return vfloat4::broadcast(&a[k]);
+  }
+
+  __forceinline vfloat4 shift_right_1(const vfloat4& x) {
+    return _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(x), 4));
+  }
+
+#if defined (__AVX2__)
+  __forceinline vfloat4 permute(const vfloat4 &a, const __m128i &index) {
+    return _mm_permutevar_ps(a,index);
+  }
+
+  __forceinline vfloat4 broadcast1f(const void* a) { return _mm_broadcast_ss((float*)a); }
+
+#endif
+
+#if defined(__AVX512VL__)
+  template<int i>
+  __forceinline vfloat4 align_shift_right(const vfloat4& a, const vfloat4& b) {
+    return _mm_castsi128_ps(_mm_alignr_epi32(_mm_castps_si128(a), _mm_castps_si128(b), i));
+  }
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting Network
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat4 sort_ascending(const vfloat4& v)
+  {
+    const vfloat4 a0 = v;
+    const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat4 c0 = min(a0,b0);
+    const vfloat4 d0 = max(a0,b0);
+    const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat4 c1 = min(a1,b1);
+    const vfloat4 d1 = max(a1,b1);
+    const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+    const vfloat4 c2 = min(a2,b2);
+    const vfloat4 d2 = max(a2,b2);
+    const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  __forceinline vfloat4 sort_descending(const vfloat4& v)
+  {
+    const vfloat4 a0 = v;
+    const vfloat4 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat4 c0 = max(a0,b0);
+    const vfloat4 d0 = min(a0,b0);
+    const vfloat4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vfloat4 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat4 c1 = max(a1,b1);
+    const vfloat4 d1 = min(a1,b1);
+    const vfloat4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vfloat4 b2 = shuffle<0,2,1,3>(a2);
+    const vfloat4 c2 = max(a2,b2);
+    const vfloat4 d2 = min(a2,b2);
+    const vfloat4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2, vfloat4& c3)
+  {
+    vfloat4 l02 = unpacklo(r0,r2);
+    vfloat4 h02 = unpackhi(r0,r2);
+    vfloat4 l13 = unpacklo(r1,r3);
+    vfloat4 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+    c3 = unpackhi(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, vfloat4& c0, vfloat4& c1, vfloat4& c2)
+  {
+    vfloat4 l02 = unpacklo(r0,r2);
+    vfloat4 h02 = unpackhi(r0,r2);
+    vfloat4 l13 = unpacklo(r1,r3);
+    vfloat4 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if defined(__aarch64__)
+      __forceinline vfloat4 vreduce_min(const vfloat4& v) { float h = vminvq_f32(v); return vdupq_n_f32(h); }
+      __forceinline vfloat4 vreduce_max(const vfloat4& v) { float h = vmaxvq_f32(v); return vdupq_n_f32(h); }
+      __forceinline vfloat4 vreduce_add(const vfloat4& v) { float h = vaddvq_f32(v); return vdupq_n_f32(h); }
+#else
+  __forceinline vfloat4 vreduce_min(const vfloat4& v) { vfloat4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vfloat4 vreduce_max(const vfloat4& v) { vfloat4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vfloat4 vreduce_add(const vfloat4& v) { vfloat4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+#endif
+
+#if defined(__aarch64__)
+  __forceinline float reduce_min(const vfloat4& v) { return vminvq_f32(v); }
+  __forceinline float reduce_max(const vfloat4& v) { return vmaxvq_f32(v); }
+  __forceinline float reduce_add(const vfloat4& v) { return vaddvq_f32(v); }
+#else
+  __forceinline float reduce_min(const vfloat4& v) { return _mm_cvtss_f32(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat4& v) { return _mm_cvtss_f32(vreduce_max(v)); }
+  __forceinline float reduce_add(const vfloat4& v) { return _mm_cvtss_f32(vreduce_add(v)); }
+#endif
+
+  __forceinline size_t select_min(const vboolf4& valid, const vfloat4& v)
+  {
+    const vfloat4 a = select(valid,v,vfloat4(pos_inf));
+    const vbool4 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid));
+  }
+  __forceinline size_t select_max(const vboolf4& valid, const vfloat4& v)
+  {
+    const vfloat4 a = select(valid,v,vfloat4(neg_inf));
+    const vbool4 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline float dot(const vfloat4& a, const vfloat4& b) {
+    return reduce_add(a*b);
+  }
+
+  __forceinline vfloat4 cross(const vfloat4& a, const vfloat4& b)
+  {
+    const vfloat4 a0 = a;
+    const vfloat4 b0 = shuffle<1,2,0,3>(b);
+    const vfloat4 a1 = shuffle<1,2,0,3>(a);
+    const vfloat4 b1 = b;
+    return shuffle<1,2,0,3>(prod_diff(a0,b0,a1,b1));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h b/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h
new file mode 100644
index 0000000000..3c7e4a8cdc
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vfloat8_avx.h
@@ -0,0 +1,847 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX float type */
+  template<>
+  struct vfloat<8>
+  {
+    ALIGNED_STRUCT_(32);
+   
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };                        // number of SIMD elements
+    union { __m256 v; float f[8]; int i[8]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat() {}
+    __forceinline vfloat(const vfloat8& other) { v = other.v; }
+    __forceinline vfloat8& operator =(const vfloat8& other) { v = other.v; return *this; }
+
+    __forceinline vfloat(__m256 a) : v(a) {}
+    __forceinline operator const __m256&() const { return v; }
+    __forceinline operator       __m256&()       { return v; }
+
+    __forceinline explicit vfloat(const vfloat4& a) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),a,1)) {}
+    __forceinline vfloat(const vfloat4& a, const vfloat4& b) : v(_mm256_insertf128_ps(_mm256_castps128_ps256(a),b,1)) {}
+
+    __forceinline explicit vfloat(const int8_t* a) : v(_mm256_loadu_ps((const float*)a)) {}
+    __forceinline vfloat(float a) : v(_mm256_set1_ps(a)) {}
+    __forceinline vfloat(float a, float b) : v(_mm256_set_ps(b, a, b, a, b, a, b, a)) {}
+    __forceinline vfloat(float a, float b, float c, float d) : v(_mm256_set_ps(d, c, b, a, d, c, b, a)) {}
+    __forceinline vfloat(float a, float b, float c, float d, float e, float f, float g, float h) : v(_mm256_set_ps(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vfloat(__m256i a) : v(_mm256_cvtepi32_ps(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vfloat(ZeroTy)   : v(_mm256_setzero_ps()) {}
+    __forceinline vfloat(OneTy)    : v(_mm256_set1_ps(1.0f)) {}
+    __forceinline vfloat(PosInfTy) : v(_mm256_set1_ps(pos_inf)) {}
+    __forceinline vfloat(NegInfTy) : v(_mm256_set1_ps(neg_inf)) {}
+    __forceinline vfloat(StepTy)   : v(_mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)) {}
+    __forceinline vfloat(NaNTy)    : v(_mm256_set1_ps(nan)) {}
+    __forceinline vfloat(UndefinedTy) : v(_mm256_undefined_ps()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vfloat8 broadcast(const void* a) {
+      return _mm256_broadcast_ss((float*)a); 
+    }
+
+    static __forceinline vfloat8 broadcast2(const float* a, const float* b) {
+#if defined(__INTEL_COMPILER)
+      const vfloat8 v0 = _mm256_broadcast_ss(a); 
+      const vfloat8 v1 = _mm256_broadcast_ss(b); 
+      return _mm256_blend_ps(v1, v0, 0xf);
+#else
+      return _mm256_set_ps(*b,*b,*b,*b,*a,*a,*a,*a);
+#endif
+    }
+
+    static __forceinline vfloat8 broadcast4f(const vfloat4* ptr) {
+      return _mm256_broadcast_ps((__m128*)ptr); 
+    }
+
+    static __forceinline vfloat8 load(const int8_t* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+
+    static __forceinline vfloat8 load(const uint8_t* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+
+    static __forceinline vfloat8 load(const short* ptr) {
+#if defined(__AVX2__)
+      return _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)ptr)));
+#else
+      return vfloat8(vfloat4::load(ptr),vfloat4::load(ptr+4));
+#endif
+    }
+      
+    static __forceinline vfloat8 load (const void* ptr) { return _mm256_load_ps((float*)ptr); }
+    static __forceinline vfloat8 loadu(const void* ptr) { return _mm256_loadu_ps((float*)ptr); }
+
+    static __forceinline void store (void* ptr, const vfloat8& v) { return _mm256_store_ps((float*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vfloat8& v) { return _mm256_storeu_ps((float*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &v) {
+      return _mm256_mask_compress_ps(v, mask, v);
+    }
+    static __forceinline vfloat8 compact(const vboolf8& mask, vfloat8 &a, const vfloat8& b) {
+      return _mm256_mask_compress_ps(a, mask, b);
+    }
+
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_ps (_mm256_setzero_ps(),mask,(float*)ptr); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_ps(_mm256_setzero_ps(),mask,(float*)ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_store_ps ((float*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_mask_storeu_ps((float*)ptr,mask,v); }
+#elif defined(__aarch64__)
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask.v); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,v); }
+#else
+    static __forceinline vfloat8 load (const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
+    static __forceinline vfloat8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_maskload_ps((float*)ptr,(__m256i)mask); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vfloat8& v) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,v); }
+#endif
+    
+#if defined(__AVX2__)
+    static __forceinline vfloat8 load_nt(void* ptr) {
+      return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i*)ptr));
+    }
+#endif
+    
+    static __forceinline void store_nt(void* ptr, const vfloat8& v) {
+      _mm256_stream_ps((float*)ptr,v);
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat8 gather(const float* ptr, const vint8& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm256_i32gather_ps(ptr, index ,scale);
+#else
+      return vfloat8(
+          *(float*)(((int8_t*)ptr)+scale*index[0]),
+          *(float*)(((int8_t*)ptr)+scale*index[1]),
+          *(float*)(((int8_t*)ptr)+scale*index[2]),
+          *(float*)(((int8_t*)ptr)+scale*index[3]),
+          *(float*)(((int8_t*)ptr)+scale*index[4]),
+          *(float*)(((int8_t*)ptr)+scale*index[5]),
+          *(float*)(((int8_t*)ptr)+scale*index[6]),
+          *(float*)(((int8_t*)ptr)+scale*index[7]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vfloat8 gather(const vboolf8& mask, const float* ptr, const vint8& index) {
+      vfloat8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_ps(r, mask, index, ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__)
+      return _mm256_mask_i32gather_ps(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(float*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(float*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(float*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(float*)(((int8_t*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(float*)(((int8_t*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(float*)(((int8_t*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(float*)(((int8_t*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(float*)(((int8_t*)ptr)+scale*index[7]);
+      return r;
+    #endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vfloat8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_ps((float*)ptr, ofs, v, scale);
+#else
+      *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vfloat8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_ps((float*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(float*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(float*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(float*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(float*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(float*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(float*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(float*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(float*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline void store(const vboolf8& mask, int8_t* ptr, const vint8& ofs, const vfloat8& v) {
+      scatter<1>(mask,ptr,ofs,v);
+    }
+    static __forceinline void store(const vboolf8& mask, float* ptr, const vint8& ofs, const vfloat8& v) {
+      scatter<4>(mask,ptr,ofs,v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const float& operator [](size_t index) const { assert(index < 8); return f[index]; }
+    __forceinline       float& operator [](size_t index)       { assert(index < 8); return f[index]; }
+  };
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 asFloat(const vint8&   a) { return _mm256_castsi256_ps(a); }
+  __forceinline vint8   asInt  (const vfloat8& a) { return _mm256_castps_si256(a); }
+
+  __forceinline vint8   toInt  (const vfloat8& a) { return vint8(a); }
+  __forceinline vfloat8 toFloat(const vint8&   a) { return vfloat8(a); }
+
+  __forceinline vfloat8 operator +(const vfloat8& a) { return a; }
+#if !defined(__aarch64__)
+  __forceinline vfloat8 operator -(const vfloat8& a) {
+    const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); 
+    return _mm256_xor_ps(a, mask);
+  }
+#else
+  __forceinline vfloat8 operator -(const vfloat8& a) {
+      __m256 res;
+      res.lo = vnegq_f32(a.v.lo);
+      res.hi = vnegq_f32(a.v.hi);
+      return res;
+}
+#endif
+
+#if !defined(__aarch64__)
+__forceinline vfloat8 abs(const vfloat8& a) {
+  const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
+  return _mm256_and_ps(a, mask);
+}
+#else
+__forceinline vfloat8 abs(const vfloat8& a) {
+    __m256 res;
+    res.lo = vabsq_f32(a.v.lo);
+    res.hi = vabsq_f32(a.v.hi);
+    return res;
+}
+#endif
+
+#if !defined(__aarch64__)
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmp_ps(a, vfloat8(zero), _CMP_NGE_UQ)); }
+#else
+  __forceinline vfloat8 sign   (const vfloat8& a) { return _mm256_blendv_ps(vfloat8(one), -vfloat8(one), _mm256_cmplt_ps(a, vfloat8(zero))); }
+#endif
+  __forceinline vfloat8 signmsk(const vfloat8& a) { return _mm256_and_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); }
+
+
+  static __forceinline vfloat8 rcp(const vfloat8& a)
+  {
+#if defined(BUILD_IOS) && defined(__aarch64__)
+    // ios devices are faster doing full divide, no need for NR fixup
+    vfloat8 ret;
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    ret.v.lo = vdivq_f32(one, a.v.lo);
+    ret.v.hi = vdivq_f32(one, a.v.hi);
+    return ret;
+#endif
+
+#if defined(__AVX512VL__)
+    const vfloat8 r = _mm256_rcp14_ps(a);
+#else
+    const vfloat8 r = _mm256_rcp_ps(a);
+#endif
+      
+#if defined(__AVX2__) //&& !defined(aarch64)
+    return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vfloat8(2.0f)));
+#else
+    return _mm256_mul_ps(r, _mm256_sub_ps(vfloat8(2.0f), _mm256_mul_ps(r, a)));
+#endif
+  }
+  __forceinline vfloat8 sqr (const vfloat8& a) { return _mm256_mul_ps(a,a); }
+  __forceinline vfloat8 sqrt(const vfloat8& a) { return _mm256_sqrt_ps(a); }
+
+  static __forceinline vfloat8 rsqrt(const vfloat8& a)
+  {
+#if defined(__AVX512VL__)
+    const vfloat8 r = _mm256_rsqrt14_ps(a);
+#else
+    const vfloat8 r = _mm256_rsqrt_ps(a);
+#endif
+
+#if defined(__AVX2__)
+    return _mm256_fmadd_ps(_mm256_set1_ps(1.5f), r,
+                           _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r))); 
+#else
+    return _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(1.5f), r),
+                         _mm256_mul_ps(_mm256_mul_ps(_mm256_mul_ps(a, _mm256_set1_ps(-0.5f)), r), _mm256_mul_ps(r, r)));
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 operator +(const vfloat8& a, const vfloat8& b) { return _mm256_add_ps(a, b); }
+  __forceinline vfloat8 operator +(const vfloat8& a, float          b) { return a + vfloat8(b); }
+  __forceinline vfloat8 operator +(float          a, const vfloat8& b) { return vfloat8(a) + b; }
+
+  __forceinline vfloat8 operator -(const vfloat8& a, const vfloat8& b) { return _mm256_sub_ps(a, b); }
+  __forceinline vfloat8 operator -(const vfloat8& a, float          b) { return a - vfloat8(b); }
+  __forceinline vfloat8 operator -(float          a, const vfloat8& b) { return vfloat8(a) - b; }
+
+  __forceinline vfloat8 operator *(const vfloat8& a, const vfloat8& b) { return _mm256_mul_ps(a, b); }
+  __forceinline vfloat8 operator *(const vfloat8& a, float          b) { return a * vfloat8(b); }
+  __forceinline vfloat8 operator *(float          a, const vfloat8& b) { return vfloat8(a) * b; }
+
+  __forceinline vfloat8 operator /(const vfloat8& a, const vfloat8& b) { return _mm256_div_ps(a, b); }
+  __forceinline vfloat8 operator /(const vfloat8& a, float          b) { return a / vfloat8(b); }
+  __forceinline vfloat8 operator /(float          a, const vfloat8& b) { return vfloat8(a) / b; }
+
+  __forceinline vfloat8 operator &(const vfloat8& a, const vfloat8& b) { return _mm256_and_ps(a,b); }
+  __forceinline vfloat8 operator |(const vfloat8& a, const vfloat8& b) { return _mm256_or_ps(a,b); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vfloat8& b) { return _mm256_xor_ps(a,b); }
+  __forceinline vfloat8 operator ^(const vfloat8& a, const vint8&   b) { return _mm256_xor_ps(a,_mm256_castsi256_ps(b)); }
+
+  __forceinline vfloat8 min(const vfloat8& a, const vfloat8& b) { return _mm256_min_ps(a, b); }
+  __forceinline vfloat8 min(const vfloat8& a, float          b) { return _mm256_min_ps(a, vfloat8(b)); }
+  __forceinline vfloat8 min(float          a, const vfloat8& b) { return _mm256_min_ps(vfloat8(a), b); }
+
+  __forceinline vfloat8 max(const vfloat8& a, const vfloat8& b) { return _mm256_max_ps(a, b); }
+  __forceinline vfloat8 max(const vfloat8& a, float          b) { return _mm256_max_ps(a, vfloat8(b)); }
+  __forceinline vfloat8 max(float          a, const vfloat8& b) { return _mm256_max_ps(vfloat8(a), b); }
+
+  /* need "static __forceinline for MSVC, otherwise we'll link the wrong version in debug mode */
+#if defined(__AVX2__)
+
+  static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_min_epi32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_max_epi32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 minui(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_min_epu32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+  static __forceinline vfloat8 maxui(const vfloat8& a, const vfloat8& b) {
+    const vint8 ai = _mm256_castps_si256(a);
+    const vint8 bi = _mm256_castps_si256(b);
+    const vint8 ci = _mm256_max_epu32(ai,bi);
+    return _mm256_castsi256_ps(ci);
+  }
+
+#else
+
+  static __forceinline vfloat8 mini(const vfloat8& a, const vfloat8& b) {
+    return asFloat(min(asInt(a),asInt(b)));
+  }
+
+  static __forceinline vfloat8 maxi(const vfloat8& a, const vfloat8& b) {
+    return asFloat(max(asInt(a),asInt(b)));
+  }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Ternary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX2__)
+  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmadd_ps(a,b,c); }
+  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fmsub_ps(a,b,c); }
+  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmadd_ps(a,b,c); }
+  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return _mm256_fnmsub_ps(a,b,c); }
+#else
+  static __forceinline vfloat8 madd  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b+c; }
+  static __forceinline vfloat8 msub  (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return a*b-c; }
+  static __forceinline vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b+c;}
+  static __forceinline vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c) { return -a*b-c; }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8& operator +=(vfloat8& a, const vfloat8& b) { return a = a + b; }
+  __forceinline vfloat8& operator +=(vfloat8& a, float          b) { return a = a + b; }
+
+  __forceinline vfloat8& operator -=(vfloat8& a, const vfloat8& b) { return a = a - b; }
+  __forceinline vfloat8& operator -=(vfloat8& a, float          b) { return a = a - b; }
+
+  __forceinline vfloat8& operator *=(vfloat8& a, const vfloat8& b) { return a = a * b; }
+  __forceinline vfloat8& operator *=(vfloat8& a, float          b) { return a = a * b; }
+
+  __forceinline vfloat8& operator /=(vfloat8& a, const vfloat8& b) { return a = a / b; }
+  __forceinline vfloat8& operator /=(vfloat8& a, float          b) { return a = a / b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps_mask(a, b, _MM_CMPINT_LE); }
+
+  static __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_mask_blend_ps(m, f, t);
+  }
+#elif !defined(__aarch64__)
+  __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);  }
+  __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
+  __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS);  }
+  __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
+  __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
+  __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS);  }
+
+  __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_blendv_ps(f, t, m); 
+  }
+#else
+  __forceinline vboolf8 operator ==(const vfloat8& a, const vfloat8& b) { return _mm256_cmpeq_ps(a, b);  }
+  __forceinline vboolf8 operator !=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpneq_ps(a, b); }
+  __forceinline vboolf8 operator < (const vfloat8& a, const vfloat8& b) { return _mm256_cmplt_ps(a, b);  }
+  __forceinline vboolf8 operator >=(const vfloat8& a, const vfloat8& b) { return _mm256_cmpge_ps(a, b);  }
+  __forceinline vboolf8 operator > (const vfloat8& a, const vfloat8& b) { return _mm256_cmpgt_ps(a, b);  }
+  __forceinline vboolf8 operator <=(const vfloat8& a, const vfloat8& b) { return _mm256_cmple_ps(a, b);  }
+
+  __forceinline vfloat8 select(const vboolf8& m, const vfloat8& t, const vfloat8& f) {
+    return _mm256_blendv_ps(f, t, m);
+  }
+
+#endif
+
+  template<int mask>
+    __forceinline vfloat8 select(const vfloat8& t, const vfloat8& f) {
+    return _mm256_blend_ps(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vfloat8& a, const float&   b) { return a == vfloat8(b); }
+  __forceinline vboolf8 operator ==(const float&   a, const vfloat8& b) { return vfloat8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vfloat8& a, const float&   b) { return a != vfloat8(b); }
+  __forceinline vboolf8 operator !=(const float&   a, const vfloat8& b) { return vfloat8(a) != b; }
+
+  __forceinline vboolf8 operator < (const vfloat8& a, const float&   b) { return a <  vfloat8(b); }
+  __forceinline vboolf8 operator < (const float&   a, const vfloat8& b) { return vfloat8(a) <  b; }
+
+  __forceinline vboolf8 operator >=(const vfloat8& a, const float&   b) { return a >= vfloat8(b); }
+  __forceinline vboolf8 operator >=(const float&   a, const vfloat8& b) { return vfloat8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vfloat8& a, const float&   b) { return a >  vfloat8(b); }
+  __forceinline vboolf8 operator > (const float&   a, const vfloat8& b) { return vfloat8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vfloat8& a, const float&   b) { return a <= vfloat8(b); }
+  __forceinline vboolf8 operator <=(const float&   a, const vfloat8& b) { return vfloat8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vfloat8& a, const vfloat8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vfloat8& a, const vfloat8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vfloat8& a, const vfloat8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vfloat8& a, const vfloat8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vfloat8& a, const vfloat8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vfloat8& a, const vfloat8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return _mm256_mask_cmp_ps_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a == b); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a != b); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <  b); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >= b); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a >  b); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vfloat8& a, const vfloat8& b) { return mask & (a <= b); }
+#endif
+
+  __forceinline vfloat8 lerp(const vfloat8& a, const vfloat8& b, const vfloat8& t) {
+    return madd(t,b-a,a);
+  }
+
+  __forceinline bool isvalid (const vfloat8& v) {
+    return all((v > vfloat8(-FLT_LARGE)) & (v < vfloat8(+FLT_LARGE)));
+  }
+
+  __forceinline bool is_finite (const vfloat8& a) {
+    return all((a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+  }
+
+  __forceinline bool is_finite (const vboolf8& valid, const vfloat8& a) {
+    return all(valid, (a >= vfloat8(-FLT_MAX)) & (a <= vfloat8(+FLT_MAX)));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Rounding Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__aarch64__)
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEG_INF    ); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_POS_INF    ); }
+  __forceinline vfloat8 trunc(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO       ); }
+  __forceinline vfloat8 round(const vfloat8& a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT); }
+#else
+  __forceinline vfloat8 floor(const vfloat8& a) { return _mm256_floor_ps(a); }
+  __forceinline vfloat8 ceil (const vfloat8& a) { return _mm256_ceil_ps(a); }
+#endif
+
+
+  __forceinline vfloat8 frac (const vfloat8& a) { return a-floor(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 unpacklo(const vfloat8& a, const vfloat8& b) { return _mm256_unpacklo_ps(a, b); }
+  __forceinline vfloat8 unpackhi(const vfloat8& a, const vfloat8& b) { return _mm256_unpackhi_ps(a, b); }
+
+  template<int i>
+  __forceinline vfloat8 shuffle(const vfloat8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i, i, i, i));
+  }
+
+  template<int i0, int i1>
+  __forceinline vfloat8 shuffle4(const vfloat8& v) {
+    return _mm256_permute2f128_ps(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vfloat8 shuffle4(const vfloat8& a, const vfloat8& b) {
+    return _mm256_permute2f128_ps(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat8 shuffle(const vfloat8& v) {
+    return _mm256_permute_ps(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vfloat8 shuffle(const vfloat8& a, const vfloat8& b) {
+    return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+#if !defined(__aarch64__)
+  template<> __forceinline vfloat8 shuffle<0, 0, 2, 2>(const vfloat8& v) { return _mm256_moveldup_ps(v); }
+  template<> __forceinline vfloat8 shuffle<1, 1, 3, 3>(const vfloat8& v) { return _mm256_movehdup_ps(v); }
+  template<> __forceinline vfloat8 shuffle<0, 1, 0, 1>(const vfloat8& v) { return _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(v))); }
+#endif
+
+  __forceinline vfloat8 broadcast(const float* ptr) { return _mm256_broadcast_ss(ptr); }
+  template<size_t i> __forceinline vfloat8 insert4(const vfloat8& a, const vfloat4& b) { return _mm256_insertf128_ps(a, b, i); }
+  template<size_t i> __forceinline vfloat4 extract4   (const vfloat8& a) { return _mm256_extractf128_ps(a, i); }
+  template<>         __forceinline vfloat4 extract4<0>(const vfloat8& a) { return _mm256_castps256_ps128(a);   }
+
+  __forceinline float toScalar(const vfloat8& v) { return _mm_cvtss_f32(_mm256_castps256_ps128(v)); }
+
+  __forceinline vfloat8 assign(const vfloat4& a) { return _mm256_castps128_ps256(a); }
+
+#if defined (__AVX2__) && !defined(__aarch64__)
+  __forceinline vfloat8 permute(const vfloat8& a, const __m256i& index) {
+    return _mm256_permutevar8x32_ps(a, index);
+  }
+#endif
+
+#if defined(__AVX512VL__)
+  template<int i>
+  static __forceinline vfloat8 align_shift_right(const vfloat8& a, const vfloat8& b) {
+    return _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(a), _mm256_castps_si256(b), i));
+  }  
+#endif
+
+#if defined (__AVX_I__)
+  template<const int mode>
+  static __forceinline vint4 convert_to_hf16(const vfloat8& a) {
+    return _mm256_cvtps_ph(a, mode);
+  }
+
+  static __forceinline vfloat8 convert_from_hf16(const vint4& a) {
+    return _mm256_cvtph_ps(a);
+  }
+#endif
+
+  __forceinline vfloat4 broadcast4f(const vfloat8& a, const size_t k) {
+    return vfloat4::broadcast(&a[k]);
+  }
+
+  __forceinline vfloat8 broadcast8f(const vfloat8& a, const size_t k) {
+    return vfloat8::broadcast(&a[k]);
+  }
+
+#if defined(__AVX512VL__)
+  static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+    return align_shift_right<1>(zero,x);
+  }
+#else
+  static __forceinline vfloat8 shift_right_1(const vfloat8& x) {
+    const vfloat8 t0 = shuffle<1,2,3,0>(x);
+    const vfloat8 t1 = shuffle4<1,0>(t0);
+    return _mm256_blend_ps(t0,t1,0x88);
+  }
+#endif
+
+  __forceinline vint8 floori(const vfloat8& a) {
+    return vint8(floor(a));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Transpose
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+  {
+    vfloat8 l02 = unpacklo(r0,r2);
+    vfloat8 h02 = unpackhi(r0,r2);
+    vfloat8 l13 = unpacklo(r1,r3);
+    vfloat8 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+    c3 = unpackhi(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, vfloat8& c0, vfloat8& c1, vfloat8& c2)
+  {
+    vfloat8 l02 = unpacklo(r0,r2);
+    vfloat8 h02 = unpackhi(r0,r2);
+    vfloat8 l13 = unpacklo(r1,r3);
+    vfloat8 h13 = unpackhi(r1,r3);
+    c0 = unpacklo(l02,l13);
+    c1 = unpackhi(l02,l13);
+    c2 = unpacklo(h02,h13);
+  }
+
+  __forceinline void transpose(const vfloat8& r0, const vfloat8& r1, const vfloat8& r2, const vfloat8& r3, const vfloat8& r4, const vfloat8& r5, const vfloat8& r6, const vfloat8& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3, vfloat8& c4, vfloat8& c5, vfloat8& c6, vfloat8& c7)
+  {
+    vfloat8 h0,h1,h2,h3; transpose(r0,r1,r2,r3,h0,h1,h2,h3);
+    vfloat8 h4,h5,h6,h7; transpose(r4,r5,r6,r7,h4,h5,h6,h7);
+    c0 = shuffle4<0,2>(h0,h4);
+    c1 = shuffle4<0,2>(h1,h5);
+    c2 = shuffle4<0,2>(h2,h6);
+    c3 = shuffle4<0,2>(h3,h7);
+    c4 = shuffle4<1,3>(h0,h4);
+    c5 = shuffle4<1,3>(h1,h5);
+    c6 = shuffle4<1,3>(h2,h6);
+    c7 = shuffle4<1,3>(h3,h7);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2, vfloat8& c3)
+  {
+    transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2, c3);
+  }
+
+  __forceinline void transpose(const vfloat4& r0, const vfloat4& r1, const vfloat4& r2, const vfloat4& r3, const vfloat4& r4, const vfloat4& r5, const vfloat4& r6, const vfloat4& r7,
+                               vfloat8& c0, vfloat8& c1, vfloat8& c2)
+  {
+    transpose(vfloat8(r0,r4), vfloat8(r1,r5), vfloat8(r2,r6), vfloat8(r3,r7), c0, c1, c2);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+#if !defined(__aarch64__)
+  __forceinline vfloat8 vreduce_min2(const vfloat8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vfloat8 vreduce_min4(const vfloat8& v) { vfloat8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vfloat8 vreduce_min (const vfloat8& v) { vfloat8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vfloat8 vreduce_max2(const vfloat8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vfloat8 vreduce_max4(const vfloat8& v) { vfloat8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vfloat8 vreduce_max (const vfloat8& v) { vfloat8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vfloat8 vreduce_add2(const vfloat8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vfloat8 vreduce_add4(const vfloat8& v) { vfloat8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vfloat8 vreduce_add (const vfloat8& v) { vfloat8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline float reduce_min(const vfloat8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline float reduce_max(const vfloat8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline float reduce_add(const vfloat8& v) { return toScalar(vreduce_add(v)); }
+#else
+  __forceinline float reduce_min(const vfloat8& v) { return vminvq_f32(_mm_min_ps(v.v.lo,v.v.hi)); }
+  __forceinline float reduce_max(const vfloat8& v) { return vmaxvq_f32(_mm_max_ps(v.v.lo,v.v.hi)); }
+  __forceinline vfloat8 vreduce_min(const vfloat8& v) { return vfloat8(reduce_min(v)); }
+  __forceinline vfloat8 vreduce_max(const vfloat8& v) { return vfloat8(reduce_max(v)); }
+  __forceinline float reduce_add(const vfloat8& v) { return vaddvq_f32(_mm_add_ps(v.v.lo,v.v.hi)); }
+
+#endif
+  __forceinline size_t select_min(const vboolf8& valid, const vfloat8& v) 
+  { 
+    const vfloat8 a = select(valid,v,vfloat8(pos_inf)); 
+    const vbool8 valid_min = valid & (a == vreduce_min(a));
+    return bsf(movemask(any(valid_min) ? valid_min : valid)); 
+  }
+
+  __forceinline size_t select_max(const vboolf8& valid, const vfloat8& v) 
+  { 
+    const vfloat8 a = select(valid,v,vfloat8(neg_inf)); 
+    const vbool8 valid_max = valid & (a == vreduce_max(a));
+    return bsf(movemask(any(valid_max) ? valid_max : valid)); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Euclidian Space Operators (pairs of Vec3fa's)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+  //  return vreduce_add4(a*b);
+  //}
+
+  __forceinline vfloat8 dot(const vfloat8& a, const vfloat8& b) {
+    return _mm256_dp_ps(a,b,0x7F);
+  }
+
+  __forceinline vfloat8 cross(const vfloat8& a, const vfloat8& b)
+  {
+    const vfloat8 a0 = a;
+    const vfloat8 b0 = shuffle<1,2,0,3>(b);
+    const vfloat8 a1 = shuffle<1,2,0,3>(a);
+    const vfloat8 b1 = b;
+    return shuffle<1,2,0,3>(msub(a0,b0,a1*b1));
+  }
+
+  //__forceinline float sqr_length (const vfloat<8>& a) { return dot(a,a); }
+  //__forceinline float rcp_length (const vfloat<8>& a) { return rsqrt(dot(a,a)); }
+  //__forceinline float rcp_length2(const vfloat<8>& a) { return rcp(dot(a,a)); }
+  //__forceinline float length     (const vfloat<8>& a) { return sqrt(dot(a,a)); }
+  __forceinline vfloat<8> normalize(const vfloat<8>& a) { return a*rsqrt(dot(a,a)); }
+  //__forceinline float distance(const vfloat<8>& a, const vfloat<8>& b) { return length(a-b); }
+  //__forceinline float halfArea(const vfloat<8>& d) { return madd(d.x,(d.y+d.z),d.y*d.z); }
+  //__forceinline float area    (const vfloat<8>& d) { return 2.0f*halfArea(d); }
+  //__forceinline vfloat<8> reflect(const vfloat<8>& V, const vfloat<8>& N) { return 2.0f*dot(V,N)*N-V; }
+
+  //__forceinline vfloat<8> normalize_safe(const vfloat<8>& a) {
+  //  const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d);
+  //}
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// In Register Sorting
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vfloat8 sort_ascending(const vfloat8& v)
+  {
+    const vfloat8 a0 = v;
+    const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat8 c0 = min(a0,b0);
+    const vfloat8 d0 = max(a0,b0);
+    const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat8 c1 = min(a1,b1);
+    const vfloat8 d1 = max(a1,b1);
+    const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+    const vfloat8 c2 = min(a2,b2);
+    const vfloat8 d2 = max(a2,b2);
+    const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vfloat8 b3 = shuffle4<1,0>(a3);
+    const vfloat8 c3 = min(a3,b3);
+    const vfloat8 d3 = max(a3,b3);
+    const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+    const vfloat8 c4 = min(a4,b4);
+    const vfloat8 d4 = max(a4,b4);
+    const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+    const vfloat8 c5 = min(a5,b5);
+    const vfloat8 d5 = max(a5,b5);
+    const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+   __forceinline vfloat8 sort_descending(const vfloat8& v)
+  {
+    const vfloat8 a0 = v;
+    const vfloat8 b0 = shuffle<1,0,3,2>(a0);
+    const vfloat8 c0 = max(a0,b0);
+    const vfloat8 d0 = min(a0,b0);
+    const vfloat8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vfloat8 b1 = shuffle<2,3,0,1>(a1);
+    const vfloat8 c1 = max(a1,b1);
+    const vfloat8 d1 = min(a1,b1);
+    const vfloat8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vfloat8 b2 = shuffle<1,0,3,2>(a2);
+    const vfloat8 c2 = max(a2,b2);
+    const vfloat8 d2 = min(a2,b2);
+    const vfloat8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vfloat8 b3 = shuffle4<1,0>(a3);
+    const vfloat8 c3 = max(a3,b3);
+    const vfloat8 d3 = min(a3,b3);
+    const vfloat8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vfloat8 b4 = shuffle<2,3,0,1>(a4);
+    const vfloat8 c4 = max(a4,b4);
+    const vfloat8 d4 = min(a4,b4);
+    const vfloat8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vfloat8 b5 = shuffle<1,0,3,2>(a5);
+    const vfloat8 c5 = max(a5,b5);
+    const vfloat8 d5 = min(a5,b5);
+    const vfloat8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vfloat8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vint16_avx512.h
new file mode 100644
index 0000000000..3249bc2b45
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vint16_avx512.h
@@ -0,0 +1,490 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{ 
+  /* 16-wide AVX-512 integer type */
+  template<>
+  struct vint<16>
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef vboolf16 Bool;
+    typedef vint16   Int;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512i v; 
+      int i[16]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vint() {}
+    __forceinline vint(const vint16& t) { v = t.v; }
+    __forceinline vint16& operator =(const vint16& f) { v = f.v; return *this; }
+
+    __forceinline vint(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vint(int i) {
+      v = _mm512_set1_epi32(i);
+    }
+    
+    __forceinline vint(int a, int b, int c, int d) {
+      v = _mm512_set4_epi32(d,c,b,a);      
+    }
+
+    __forceinline vint(int a0 , int a1 , int a2 , int a3,
+                       int a4 , int a5 , int a6 , int a7,
+                       int a8 , int a9 , int a10, int a11,
+                       int a12, int a13, int a14, int a15)
+    {
+      v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+
+    __forceinline vint(const vint4& i) {
+      v = _mm512_broadcast_i32x4(i);
+    }
+
+    __forceinline vint(const vint4& a, const vint4& b, const vint4& c, const vint4& d) {
+      v = _mm512_castsi128_si512(a);
+      v = _mm512_inserti32x4(v, b, 1);
+      v = _mm512_inserti32x4(v, c, 2);
+      v = _mm512_inserti32x4(v, d, 3);
+    }
+
+    __forceinline vint(const vint8& i) {
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+    }
+
+    __forceinline vint(const vint8& a, const vint8& b) {
+      v = _mm512_castsi256_si512(a);
+      v = _mm512_inserti64x4(v, b, 1);
+    }
+   
+    __forceinline explicit vint(const __m512& f) {
+      v = _mm512_cvtps_epi32(f);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vint(ZeroTy)   : v(_mm512_setzero_epi32()) {}
+    __forceinline vint(OneTy)    : v(_mm512_set1_epi32(1)) {}
+    __forceinline vint(PosInfTy) : v(_mm512_set1_epi32(pos_inf)) {}
+    __forceinline vint(NegInfTy) : v(_mm512_set1_epi32(neg_inf)) {}
+    __forceinline vint(StepTy)   : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint16 load (const void* addr) { return _mm512_load_si512((int*)addr); }
+
+    static __forceinline vint16 load(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vint16 load(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_load_si256((__m256i*)ptr)); }
+
+    static __forceinline vint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+    static __forceinline vint16 loadu(const void* addr) { return _mm512_loadu_si512(addr); }
+
+    static __forceinline vint16 load (const vboolf16& mask, const void* addr) { return _mm512_mask_load_epi32 (_mm512_setzero_epi32(),mask,addr); }
+    static __forceinline vint16 loadu(const vboolf16& mask, const void* addr) { return _mm512_mask_loadu_epi32(_mm512_setzero_epi32(),mask,addr); }
+
+    static __forceinline void store (void* ptr, const vint16& v) { _mm512_store_si512 (ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint16& v) { _mm512_storeu_si512(ptr,v); }
+ 
+    static __forceinline void store (const vboolf16& mask, void* addr, const vint16& v2) { _mm512_mask_store_epi32(addr,mask,v2); }
+    static __forceinline void storeu(const vboolf16& mask, void* ptr,  const vint16& f) { _mm512_mask_storeu_epi32((int*)ptr,mask,f); }
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vint16& a) { _mm512_stream_si512((__m512i*)ptr,a); }
+
+    /* pass by value to avoid compiler generating inefficient code */
+    static __forceinline void storeu_compact(const vboolf16 mask, void* addr, vint16 reg) {
+      _mm512_mask_compressstoreu_epi32(addr,mask,reg);
+    }
+
+    static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vint16 reg) {
+      //_mm512_mask_compressstoreu_epi32(addr,mask,reg);
+      *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg)));
+    }
+
+    static __forceinline vint16 compact64bit(const vboolf16& mask, vint16 &v) {
+      return _mm512_mask_compress_epi64(v,mask,v);
+    }
+
+    static __forceinline vint16 compact(const vboolf16& mask, vint16 &v) {
+      return _mm512_mask_compress_epi32(v,mask,v);
+    }
+
+    static __forceinline vint16 compact(const vboolf16& mask, const vint16 &a, vint16 &b) {
+      return _mm512_mask_compress_epi32(a,mask,b);
+    }
+
+    static __forceinline vint16 expand(const vboolf16& mask, const vint16& a, vint16& b) {
+      return _mm512_mask_expand_epi32(b,mask,a);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const int* ptr, const vint16& index) {
+      return _mm512_i32gather_epi32(index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const vboolf16& mask, const int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint16 gather(const vboolf16& mask, vint16& dest, const int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(int* ptr, const vint16& index, const vint16& v) {
+      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, int* ptr, const vint16& index, const vint16& v) {
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+    }
+
+    static __forceinline vint16 broadcast64bit(size_t v) {
+      return _mm512_set1_epi64(v);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       int& operator [](size_t index)       { assert(index < 16); return i[index]; }
+    __forceinline const int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+    __forceinline unsigned int uint    (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+    __forceinline size_t&      uint64_t(size_t index) const { assert(index < 8);  return ((size_t*)i)[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 asBool(const vint16& a) { return _mm512_movepi32_mask(a); }
+
+  __forceinline vint16 operator +(const vint16& a) { return a; }
+  __forceinline vint16 operator -(const vint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 operator +(const vint16& a, const vint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vint16 operator +(const vint16& a, int           b) { return a + vint16(b); }
+  __forceinline vint16 operator +(int           a, const vint16& b) { return vint16(a) + b; }
+
+  __forceinline vint16 operator -(const vint16& a, const vint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vint16 operator -(const vint16& a, int           b) { return a - vint16(b); }
+  __forceinline vint16 operator -(int           a, const vint16& b) { return vint16(a) - b; }
+
+  __forceinline vint16 operator *(const vint16& a, const vint16& b) { return _mm512_mullo_epi32(a, b); }
+  __forceinline vint16 operator *(const vint16& a, int           b) { return a * vint16(b); }
+  __forceinline vint16 operator *(int           a, const vint16& b) { return vint16(a) * b; }
+
+  __forceinline vint16 operator &(const vint16& a, const vint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vint16 operator &(const vint16& a, int           b) { return a & vint16(b); }
+  __forceinline vint16 operator &(int           a, const vint16& b) { return vint16(a) & b; }
+
+  __forceinline vint16 operator |(const vint16& a, const vint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vint16 operator |(const vint16& a, int           b) { return a | vint16(b); }
+  __forceinline vint16 operator |(int           a, const vint16& b) { return vint16(a) | b; }
+
+  __forceinline vint16 operator ^(const vint16& a, const vint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vint16 operator ^(const vint16& a, int           b) { return a ^ vint16(b); }
+  __forceinline vint16 operator ^(int           a, const vint16& b) { return vint16(a) ^ b; }
+
+  __forceinline vint16 operator <<(const vint16& a, int n) { return _mm512_slli_epi32(a, n); }
+  __forceinline vint16 operator >>(const vint16& a, int n) { return _mm512_srai_epi32(a, n); }
+
+  __forceinline vint16 operator <<(const vint16& a, const vint16& n) { return _mm512_sllv_epi32(a, n); }
+  __forceinline vint16 operator >>(const vint16& a, const vint16& n) { return _mm512_srav_epi32(a, n); }
+
+  __forceinline vint16 sll (const vint16& a, int b) { return _mm512_slli_epi32(a, b); }
+  __forceinline vint16 sra (const vint16& a, int b) { return _mm512_srai_epi32(a, b); }
+  __forceinline vint16 srl (const vint16& a, int b) { return _mm512_srli_epi32(a, b); }
+  
+  __forceinline vint16 min(const vint16& a, const vint16& b) { return _mm512_min_epi32(a, b); }
+  __forceinline vint16 min(const vint16& a, int           b) { return min(a,vint16(b)); }
+  __forceinline vint16 min(int           a, const vint16& b) { return min(vint16(a),b); }
+
+  __forceinline vint16 max(const vint16& a, const vint16& b) { return _mm512_max_epi32(a, b); }
+  __forceinline vint16 max(const vint16& a, int           b) { return max(a,vint16(b)); }
+  __forceinline vint16 max(int           a, const vint16& b) { return max(vint16(a),b); }
+  
+  __forceinline vint16 umin(const vint16& a, const vint16& b) { return _mm512_min_epu32(a, b); }
+  __forceinline vint16 umax(const vint16& a, const vint16& b) { return _mm512_max_epu32(a, b); }
+
+  __forceinline vint16 mask_add(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+  __forceinline vint16 mask_sub(const vboolf16& mask, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+  __forceinline vint16 mask_and(const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+  __forceinline vint16 mask_or (const vboolf16& m, vint16& c, const vint16& a, const vint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16& operator +=(vint16& a, const vint16& b) { return a = a + b; }
+  __forceinline vint16& operator +=(vint16& a, int           b) { return a = a + b; }
+  
+  __forceinline vint16& operator -=(vint16& a, const vint16& b) { return a = a - b; }
+  __forceinline vint16& operator -=(vint16& a, int           b) { return a = a - b; }
+
+  __forceinline vint16& operator *=(vint16& a, const vint16& b) { return a = a * b; }
+  __forceinline vint16& operator *=(vint16& a, int           b) { return a = a * b; }
+  
+  __forceinline vint16& operator &=(vint16& a, const vint16& b) { return a = a & b; }
+  __forceinline vint16& operator &=(vint16& a, int           b) { return a = a & b; }
+  
+  __forceinline vint16& operator |=(vint16& a, const vint16& b) { return a = a | b; }
+  __forceinline vint16& operator |=(vint16& a, int           b) { return a = a | b; }
+  
+  __forceinline vint16& operator <<=(vint16& a, int b) { return a = a << b; }
+  __forceinline vint16& operator >>=(vint16& a, int b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vint16& a, int           b) { return a == vint16(b); }
+  __forceinline vboolf16 operator ==(int           a, const vint16& b) { return vint16(a) == b; }
+  
+  __forceinline vboolf16 operator !=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vint16& a, int           b) { return a != vint16(b); }
+  __forceinline vboolf16 operator !=(int           a, const vint16& b) { return vint16(a) != b; }
+  
+  __forceinline vboolf16 operator < (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vint16& a, int           b) { return a <  vint16(b); }
+  __forceinline vboolf16 operator < (int           a, const vint16& b) { return vint16(a) <  b; }
+  
+  __forceinline vboolf16 operator >=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vint16& a, int           b) { return a >= vint16(b); }
+  __forceinline vboolf16 operator >=(int           a, const vint16& b) { return vint16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vint16& a, int           b) { return a >  vint16(b); }
+  __forceinline vboolf16 operator > (int           a, const vint16& b) { return vint16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vint16& a, int           b) { return a <= vint16(b); }
+  __forceinline vboolf16 operator <=(int           a, const vint16& b) { return vint16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vint16& a, const vint16& b) { return _mm512_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vint16& a, const vint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epi32_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_le(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 uint_gt(const vboolf16 mask, const vint16& a, const vint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+    
+ 
+  __forceinline vint16 select(const vboolf16& m, const vint16& t, const vint16& f) {
+    return _mm512_mask_or_epi32(f,m,t,t); 
+  }
+
+  __forceinline void xchg(const vboolf16& m, vint16& a, vint16& b) {
+    const vint16 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboolf16 test(const vboolf16& m, const vint16& a, const vint16& b) {
+    return _mm512_mask_test_epi32_mask(m,a,b);
+  }
+
+  __forceinline vboolf16 test(const vint16& a, const vint16& b) {
+    return _mm512_test_epi32_mask(a,b);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 unpacklo(const vint16& a, const vint16& b) { return _mm512_unpacklo_epi32(a, b); }
+  __forceinline vint16 unpackhi(const vint16& a, const vint16& b) { return _mm512_unpackhi_epi32(a, b); }
+
+  template<int i>
+    __forceinline vint16 shuffle(const vint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint16 shuffle(const vint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vint16 shuffle4(const vint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint16 shuffle4(const vint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vint16 align_shift_right(const vint16& a, const vint16& b) {
+    return _mm512_alignr_epi32(a, b, i);
+  };
+
+  __forceinline int toScalar(const vint16& v) {
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+  }
+
+  template<int i> __forceinline vint16 insert4(const vint16& a, const vint4& b) { return _mm512_inserti32x4(a, b, i); }
+
+  __forceinline size_t extract64bit(const vint16& v) {
+    return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+  }
+
+  template<int N, int i>
+  vint<N> extractN(const vint16& v);
+
+  template<> __forceinline vint4 extractN<4,0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
+  template<> __forceinline vint4 extractN<4,1>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 1); }
+  template<> __forceinline vint4 extractN<4,2>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 2); }
+  template<> __forceinline vint4 extractN<4,3>(const vint16& v) { return _mm512_extracti32x4_epi32(v, 3); }
+
+  template<> __forceinline vint8 extractN<8,0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
+  template<> __forceinline vint8 extractN<8,1>(const vint16& v) { return _mm512_extracti32x8_epi32(v, 1); }
+
+  template<int i> __forceinline vint4 extract4   (const vint16& v) { return _mm512_extracti32x4_epi32(v, i); }
+  template<>      __forceinline vint4 extract4<0>(const vint16& v) { return _mm512_castsi512_si128(v);       }
+
+  template<int i> __forceinline vint8 extract8   (const vint16& v) { return _mm512_extracti32x8_epi32(v, i); }
+  template<>      __forceinline vint8 extract8<0>(const vint16& v) { return _mm512_castsi512_si256(v);       }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 vreduce_min2(vint16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_min4(vint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vint16 vreduce_min8(vint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_min (vint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vint16 vreduce_max2(vint16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_max4(vint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vint16 vreduce_max8(vint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vint16 vreduce_max (vint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vint16 vreduce_and2(vint16 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_and4(vint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_and8(vint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_and (vint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+  __forceinline vint16 vreduce_or2(vint16 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_or4(vint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_or8(vint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_or (vint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+  __forceinline vint16 vreduce_add2(vint16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_add4(vint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vint16 vreduce_add8(vint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vint16 vreduce_add (vint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+  
+  __forceinline int reduce_min(const vint16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint16& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_and(const vint16& v) { return toScalar(vreduce_and(v)); }
+  __forceinline int reduce_or (const vint16& v) { return toScalar(vreduce_or (v)); }
+  __forceinline int reduce_add(const vint16& v) { return toScalar(vreduce_add(v)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint16 conflict(const vint16& index)
+  {
+    return _mm512_conflict_epi32(index);
+  }
+
+  __forceinline vint16 conflict(const vboolf16& mask, vint16& dest, const vint16& index)
+  {
+    return _mm512_mask_conflict_epi32(dest,mask,index);
+  }    
+
+  __forceinline vint16 convert_uint32_t(const __m512& f) {
+    return _mm512_cvtps_epu32(f);
+  }
+
+  __forceinline vint16 permute(vint16 v, vint16 index) {
+    return _mm512_permutexvar_epi32(index,v);  
+  }
+
+  __forceinline vint16 reverse(const vint16 &a) {
+    return permute(a,vint16(reverse_step));
+  }
+
+  __forceinline vint16 prefix_sum(const vint16& a) 
+  {
+    const vint16 z(zero);
+    vint16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vint16 reverse_prefix_sum(const vint16& a) 
+  {
+    const vint16 z(zero);
+    vint16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  /* this should use a vbool8 and a vint8_64...*/
+  template<int scale = 1, int hint = _MM_HINT_T0>
+    __forceinline void gather_prefetch64(const void* base_addr, const vbool16& mask, const vint16& offset)
+  {
+#if defined(__AVX512PF__)
+    _mm512_mask_prefetch_i64gather_pd(offset, mask, base_addr, scale, hint);
+#endif
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vint4_sse2.h
new file mode 100644
index 0000000000..96f105a7c5
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vint4_sse2.h
@@ -0,0 +1,681 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+namespace embree
+{
+  /* 4-wide SSE integer type */
+  template<>
+  struct vint<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 };             // number of SIMD elements
+    union { __m128i v; int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint4& a) { v = a.v; }
+    __forceinline vint4& operator =(const vint4& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m128i a) : v(a) {}
+    __forceinline operator const __m128i&() const { return v; }
+    __forceinline operator       __m128i&()       { return v; }
+
+    __forceinline vint(int a) : v(_mm_set1_epi32(a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+    __forceinline explicit vint(__m128 a) : v(_mm_cvtps_epi32(a)) {}
+#if defined(__AVX512VL__)
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+    __forceinline explicit vint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+    __forceinline vint(long long a, long long b) : v(_mm_set_epi64x(b,a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm_setzero_si128()) {}
+    __forceinline vint(OneTy)         : v(_mm_set_epi32(1, 1, 1, 1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm_set_epi32(pos_inf, pos_inf, pos_inf, pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm_set_epi32(neg_inf, neg_inf, neg_inf, neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm_set_epi32(3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm_set_epi32(0, 1, 2, 3)) {}
+
+    __forceinline vint(TrueTy)   { v = _mm_cmpeq_epi32(v,v); }
+    __forceinline vint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+    static __forceinline vint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+    static __forceinline void store (void* ptr, const vint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vint4 compact(const vboolf4& mask, vint4 &v) {
+      return _mm_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vint4 compact(const vboolf4& mask, vint4 &a, const vint4& b) {
+      return _mm_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+    static __forceinline vint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+    static __forceinline vint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vint4& i) { store (ptr,select(mask,i,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+
+#if defined(__aarch64__)
+    static __forceinline vint4 load(const uint8_t* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+    static __forceinline vint4 loadu(const uint8_t* ptr) {
+        return  _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vint4 load(const uint8_t* ptr) {
+      return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+    static __forceinline vint4 loadu(const uint8_t* ptr) {
+      return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+#else
+
+    static __forceinline vint4 load(const uint8_t* ptr) {
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+
+    static __forceinline vint4 loadu(const uint8_t* ptr) {
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+    }
+
+#endif
+
+    static __forceinline vint4 load(const unsigned short* ptr) {
+#if defined(__aarch64__)
+      return __m128i(vmovl_u16(vld1_u16(ptr)));
+#elif defined (__SSE4_1__)
+      return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+      return vint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+    }
+
+    static __forceinline void store(uint8_t* ptr, const vint4& v) {
+#if defined(__aarch64__)
+        int32x4_t x = v;
+        uint16x4_t y = vqmovn_u32(uint32x4_t(x));
+        uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
+        vst1_lane_u32((uint32_t *)ptr,uint32x2_t(z), 0);
+#elif defined(__SSE4_1__)
+      __m128i x = v;
+      x = _mm_packus_epi32(x, x);
+      x = _mm_packus_epi16(x, x);
+      *(int*)ptr = _mm_cvtsi128_si32(x);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (uint8_t)v[i];
+#endif
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint4& v) {
+#if defined(__aarch64__)
+      uint32x4_t x = uint32x4_t(v.v);
+      uint16x4_t y = vqmovn_u32(x);
+      vst1_u16(ptr, y);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (unsigned short)v[i];
+#endif
+    }
+
+    static __forceinline vint4 load_nt(void* ptr) {
+#if defined(__aarch64__) || defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr);
+#else
+      return _mm_load_si128((__m128i*)ptr);
+#endif
+    }
+
+    static __forceinline void store_nt(void* ptr, const vint4& v) {
+#if !defined(__aarch64__) && defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v));
+#else
+      _mm_store_si128((__m128i*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vint4 gather(const int* ptr, const vint4& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_i32gather_epi32(ptr, index, scale);
+#else
+      return vint4(
+          *(int*)(((int8_t*)ptr)+scale*index[0]),
+          *(int*)(((int8_t*)ptr)+scale*index[1]),
+          *(int*)(((int8_t*)ptr)+scale*index[2]),
+          *(int*)(((int8_t*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vint4 gather(const vboolf4& mask, const int* ptr, const vint4& index) {
+      vint4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint4& index, const vint4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_i32scatter_epi32((int*)ptr, index, v, scale);
+#else
+      *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf4& mask, void* ptr, const vint4& index, const vint4& v)
+    {
+#if defined(__AVX512VL__)
+      _mm_mask_i32scatter_epi32((int*)ptr, mask, index, v, scale);
+#else
+      if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*index[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*index[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*index[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*index[3]) = v[3];
+#endif
+    }
+
+#if defined(__x86_64__) || defined(__aarch64__)
+    static __forceinline vint4 broadcast64(long long a) { return _mm_set1_epi64x(a); }
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+
+    friend __forceinline vint4 select(const vboolf4& m, const vint4& t, const vint4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__aarch64__)
+      return _mm_castps_si128(_mm_blendv_ps((__m128)f.v,(__m128) t.v, (__m128)m.v));
+#elif defined(__SSE4_1__)
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m));
+#else
+      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f));
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_movepi32_mask(a); }
+#else
+  __forceinline vboolf4 asBool(const vint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+  __forceinline vint4 operator +(const vint4& a) { return a; }
+  __forceinline vint4 operator -(const vint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+#if defined(__aarch64__)
+  __forceinline vint4 abs(const vint4& a) { return vabsq_s32(a.v); }
+#elif defined(__SSSE3__)
+  __forceinline vint4 abs(const vint4& a) { return _mm_abs_epi32(a); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4 operator +(const vint4& a, const vint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vint4 operator +(const vint4& a, int          b) { return a + vint4(b); }
+  __forceinline vint4 operator +(int          a, const vint4& b) { return vint4(a) + b; }
+
+  __forceinline vint4 operator -(const vint4& a, const vint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vint4 operator -(const vint4& a, int          b) { return a - vint4(b); }
+  __forceinline vint4 operator -(int          a, const vint4& b) { return vint4(a) - b; }
+
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return _mm_mullo_epi32(a, b); }
+#else
+  __forceinline vint4 operator *(const vint4& a, const vint4& b) { return vint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+#endif
+  __forceinline vint4 operator *(const vint4& a, int          b) { return a * vint4(b); }
+  __forceinline vint4 operator *(int          a, const vint4& b) { return vint4(a) * b; }
+
+  __forceinline vint4 operator &(const vint4& a, const vint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vint4 operator &(const vint4& a, int          b) { return a & vint4(b); }
+  __forceinline vint4 operator &(int          a, const vint4& b) { return vint4(a) & b; }
+
+  __forceinline vint4 operator |(const vint4& a, const vint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vint4 operator |(const vint4& a, int          b) { return a | vint4(b); }
+  __forceinline vint4 operator |(int          a, const vint4& b) { return vint4(a) | b; }
+
+  __forceinline vint4 operator ^(const vint4& a, const vint4& b) { return _mm_xor_si128(a, b); }
+  __forceinline vint4 operator ^(const vint4& a, int          b) { return a ^ vint4(b); }
+  __forceinline vint4 operator ^(int          a, const vint4& b) { return vint4(a) ^ b; }
+
+  __forceinline vint4 operator <<(const vint4& a, const int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vint4 operator >>(const vint4& a, const int n) { return _mm_srai_epi32(a, n); }
+
+  __forceinline vint4 sll (const vint4& a, int b) { return _mm_slli_epi32(a, b); }
+  __forceinline vint4 sra (const vint4& a, int b) { return _mm_srai_epi32(a, b); }
+  __forceinline vint4 srl (const vint4& a, int b) { return _mm_srli_epi32(a, b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4& operator +=(vint4& a, const vint4& b) { return a = a + b; }
+  __forceinline vint4& operator +=(vint4& a, int          b) { return a = a + b; }
+
+  __forceinline vint4& operator -=(vint4& a, const vint4& b) { return a = a - b; }
+  __forceinline vint4& operator -=(vint4& a, int          b) { return a = a - b; }
+
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+  __forceinline vint4& operator *=(vint4& a, const vint4& b) { return a = a * b; }
+  __forceinline vint4& operator *=(vint4& a, int          b) { return a = a * b; }
+#endif
+
+  __forceinline vint4& operator &=(vint4& a, const vint4& b) { return a = a & b; }
+  __forceinline vint4& operator &=(vint4& a, int          b) { return a = a & b; }
+
+  __forceinline vint4& operator |=(vint4& a, const vint4& b) { return a = a | b; }
+  __forceinline vint4& operator |=(vint4& a, int          b) { return a = a | b; }
+
+  __forceinline vint4& operator <<=(vint4& a, int b) { return a = a << b; }
+  __forceinline vint4& operator >>=(vint4& a, int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return _mm_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator !=(const vint4& a, const vint4& b) { return !(a == b); }
+  __forceinline vboolf4 operator < (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmplt_epi32(a, b)); }
+  __forceinline vboolf4 operator >=(const vint4& a, const vint4& b) { return !(a <  b); }
+  __forceinline vboolf4 operator > (const vint4& a, const vint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epi32(a, b)); }
+  __forceinline vboolf4 operator <=(const vint4& a, const vint4& b) { return !(a >  b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vint4& a, int          b) { return a == vint4(b); }
+  __forceinline vboolf4 operator ==(int          a, const vint4& b) { return vint4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vint4& a, int          b) { return a != vint4(b); }
+  __forceinline vboolf4 operator !=(int          a, const vint4& b) { return vint4(a) != b; }
+
+  __forceinline vboolf4 operator < (const vint4& a, int          b) { return a <  vint4(b); }
+  __forceinline vboolf4 operator < (int          a, const vint4& b) { return vint4(a) <  b; }
+
+  __forceinline vboolf4 operator >=(const vint4& a, int          b) { return a >= vint4(b); }
+  __forceinline vboolf4 operator >=(int          a, const vint4& b) { return vint4(a) >= b; }
+
+  __forceinline vboolf4 operator > (const vint4& a, int          b) { return a >  vint4(b); }
+  __forceinline vboolf4 operator > (int          a, const vint4& b) { return vint4(a) >  b; }
+
+  __forceinline vboolf4 operator <=(const vint4& a, int          b) { return a <= vint4(b); }
+  __forceinline vboolf4 operator <=(int          a, const vint4& b) { return vint4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vint4& a, const vint4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vint4& a, const vint4& b) { return a != b; }
+  __forceinline vboolf4 lt(const vint4& a, const vint4& b) { return a <  b; }
+  __forceinline vboolf4 ge(const vint4& a, const vint4& b) { return a >= b; }
+  __forceinline vboolf4 gt(const vint4& a, const vint4& b) { return a >  b; }
+  __forceinline vboolf4 le(const vint4& a, const vint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return _mm_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a != b); }
+  __forceinline vboolf4 lt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <  b); }
+  __forceinline vboolf4 ge(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >= b); }
+  __forceinline vboolf4 gt(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a >  b); }
+  __forceinline vboolf4 le(const vboolf4& mask, const vint4& a, const vint4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+  __forceinline vint4 select(const vint4& t, const vint4& f) {
+#if defined(__SSE4_1__)
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+    return select(vboolf4(mask), t, f);
+#endif
+  }
+
+      
+#if defined(__aarch64__) || defined(__SSE4_1__)
+  __forceinline vint4 min(const vint4& a, const vint4& b) { return _mm_min_epi32(a, b); }
+  __forceinline vint4 max(const vint4& a, const vint4& b) { return _mm_max_epi32(a, b); }
+
+  __forceinline vint4 umin(const vint4& a, const vint4& b) { return _mm_min_epu32(a, b); }
+  __forceinline vint4 umax(const vint4& a, const vint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+  __forceinline vint4 min(const vint4& a, const vint4& b) { return select(a < b,a,b); }
+  __forceinline vint4 max(const vint4& a, const vint4& b) { return select(a < b,b,a); }
+#endif
+
+  __forceinline vint4 min(const vint4& a, int          b) { return min(a,vint4(b)); }
+  __forceinline vint4 min(int          a, const vint4& b) { return min(vint4(a),b); }
+  __forceinline vint4 max(const vint4& a, int          b) { return max(a,vint4(b)); }
+  __forceinline vint4 max(int          a, const vint4& b) { return max(vint4(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint4 unpacklo(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vint4 unpackhi(const vint4& a, const vint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+#if defined(__aarch64__)
+    template<int i0, int i1, int i2, int i3>
+    __forceinline vint4 shuffle(const vint4& v) {
+        return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+    }
+    template<int i0, int i1, int i2, int i3>
+    __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+        return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+    }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint4 shuffle(const vint4& v) {
+    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint4 shuffle(const vint4& a, const vint4& b) {
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+#endif
+#if defined(__SSE3__)
+  template<> __forceinline vint4 shuffle<0, 0, 2, 2>(const vint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vint4 shuffle<1, 1, 3, 3>(const vint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vint4 shuffle<0, 1, 0, 1>(const vint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vint4 shuffle(const vint4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__aarch64__)
+    template<int src> __forceinline int extract(const vint4& b);
+    template<int dst> __forceinline vint4 insert(const vint4& a, const int b);
+#elif defined(__SSE4_1__)
+  template<int src> __forceinline int extract(const vint4& b) { return _mm_extract_epi32(b, src); }
+  template<int dst> __forceinline vint4 insert(const vint4& a, const int b) { return _mm_insert_epi32(a, b, dst); }
+#else
+  template<int src> __forceinline int extract(const vint4& b) { return b[src&3]; }
+  template<int dst> __forceinline vint4 insert(const vint4& a, int b) { vint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+#if defined(__aarch64__)
+    template<> __forceinline int extract<0>(const vint4& b) {
+        return b.v[0];
+    }
+    template<> __forceinline int extract<1>(const vint4& b) {
+        return b.v[1];
+    }
+    template<> __forceinline int extract<2>(const vint4& b) {
+        return b.v[2];
+    }
+    template<> __forceinline int extract<3>(const vint4& b) {
+        return b.v[3];
+    }
+    template<> __forceinline vint4 insert<0>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[0] = b;
+        return c;
+    }
+    template<> __forceinline vint4 insert<1>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[1] = b;
+        return c;
+    }
+    template<> __forceinline vint4 insert<2>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[2] = b;
+        return c;
+    }
+    template<> __forceinline vint4 insert<3>(const vint4& a, int b)
+    {
+        vint4 c = a;
+        c[3] = b;
+        return c;
+    }
+      
+    __forceinline int toScalar(const vint4& v) {
+        return v[0];
+    }
+      
+    __forceinline size_t toSizeT(const vint4& v) {
+        uint64x2_t x = uint64x2_t(v.v);
+        return x[0];
+    }
+#else
+  template<> __forceinline int extract<0>(const vint4& b) { return _mm_cvtsi128_si32(b); }
+
+  __forceinline int toScalar(const vint4& v) { return _mm_cvtsi128_si32(v); }
+
+  __forceinline size_t toSizeT(const vint4& v) {
+#if defined(__WIN32__) && !defined(__X86_64__) // win32 workaround
+    return toScalar(v);
+#elif defined(__ARM_NEON)
+    // FIXME(LTE): Do we need a swap(i.e. use lane 1)?
+    return vgetq_lane_u64(*(reinterpret_cast<const uint64x2_t *>(&v)), 0);
+#else
+    return _mm_cvtsi128_si64(v);
+#endif
+  }
+#endif
+      
+#if defined(__AVX512VL__)
+
+  __forceinline vint4 permute(const vint4 &a, const vint4 &index) {
+    return  _mm_castps_si128(_mm_permutevar_ps(_mm_castsi128_ps(a),index));
+  }
+
+  template<int i>
+  __forceinline vint4 align_shift_right(const vint4& a, const vint4& b) {
+    return _mm_alignr_epi32(a, b, i);
+  }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__aarch64__) || defined(__SSE4_1__)
+      
+#if defined(__aarch64__)
+    __forceinline vint4 vreduce_min(const vint4& v) { int h = vminvq_s32(v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_max(const vint4& v) { int h = vmaxvq_s32(v); return vdupq_n_s32(h); }
+    __forceinline vint4 vreduce_add(const vint4& v) { int h = vaddvq_s32(v); return vdupq_n_s32(h); }
+      
+    __forceinline int reduce_min(const vint4& v) { return vminvq_s32(v); }
+    __forceinline int reduce_max(const vint4& v) { return vmaxvq_s32(v); }
+    __forceinline int reduce_add(const vint4& v) { return vaddvq_s32(v); }
+#else
+  __forceinline vint4 vreduce_min(const vint4& v) { vint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vint4 vreduce_max(const vint4& v) { vint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vint4 vreduce_add(const vint4& v) { vint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline int reduce_min(const vint4& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint4& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint4& v) { return toScalar(vreduce_add(v)); }
+#endif
+      
+  __forceinline size_t select_min(const vint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf4& valid, const vint4& v) { const vint4 a = select(valid,v,vint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+  __forceinline int reduce_min(const vint4& v) { return min(v[0],v[1],v[2],v[3]); }
+  __forceinline int reduce_max(const vint4& v) { return max(v[0],v[1],v[2],v[3]); }
+  __forceinline int reduce_add(const vint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+
+  __forceinline vint4 usort_ascending(const vint4& v)
+  {
+    const vint4 a0 = v;
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = umin(a0,b0);
+    const vint4 d0 = umax(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = umin(a1,b1);
+    const vint4 d1 = umax(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = umin(a2,b2);
+    const vint4 d2 = umax(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+  __forceinline vint4 usort_descending(const vint4& v)
+  {
+    const vint4 a0 = v;
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = umax(a0,b0);
+    const vint4 d0 = umin(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = umax(a1,b1);
+    const vint4 d1 = umin(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = umax(a2,b2);
+    const vint4 d2 = umin(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3;
+  }
+
+#else
+
+  __forceinline vint4 usort_ascending(const vint4& v)
+  {
+    const vint4 a0 = v-vint4(0x80000000);
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = min(a0,b0);
+    const vint4 d0 = max(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = min(a1,b1);
+    const vint4 d1 = max(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = min(a2,b2);
+    const vint4 d2 = max(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3+vint4(0x80000000);
+  }
+
+  __forceinline vint4 usort_descending(const vint4& v)
+  {
+    const vint4 a0 = v-vint4(0x80000000);
+    const vint4 b0 = shuffle<1,0,3,2>(a0);
+    const vint4 c0 = max(a0,b0);
+    const vint4 d0 = min(a0,b0);
+    const vint4 a1 = select<0x5 /* 0b0101 */>(c0,d0);
+    const vint4 b1 = shuffle<2,3,0,1>(a1);
+    const vint4 c1 = max(a1,b1);
+    const vint4 d1 = min(a1,b1);
+    const vint4 a2 = select<0x3 /* 0b0011 */>(c1,d1);
+    const vint4 b2 = shuffle<0,2,1,3>(a2);
+    const vint4 c2 = max(a2,b2);
+    const vint4 d2 = min(a2,b2);
+    const vint4 a3 = select<0x2 /* 0b0010 */>(c2,d2);
+    return a3+vint4(0x80000000);
+  }
+
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx.h b/thirdparty/embree-aarch64/common/simd/vint8_avx.h
new file mode 100644
index 0000000000..25a771284d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vint8_avx.h
@@ -0,0 +1,464 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vint<8>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };        // number of SIMD elements
+    union {                    // data
+      __m256i v;
+      struct { __m128i vl,vh; };
+      int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint8& a) { v = a.v; }
+    __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+ 
+    __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm256_setzero_si256()) {}
+    __forceinline vint(OneTy)         : v(_mm256_set_epi32(1,1,1,1,1,1,1,1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm256_set_epi32(pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf,pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm256_set_epi32(neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf,neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+    __forceinline vint(UndefinedTy)   : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+    static __forceinline vint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+    static __forceinline vint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (void* ptr, const vint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(void* ptr, const vint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    
+#if !defined(__aarch64__)
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+#else
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+#endif
+
+    static __forceinline void store_nt(void* ptr, const vint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline vint8 load(const uint8_t* ptr) {
+      vint4 il = vint4::load(ptr+0);
+      vint4 ih = vint4::load(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 loadu(const uint8_t* ptr) {
+      vint4 il = vint4::loadu(ptr+0);
+      vint4 ih = vint4::loadu(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 load(const unsigned short* ptr) {
+      vint4 il = vint4::load(ptr+0);
+      vint4 ih = vint4::load(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline vint8 loadu(const unsigned short* ptr) {
+      vint4 il = vint4::loadu(ptr+0);
+      vint4 ih = vint4::loadu(ptr+4);
+      return vint8(il,ih);
+    }
+
+    static __forceinline void store(uint8_t* ptr, const vint8& i) {
+      vint4 il(i.vl);
+      vint4 ih(i.vh);
+      vint4::store(ptr + 0,il);
+      vint4::store(ptr + 4,ih);
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const int* ptr, const vint8& index) {
+      return vint8(
+          *(int*)(((int8_t*)ptr)+scale*index[0]),
+          *(int*)(((int8_t*)ptr)+scale*index[1]),
+          *(int*)(((int8_t*)ptr)+scale*index[2]),
+          *(int*)(((int8_t*)ptr)+scale*index[3]),
+          *(int*)(((int8_t*)ptr)+scale*index[4]),
+          *(int*)(((int8_t*)ptr)+scale*index[5]),
+          *(int*)(((int8_t*)ptr)+scale*index[6]),
+          *(int*)(((int8_t*)ptr)+scale*index[7]));
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const vboolf8& mask, const int* ptr, const vint8& index) {
+      vint8 r = zero;
+      if (likely(mask[0])) r[0] = *(int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(int*)(((int8_t*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(int*)(((int8_t*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(int*)(((int8_t*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(int*)(((int8_t*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(int*)(((int8_t*)ptr)+scale*index[7]);
+      return r;
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+    {
+      *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+    {
+      if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+
+    static __forceinline vint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+
+  __forceinline vint8 operator +(const vint8& a) { return a; }
+  __forceinline vint8 operator -(const vint8& a) { return vint8(_mm_sub_epi32(_mm_setzero_si128(), a.vl), _mm_sub_epi32(_mm_setzero_si128(), a.vh)); }
+  __forceinline vint8 abs       (const vint8& a) { return vint8(_mm_abs_epi32(a.vl), _mm_abs_epi32(a.vh)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return vint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator +(const vint8& a, int          b) { return a + vint8(b); }
+  __forceinline vint8 operator +(int          a, const vint8& b) { return vint8(a) + b; }
+
+  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return vint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator -(const vint8& a, int          b) { return a - vint8(b); }
+  __forceinline vint8 operator -(int          a, const vint8& b) { return vint8(a) - b; }
+
+  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return vint8(_mm_mullo_epi32(a.vl, b.vl), _mm_mullo_epi32(a.vh, b.vh)); }
+  __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
+  __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
+
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
+  __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
+
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
+  __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
+
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
+  __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
+
+  __forceinline vint8 operator <<(const vint8& a, int n) { return vint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+  __forceinline vint8 operator >>(const vint8& a, int n) { return vint8(_mm_srai_epi32(a.vl, n), _mm_srai_epi32(a.vh, n)); }
+
+  __forceinline vint8 sll (const vint8& a, int b) { return vint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+  __forceinline vint8 sra (const vint8& a, int b) { return vint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+  __forceinline vint8 srl (const vint8& a, int b) { return vint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+  
+  __forceinline vint8 min(const vint8& a, const vint8& b) { return vint8(_mm_min_epi32(a.vl, b.vl), _mm_min_epi32(a.vh, b.vh)); }
+  __forceinline vint8 min(const vint8& a, int          b) { return min(a,vint8(b)); }
+  __forceinline vint8 min(int          a, const vint8& b) { return min(vint8(a),b); }
+
+  __forceinline vint8 max(const vint8& a, const vint8& b) { return vint8(_mm_max_epi32(a.vl, b.vl), _mm_max_epi32(a.vh, b.vh)); }
+  __forceinline vint8 max(const vint8& a, int          b) { return max(a,vint8(b)); }
+  __forceinline vint8 max(int          a, const vint8& b) { return max(vint8(a),b); }
+
+  __forceinline vint8 umin(const vint8& a, const vint8& b) { return vint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+  __forceinline vint8 umin(const vint8& a, int          b) { return umin(a,vint8(b)); }
+  __forceinline vint8 umin(int          a, const vint8& b) { return umin(vint8(a),b); }
+
+  __forceinline vint8 umax(const vint8& a, const vint8& b) { return vint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+  __forceinline vint8 umax(const vint8& a, int          b) { return umax(a,vint8(b)); }
+  __forceinline vint8 umax(int          a, const vint8& b) { return umax(vint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+  __forceinline vint8& operator +=(vint8& a, int          b) { return a = a + b; }
+  
+  __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+  __forceinline vint8& operator -=(vint8& a, int          b) { return a = a - b; }
+  
+  __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+  __forceinline vint8& operator *=(vint8& a, int          b) { return a = a * b; }
+  
+  __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+  __forceinline vint8& operator &=(vint8& a, int          b) { return a = a & b; }
+  
+  __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+  __forceinline vint8& operator |=(vint8& a, int          b) { return a = a | b; }
+  
+  __forceinline vint8& operator <<=(vint8& a, int b) { return a = a << b; }
+  __forceinline vint8& operator >>=(vint8& a, int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator ==(const vint8& a, int          b) { return a == vint8(b); }
+  __forceinline vboolf8 operator ==(int          a, const vint8& b) { return vint8(a) == b; }
+  
+  __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+  __forceinline vboolf8 operator !=(const vint8& a, int          b) { return a != vint8(b); }
+  __forceinline vboolf8 operator !=(int          a, const vint8& b) { return vint8(a) != b; }
+  
+  __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmplt_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator < (const vint8& a, int          b) { return a <  vint8(b); }
+  __forceinline vboolf8 operator < (int          a, const vint8& b) { return vint8(a) <  b; }
+  
+  __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a <  b); }
+  __forceinline vboolf8 operator >=(const vint8& a, int          b) { return a >= vint8(b); }
+  __forceinline vboolf8 operator >=(int          a, const vint8& b) { return vint8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epi32 (a.vl, b.vl)),
+                                                                                     _mm_castsi128_ps(_mm_cmpgt_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator > (const vint8& a, int          b) { return a >  vint8(b); }
+  __forceinline vboolf8 operator > (int          a, const vint8& b) { return vint8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a >  b); }
+  __forceinline vboolf8 operator <=(const vint8& a, int          b) { return a <= vint8(b); }
+  __forceinline vboolf8 operator <=(int          a, const vint8& b) { return vint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+  __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+  __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <  b); }
+  __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+  __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >  b); }
+  __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+
+  __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+  }
+
+  __forceinline vint8 notand(const vboolf8& m, const vint8& f) {
+    return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+  template<int i>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 usort_ascending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umin(a0,b0);
+    const vint8 d0 = umax(a0,b0);
+    const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umin(a1,b1);
+    const vint8 d1 = umax(a1,b1);
+    const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umin(a2,b2);
+    const vint8 d2 = umax(a2,b2);
+    const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umin(a3,b3);
+    const vint8 d3 = umax(a3,b3);
+    const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umin(a4,b4);
+    const vint8 d4 = umax(a4,b4);
+    const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umin(a5,b5);
+    const vint8 d5 = umax(a5,b5);
+    const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+    return a6;
+  }
+
+  __forceinline vint8 usort_descending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umax(a0,b0);
+    const vint8 d0 = umin(a0,b0);
+    const vint8 a1 = select(0x99 /* 0b10011001 */,c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umax(a1,b1);
+    const vint8 d1 = umin(a1,b1);
+    const vint8 a2 = select(0xc3 /* 0b11000011 */,c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umax(a2,b2);
+    const vint8 d2 = umin(a2,b2);
+    const vint8 a3 = select(0xa5 /* 0b10100101 */,c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umax(a3,b3);
+    const vint8 d3 = umin(a3,b3);
+    const vint8 a4 = select(0xf /* 0b00001111 */,c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umax(a4,b4);
+    const vint8 d4 = umin(a4,b4);
+    const vint8 a5 = select(0x33 /* 0b00110011 */,c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umax(a5,b5);
+    const vint8 d5 = umin(a5,b5);
+    const vint8 a6 = select(0x55 /* 0b01010101 */,c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vint8_avx2.h
new file mode 100644
index 0000000000..4937d972cf
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vint8_avx2.h
@@ -0,0 +1,512 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vint<8>
+  {
+    ALIGNED_STRUCT_(32);
+
+    typedef vboolf8 Bool;
+    typedef vint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m256i v;
+      int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint() {}
+    __forceinline vint(const vint8& a) { v = a.v; }
+    __forceinline vint8& operator =(const vint8& a) { v = a.v; return *this; }
+
+    __forceinline vint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vint(const vint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vint(const vint4& a, const vint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ 
+    __forceinline explicit vint(const int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vint(int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vint(int a, int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vint(int a, int b, int c, int d, int e, int f, int g, int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+    __forceinline explicit vint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vint(ZeroTy)        : v(_mm256_setzero_si256()) {}
+    __forceinline vint(OneTy)         : v(_mm256_set1_epi32(1)) {}
+    __forceinline vint(PosInfTy)      : v(_mm256_set1_epi32(pos_inf)) {}
+    __forceinline vint(NegInfTy)      : v(_mm256_set1_epi32(neg_inf)) {}
+    __forceinline vint(StepTy)        : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vint(ReverseStepTy) : v(_mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7)) {}
+    __forceinline vint(UndefinedTy)   : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vint8 load(const uint8_t* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+    static __forceinline vint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+    static __forceinline vint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+    static __forceinline void store (void* ptr, const vint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vint8 compact(const vboolf8& mask, vint8 &v) {
+      return _mm256_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vint8 compact(const vboolf8& mask, vint8 &a, const vint8& b) {
+      return _mm256_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+    static __forceinline vint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+    
+    static __forceinline vint8 load_nt(void* ptr) {
+      return _mm256_stream_load_si256((__m256i*)ptr);
+    }
+
+    static __forceinline void store_nt(void* ptr, const vint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline void store(uint8_t* ptr, const vint8& i)
+    {
+      for (size_t j=0; j<8; j++)
+        ptr[j] = i[j];
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const int *const ptr, const vint8& index) {
+      return _mm256_i32gather_epi32(ptr, index, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vint8 gather(const vboolf8& mask, const int *const ptr, const vint8& index) {
+      vint8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#else
+      return _mm256_mask_i32gather_epi32(r, ptr, index, mask, scale);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+      *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline vint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_movepi32_mask(a); }
+#else
+  static __forceinline vboolf8 asBool(const vint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+  __forceinline vint8 operator +(const vint8& a) { return a; }
+  __forceinline vint8 operator -(const vint8& a) { return _mm256_sub_epi32(_mm256_setzero_si256(), a); }
+  __forceinline vint8 abs       (const vint8& a) { return _mm256_abs_epi32(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 operator +(const vint8& a, const vint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vint8 operator +(const vint8& a, int          b) { return a + vint8(b); }
+  __forceinline vint8 operator +(int          a, const vint8& b) { return vint8(a) + b; }
+
+  __forceinline vint8 operator -(const vint8& a, const vint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vint8 operator -(const vint8& a, int          b) { return a - vint8(b); }
+  __forceinline vint8 operator -(int          a, const vint8& b) { return vint8(a) - b; }
+
+  __forceinline vint8 operator *(const vint8& a, const vint8& b) { return _mm256_mullo_epi32(a, b); }
+  __forceinline vint8 operator *(const vint8& a, int          b) { return a * vint8(b); }
+  __forceinline vint8 operator *(int          a, const vint8& b) { return vint8(a) * b; }
+
+  __forceinline vint8 operator &(const vint8& a, const vint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vint8 operator &(const vint8& a, int          b) { return a & vint8(b); }
+  __forceinline vint8 operator &(int          a, const vint8& b) { return vint8(a) & b; }
+
+  __forceinline vint8 operator |(const vint8& a, const vint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vint8 operator |(const vint8& a, int          b) { return a | vint8(b); }
+  __forceinline vint8 operator |(int          a, const vint8& b) { return vint8(a) | b; }
+
+  __forceinline vint8 operator ^(const vint8& a, const vint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vint8 operator ^(const vint8& a, int          b) { return a ^ vint8(b); }
+  __forceinline vint8 operator ^(int          a, const vint8& b) { return vint8(a) ^ b; }
+
+  __forceinline vint8 operator <<(const vint8& a, int n) { return _mm256_slli_epi32(a, n); }
+  __forceinline vint8 operator >>(const vint8& a, int n) { return _mm256_srai_epi32(a, n); }
+
+  __forceinline vint8 operator <<(const vint8& a, const vint8& n) { return _mm256_sllv_epi32(a, n); }
+  __forceinline vint8 operator >>(const vint8& a, const vint8& n) { return _mm256_srav_epi32(a, n); }
+
+  __forceinline vint8 sll(const vint8& a, int b) { return _mm256_slli_epi32(a, b); }
+  __forceinline vint8 sra(const vint8& a, int b) { return _mm256_srai_epi32(a, b); }
+  __forceinline vint8 srl(const vint8& a, int b) { return _mm256_srli_epi32(a, b); }
+
+  __forceinline vint8 sll(const vint8& a, const vint8& b) { return _mm256_sllv_epi32(a, b); }
+  __forceinline vint8 sra(const vint8& a, const vint8& b) { return _mm256_srav_epi32(a, b); }
+  __forceinline vint8 srl(const vint8& a, const vint8& b) { return _mm256_srlv_epi32(a, b); }
+  
+  __forceinline vint8 min(const vint8& a, const vint8& b) { return _mm256_min_epi32(a, b); }
+  __forceinline vint8 min(const vint8& a, int          b) { return min(a,vint8(b)); }
+  __forceinline vint8 min(int          a, const vint8& b) { return min(vint8(a),b); }
+
+  __forceinline vint8 max(const vint8& a, const vint8& b) { return _mm256_max_epi32(a, b); }
+  __forceinline vint8 max(const vint8& a, int          b) { return max(a,vint8(b)); }
+  __forceinline vint8 max(int          a, const vint8& b) { return max(vint8(a),b); }
+
+  __forceinline vint8 umin(const vint8& a, const vint8& b) { return _mm256_min_epu32(a, b); }
+  __forceinline vint8 umax(const vint8& a, const vint8& b) { return _mm256_max_epu32(a, b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8& operator +=(vint8& a, const vint8& b) { return a = a + b; }
+  __forceinline vint8& operator +=(vint8& a, int          b) { return a = a + b; }
+  
+  __forceinline vint8& operator -=(vint8& a, const vint8& b) { return a = a - b; }
+  __forceinline vint8& operator -=(vint8& a, int          b) { return a = a - b; }
+  
+  __forceinline vint8& operator *=(vint8& a, const vint8& b) { return a = a * b; }
+  __forceinline vint8& operator *=(vint8& a, int          b) { return a = a * b; }
+  
+  __forceinline vint8& operator &=(vint8& a, const vint8& b) { return a = a & b; }
+  __forceinline vint8& operator &=(vint8& a, int          b) { return a = a & b; }
+  
+  __forceinline vint8& operator |=(vint8& a, const vint8& b) { return a = a | b; }
+  __forceinline vint8& operator |=(vint8& a, int          b) { return a = a | b; }
+  
+  __forceinline vint8& operator <<=(vint8& a, const int b) { return a = a << b; }
+  __forceinline vint8& operator >>=(vint8& a, const int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_EQ); }
+  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_NE); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LT); }
+  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GE); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_GT); }
+  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return _mm256_cmp_epi32_mask(a,b,_MM_CMPINT_LE); }
+
+  static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+  }
+#else
+  static __forceinline vboolf8 operator ==(const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  static __forceinline vboolf8 operator !=(const vint8& a, const vint8& b) { return !(a == b); }
+  static __forceinline vboolf8 operator < (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b, a)); }
+  static __forceinline vboolf8 operator >=(const vint8& a, const vint8& b) { return !(a <  b); }
+  static __forceinline vboolf8 operator > (const vint8& a, const vint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, b)); }
+  static __forceinline vboolf8 operator <=(const vint8& a, const vint8& b) { return !(a >  b); }
+
+  static __forceinline vint8 select(const vboolf8& m, const vint8& t, const vint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+  }
+#endif
+
+  template<int mask>
+  __forceinline vint8 select(const vint8& t, const vint8& f) {
+    return _mm256_blend_epi32(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vint8& a, int          b) { return a == vint8(b); }
+  __forceinline vboolf8 operator ==(int          a, const vint8& b) { return vint8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vint8& a, int          b) { return a != vint8(b); }
+  __forceinline vboolf8 operator !=(int          a, const vint8& b) { return vint8(a) != b; }
+
+  __forceinline vboolf8 operator < (const vint8& a, int          b) { return a <  vint8(b); }
+  __forceinline vboolf8 operator < (int          a, const vint8& b) { return vint8(a) <  b; }
+
+  __forceinline vboolf8 operator >=(const vint8& a, int          b) { return a >= vint8(b); }
+  __forceinline vboolf8 operator >=(int          a, const vint8& b) { return vint8(a) >= b; }
+
+  __forceinline vboolf8 operator > (const vint8& a, int          b) { return a >  vint8(b); }
+  __forceinline vboolf8 operator > (int          a, const vint8& b) { return vint8(a) >  b; }
+
+  __forceinline vboolf8 operator <=(const vint8& a, int          b) { return a <= vint8(b); }
+  __forceinline vboolf8 operator <=(int          a, const vint8& b) { return vint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vint8& a, const vint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vint8& a, const vint8& b) { return a != b; }
+  __forceinline vboolf8 lt(const vint8& a, const vint8& b) { return a <  b; }
+  __forceinline vboolf8 ge(const vint8& a, const vint8& b) { return a >= b; }
+  __forceinline vboolf8 gt(const vint8& a, const vint8& b) { return a >  b; }
+  __forceinline vboolf8 le(const vint8& a, const vint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_NE); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GE); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_GT); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return _mm256_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  static __forceinline vboolf8 eq(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a == b); }
+  static __forceinline vboolf8 ne(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a != b); }
+  static __forceinline vboolf8 lt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <  b); }
+  static __forceinline vboolf8 ge(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >= b); }
+  static __forceinline vboolf8 gt(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a >  b); }
+  static __forceinline vboolf8 le(const vboolf8& mask, const vint8& a, const vint8& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 unpacklo(const vint8& a, const vint8& b) { return _mm256_unpacklo_epi32(a, b); }
+  __forceinline vint8 unpackhi(const vint8& a, const vint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+  template<int i>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vint8 shuffle4(const vint8& a, const vint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vint8 shuffle(const vint8& a, const vint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vint8 shuffle<0, 0, 2, 2>(const vint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<1, 1, 3, 3>(const vint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vint8 shuffle<0, 1, 0, 1>(const vint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vint8 broadcast(const int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+
+  template<int i> __forceinline vint8 insert4(const vint8& a, const vint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vint4 extract4(const vint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vint4 extract4<0>(const vint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+#if !defined(__aarch64__)
+
+__forceinline vint8 permute(const vint8& v, const __m256i& index) {
+    return _mm256_permutevar8x32_epi32(v, index);
+  }
+
+  __forceinline vint8 shuffle(const vint8& v, const __m256i& index) {
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+  }
+
+
+
+  template<int i>
+  static __forceinline vint8 align_shift_right(const vint8& a, const vint8& b) {
+#if defined(__AVX512VL__)
+    return _mm256_alignr_epi32(a, b, i);    
+#else
+    return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+  }  
+
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 vreduce_min2(const vint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_min4(const vint8& v) { vint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_min (const vint8& v) { vint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_max2(const vint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  __forceinline vint8 vreduce_max4(const vint8& v) { vint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  __forceinline vint8 vreduce_max (const vint8& v) { vint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vint8 vreduce_add2(const vint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vint8 vreduce_add4(const vint8& v) { vint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vint8 vreduce_add (const vint8& v) { vint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  __forceinline int reduce_min(const vint8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline int reduce_max(const vint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vint8& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  __forceinline size_t select_min(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  __forceinline size_t select_max(const vboolf8& valid, const vint8& v) { const vint8 a = select(valid,v,vint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+
+  __forceinline vint8 assign(const vint4& a) { return _mm256_castsi128_si256(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Sorting networks
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vint8 usort_ascending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umin(a0,b0);
+    const vint8 d0 = umax(a0,b0);
+    const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umin(a1,b1);
+    const vint8 d1 = umax(a1,b1);
+    const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umin(a2,b2);
+    const vint8 d2 = umax(a2,b2);
+    const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umin(a3,b3);
+    const vint8 d3 = umax(a3,b3);
+    const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umin(a4,b4);
+    const vint8 d4 = umax(a4,b4);
+    const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umin(a5,b5);
+    const vint8 d5 = umax(a5,b5);
+    const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  __forceinline vint8 usort_descending(const vint8& v)
+  {
+    const vint8 a0 = v;
+    const vint8 b0 = shuffle<1,0,3,2>(a0);
+    const vint8 c0 = umax(a0,b0);
+    const vint8 d0 = umin(a0,b0);
+    const vint8 a1 = select<0x99 /* 0b10011001 */>(c0,d0);
+    const vint8 b1 = shuffle<2,3,0,1>(a1);
+    const vint8 c1 = umax(a1,b1);
+    const vint8 d1 = umin(a1,b1);
+    const vint8 a2 = select<0xc3 /* 0b11000011 */>(c1,d1);
+    const vint8 b2 = shuffle<1,0,3,2>(a2);
+    const vint8 c2 = umax(a2,b2);
+    const vint8 d2 = umin(a2,b2);
+    const vint8 a3 = select<0xa5 /* 0b10100101 */>(c2,d2);
+    const vint8 b3 = shuffle4<1,0>(a3);
+    const vint8 c3 = umax(a3,b3);
+    const vint8 d3 = umin(a3,b3);
+    const vint8 a4 = select<0xf /* 0b00001111 */>(c3,d3);
+    const vint8 b4 = shuffle<2,3,0,1>(a4);
+    const vint8 c4 = umax(a4,b4);
+    const vint8 d4 = umin(a4,b4);
+    const vint8 a5 = select<0x33 /* 0b00110011 */>(c4,d4);
+    const vint8 b5 = shuffle<1,0,3,2>(a5);
+    const vint8 c5 = umax(a5,b5);
+    const vint8 d5 = umin(a5,b5);
+    const vint8 a6 = select<0x55 /* 0b01010101 */>(c5,d5);
+    return a6;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h b/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h
new file mode 100644
index 0000000000..de3ebc16a7
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vllong4_avx2.h
@@ -0,0 +1,358 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{ 
+  /* 4-wide AVX2 64-bit long long type */
+  template<>
+  struct vllong<4>
+  {
+    ALIGNED_STRUCT_(32);
+    
+    typedef vboold4 Bool;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union {             // data
+      __m256i v; 
+      long long i[4];
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vllong() {}
+    __forceinline vllong(const vllong4& t) { v = t.v; }
+    __forceinline vllong4& operator =(const vllong4& f) { v = f.v; return *this; }
+
+    __forceinline vllong(const __m256i& t) { v = t; }
+    __forceinline operator __m256i() const { return v; }
+    __forceinline operator __m256d() const { return _mm256_castsi256_pd(v); }
+
+
+    __forceinline vllong(long long i) {
+      v = _mm256_set1_epi64x(i);
+    }
+    
+    __forceinline vllong(long long a, long long b, long long c, long long d) {
+      v = _mm256_set_epi64x(d,c,b,a);      
+    }
+   
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vllong(ZeroTy) : v(_mm256_setzero_si256()) {}
+    __forceinline vllong(OneTy)  : v(_mm256_set1_epi64x(1)) {}
+    __forceinline vllong(StepTy) : v(_mm256_set_epi64x(3,2,1,0)) {}
+    __forceinline vllong(ReverseStepTy) : v(_mm256_set_epi64x(0,1,2,3)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vllong4& a) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(a));
+    }
+
+    static __forceinline vllong4 loadu(const void* addr)
+    {
+      return _mm256_loadu_si256((__m256i*)addr);
+    }
+
+    static __forceinline vllong4 load(const vllong4* addr) {
+      return _mm256_load_si256((__m256i*)addr);
+    }
+
+    static __forceinline vllong4 load(const long long* addr) {
+      return _mm256_load_si256((__m256i*)addr);
+    }
+
+    static __forceinline void store(void* ptr, const vllong4& v) {
+      _mm256_store_si256((__m256i*)ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vllong4& v) {
+      _mm256_storeu_si256((__m256i*)ptr,v);
+    }
+
+    static __forceinline void storeu(const vboold4& mask, long long* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+      _mm256_mask_storeu_epi64(ptr,mask,f);
+#else
+      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+    }
+
+    static __forceinline void store(const vboold4& mask, void* ptr, const vllong4& f) {
+#if defined(__AVX512VL__)
+      _mm256_mask_store_epi64(ptr,mask,f);
+#else
+      _mm256_maskstore_pd((double*)ptr,mask,_mm256_castsi256_pd(f));
+#endif
+    }
+
+    static __forceinline vllong4 broadcast64bit(size_t v) {
+      return _mm256_set1_epi64x(v);
+    }
+
+    static __forceinline size_t extract64bit(const vllong4& v)
+    {
+      return _mm_cvtsi128_si64(_mm256_castsi256_si128(v));
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       long long& operator [](size_t index)       { assert(index < 4); return i[index]; }
+    __forceinline const long long& operator [](size_t index) const { assert(index < 4); return i[index]; }
+
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Select
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vllong4 select(const vboold4& m, const vllong4& t, const vllong4& f) {
+  #if defined(__AVX512VL__)
+    return _mm256_mask_blend_epi64(m, f, t);
+  #else
+    return _mm256_castpd_si256(_mm256_blendv_pd(_mm256_castsi256_pd(f), _mm256_castsi256_pd(t), m));
+  #endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_movepi64_mask(a); }
+#else
+  __forceinline vboold4 asBool(const vllong4& a) { return _mm256_castsi256_pd(a); }
+#endif
+
+  __forceinline vllong4 operator +(const vllong4& a) { return a; }
+  __forceinline vllong4 operator -(const vllong4& a) { return _mm256_sub_epi64(_mm256_setzero_si256(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong4 operator +(const vllong4& a, const vllong4& b) { return _mm256_add_epi64(a, b); }
+  __forceinline vllong4 operator +(const vllong4& a, long long      b) { return a + vllong4(b); }
+  __forceinline vllong4 operator +(long long      a, const vllong4& b) { return vllong4(a) + b; }
+
+  __forceinline vllong4 operator -(const vllong4& a, const vllong4& b) { return _mm256_sub_epi64(a, b); }
+  __forceinline vllong4 operator -(const vllong4& a, long long      b) { return a - vllong4(b); }
+  __forceinline vllong4 operator -(long long      a, const vllong4& b) { return vllong4(a) - b; }
+
+  /* only low 32bit part */
+  __forceinline vllong4 operator *(const vllong4& a, const vllong4& b) { return _mm256_mul_epi32(a, b); }
+  __forceinline vllong4 operator *(const vllong4& a, long long      b) { return a * vllong4(b); }
+  __forceinline vllong4 operator *(long long      a, const vllong4& b) { return vllong4(a) * b; }
+
+  __forceinline vllong4 operator &(const vllong4& a, const vllong4& b) { return _mm256_and_si256(a, b); }
+  __forceinline vllong4 operator &(const vllong4& a, long long      b) { return a & vllong4(b); }
+  __forceinline vllong4 operator &(long long      a, const vllong4& b) { return vllong4(a) & b; }
+
+  __forceinline vllong4 operator |(const vllong4& a, const vllong4& b) { return _mm256_or_si256(a, b); }
+  __forceinline vllong4 operator |(const vllong4& a, long long      b) { return a | vllong4(b); }
+  __forceinline vllong4 operator |(long long      a, const vllong4& b) { return vllong4(a) | b; }
+
+  __forceinline vllong4 operator ^(const vllong4& a, const vllong4& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vllong4 operator ^(const vllong4& a, long long      b) { return a ^ vllong4(b); }
+  __forceinline vllong4 operator ^(long long      a, const vllong4& b) { return vllong4(a) ^ b; }
+
+  __forceinline vllong4 operator <<(const vllong4& a, long long n) { return _mm256_slli_epi64(a, (int)n); }
+  //__forceinline vllong4 operator >>(const vllong4& a, long long n) { return _mm256_srai_epi64(a, n); }
+
+  __forceinline vllong4 operator <<(const vllong4& a, const vllong4& n) { return _mm256_sllv_epi64(a, n); }
+  //__forceinline vllong4 operator >>(const vllong4& a, const vllong4& n) { return _mm256_srav_epi64(a, n); }
+  //__forceinline vllong4 sra(const vllong4& a, long long b) { return _mm256_srai_epi64(a, b); }
+
+  __forceinline vllong4 srl(const vllong4& a, long long b) { return _mm256_srli_epi64(a, (int)b); }
+  
+  //__forceinline vllong4 min(const vllong4& a, const vllong4& b) { return _mm256_min_epi64(a, b); }
+  //__forceinline vllong4 min(const vllong4& a, long long      b) { return min(a,vllong4(b)); }
+  //__forceinline vllong4 min(long long      a, const vllong4& b) { return min(vllong4(a),b); }
+
+  //__forceinline vllong4 max(const vllong4& a, const vllong4& b) { return _mm256_max_epi64(a, b); }
+  //__forceinline vllong4 max(const vllong4& a, long long      b) { return max(a,vllong4(b)); }
+  //__forceinline vllong4 max(long long      a, const vllong4& b) { return max(vllong4(a),b); }
+
+#if defined(__AVX512VL__)
+  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_and_epi64(c,m,a,b); }
+  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return _mm256_mask_or_epi64(c,m,a,b); }
+#else
+  __forceinline vllong4 mask_and(const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a & b, c); }
+  __forceinline vllong4 mask_or (const vboold4& m, const vllong4& c, const vllong4& a, const vllong4& b) { return select(m, a | b, c); }
+#endif
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong4& operator +=(vllong4& a, const vllong4& b) { return a = a + b; }
+  __forceinline vllong4& operator +=(vllong4& a, long long      b) { return a = a + b; }
+  
+  __forceinline vllong4& operator -=(vllong4& a, const vllong4& b) { return a = a - b; }
+  __forceinline vllong4& operator -=(vllong4& a, long long      b) { return a = a - b; }
+
+  __forceinline vllong4& operator *=(vllong4& a, const vllong4& b) { return a = a * b; }
+  __forceinline vllong4& operator *=(vllong4& a, long long      b) { return a = a * b; }
+  
+  __forceinline vllong4& operator &=(vllong4& a, const vllong4& b) { return a = a & b; }
+  __forceinline vllong4& operator &=(vllong4& a, long long      b) { return a = a & b; }
+  
+  __forceinline vllong4& operator |=(vllong4& a, const vllong4& b) { return a = a | b; }
+  __forceinline vllong4& operator |=(vllong4& a, long long      b) { return a = a | b; }
+  
+  __forceinline vllong4& operator <<=(vllong4& a, long long      b) { return a = a << b; }
+  //__forceinline vllong4& operator >>=(vllong4& a, long long      b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return _mm256_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 operator ==(const vllong4& a, const vllong4& b) { return _mm256_cmpeq_epi64(a,b); }
+  __forceinline vboold4 operator !=(const vllong4& a, const vllong4& b) { return !(a == b); }
+  __forceinline vboold4 operator > (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(a,b); }
+  __forceinline vboold4 operator < (const vllong4& a, const vllong4& b) { return _mm256_cmpgt_epi64(b,a); }
+  __forceinline vboold4 operator >=(const vllong4& a, const vllong4& b) { return !(a < b); }
+  __forceinline vboold4 operator <=(const vllong4& a, const vllong4& b) { return !(a > b); }
+#endif
+
+  __forceinline vboold4 operator ==(const vllong4& a, long long      b) { return a == vllong4(b); }
+  __forceinline vboold4 operator ==(long long      a, const vllong4& b) { return vllong4(a) == b; }
+
+  __forceinline vboold4 operator !=(const vllong4& a, long long      b) { return a != vllong4(b); }
+  __forceinline vboold4 operator !=(long long      a, const vllong4& b) { return vllong4(a) != b; }
+
+  __forceinline vboold4 operator > (const vllong4& a, long long      b) { return a >  vllong4(b); }
+  __forceinline vboold4 operator > (long long      a, const vllong4& b) { return vllong4(a) >  b; }
+
+  __forceinline vboold4 operator < (const vllong4& a, long long      b) { return a <  vllong4(b); }
+  __forceinline vboold4 operator < (long long      a, const vllong4& b) { return vllong4(a) <  b; }
+
+  __forceinline vboold4 operator >=(const vllong4& a, long long      b) { return a >= vllong4(b); }
+  __forceinline vboold4 operator >=(long long      a, const vllong4& b) { return vllong4(a) >= b; }
+
+  __forceinline vboold4 operator <=(const vllong4& a, long long      b) { return a <= vllong4(b); }
+  __forceinline vboold4 operator <=(long long      a, const vllong4& b) { return vllong4(a) <= b; }
+
+  __forceinline vboold4 eq(const vllong4& a, const vllong4& b) { return a == b; }
+  __forceinline vboold4 ne(const vllong4& a, const vllong4& b) { return a != b; }
+  __forceinline vboold4 lt(const vllong4& a, const vllong4& b) { return a <  b; }
+  __forceinline vboold4 ge(const vllong4& a, const vllong4& b) { return a >= b; }
+  __forceinline vboold4 gt(const vllong4& a, const vllong4& b) { return a >  b; }
+  __forceinline vboold4 le(const vllong4& a, const vllong4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return _mm256_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboold4 eq(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a == b); }
+  __forceinline vboold4 ne(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a != b); }
+  __forceinline vboold4 lt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <  b); }
+  __forceinline vboold4 ge(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >= b); }
+  __forceinline vboold4 gt(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a >  b); }
+  __forceinline vboold4 le(const vboold4& mask, const vllong4& a, const vllong4& b) { return mask & (a <= b); }
+#endif
+
+  __forceinline void xchg(const vboold4& m, vllong4& a, vllong4& b) {
+    const vllong4 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboold4 test(const vllong4& a, const vllong4& b) {
+#if defined(__AVX512VL__)
+    return _mm256_test_epi64_mask(a,b);
+#else
+    return _mm256_testz_si256(a,b);
+#endif
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vllong4 shuffle(const vllong4& v) {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(v), (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+  }
+
+  template<int i>
+  __forceinline vllong4 shuffle(const vllong4& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1>
+  __forceinline vllong4 shuffle2(const vllong4& v) {
+    return _mm256_castpd_si256(_mm256_permute2f128_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(v), (i1 << 4) | i0));
+  }
+
+  __forceinline long long toScalar(const vllong4& v) {
+    return _mm_cvtsi128_si64(_mm256_castsi256_si128(v));
+  }
+
+#if defined(__AVX512VL__)
+  __forceinline vllong4 permute(const vllong4& a, const __m256i& index) {
+    // workaround for GCC 7.x
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+    return _mm256_permutex2var_epi64(a,index,a);
+#else
+    return _mm256_permutexvar_epi64(index,a);
+#endif
+  }
+
+  __forceinline vllong4 permutex2var(const vllong4& index, const vllong4& a, const vllong4& b) {
+    return _mm256_permutex2var_epi64(a,index,b);
+  }
+
+#endif
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+  
+
+  __forceinline vllong4 vreduce_and2(const vllong4& x) { return x & shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_and (const vllong4& y) { const vllong4 x = vreduce_and2(y); return x & shuffle2<1,0>(x); }
+
+  __forceinline vllong4 vreduce_or2(const vllong4& x) { return x | shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_or (const vllong4& y) { const vllong4 x = vreduce_or2(y); return x | shuffle2<1,0>(x); }
+
+  __forceinline vllong4 vreduce_add2(const vllong4& x) { return x + shuffle<1,0>(x); }
+  __forceinline vllong4 vreduce_add (const vllong4& y) { const vllong4 x = vreduce_add2(y); return x + shuffle2<1,0>(x); }
+
+  __forceinline long long reduce_add(const vllong4& a) { return toScalar(vreduce_add(a)); }
+  __forceinline long long reduce_or (const vllong4& a) { return toScalar(vreduce_or(a)); }
+  __forceinline long long reduce_and(const vllong4& a) { return toScalar(vreduce_and(a)); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vllong4& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<4; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h b/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h
new file mode 100644
index 0000000000..76dddd8991
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vllong8_avx512.h
@@ -0,0 +1,381 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{ 
+  /* 8-wide AVX-512 64-bit long long type */
+  template<>
+  struct vllong<8>
+  {
+    ALIGNED_STRUCT_(64);
+        
+    typedef vboold8 Bool;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m512i v; 
+      long long i[8];
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vllong() {}
+    __forceinline vllong(const vllong8& t) { v = t.v; }
+    __forceinline vllong8& operator =(const vllong8& f) { v = f.v; return *this; }
+
+    __forceinline vllong(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vllong(long long i) {
+      v = _mm512_set1_epi64(i);
+    }
+    
+    __forceinline vllong(long long a, long long b, long long c, long long d) {
+      v = _mm512_set4_epi64(d,c,b,a);      
+    }
+
+    __forceinline vllong(long long a0, long long a1, long long a2, long long a3,
+                         long long a4, long long a5, long long a6, long long a7)
+    {
+      v = _mm512_set_epi64(a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+   
+    __forceinline vllong(const vllong<4>& i) {
+      v = _mm512_broadcast_i64x4(i);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vllong(ZeroTy) : v(_mm512_setzero_epi32()) {}
+    __forceinline vllong(OneTy)  : v(_mm512_set1_epi64(1)) {}
+    __forceinline vllong(StepTy) : v(_mm512_set_epi64(7,6,5,4,3,2,1,0)) {}
+    __forceinline vllong(ReverseStepTy) : v(_mm512_setr_epi64(7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vllong8& a) {
+      _mm512_stream_si512((__m512i*)ptr,a);
+    }
+
+    static __forceinline vllong8 loadu(const void* addr) {
+      return _mm512_loadu_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const vllong8* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const long long* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vllong8 load(const uint8_t* ptr) {
+      return _mm512_cvtepu8_epi64(*(__m128i*)ptr); 
+    }
+
+    static __forceinline void store(void* ptr, const vllong8& v) {
+      _mm512_store_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vllong8& v) {
+      _mm512_storeu_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(const vboold8& mask, long long* ptr, const vllong8& f) {
+      _mm512_mask_storeu_epi64(ptr,mask,f);
+    }
+
+    static __forceinline void store(const vboold8& mask, void* addr, const vllong8& v2) {
+      _mm512_mask_store_epi64(addr,mask,v2);
+    }
+
+    /* pass by value to avoid compiler generating inefficient code */
+    static __forceinline void storeu_compact(const vboold8 mask, void* addr, const vllong8& reg) {
+      _mm512_mask_compressstoreu_epi64(addr,mask,reg);
+    }
+
+    static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& v) {
+      return _mm512_mask_compress_epi64(v,mask,v);
+    }
+
+    static __forceinline vllong8 compact64bit(const vboold8& mask, vllong8& dest, const vllong8& source) {
+      return _mm512_mask_compress_epi64(dest,mask,source);
+    }
+
+    static __forceinline vllong8 compact(const vboold8& mask, vllong8& v) {
+      return _mm512_mask_compress_epi64(v,mask,v);
+    }
+
+    static __forceinline vllong8 compact(const vboold8& mask, const vllong8& a, vllong8& b) {
+      return _mm512_mask_compress_epi64(a,mask,b);
+    }
+
+    static __forceinline vllong8 expand(const vboold8& mask, const vllong8& a, vllong8& b) {
+      return _mm512_mask_expand_epi64(b,mask,a);
+    }
+
+    static __forceinline vllong8 broadcast64bit(size_t v) {
+      return _mm512_set1_epi64(v);
+    }
+
+    static __forceinline size_t extract64bit(const vllong8& v)
+    {
+      return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       long long& operator [](size_t index)       { assert(index < 8); return i[index]; }
+    __forceinline const long long& operator [](size_t index) const { assert(index < 8); return i[index]; }
+
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 asBool(const vllong8& a) { return _mm512_movepi64_mask(a); }
+
+  __forceinline vllong8 operator +(const vllong8& a) { return a; }
+  __forceinline vllong8 operator -(const vllong8& a) { return _mm512_sub_epi64(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 operator +(const vllong8& a, const vllong8& b) { return _mm512_add_epi64(a, b); }
+  __forceinline vllong8 operator +(const vllong8& a, long long      b) { return a + vllong8(b); }
+  __forceinline vllong8 operator +(long long      a, const vllong8& b) { return vllong8(a) + b; }
+
+  __forceinline vllong8 operator -(const vllong8& a, const vllong8& b) { return _mm512_sub_epi64(a, b); }
+  __forceinline vllong8 operator -(const vllong8& a, long long      b) { return a - vllong8(b); }
+  __forceinline vllong8 operator -(long long      a, const vllong8& b) { return vllong8(a) - b; }
+
+  __forceinline vllong8 operator *(const vllong8& a, const vllong8& b) { return _mm512_mullo_epi64(a, b); }
+  __forceinline vllong8 operator *(const vllong8& a, long long      b) { return a * vllong8(b); }
+  __forceinline vllong8 operator *(long long      a, const vllong8& b) { return vllong8(a) * b; }
+
+  __forceinline vllong8 operator &(const vllong8& a, const vllong8& b) { return _mm512_and_epi64(a, b); }
+  __forceinline vllong8 operator &(const vllong8& a, long long      b) { return a & vllong8(b); }
+  __forceinline vllong8 operator &(long long      a, const vllong8& b) { return vllong8(a) & b; }
+
+  __forceinline vllong8 operator |(const vllong8& a, const vllong8& b) { return _mm512_or_epi64(a, b); }
+  __forceinline vllong8 operator |(const vllong8& a, long long      b) { return a | vllong8(b); }
+  __forceinline vllong8 operator |(long long      a, const vllong8& b) { return vllong8(a) | b; }
+
+  __forceinline vllong8 operator ^(const vllong8& a, const vllong8& b) { return _mm512_xor_epi64(a, b); }
+  __forceinline vllong8 operator ^(const vllong8& a, long long      b) { return a ^ vllong8(b); }
+  __forceinline vllong8 operator ^(long long      a, const vllong8& b) { return vllong8(a) ^ b; }
+
+  __forceinline vllong8 operator <<(const vllong8& a, long long n) { return _mm512_slli_epi64(a, n); }
+  __forceinline vllong8 operator >>(const vllong8& a, long long n) { return _mm512_srai_epi64(a, n); }
+
+  __forceinline vllong8 operator <<(const vllong8& a, const vllong8& n) { return _mm512_sllv_epi64(a, n); }
+  __forceinline vllong8 operator >>(const vllong8& a, const vllong8& n) { return _mm512_srav_epi64(a, n); }
+
+  __forceinline vllong8 sll (const vllong8& a, long long b) { return _mm512_slli_epi64(a, b); }
+  __forceinline vllong8 sra (const vllong8& a, long long b) { return _mm512_srai_epi64(a, b); }
+  __forceinline vllong8 srl (const vllong8& a, long long b) { return _mm512_srli_epi64(a, b); }
+
+  __forceinline vllong8 min(const vllong8& a, const vllong8& b) { return _mm512_min_epi64(a, b); }
+  __forceinline vllong8 min(const vllong8& a, long long      b) { return min(a,vllong8(b)); }
+  __forceinline vllong8 min(long long      a, const vllong8& b) { return min(vllong8(a),b); }
+
+  __forceinline vllong8 max(const vllong8& a, const vllong8& b) { return _mm512_max_epi64(a, b); }
+  __forceinline vllong8 max(const vllong8& a, long long      b) { return max(a,vllong8(b)); }
+  __forceinline vllong8 max(long long      a, const vllong8& b) { return max(vllong8(a),b); }
+  
+  __forceinline vllong8 mask_add(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_add_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_sub(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_sub_epi64(c,m,a,b); }
+
+  __forceinline vllong8 mask_and(const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_and_epi64(c,m,a,b); }
+  __forceinline vllong8 mask_or (const vboold8& m, const vllong8& c, const vllong8& a, const vllong8& b) { return _mm512_mask_or_epi64(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8& operator +=(vllong8& a, const vllong8&  b) { return a = a + b; }
+  __forceinline vllong8& operator +=(vllong8& a, long long       b) { return a = a + b; }
+  
+  __forceinline vllong8& operator -=(vllong8& a, const vllong8&  b) { return a = a - b; }
+  __forceinline vllong8& operator -=(vllong8& a, long long       b) { return a = a - b; }
+
+  __forceinline vllong8& operator *=(vllong8& a, const vllong8&  b) { return a = a * b; }
+  __forceinline vllong8& operator *=(vllong8& a, long long       b) { return a = a * b; }
+  
+  __forceinline vllong8& operator &=(vllong8& a, const vllong8&  b) { return a = a & b; }
+  __forceinline vllong8& operator &=(vllong8& a, long long       b) { return a = a & b; }
+  
+  __forceinline vllong8& operator |=(vllong8& a, const vllong8&  b) { return a = a | b; }
+  __forceinline vllong8& operator |=(vllong8& a, long long       b) { return a = a | b; }
+  
+  __forceinline vllong8& operator <<=(vllong8& a, long long b) { return a = a << b; }
+  __forceinline vllong8& operator >>=(vllong8& a, long long b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboold8 operator ==(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 operator ==(const vllong8& a, long long      b) { return a == vllong8(b); }
+  __forceinline vboold8 operator ==(long long      a, const vllong8& b) { return vllong8(a) == b; }
+  
+  __forceinline vboold8 operator !=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 operator !=(const vllong8& a, long long      b) { return a != vllong8(b); }
+  __forceinline vboold8 operator !=(long long      a, const vllong8& b) { return vllong8(a) != b; }
+  
+  __forceinline vboold8 operator < (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 operator < (const vllong8& a, long long      b) { return a <  vllong8(b); }
+  __forceinline vboold8 operator < (long long      a, const vllong8& b) { return vllong8(a) <  b; }
+  
+  __forceinline vboold8 operator >=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 operator >=(const vllong8& a, long long      b) { return a >= vllong8(b); }
+  __forceinline vboold8 operator >=(long long      a, const vllong8& b) { return vllong8(a) >= b; }
+
+  __forceinline vboold8 operator > (const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 operator > (const vllong8& a, long long      b) { return a >  vllong8(b); }
+  __forceinline vboold8 operator > (long long      a, const vllong8& b) { return vllong8(a) >  b; }
+
+  __forceinline vboold8 operator <=(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboold8 operator <=(const vllong8& a, long long      b) { return a <= vllong8(b); }
+  __forceinline vboold8 operator <=(long long      a, const vllong8& b) { return vllong8(a) <= b; }
+
+  __forceinline vboold8 eq(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vllong8& a, const vllong8& b) { return _mm512_cmp_epi64_mask(a,b,_MM_CMPINT_LE); }
+    
+  __forceinline vboold8 eq(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboold8 ne(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboold8 lt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboold8 ge(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboold8 gt(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboold8 le(const vboold8 mask, const vllong8& a, const vllong8& b) { return _mm512_mask_cmp_epi64_mask(mask,a,b,_MM_CMPINT_LE); }
+
+  __forceinline vllong8 select(const vboold8& m, const vllong8& t, const vllong8& f) {
+    return _mm512_mask_or_epi64(f,m,t,t); 
+  }
+
+  __forceinline void xchg(const vboold8& m, vllong8& a, vllong8& b) {
+    const vllong8 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboold8 test(const vboold8& m, const vllong8& a, const vllong8& b) {
+    return _mm512_mask_test_epi64_mask(m,a,b);
+  }
+
+  __forceinline vboold8 test(const vllong8& a, const vllong8& b) {
+    return _mm512_test_epi64_mask(a,b);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i0, int i1>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return _mm512_castpd_si512(_mm512_permute_pd(_mm512_castsi512_pd(v), (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0));
+  }
+
+  template<int i>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return shuffle<i, i>(v);
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vllong8 shuffle(const vllong8& v) {
+    return _mm512_permutex_epi64(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vllong8 shuffle4(const vllong8& v) {
+    return _mm512_shuffle_i64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
+  }
+
+  template<int i>
+  __forceinline vllong8 shuffle4(const vllong8& v) {
+    return shuffle4<i, i>(v);
+  }
+
+  template<int i>
+  __forceinline vllong8 align_shift_right(const vllong8& a, const vllong8& b) {
+    return _mm512_alignr_epi64(a, b, i);
+  };
+
+  __forceinline long long toScalar(const vllong8& v) {
+    return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+  }
+
+  __forceinline vllong8 zeroExtend32Bit(const __m512i& a) {
+    return _mm512_cvtepu32_epi64(_mm512_castsi512_si256(a));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 vreduce_min2(vllong8 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vllong8 vreduce_min4(vllong8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vllong8 vreduce_min (vllong8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
+
+  __forceinline vllong8 vreduce_max2(vllong8 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vllong8 vreduce_max4(vllong8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vllong8 vreduce_max (vllong8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
+
+  __forceinline vllong8 vreduce_and2(vllong8 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_and4(vllong8 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_and (vllong8 x) { x = vreduce_and4(x); return x & shuffle4<1,0>(x); }
+
+  __forceinline vllong8 vreduce_or2(vllong8 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_or4(vllong8 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_or (vllong8 x) { x = vreduce_or4(x); return x | shuffle4<1,0>(x); }
+
+  __forceinline vllong8 vreduce_add2(vllong8 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vllong8 vreduce_add4(vllong8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vllong8 vreduce_add (vllong8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
+
+  __forceinline long long reduce_min(const vllong8& v) { return toScalar(vreduce_min(v)); }
+  __forceinline long long reduce_max(const vllong8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline long long reduce_and(const vllong8& v) { return toScalar(vreduce_and(v)); }
+  __forceinline long long reduce_or (const vllong8& v) { return toScalar(vreduce_or (v)); }
+  __forceinline long long reduce_add(const vllong8& v) { return toScalar(vreduce_add(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vllong8 permute(const vllong8& v, const vllong8& index) {
+    return _mm512_permutexvar_epi64(index,v);  
+  }
+
+  __forceinline vllong8 reverse(const vllong8& a) {
+    return permute(a,vllong8(reverse_step));
+  }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vllong8& v)
+  {
+    cout << "<" << v[0];
+    for (size_t i=1; i<8; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h b/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h
new file mode 100644
index 0000000000..39752611bb
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vuint16_avx512.h
@@ -0,0 +1,443 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{ 
+  /* 16-wide AVX-512 unsigned integer type */
+  template<>
+  struct vuint<16>
+  {
+    ALIGNED_STRUCT_(64);   
+
+    typedef vboolf16 Bool;
+    typedef vuint16  UInt;
+    typedef vfloat16 Float;
+
+    enum  { size = 16 }; // number of SIMD elements
+    union {              // data
+      __m512i v; 
+      unsigned int i[16]; 
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+       
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint16& t) { v = t.v; }
+    __forceinline vuint16& operator =(const vuint16& f) { v = f.v; return *this; }
+
+    __forceinline vuint(const __m512i& t) { v = t; }
+    __forceinline operator __m512i() const { return v; }
+    __forceinline operator __m256i() const { return _mm512_castsi512_si256(v); }
+
+    __forceinline vuint(unsigned int i) {
+      v = _mm512_set1_epi32(i);
+    }
+
+    __forceinline vuint(const vuint4& i) {
+      v = _mm512_broadcast_i32x4(i);
+    }
+
+    __forceinline vuint(const vuint8& i) {
+      v = _mm512_castps_si512(_mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_castsi256_pd(i))));
+    }
+    
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) {
+      v = _mm512_set4_epi32(d,c,b,a);      
+    }
+
+    __forceinline vuint(unsigned int a0 , unsigned int a1 , unsigned int a2 , unsigned int a3,
+                        unsigned int a4 , unsigned int a5 , unsigned int a6 , unsigned int a7,
+                        unsigned int a8 , unsigned int a9 , unsigned int a10, unsigned int a11,
+                        unsigned int a12, unsigned int a13, unsigned int a14, unsigned int a15)
+    {
+      v = _mm512_set_epi32(a15,a14,a13,a12,a11,a10,a9,a8,a7,a6,a5,a4,a3,a2,a1,a0);
+    }
+   
+    __forceinline explicit vuint(const __m512& f) {
+      v = _mm512_cvtps_epu32(f);
+    }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vuint(ZeroTy) : v(_mm512_setzero_epi32()) {}
+    __forceinline vuint(OneTy)  : v(_mm512_set1_epi32(1)) {}
+    __forceinline vuint(StepTy) : v(_mm512_set_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+    __forceinline vuint(ReverseStepTy) : v(_mm512_setr_epi32(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline void store_nt(void* __restrict__ ptr, const vuint16& a) {
+      _mm512_stream_si512((__m512i*)ptr,a);
+    }
+
+    static __forceinline vuint16 loadu(const void* addr)
+    {
+      return _mm512_loadu_si512(addr);
+    }
+
+    static __forceinline vuint16 loadu(const uint8_t* ptr) { return _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+    static __forceinline vuint16 loadu(const unsigned short* ptr) { return _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)ptr)); }
+
+    static __forceinline vuint16 load(const vuint16* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vuint16 load(const unsigned int* addr) {
+      return _mm512_load_si512(addr);
+    }
+
+    static __forceinline vuint16 load(unsigned short* ptr) { return _mm512_cvtepu16_epi32(*(__m256i*)ptr); }
+
+
+    static __forceinline void store(void* ptr, const vuint16& v) {
+      _mm512_store_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(void* ptr, const vuint16& v) {
+      _mm512_storeu_si512(ptr,v);
+    }
+
+    static __forceinline void storeu(const vboolf16& mask, void* ptr, const vuint16& f) {
+      _mm512_mask_storeu_epi32(ptr,mask,f);
+    }
+
+    static __forceinline void store(const vboolf16& mask, void* addr, const vuint16& v2) {
+      _mm512_mask_store_epi32(addr,mask,v2);
+    }
+
+    /* pass by value to avoid compiler generating inefficient code */
+    static __forceinline void storeu_compact(const vboolf16 mask, void* addr, const vuint16 reg) {
+      _mm512_mask_compressstoreu_epi32(addr,mask,reg);
+    }
+
+    static __forceinline void storeu_compact_single(const vboolf16 mask, void* addr, vuint16 reg) {
+      //_mm512_mask_compressstoreu_epi32(addr,mask,reg);
+      *(float*)addr = mm512_cvtss_f32(_mm512_mask_compress_ps(_mm512_castsi512_ps(reg),mask,_mm512_castsi512_ps(reg)));
+    }
+
+    static __forceinline vuint16 compact64bit(const vboolf16& mask, vuint16& v) {
+      return _mm512_mask_compress_epi64(v,mask,v);
+    }
+
+    static __forceinline vuint16 compact(const vboolf16& mask, vuint16& v) {
+      return _mm512_mask_compress_epi32(v,mask,v);
+    }
+
+    static __forceinline vuint16 compact(const vboolf16& mask, const vuint16& a, vuint16& b) {
+      return _mm512_mask_compress_epi32(a,mask,b);
+    }
+
+    static __forceinline vuint16 expand(const vboolf16& mask, const vuint16& a, vuint16& b) {
+      return _mm512_mask_expand_epi32(b,mask,a);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const unsigned int* ptr, const vint16& index) {
+      return _mm512_i32gather_epi32(index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const vboolf16& mask, const unsigned int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(),mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint16 gather(const vboolf16& mask, vuint16& dest, const unsigned int* ptr, const vint16& index) {
+      return _mm512_mask_i32gather_epi32(dest,mask,index,ptr,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(unsigned int* ptr, const vint16& index, const vuint16& v) {
+      _mm512_i32scatter_epi32((int*)ptr,index,v,scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf16& mask, unsigned int* ptr, const vint16& index, const vuint16& v) {
+      _mm512_mask_i32scatter_epi32((int*)ptr,mask,index,v,scale);
+    }
+
+    static __forceinline vuint16 broadcast64bit(size_t v) {
+      return _mm512_set1_epi64(v);
+    }
+
+    static __forceinline size_t extract64bit(const vuint16& v)
+    {
+      return _mm_cvtsi128_si64(_mm512_castsi512_si128(v));
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 16); return i[index]; }
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 16); return i[index]; }
+
+    __forceinline unsigned int uint    (size_t index) const { assert(index < 16); return ((unsigned int*)i)[index]; }
+    __forceinline size_t&      uint64_t(size_t index) const { assert(index < 8);  return ((size_t*)i)[index]; }
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 asBool(const vuint16& a) { return _mm512_movepi32_mask(a); }
+
+  __forceinline vuint16 operator +(const vuint16& a) { return a; }
+  __forceinline vuint16 operator -(const vuint16& a) { return _mm512_sub_epi32(_mm512_setzero_epi32(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16 operator +(const vuint16& a, const vuint16& b) { return _mm512_add_epi32(a, b); }
+  __forceinline vuint16 operator +(const vuint16& a, unsigned int   b) { return a + vuint16(b); }
+  __forceinline vuint16 operator +(unsigned int   a, const vuint16& b) { return vuint16(a) + b; }
+
+  __forceinline vuint16 operator -(const vuint16& a, const vuint16& b) { return _mm512_sub_epi32(a, b); }
+  __forceinline vuint16 operator -(const vuint16& a, unsigned int   b) { return a - vuint16(b); }
+  __forceinline vuint16 operator -(unsigned int   a, const vuint16& b) { return vuint16(a) - b; }
+
+  __forceinline vuint16 operator *(const vuint16& a, const vuint16& b) { return _mm512_mul_epu32(a, b); }
+  __forceinline vuint16 operator *(const vuint16& a, unsigned int   b) { return a * vuint16(b); }
+  __forceinline vuint16 operator *(unsigned int   a, const vuint16& b) { return vuint16(a) * b; }
+
+  __forceinline vuint16 operator &(const vuint16& a, const vuint16& b) { return _mm512_and_epi32(a, b); }
+  __forceinline vuint16 operator &(const vuint16& a, unsigned int   b) { return a & vuint16(b); }
+  __forceinline vuint16 operator &(unsigned int   a, const vuint16& b) { return vuint16(a) & b; }
+
+  __forceinline vuint16 operator |(const vuint16& a, const vuint16& b) { return _mm512_or_epi32(a, b); }
+  __forceinline vuint16 operator |(const vuint16& a, unsigned int   b) { return a | vuint16(b); }
+  __forceinline vuint16 operator |(unsigned int   a, const vuint16& b) { return vuint16(a) | b; }
+
+  __forceinline vuint16 operator ^(const vuint16& a, const vuint16& b) { return _mm512_xor_epi32(a, b); }
+  __forceinline vuint16 operator ^(const vuint16& a, unsigned int   b) { return a ^ vuint16(b); }
+  __forceinline vuint16 operator ^(unsigned int   a, const vuint16& b) { return vuint16(a) ^ b; }
+
+  __forceinline vuint16 operator <<(const vuint16& a, unsigned int n) { return _mm512_slli_epi32(a, n); }
+  __forceinline vuint16 operator >>(const vuint16& a, unsigned int n) { return _mm512_srli_epi32(a, n); }
+
+  __forceinline vuint16 operator <<(const vuint16& a, const vuint16& n) { return _mm512_sllv_epi32(a, n); }
+  __forceinline vuint16 operator >>(const vuint16& a, const vuint16& n) { return _mm512_srlv_epi32(a, n); }
+
+  __forceinline vuint16 sll (const vuint16& a, unsigned int b) { return _mm512_slli_epi32(a, b); }
+  __forceinline vuint16 sra (const vuint16& a, unsigned int b) { return _mm512_srai_epi32(a, b); }
+  __forceinline vuint16 srl (const vuint16& a, unsigned int b) { return _mm512_srli_epi32(a, b); }
+  
+  __forceinline vuint16 min(const vuint16& a, const vuint16& b) { return _mm512_min_epu32(a, b); }
+  __forceinline vuint16 min(const vuint16& a, unsigned int   b) { return min(a,vuint16(b)); }
+  __forceinline vuint16 min(unsigned int   a, const vuint16& b) { return min(vuint16(a),b); }
+
+  __forceinline vuint16 max(const vuint16& a, const vuint16& b) { return _mm512_max_epu32(a, b); }
+  __forceinline vuint16 max(const vuint16& a, unsigned int   b) { return max(a,vuint16(b)); }
+  __forceinline vuint16 max(unsigned int   a, const vuint16& b) { return max(vuint16(a),b); }
+  
+  __forceinline vuint16 mask_add(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_add_epi32(c,mask,a,b); }
+  __forceinline vuint16 mask_sub(const vboolf16& mask, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_sub_epi32(c,mask,a,b); }
+
+  __forceinline vuint16 mask_and(const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_and_epi32(c,m,a,b); }
+  __forceinline vuint16 mask_or (const vboolf16& m, vuint16& c, const vuint16& a, const vuint16& b) { return _mm512_mask_or_epi32(c,m,a,b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16& operator +=(vuint16& a, const vuint16& b) { return a = a + b; }
+  __forceinline vuint16& operator +=(vuint16& a, unsigned int   b) { return a = a + b; }
+  
+  __forceinline vuint16& operator -=(vuint16& a, const vuint16& b) { return a = a - b; }
+  __forceinline vuint16& operator -=(vuint16& a, unsigned int   b) { return a = a - b; }
+
+  __forceinline vuint16& operator *=(vuint16& a, const vuint16& b) { return a = a * b; }
+  __forceinline vuint16& operator *=(vuint16& a, unsigned int   b) { return a = a * b; }
+  
+  __forceinline vuint16& operator &=(vuint16& a, const vuint16& b) { return a = a & b; }
+  __forceinline vuint16& operator &=(vuint16& a, unsigned int   b) { return a = a & b; }
+  
+  __forceinline vuint16& operator |=(vuint16& a, const vuint16& b) { return a = a | b; }
+  __forceinline vuint16& operator |=(vuint16& a, unsigned int   b) { return a = a | b; }
+  
+  __forceinline vuint16& operator <<=(vuint16& a, unsigned int b) { return a = a << b; }
+  __forceinline vuint16& operator >>=(vuint16& a, unsigned int b) { return a = a >> b; }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf16 operator ==(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 operator ==(const vuint16& a, unsigned int   b) { return a == vuint16(b); }
+  __forceinline vboolf16 operator ==(unsigned int   a, const vuint16& b) { return vuint16(a) == b; }
+  
+  __forceinline vboolf16 operator !=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 operator !=(const vuint16& a, unsigned int   b) { return a != vuint16(b); }
+  __forceinline vboolf16 operator !=(unsigned int   a, const vuint16& b) { return vuint16(a) != b; }
+  
+  __forceinline vboolf16 operator < (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 operator < (const vuint16& a, unsigned int   b) { return a <  vuint16(b); }
+  __forceinline vboolf16 operator < (unsigned int   a, const vuint16& b) { return vuint16(a) <  b; }
+  
+  __forceinline vboolf16 operator >=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 operator >=(const vuint16& a, unsigned int   b) { return a >= vuint16(b); }
+  __forceinline vboolf16 operator >=(unsigned int   a, const vuint16& b) { return vuint16(a) >= b; }
+
+  __forceinline vboolf16 operator > (const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 operator > (const vuint16& a, unsigned int   b) { return a >  vuint16(b); }
+  __forceinline vboolf16 operator > (unsigned int   a, const vuint16& b) { return vuint16(a) >  b; }
+
+  __forceinline vboolf16 operator <=(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+  __forceinline vboolf16 operator <=(const vuint16& a, unsigned int   b) { return a <= vuint16(b); }
+  __forceinline vboolf16 operator <=(unsigned int   a, const vuint16& b) { return vuint16(a) <= b; }
+
+  __forceinline vboolf16 eq(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vuint16& a, const vuint16& b) { return _mm512_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vboolf16 eq(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf16 ne(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf16 lt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf16 ge(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf16 gt(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf16 le(const vboolf16 mask, const vuint16& a, const vuint16& b) { return _mm512_mask_cmp_epu32_mask(mask,a,b,_MM_CMPINT_LE); }
+    
+ 
+  __forceinline vuint16 select(const vboolf16& m, const vuint16& t, const vuint16& f) {
+    return _mm512_mask_or_epi32(f,m,t,t); 
+  }
+
+  __forceinline void xchg(const vboolf16& m, vuint16& a, vuint16& b) {
+    const vuint16 c = a; a = select(m,b,a); b = select(m,c,b);
+  }
+
+  __forceinline vboolf16 test(const vboolf16& m, const vuint16& a, const vuint16& b) {
+    return _mm512_mask_test_epi32_mask(m,a,b);
+  }
+
+  __forceinline vboolf16 test(const vuint16& a, const vuint16& b) {
+    return _mm512_test_epi32_mask(a,b);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int i>
+  __forceinline vuint16 shuffle(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint16 shuffle(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_permute_ps(_mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vuint16 shuffle4(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v) ,_MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint16 shuffle4(const vuint16& v) {
+    return _mm512_castps_si512(_mm512_shuffle_f32x4(_mm512_castsi512_ps(v), _mm512_castsi512_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i>
+  __forceinline vuint16 align_shift_right(const vuint16& a, const vuint16& b) {
+    return _mm512_alignr_epi32(a, b, i);
+  };
+
+  __forceinline unsigned int toScalar(const vuint16& v) {
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(v));
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint16 vreduce_min2(vuint16 x) {                      return min(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_min4(vuint16 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vuint16 vreduce_min8(vuint16 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_min (vuint16 x) { x = vreduce_min8(x); return min(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vuint16 vreduce_max2(vuint16 x) {                      return max(x, shuffle<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_max4(vuint16 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
+  __forceinline vuint16 vreduce_max8(vuint16 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0,3,2>(x)); }
+  __forceinline vuint16 vreduce_max (vuint16 x) { x = vreduce_max8(x); return max(x, shuffle4<2,3,0,1>(x)); }
+
+  __forceinline vuint16 vreduce_and2(vuint16 x) {                      return x & shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_and4(vuint16 x) { x = vreduce_and2(x); return x & shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_and8(vuint16 x) { x = vreduce_and4(x); return x & shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_and (vuint16 x) { x = vreduce_and8(x); return x & shuffle4<2,3,0,1>(x); }
+
+  __forceinline vuint16 vreduce_or2(vuint16 x) {                     return x | shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_or4(vuint16 x) { x = vreduce_or2(x); return x | shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_or8(vuint16 x) { x = vreduce_or4(x); return x | shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_or (vuint16 x) { x = vreduce_or8(x); return x | shuffle4<2,3,0,1>(x); }
+
+  __forceinline vuint16 vreduce_add2(vuint16 x) {                      return x + shuffle<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_add4(vuint16 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
+  __forceinline vuint16 vreduce_add8(vuint16 x) { x = vreduce_add4(x); return x + shuffle4<1,0,3,2>(x); }
+  __forceinline vuint16 vreduce_add (vuint16 x) { x = vreduce_add8(x); return x + shuffle4<2,3,0,1>(x); }
+
+  __forceinline unsigned int reduce_min(const vuint16& v) { return toScalar(vreduce_min(v)); }
+  __forceinline unsigned int reduce_max(const vuint16& v) { return toScalar(vreduce_max(v)); }
+  __forceinline unsigned int reduce_and(const vuint16& v) { return toScalar(vreduce_and(v)); }
+  __forceinline unsigned int reduce_or (const vuint16& v) { return toScalar(vreduce_or (v)); }
+  __forceinline unsigned int reduce_add(const vuint16& v) { return toScalar(vreduce_add(v)); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Memory load and store operations
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline vuint16 permute(vuint16 v, vuint16 index) {
+    return _mm512_permutexvar_epi32(index,v);  
+  }
+
+  __forceinline vuint16 reverse(const vuint16& a) {
+    return permute(a,vuint16(reverse_step));
+  }
+
+  __forceinline vuint16 prefix_sum(const vuint16& a) 
+  {
+    const vuint16 z(zero);
+    vuint16 v = a;
+    v = v + align_shift_right<16-1>(v,z);
+    v = v + align_shift_right<16-2>(v,z);
+    v = v + align_shift_right<16-4>(v,z);
+    v = v + align_shift_right<16-8>(v,z);
+    return v;  
+  }
+
+  __forceinline vuint16 reverse_prefix_sum(const vuint16& a) 
+  {
+    const vuint16 z(zero);
+    vuint16 v = a;
+    v = v + align_shift_right<1>(z,v);
+    v = v + align_shift_right<2>(z,v);
+    v = v + align_shift_right<4>(z,v);
+    v = v + align_shift_right<8>(z,v);
+    return v;  
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+  
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint16& v)
+  {
+    cout << "<" << v[0];
+    for (int i=1; i<16; i++) cout << ", " << v[i];
+    cout << ">";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h b/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h
new file mode 100644
index 0000000000..a3f393ebf2
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vuint4_sse2.h
@@ -0,0 +1,499 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../math/math.h"
+
+namespace embree
+{
+  /* 4-wide SSE integer type */
+  template<>
+  struct vuint<4>
+  {
+    ALIGNED_STRUCT_(16);
+    
+    typedef vboolf4 Bool;
+    typedef vuint4   Int;
+    typedef vfloat4 Float;
+
+    enum  { size = 4 }; // number of SIMD elements
+    union { __m128i v; unsigned int i[4]; }; // data
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+    
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint4& a) { v = a.v; }
+    __forceinline vuint4& operator =(const vuint4& a) { v = a.v; return *this; }
+
+    __forceinline vuint(const __m128i a) : v(a) {}
+    __forceinline operator const __m128i&() const { return v; }
+    __forceinline operator       __m128i&()       { return v; }
+
+
+    __forceinline vuint(unsigned int a) : v(_mm_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm_set_epi32(d, c, b, a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(__m128 a) : v(_mm_cvtps_epu32(a)) {}
+#endif
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_movm_epi32(a)) {}
+#else
+    __forceinline explicit vuint(const vboolf4& a) : v(_mm_castps_si128((__m128)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm_setzero_si128()) {}
+    __forceinline vuint(OneTy)    : v(_mm_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm_set1_epi32(unsigned(pos_inf))) {}
+    __forceinline vuint(StepTy)   : v(_mm_set_epi32(3, 2, 1, 0)) {}
+    __forceinline vuint(TrueTy)   { v = _mm_cmpeq_epi32(v,v); }
+    __forceinline vuint(UndefinedTy) : v(_mm_castps_si128(_mm_undefined_ps())) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint4 load (const void* a) { return _mm_load_si128((__m128i*)a); }
+    static __forceinline vuint4 loadu(const void* a) { return _mm_loadu_si128((__m128i*)a); }
+
+    static __forceinline void store (void* ptr, const vuint4& v) { _mm_store_si128((__m128i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vuint4& v) { _mm_storeu_si128((__m128i*)ptr,v); }
+    
+#if defined(__AVX512VL__)
+    static __forceinline vuint4 load (const vboolf4& mask, const void* ptr) { return _mm_mask_load_epi32 (_mm_setzero_si128(),mask,ptr); }
+    static __forceinline vuint4 loadu(const vboolf4& mask, const void* ptr) { return _mm_mask_loadu_epi32(_mm_setzero_si128(),mask,ptr); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& v) { _mm_mask_storeu_epi32(ptr,mask,v); }
+#elif defined(__AVX__)
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_castps_si128(_mm_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { _mm_maskstore_ps((float*)ptr,(__m128i)mask,_mm_castsi128_ps(i)); }
+#else
+    static __forceinline vuint4 load (const vbool4& mask, const void* a) { return _mm_and_si128(_mm_load_si128 ((__m128i*)a),mask); }
+    static __forceinline vuint4 loadu(const vbool4& mask, const void* a) { return _mm_and_si128(_mm_loadu_si128((__m128i*)a),mask); }
+
+    static __forceinline void store (const vboolf4& mask, void* ptr, const vuint4& i) { store (ptr,select(mask,i,load (ptr))); }
+    static __forceinline void storeu(const vboolf4& mask, void* ptr, const vuint4& i) { storeu(ptr,select(mask,i,loadu(ptr))); }
+#endif
+
+#if defined(__aarch64__)
+    static __forceinline vuint4 load(const uint8_t* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+    static __forceinline vuint4 loadu(const uint8_t* ptr) {
+        return _mm_load4epu8_epi32(((__m128i*)ptr));
+    }
+#elif defined(__SSE4_1__)
+    static __forceinline vuint4 load(const uint8_t* ptr) {
+      return _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+    static __forceinline vuint4 loadu(const uint8_t* ptr) {
+      return  _mm_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr));
+    }
+
+#endif
+
+    static __forceinline vuint4 load(const unsigned short* ptr) {
+#if defined(__aarch64__)
+      return _mm_load4epu16_epi32(((__m128i*)ptr));
+#elif defined (__SSE4_1__)
+      return _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr));
+#else
+      return vuint4(ptr[0],ptr[1],ptr[2],ptr[3]);
+#endif
+    } 
+
+    static __forceinline void store_uint8(uint8_t* ptr, const vuint4& v) {
+#if defined(__aarch64__) 
+        uint32x4_t x = uint32x4_t(v.v);
+        uint16x4_t y = vqmovn_u32(x);
+        uint8x8_t z = vqmovn_u16(vcombine_u16(y, y));
+        vst1_lane_u32((uint32_t *)ptr, uint32x2_t(z), 0);
+#elif defined(__SSE4_1__)
+      __m128i x = v;
+      x = _mm_packus_epi32(x, x);
+      x = _mm_packus_epi16(x, x);
+      *(unsigned*)ptr = _mm_cvtsi128_si32(x);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (uint8_t)v[i];
+#endif
+    }
+
+    static __forceinline void store_uint8(unsigned short* ptr, const vuint4& v) {
+#if defined(__aarch64__)
+        uint32x4_t x = (uint32x4_t)v.v;
+        uint16x4_t y = vqmovn_u32(x);
+        vst1_u16(ptr, y);
+#else
+      for (size_t i=0;i<4;i++)
+        ptr[i] = (unsigned short)v[i];
+#endif
+    }
+
+    static __forceinline vuint4 load_nt(void* ptr) {
+#if (defined(__aarch64__)) || defined(__SSE4_1__)
+      return _mm_stream_load_si128((__m128i*)ptr); 
+#else
+      return _mm_load_si128((__m128i*)ptr); 
+#endif
+    }
+    
+    static __forceinline void store_nt(void* ptr, const vuint4& v) {
+#if !defined(__aarch64__) && defined(__SSE4_1__)
+      _mm_stream_ps((float*)ptr, _mm_castsi128_ps(v)); 
+#else
+      _mm_store_si128((__m128i*)ptr,v);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint4 gather(const unsigned int* ptr, const vint4& index) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_i32gather_epi32((const int*)ptr, index, scale);
+#else
+      return vuint4(
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[0]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[1]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[2]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[3]));
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint4 gather(const vboolf4& mask, const unsigned int* ptr, const vint4& index) {
+      vuint4 r = zero;
+#if defined(__AVX512VL__)
+      return _mm_mmask_i32gather_epi32(r, mask, index, ptr, scale);
+#elif defined(__AVX2__) && !defined(__aarch64__)
+      return _mm_mask_i32gather_epi32(r, (const int*)ptr, index, mask, scale);
+#else
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]);
+      return r;
+#endif
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 4); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 4); return i[index]; }
+
+    friend __forceinline vuint4 select(const vboolf4& m, const vuint4& t, const vuint4& f) {
+#if defined(__AVX512VL__)
+      return _mm_mask_blend_epi32(m, (__m128i)f, (__m128i)t);
+#elif defined(__SSE4_1__)
+      return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), m)); 
+#else
+      return _mm_or_si128(_mm_and_si128(m, t), _mm_andnot_si128(m, f)); 
+#endif
+    }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_movepi32_mask(a); }
+#else
+  __forceinline vboolf4 asBool(const vuint4& a) { return _mm_castsi128_ps(a); }
+#endif
+
+  __forceinline vuint4 operator +(const vuint4& a) { return a; }
+  __forceinline vuint4 operator -(const vuint4& a) { return _mm_sub_epi32(_mm_setzero_si128(), a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4 operator +(const vuint4& a, const vuint4& b) { return _mm_add_epi32(a, b); }
+  __forceinline vuint4 operator +(const vuint4& a, unsigned int  b) { return a + vuint4(b); }
+  __forceinline vuint4 operator +(unsigned int  a, const vuint4& b) { return vuint4(a) + b; }
+
+  __forceinline vuint4 operator -(const vuint4& a, const vuint4& b) { return _mm_sub_epi32(a, b); }
+  __forceinline vuint4 operator -(const vuint4& a, unsigned int  b) { return a - vuint4(b); }
+  __forceinline vuint4 operator -(unsigned int  a, const vuint4& b) { return vuint4(a) - b; }
+
+//#if defined(__SSE4_1__)
+//  __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return _mm_mullo_epu32(a, b); }
+//#else
+//  __forceinline vuint4 operator *(const vuint4& a, const vuint4& b) { return vuint4(a[0]*b[0],a[1]*b[1],a[2]*b[2],a[3]*b[3]); }
+//#endif
+//  __forceinline vuint4 operator *(const vuint4& a, unsigned int  b) { return a * vuint4(b); }
+//  __forceinline vuint4 operator *(unsigned int  a, const vuint4& b) { return vuint4(a) * b; }
+
+  __forceinline vuint4 operator &(const vuint4& a, const vuint4& b) { return _mm_and_si128(a, b); }
+  __forceinline vuint4 operator &(const vuint4& a, unsigned int  b) { return a & vuint4(b); }
+  __forceinline vuint4 operator &(unsigned int  a, const vuint4& b) { return vuint4(a) & b; }
+
+  __forceinline vuint4 operator |(const vuint4& a, const vuint4& b) { return _mm_or_si128(a, b); }
+  __forceinline vuint4 operator |(const vuint4& a, unsigned int  b) { return a | vuint4(b); }
+  __forceinline vuint4 operator |(unsigned int  a, const vuint4& b) { return vuint4(a) | b; }
+
+  __forceinline vuint4 operator ^(const vuint4& a, const vuint4& b) { return _mm_xor_si128(a, b); }
+  __forceinline vuint4 operator ^(const vuint4& a, unsigned int  b) { return a ^ vuint4(b); }
+  __forceinline vuint4 operator ^(unsigned int  a, const vuint4& b) { return vuint4(a) ^ b; }
+
+  __forceinline vuint4 operator <<(const vuint4& a, unsigned int n) { return _mm_slli_epi32(a, n); }
+  __forceinline vuint4 operator >>(const vuint4& a, unsigned int n) { return _mm_srli_epi32(a, n); }
+
+  __forceinline vuint4 sll (const vuint4& a, unsigned int b) { return _mm_slli_epi32(a, b); }
+  __forceinline vuint4 sra (const vuint4& a, unsigned int b) { return _mm_srai_epi32(a, b); }
+  __forceinline vuint4 srl (const vuint4& a, unsigned int b) { return _mm_srli_epi32(a, b); }
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4& operator +=(vuint4& a, const vuint4& b) { return a = a + b; }
+  __forceinline vuint4& operator +=(vuint4& a, unsigned int  b) { return a = a + b; }
+  
+  __forceinline vuint4& operator -=(vuint4& a, const vuint4& b) { return a = a - b; }
+  __forceinline vuint4& operator -=(vuint4& a, unsigned int  b) { return a = a - b; }
+
+//#if defined(__SSE4_1__)
+//  __forceinline vuint4& operator *=(vuint4& a, const vuint4& b) { return a = a * b; }
+//  __forceinline vuint4& operator *=(vuint4& a, unsigned int  b) { return a = a * b; }
+//#endif
+  
+  __forceinline vuint4& operator &=(vuint4& a, const vuint4& b) { return a = a & b; }
+  __forceinline vuint4& operator &=(vuint4& a, unsigned int  b) { return a = a & b; }
+  
+  __forceinline vuint4& operator |=(vuint4& a, const vuint4& b) { return a = a | b; }
+  __forceinline vuint4& operator |=(vuint4& a, unsigned int  b) { return a = a | b; }
+  
+  __forceinline vuint4& operator <<=(vuint4& a, unsigned int  b) { return a = a << b; }
+  __forceinline vuint4& operator >>=(vuint4& a, unsigned int  b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return _mm_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 operator ==(const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b)); }
+  __forceinline vboolf4 operator !=(const vuint4& a, const vuint4& b) { return !(a == b); }
+  //__forceinline vboolf4 operator < (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmplt_epu32(a, b)); }
+  //__forceinline vboolf4 operator >=(const vuint4& a, const vuint4& b) { return !(a <  b); }
+  //__forceinline vboolf4 operator > (const vuint4& a, const vuint4& b) { return _mm_castsi128_ps(_mm_cmpgt_epu32(a, b)); }
+  //__forceinline vboolf4 operator <=(const vuint4& a, const vuint4& b) { return !(a >  b); }
+#endif
+
+  __forceinline vboolf4 operator ==(const vuint4& a, unsigned int  b) { return a == vuint4(b); }
+  __forceinline vboolf4 operator ==(unsigned int  a, const vuint4& b) { return vuint4(a) == b; }
+
+  __forceinline vboolf4 operator !=(const vuint4& a, unsigned int  b) { return a != vuint4(b); }
+  __forceinline vboolf4 operator !=(unsigned int  a, const vuint4& b) { return vuint4(a) != b; }
+
+  //__forceinline vboolf4 operator < (const vuint4& a, unsigned int  b) { return a <  vuint4(b); }
+  //__forceinline vboolf4 operator < (unsigned int  a, const vuint4& b) { return vuint4(a) <  b; }
+
+  //__forceinline vboolf4 operator >=(const vuint4& a, unsigned int  b) { return a >= vuint4(b); }
+  //__forceinline vboolf4 operator >=(unsigned int  a, const vuint4& b) { return vuint4(a) >= b; }
+
+  //__forceinline vboolf4 operator > (const vuint4& a, unsigned int  b) { return a >  vuint4(b); }
+  //__forceinline vboolf4 operator > (unsigned int  a, const vuint4& b) { return vuint4(a) >  b; }
+
+  //__forceinline vboolf4 operator <=(const vuint4& a, unsigned int  b) { return a <= vuint4(b); }
+  //__forceinline vboolf4 operator <=(unsigned int  a, const vuint4& b) { return vuint4(a) <= b; }
+
+  __forceinline vboolf4 eq(const vuint4& a, const vuint4& b) { return a == b; }
+  __forceinline vboolf4 ne(const vuint4& a, const vuint4& b) { return a != b; }
+  //__forceinline vboolf4 lt(const vuint4& a, const vuint4& b) { return a <  b; }
+  //__forceinline vboolf4 ge(const vuint4& a, const vuint4& b) { return a >= b; }
+  //__forceinline vboolf4 gt(const vuint4& a, const vuint4& b) { return a >  b; }
+  //__forceinline vboolf4 le(const vuint4& a, const vuint4& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+  //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+  //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+  //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+  //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return _mm_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf4 eq(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a == b); }
+  __forceinline vboolf4 ne(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a != b); }
+  //__forceinline vboolf4 lt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <  b); }
+  //__forceinline vboolf4 ge(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >= b); }
+  //__forceinline vboolf4 gt(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a >  b); }
+  //__forceinline vboolf4 le(const vboolf4& mask, const vuint4& a, const vuint4& b) { return mask & (a <= b); }
+#endif
+
+  template<int mask>
+  __forceinline vuint4 select(const vuint4& t, const vuint4& f) {
+#if defined(__SSE4_1__) 
+    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(f), _mm_castsi128_ps(t), mask));
+#else
+    return select(vboolf4(mask), t, f);
+#endif    
+  }
+
+/*#if defined(__SSE4_1__)
+  __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return _mm_min_epu32(a, b); }
+  __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return _mm_max_epu32(a, b); }
+
+#else
+  __forceinline vuint4 min(const vuint4& a, const vuint4& b) { return select(a < b,a,b); }
+  __forceinline vuint4 max(const vuint4& a, const vuint4& b) { return select(a < b,b,a); }
+#endif
+
+  __forceinline vuint4 min(const vuint4& a, unsigned int  b) { return min(a,vuint4(b)); }
+  __forceinline vuint4 min(unsigned int  a, const vuint4& b) { return min(vuint4(a),b); }
+  __forceinline vuint4 max(const vuint4& a, unsigned int  b) { return max(a,vuint4(b)); }
+  __forceinline vuint4 max(unsigned int  a, const vuint4& b) { return max(vuint4(a),b); }*/
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint4 unpacklo(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpacklo_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+  __forceinline vuint4 unpackhi(const vuint4& a, const vuint4& b) { return _mm_castps_si128(_mm_unpackhi_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b))); }
+
+#if defined(__aarch64__)
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return vreinterpretq_s32_u8(vqtbl1q_u8( (uint8x16_t)v.v, _MN_SHUFFLE(i0, i1, i2, i3)));
+  }
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+    return vreinterpretq_s32_u8(vqtbl2q_u8( (uint8x16x2_t){(uint8x16_t)a.v, (uint8x16_t)b.v}, _MF_SHUFFLE(i0, i1, i2, i3)));
+  }
+#else
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
+  }
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint4 shuffle(const vuint4& a, const vuint4& b) {
+    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+#endif
+#if defined(__SSE3__)
+  template<> __forceinline vuint4 shuffle<0, 0, 2, 2>(const vuint4& v) { return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vuint4 shuffle<1, 1, 3, 3>(const vuint4& v) { return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(v))); }
+  template<> __forceinline vuint4 shuffle<0, 1, 0, 1>(const vuint4& v) { return _mm_castpd_si128(_mm_movedup_pd (_mm_castsi128_pd(v))); }
+#endif
+
+  template<int i>
+  __forceinline vuint4 shuffle(const vuint4& v) {
+    return shuffle<i,i,i,i>(v);
+  }
+
+#if defined(__aarch64__)
+  template<int src> __forceinline unsigned int extract(const vuint4& b);
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b);
+#elif defined(__SSE4_1__)
+  template<int src> __forceinline unsigned int extract(const vuint4& b) { return _mm_extract_epi32(b, src); }
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { return _mm_insert_epi32(a, b, dst); }
+#else
+  template<int src> __forceinline unsigned int extract(const vuint4& b) { return b[src&3]; }
+  template<int dst> __forceinline vuint4 insert(const vuint4& a, const unsigned b) { vuint4 c = a; c[dst&3] = b; return c; }
+#endif
+
+#if defined(__aarch64__)
+  template<> __forceinline unsigned int extract<0>(const vuint4& b) {
+    return b[0];
+  }
+  template<> __forceinline unsigned int extract<1>(const vuint4& b) {
+    return b[1];
+  }
+  template<> __forceinline unsigned int extract<2>(const vuint4& b) {
+    return b[2];
+  }
+  template<> __forceinline unsigned int extract<3>(const vuint4& b) {
+    return b[3];
+  }
+                                                                               
+  template<> __forceinline vuint4 insert<0>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[0] = b;
+    return c;
+  }
+  template<> __forceinline vuint4 insert<1>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[1] = b;
+    return c;
+  }
+  template<> __forceinline vuint4 insert<2>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[2] = b;
+    return c;
+  }
+  template<> __forceinline vuint4 insert<3>(const vuint4& a, unsigned b){
+    vuint4 c = a;
+    c[3] = b;
+    return c;
+  }
+                                                                               
+  __forceinline unsigned int toScalar(const vuint4& v) {
+    return v[0];
+  }
+#else
+  template<> __forceinline unsigned int extract<0>(const vuint4& b) { return _mm_cvtsi128_si32(b); }
+
+  __forceinline unsigned int toScalar(const vuint4& v) { return _mm_cvtsi128_si32(v); }
+#endif
+                                                                               
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+#if defined(__SSE4_1__)
+
+  __forceinline vuint4 vreduce_min(const vuint4& v) { vuint4 h = min(shuffle<1,0,3,2>(v),v); return min(shuffle<2,3,0,1>(h),h); }
+  __forceinline vuint4 vreduce_max(const vuint4& v) { vuint4 h = max(shuffle<1,0,3,2>(v),v); return max(shuffle<2,3,0,1>(h),h); }
+  __forceinline vuint4 vreduce_add(const vuint4& v) { vuint4 h = shuffle<1,0,3,2>(v)   + v ; return shuffle<2,3,0,1>(h)   + h ; }
+
+  __forceinline unsigned int reduce_min(const vuint4& v) { return toScalar(vreduce_min(v)); }
+  __forceinline unsigned int reduce_max(const vuint4& v) { return toScalar(vreduce_max(v)); }
+  __forceinline unsigned int reduce_add(const vuint4& v) { return toScalar(vreduce_add(v)); }
+
+  __forceinline size_t select_min(const vuint4& v) { return bsf(movemask(v == vreduce_min(v))); }
+  __forceinline size_t select_max(const vuint4& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf4& valid, const vuint4& v) { const vuint4 a = select(valid,v,vuint4(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+#else
+
+  __forceinline unsigned int reduce_min(const vuint4& v) { return min(v[0],v[1],v[2],v[3]); }
+  __forceinline unsigned int reduce_max(const vuint4& v) { return max(v[0],v[1],v[2],v[3]); }
+  __forceinline unsigned int reduce_add(const vuint4& v) { return v[0]+v[1]+v[2]+v[3]; }
+
+#endif
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint4& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ">";
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx.h
new file mode 100644
index 0000000000..d4e86ae92d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vuint8_avx.h
@@ -0,0 +1,379 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vuint<8>
+  {
+    ALIGNED_STRUCT_(32);   
+
+    typedef vboolf8 Bool;
+    typedef vuint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 };        // number of SIMD elements
+    union {                    // data
+      __m256i v;
+      struct { __m128i vl,vh; };
+      unsigned int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint8& a) { v = a.v; }
+    __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+    __forceinline vuint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vuint(const __m128i& a, const __m128i& b) : vl(a), vh(b) {}
+ 
+    __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int vh) : v(_mm256_set_epi32(vh, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm256_setzero_si256()) {}
+    __forceinline vuint(OneTy)    : v(_mm256_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(0xFFFFFFFF)) {}
+    __forceinline vuint(StepTy)   : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint8 load (const void* a) { return _mm256_castps_si256(_mm256_load_ps((float*)a)); }
+    static __forceinline vuint8 loadu(const void* a) { return _mm256_castps_si256(_mm256_loadu_ps((float*)a)); }
+
+    static __forceinline vuint8 load (const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* a) { return _mm256_castps_si256(_mm256_maskload_ps((float*)a,mask)); }
+
+    static __forceinline void store (void* ptr, const vuint8& f) { _mm256_store_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(void* ptr, const vuint8& f) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(f)); }
+    
+#if !defined(__aarch64__)
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask,_mm256_castsi256_ps(f)); }
+#else
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& f) { _mm256_maskstore_ps((float*)ptr,(__m256i)mask.v,_mm256_castsi256_ps(f)); }
+#endif
+    static __forceinline void store_nt(void* ptr, const vuint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline vuint8 load(const uint8_t* ptr) {
+      vuint4 il = vuint4::load(ptr+0);
+      vuint4 ih = vuint4::load(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 loadu(const uint8_t* ptr) {
+      vuint4 il = vuint4::loadu(ptr+0);
+      vuint4 ih = vuint4::loadu(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 load(const unsigned short* ptr) {
+      vuint4 il = vuint4::load(ptr+0);
+      vuint4 ih = vuint4::load(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline vuint8 loadu(const unsigned short* ptr) {
+      vuint4 il = vuint4::loadu(ptr+0);
+      vuint4 ih = vuint4::loadu(ptr+4);
+      return vuint8(il,ih);
+    }
+
+    static __forceinline void store(uint8_t* ptr, const vuint8& i) {
+      vuint4 il(i.vl);
+      vuint4 ih(i.vh);
+      vuint4::store(ptr + 0,il);
+      vuint4::store(ptr + 4,ih);
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const unsigned int* ptr, const vint8& index) {
+      return vuint8(
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[0]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[1]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[2]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[3]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[4]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[5]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[6]),
+          *(unsigned int*)(((int8_t*)ptr)+scale*index[7]));
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int* ptr, const vint8& index) {
+      vuint8 r = zero;
+      if (likely(mask[0])) r[0] = *(unsigned int*)(((int8_t*)ptr)+scale*index[0]);
+      if (likely(mask[1])) r[1] = *(unsigned int*)(((int8_t*)ptr)+scale*index[1]);
+      if (likely(mask[2])) r[2] = *(unsigned int*)(((int8_t*)ptr)+scale*index[2]);
+      if (likely(mask[3])) r[3] = *(unsigned int*)(((int8_t*)ptr)+scale*index[3]);
+      if (likely(mask[4])) r[4] = *(unsigned int*)(((int8_t*)ptr)+scale*index[4]);
+      if (likely(mask[5])) r[5] = *(unsigned int*)(((int8_t*)ptr)+scale*index[5]);
+      if (likely(mask[6])) r[6] = *(unsigned int*)(((int8_t*)ptr)+scale*index[6]);
+      if (likely(mask[7])) r[7] = *(unsigned int*)(((int8_t*)ptr)+scale*index[7]);
+      return r;
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+    {
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+    {
+      if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+    }
+
+
+    static __forceinline vuint8 broadcast64(const long long& a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+
+  __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return vuint8(_mm_add_epi32(a.vl, b.vl), _mm_add_epi32(a.vh, b.vh)); }
+  __forceinline vuint8 operator +(const vuint8& a, unsigned int          b) { return a + vuint8(b); }
+  __forceinline vuint8 operator +(unsigned int          a, const vuint8& b) { return vuint8(a) + b; }
+
+  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return vuint8(_mm_sub_epi32(a.vl, b.vl), _mm_sub_epi32(a.vh, b.vh)); }
+  __forceinline vuint8 operator -(const vuint8& a, unsigned int          b) { return a - vuint8(b); }
+  __forceinline vuint8 operator -(unsigned int          a, const vuint8& b) { return vuint8(a) - b; }
+
+  //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return vuint8(_mm_mullo_epu32(a.vl, b.vl), _mm_mullo_epu32(a.vh, b.vh)); }
+  //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
+  //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
+
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
+  __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
+
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
+  __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
+
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
+  __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
+
+  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return vuint8(_mm_slli_epi32(a.vl, n), _mm_slli_epi32(a.vh, n)); }
+  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return vuint8(_mm_srai_epi32(a.vl, n), _mm_srli_epi32(a.vh, n)); }
+
+  __forceinline vuint8 sll (const vuint8& a, unsigned int b) { return vuint8(_mm_slli_epi32(a.vl, b), _mm_slli_epi32(a.vh, b)); }
+  __forceinline vuint8 sra (const vuint8& a, unsigned int b) { return vuint8(_mm_srai_epi32(a.vl, b), _mm_srai_epi32(a.vh, b)); }
+  __forceinline vuint8 srl (const vuint8& a, unsigned int b) { return vuint8(_mm_srli_epi32(a.vl, b), _mm_srli_epi32(a.vh, b)); }
+  
+  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return vuint8(_mm_min_epu32(a.vl, b.vl), _mm_min_epu32(a.vh, b.vh)); }
+  __forceinline vuint8 min(const vuint8& a, unsigned int          b) { return min(a,vuint8(b)); }
+  __forceinline vuint8 min(unsigned int          a, const vuint8& b) { return min(vuint8(a),b); }
+
+  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return vuint8(_mm_max_epu32(a.vl, b.vl), _mm_max_epu32(a.vh, b.vh)); }
+  __forceinline vuint8 max(const vuint8& a, unsigned int          b) { return max(a,vuint8(b)); }
+  __forceinline vuint8 max(unsigned int          a, const vuint8& b) { return max(vuint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+  __forceinline vuint8& operator +=(vuint8& a, unsigned int          b) { return a = a + b; }
+  
+  __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+  __forceinline vuint8& operator -=(vuint8& a, unsigned int          b) { return a = a - b; }
+  
+  //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+  //__forceinline vuint8& operator *=(vuint8& a, unsigned int          b) { return a = a * b; }
+  
+  __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+  __forceinline vuint8& operator &=(vuint8& a, unsigned int          b) { return a = a & b; }
+  
+  __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+  __forceinline vuint8& operator |=(vuint8& a, unsigned int          b) { return a = a | b; }
+  
+  __forceinline vuint8& operator <<=(vuint8& a, unsigned int b) { return a = a << b; }
+  __forceinline vuint8& operator >>=(vuint8& a, unsigned int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpeq_epi32 (a.vl, b.vl)),
+                                                                                       _mm_castsi128_ps(_mm_cmpeq_epi32 (a.vh, b.vh))); }
+  __forceinline vboolf8 operator ==(const vuint8& a, unsigned int          b) { return a == vuint8(b); }
+  __forceinline vboolf8 operator ==(unsigned int          a, const vuint8& b) { return vuint8(a) == b; }
+  
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+  __forceinline vboolf8 operator !=(const vuint8& a, unsigned int          b) { return a != vuint8(b); }
+  __forceinline vboolf8 operator !=(unsigned int          a, const vuint8& b) { return vuint8(a) != b; }
+  
+  //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmplt_epu32 (a.vl, b.vl)),
+  //                                                                                     _mm_castsi128_ps(_mm_cmplt_epu32 (a.vh, b.vh))); }
+  //__forceinline vboolf8 operator < (const vuint8& a, unsigned int          b) { return a <  vuint8(b); }
+  //__forceinline vboolf8 operator < (unsigned int          a, const vuint8& b) { return vuint8(a) <  b; }
+  
+  //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a <  b); }
+  //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int          b) { return a >= vuint8(b); }
+  //__forceinline vboolf8 operator >=(unsigned int          a, const vuint8& b) { return vuint8(a) >= b; }
+
+  //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return vboolf8(_mm_castsi128_ps(_mm_cmpgt_epu32 (a.vl, b.vl)),
+  //                                                                                     _mm_castsi128_ps(_mm_cmpgt_epu32 (a.vh, b.vh))); }
+  //__forceinline vboolf8 operator > (const vuint8& a, unsigned int          b) { return a >  vuint8(b); }
+  //__forceinline vboolf8 operator > (unsigned int          a, const vuint8& b) { return vuint8(a) >  b; }
+
+  //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a >  b); }
+  //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int          b) { return a <= vuint8(b); }
+  //__forceinline vboolf8 operator <=(unsigned int          a, const vuint8& b) { return vuint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m)); 
+  }
+
+  __forceinline vuint8 notand(const vboolf8& m, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_andnot_ps(m, _mm256_castsi256_ps(f))); 
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
+
+  template<int i>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+  //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+  //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h b/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h
new file mode 100644
index 0000000000..b2a965448d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/simd/vuint8_avx2.h
@@ -0,0 +1,439 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* 8-wide AVX integer type */
+  template<>
+  struct vuint<8>
+  {
+    ALIGNED_STRUCT_(32);
+        
+    typedef vboolf8 Bool;
+    typedef vuint8   Int;
+    typedef vfloat8 Float;
+
+    enum  { size = 8 }; // number of SIMD elements
+    union {             // data
+      __m256i v;
+      unsigned int i[8];
+    }; 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint() {}
+    __forceinline vuint(const vuint8& a) { v = a.v; }
+    __forceinline vuint8& operator =(const vuint8& a) { v = a.v; return *this; }
+
+    __forceinline vuint(__m256i a) : v(a) {}
+    __forceinline operator const __m256i&() const { return v; }
+    __forceinline operator       __m256i&()       { return v; }
+
+    __forceinline explicit vuint(const vuint4& a) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
+    __forceinline vuint(const vuint4& a, const vuint4& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+    __forceinline vuint(const __m128i& a, const __m128i& b) : v(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
+ 
+    __forceinline explicit vuint(const unsigned int* a) : v(_mm256_castps_si256(_mm256_loadu_ps((const float*)a))) {}
+    __forceinline vuint(unsigned int a) : v(_mm256_set1_epi32(a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b) : v(_mm256_set_epi32(b, a, b, a, b, a, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d) : v(_mm256_set_epi32(d, c, b, a, d, c, b, a)) {}
+    __forceinline vuint(unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int e, unsigned int f, unsigned int g, unsigned int h) : v(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}
+
+    __forceinline explicit vuint(__m256 a) : v(_mm256_cvtps_epi32(a)) {}
+
+#if defined(__AVX512VL__)
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_movm_epi32(a)) {}
+#else
+    __forceinline explicit vuint(const vboolf8& a) : v(_mm256_castps_si256((__m256)a)) {}
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constants
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline vuint(ZeroTy)   : v(_mm256_setzero_si256()) {}
+    __forceinline vuint(OneTy)    : v(_mm256_set1_epi32(1)) {}
+    __forceinline vuint(PosInfTy) : v(_mm256_set1_epi32(pos_inf)) {}
+    __forceinline vuint(NegInfTy) : v(_mm256_set1_epi32(neg_inf)) {}
+    __forceinline vuint(StepTy)   : v(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
+    __forceinline vuint(UndefinedTy) : v(_mm256_undefined_si256()) {}
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Loads and Stores
+    ////////////////////////////////////////////////////////////////////////////////
+
+    static __forceinline vuint8 load(const uint8_t* ptr)  { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 loadu(const uint8_t* ptr) { return _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)ptr)); }
+    static __forceinline vuint8 load(const unsigned short* ptr)  { return _mm256_cvtepu16_epi32(_mm_load_si128((__m128i*)ptr)); }
+    static __forceinline vuint8 loadu(const unsigned short* ptr) { return _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)ptr)); }
+
+    static __forceinline vuint8 load(const void* ptr) { return _mm256_load_si256((__m256i*)ptr); }
+    static __forceinline vuint8 loadu(const void* ptr) { return _mm256_loadu_si256((__m256i*)ptr); }
+
+    static __forceinline void store (void* ptr, const vuint8& v) { _mm256_store_si256((__m256i*)ptr,v); }
+    static __forceinline void storeu(void* ptr, const vuint8& v) { _mm256_storeu_ps((float*)ptr,_mm256_castsi256_ps(v)); }
+
+#if defined(__AVX512VL__)
+
+    static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &v) {
+      return _mm256_mask_compress_epi32(v, mask, v);
+    }
+    static __forceinline vuint8 compact(const vboolf8& mask, vuint8 &a, const vuint8& b) {
+      return _mm256_mask_compress_epi32(a, mask, b);
+    }
+
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_mask_load_epi32 (_mm256_setzero_si256(),mask,ptr); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_mask_loadu_epi32(_mm256_setzero_si256(),mask,ptr); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_store_epi32 (ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_mask_storeu_epi32(ptr,mask,v); }
+#else
+    static __forceinline vuint8 load (const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+    static __forceinline vuint8 loadu(const vboolf8& mask, const void* ptr) { return _mm256_castps_si256(_mm256_maskload_ps((float*)ptr,mask)); }
+
+    static __forceinline void store (const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+    static __forceinline void storeu(const vboolf8& mask, void* ptr, const vuint8& v) { _mm256_maskstore_epi32((int*)ptr,mask,v); }
+#endif
+    
+    static __forceinline vuint8 load_nt(void* ptr) {
+      return _mm256_stream_load_si256((__m256i*)ptr);
+    }
+
+    static __forceinline void store_nt(void* ptr, const vuint8& v) {
+      _mm256_stream_ps((float*)ptr,_mm256_castsi256_ps(v));
+    }
+
+    static __forceinline void store(uint8_t* ptr, const vuint8& i)
+    {
+      for (size_t j=0; j<8; j++)
+        ptr[j] = i[j];
+    }
+
+    static __forceinline void store(unsigned short* ptr, const vuint8& v) {
+      for (size_t i=0;i<8;i++)
+        ptr[i] = (unsigned short)v[i];
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const unsigned int *const ptr, const vint8& index) {
+      return _mm256_i32gather_epi32((const int*) ptr, index, scale);
+    }
+
+    template<int scale = 4>
+    static __forceinline vuint8 gather(const vboolf8& mask, const unsigned int *const ptr, const vint8& index) {
+      vuint8 r = zero;
+#if defined(__AVX512VL__)
+      return _mm256_mmask_i32gather_epi32(r, mask, index, (const int*) ptr, scale);
+#else
+      return _mm256_mask_i32gather_epi32(r, (const int*) ptr, index, mask, scale);
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(void* ptr, const vint8& ofs, const vuint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_i32scatter_epi32((int*)ptr, ofs, v, scale);
+#else
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[0]) = v[0];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[1]) = v[1];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[2]) = v[2];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[3]) = v[3];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[4]) = v[4];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[5]) = v[5];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[6]) = v[6];
+      *(unsigned int*)(((int8_t*)ptr) + scale * ofs[7]) = v[7];
+#endif
+    }
+
+    template<int scale = 4>
+    static __forceinline void scatter(const vboolf8& mask, void* ptr, const vint8& ofs, const vuint8& v)
+    {
+#if defined(__AVX512VL__)
+      _mm256_mask_i32scatter_epi32((int*)ptr, mask, ofs, v, scale);
+#else
+      if (likely(mask[0])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[0]) = v[0];
+      if (likely(mask[1])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[1]) = v[1];
+      if (likely(mask[2])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[2]) = v[2];
+      if (likely(mask[3])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[3]) = v[3];
+      if (likely(mask[4])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[4]) = v[4];
+      if (likely(mask[5])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[5]) = v[5];
+      if (likely(mask[6])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[6]) = v[6];
+      if (likely(mask[7])) *(unsigned int*)(((int8_t*)ptr)+scale*ofs[7]) = v[7];
+#endif
+    }
+
+    static __forceinline vuint8 broadcast64(const long long &a) { return _mm256_set1_epi64x(a); }
+    
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Array Access
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline const unsigned int& operator [](size_t index) const { assert(index < 8); return i[index]; }
+    __forceinline       unsigned int& operator [](size_t index)       { assert(index < 8); return i[index]; }
+  };
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Unary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_movepi32_mask(a); }
+#else
+  __forceinline vboolf8 asBool(const vuint8& a) { return _mm256_castsi256_ps(a); }
+#endif
+
+  __forceinline vuint8 operator +(const vuint8& a) { return a; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Binary Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 operator +(const vuint8& a, const vuint8& b) { return _mm256_add_epi32(a, b); }
+  __forceinline vuint8 operator +(const vuint8& a, unsigned int          b) { return a + vuint8(b); }
+  __forceinline vuint8 operator +(unsigned int          a, const vuint8& b) { return vuint8(a) + b; }
+
+  __forceinline vuint8 operator -(const vuint8& a, const vuint8& b) { return _mm256_sub_epi32(a, b); }
+  __forceinline vuint8 operator -(const vuint8& a, unsigned int          b) { return a - vuint8(b); }
+  __forceinline vuint8 operator -(unsigned int          a, const vuint8& b) { return vuint8(a) - b; }
+
+  //__forceinline vuint8 operator *(const vuint8& a, const vuint8& b) { return _mm256_mullo_epu32(a, b); }
+  //__forceinline vuint8 operator *(const vuint8& a, unsigned int          b) { return a * vuint8(b); }
+  //__forceinline vuint8 operator *(unsigned int          a, const vuint8& b) { return vuint8(a) * b; }
+
+  __forceinline vuint8 operator &(const vuint8& a, const vuint8& b) { return _mm256_and_si256(a, b); }
+  __forceinline vuint8 operator &(const vuint8& a, unsigned int          b) { return a & vuint8(b); }
+  __forceinline vuint8 operator &(unsigned int          a, const vuint8& b) { return vuint8(a) & b; }
+
+  __forceinline vuint8 operator |(const vuint8& a, const vuint8& b) { return _mm256_or_si256(a, b); }
+  __forceinline vuint8 operator |(const vuint8& a, unsigned int          b) { return a | vuint8(b); }
+  __forceinline vuint8 operator |(unsigned int          a, const vuint8& b) { return vuint8(a) | b; }
+
+  __forceinline vuint8 operator ^(const vuint8& a, const vuint8& b) { return _mm256_xor_si256(a, b); }
+  __forceinline vuint8 operator ^(const vuint8& a, unsigned int          b) { return a ^ vuint8(b); }
+  __forceinline vuint8 operator ^(unsigned int          a, const vuint8& b) { return vuint8(a) ^ b; }
+
+  __forceinline vuint8 operator <<(const vuint8& a, unsigned int n) { return _mm256_slli_epi32(a, n); }
+  __forceinline vuint8 operator >>(const vuint8& a, unsigned int n) { return _mm256_srli_epi32(a, n); }
+
+  __forceinline vuint8 operator <<(const vuint8& a, const vuint8& n) { return _mm256_sllv_epi32(a, n); }
+  __forceinline vuint8 operator >>(const vuint8& a, const vuint8& n) { return _mm256_srlv_epi32(a, n); }
+
+  __forceinline vuint8 sll(const vuint8& a, unsigned int b) { return _mm256_slli_epi32(a, b); }
+  __forceinline vuint8 sra(const vuint8& a, unsigned int b) { return _mm256_srai_epi32(a, b); }
+  __forceinline vuint8 srl(const vuint8& a, unsigned int b) { return _mm256_srli_epi32(a, b); }
+
+  __forceinline vuint8 sll(const vuint8& a, const vuint8& b) { return _mm256_sllv_epi32(a, b); }
+  __forceinline vuint8 sra(const vuint8& a, const vuint8& b) { return _mm256_srav_epi32(a, b); }
+  __forceinline vuint8 srl(const vuint8& a, const vuint8& b) { return _mm256_srlv_epi32(a, b); }
+  
+  __forceinline vuint8 min(const vuint8& a, const vuint8& b) { return _mm256_min_epu32(a, b); }
+  __forceinline vuint8 min(const vuint8& a, unsigned int          b) { return min(a,vuint8(b)); }
+  __forceinline vuint8 min(unsigned int          a, const vuint8& b) { return min(vuint8(a),b); }
+
+  __forceinline vuint8 max(const vuint8& a, const vuint8& b) { return _mm256_max_epu32(a, b); }
+  __forceinline vuint8 max(const vuint8& a, unsigned int          b) { return max(a,vuint8(b)); }
+  __forceinline vuint8 max(unsigned int          a, const vuint8& b) { return max(vuint8(a),b); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Assignment Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8& operator +=(vuint8& a, const vuint8& b) { return a = a + b; }
+  __forceinline vuint8& operator +=(vuint8& a, unsigned int          b) { return a = a + b; }
+  
+  __forceinline vuint8& operator -=(vuint8& a, const vuint8& b) { return a = a - b; }
+  __forceinline vuint8& operator -=(vuint8& a, unsigned int          b) { return a = a - b; }
+  
+  //__forceinline vuint8& operator *=(vuint8& a, const vuint8& b) { return a = a * b; }
+  //__forceinline vuint8& operator *=(vuint8& a, unsigned int          b) { return a = a * b; }
+  
+  __forceinline vuint8& operator &=(vuint8& a, const vuint8& b) { return a = a & b; }
+  __forceinline vuint8& operator &=(vuint8& a, unsigned int          b) { return a = a & b; }
+  
+  __forceinline vuint8& operator |=(vuint8& a, const vuint8& b) { return a = a | b; }
+  __forceinline vuint8& operator |=(vuint8& a, unsigned int          b) { return a = a | b; }
+  
+  __forceinline vuint8& operator <<=(vuint8& a, const unsigned int b) { return a = a << b; }
+  __forceinline vuint8& operator >>=(vuint8& a, const unsigned int b) { return a = a >> b; }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Comparison Operators + Select
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_EQ); }
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_NE); }
+  __forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LT); }
+  __forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GE); }
+  __forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_GT); }
+  __forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return _mm256_cmp_epu32_mask(a,b,_MM_CMPINT_LE); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_mask_blend_epi32(m, (__m256i)f, (__m256i)t);
+  }
+#else
+  __forceinline vboolf8 operator ==(const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b)); }
+  __forceinline vboolf8 operator !=(const vuint8& a, const vuint8& b) { return !(a == b); }
+  //__forceinline vboolf8 operator < (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(b, a)); }
+  //__forceinline vboolf8 operator >=(const vuint8& a, const vuint8& b) { return !(a <  b); }
+  //__forceinline vboolf8 operator > (const vuint8& a, const vuint8& b) { return _mm256_castsi256_ps(_mm256_cmpgt_epu32(a, b)); }
+  //__forceinline vboolf8 operator <=(const vuint8& a, const vuint8& b) { return !(a >  b); }
+
+  __forceinline vuint8 select(const vboolf8& m, const vuint8& t, const vuint8& f) {
+    return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
+  }
+#endif
+
+  template<int mask>
+  __forceinline vuint8 select(const vuint8& t, const vuint8& f) {
+    return _mm256_blend_epi32(f, t, mask);
+  }
+
+  __forceinline vboolf8 operator ==(const vuint8& a, unsigned int          b) { return a == vuint8(b); }
+  __forceinline vboolf8 operator ==(unsigned int          a, const vuint8& b) { return vuint8(a) == b; }
+
+  __forceinline vboolf8 operator !=(const vuint8& a, unsigned int          b) { return a != vuint8(b); }
+  __forceinline vboolf8 operator !=(unsigned int          a, const vuint8& b) { return vuint8(a) != b; }
+
+  //__forceinline vboolf8 operator < (const vuint8& a, unsigned int          b) { return a <  vuint8(b); }
+  //__forceinline vboolf8 operator < (unsigned int          a, const vuint8& b) { return vuint8(a) <  b; }
+
+  //__forceinline vboolf8 operator >=(const vuint8& a, unsigned int          b) { return a >= vuint8(b); }
+  //__forceinline vboolf8 operator >=(unsigned int          a, const vuint8& b) { return vuint8(a) >= b; }
+
+  //__forceinline vboolf8 operator > (const vuint8& a, unsigned int          b) { return a >  vuint8(b); }
+  //__forceinline vboolf8 operator > (unsigned int          a, const vuint8& b) { return vuint8(a) >  b; }
+
+  //__forceinline vboolf8 operator <=(const vuint8& a, unsigned int          b) { return a <= vuint8(b); }
+  //__forceinline vboolf8 operator <=(unsigned int          a, const vuint8& b) { return vuint8(a) <= b; }
+
+  __forceinline vboolf8 eq(const vuint8& a, const vuint8& b) { return a == b; }
+  __forceinline vboolf8 ne(const vuint8& a, const vuint8& b) { return a != b; }
+  //__forceinline vboolf8 lt(const vuint8& a, const vuint8& b) { return a <  b; }
+  //__forceinline vboolf8 ge(const vuint8& a, const vuint8& b) { return a >= b; }
+  //__forceinline vboolf8 gt(const vuint8& a, const vuint8& b) { return a >  b; }
+  //__forceinline vboolf8 le(const vuint8& a, const vuint8& b) { return a <= b; }
+
+#if defined(__AVX512VL__)
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_EQ); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_NE); }
+  __forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); }
+  __forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GE); }
+  __forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_GT); }
+  __forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return _mm256_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LE); }
+#else
+  __forceinline vboolf8 eq(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a == b); }
+  __forceinline vboolf8 ne(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a != b); }
+  //__forceinline vboolf8 lt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <  b); }
+  //__forceinline vboolf8 ge(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >= b); }
+  //__forceinline vboolf8 gt(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a >  b); }
+  //__forceinline vboolf8 le(const vboolf8& mask, const vuint8& a, const vuint8& b) { return mask & (a <= b); }
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Movement/Shifting/Shuffling Functions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline vuint8 unpacklo(const vuint8& a, const vuint8& b) { return _mm256_unpacklo_epi32(a, b); }
+  __forceinline vuint8 unpackhi(const vuint8& a, const vuint8& b) { return _mm256_unpackhi_epi32(a, b); }
+
+  template<int i>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i, i, i, i)));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& v) {
+    return _mm256_permute2f128_si256(v, v, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1>
+  __forceinline vuint8 shuffle4(const vuint8& a, const vuint8& b) {
+    return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& v) {
+    return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(v), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<int i0, int i1, int i2, int i3>
+  __forceinline vuint8 shuffle(const vuint8& a, const vuint8& b) {
+    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
+  }
+
+  template<> __forceinline vuint8 shuffle<0, 0, 2, 2>(const vuint8& v) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<1, 1, 3, 3>(const vuint8& v) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(v))); }
+  template<> __forceinline vuint8 shuffle<0, 1, 0, 1>(const vuint8& v) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(v))))); }
+
+  __forceinline vuint8 broadcast(const unsigned int* ptr) { return _mm256_castps_si256(_mm256_broadcast_ss((const float*)ptr)); }
+
+  template<int i> __forceinline vuint8 insert4(const vuint8& a, const vuint4& b) { return _mm256_insertf128_si256(a, b, i); }
+  template<int i> __forceinline vuint4 extract4(const vuint8& a) { return _mm256_extractf128_si256(a, i); }
+  template<> __forceinline vuint4 extract4<0>(const vuint8& a) { return _mm256_castsi256_si128(a); }
+
+  __forceinline int toScalar(const vuint8& v) { return _mm_cvtsi128_si32(_mm256_castsi256_si128(v)); }
+
+#if !defined(__aarch64__)
+
+  __forceinline vuint8 permute(const vuint8& v, const __m256i& index) {
+    return _mm256_permutevar8x32_epi32(v, index);
+  }
+
+  __forceinline vuint8 shuffle(const vuint8& v, const __m256i& index) {
+    return _mm256_castps_si256(_mm256_permutevar_ps(_mm256_castsi256_ps(v), index));
+  }
+
+  template<int i>
+  __forceinline vuint8 align_shift_right(const vuint8& a, const vuint8& b) {
+#if defined(__AVX512VL__)
+    return _mm256_alignr_epi32(a, b, i);    
+#else
+    return _mm256_alignr_epi8(a, b, 4*i);
+#endif
+  }
+
+#endif
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reductions
+  ////////////////////////////////////////////////////////////////////////////////
+
+  //__forceinline vuint8 vreduce_min2(const vuint8& v) { return min(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_min4(const vuint8& v) { vuint8 v1 = vreduce_min2(v); return min(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_min (const vuint8& v) { vuint8 v1 = vreduce_min4(v); return min(v1,shuffle4<1,0>(v1)); }
+
+  //__forceinline vuint8 vreduce_max2(const vuint8& v) { return max(v,shuffle<1,0,3,2>(v)); }
+  //__forceinline vuint8 vreduce_max4(const vuint8& v) { vuint8 v1 = vreduce_max2(v); return max(v1,shuffle<2,3,0,1>(v1)); }
+  //__forceinline vuint8 vreduce_max (const vuint8& v) { vuint8 v1 = vreduce_max4(v); return max(v1,shuffle4<1,0>(v1)); }
+
+  __forceinline vuint8 vreduce_add2(const vuint8& v) { return v + shuffle<1,0,3,2>(v); }
+  __forceinline vuint8 vreduce_add4(const vuint8& v) { vuint8 v1 = vreduce_add2(v); return v1 + shuffle<2,3,0,1>(v1); }
+  __forceinline vuint8 vreduce_add (const vuint8& v) { vuint8 v1 = vreduce_add4(v); return v1 + shuffle4<1,0>(v1); }
+
+  //__forceinline int reduce_min(const vuint8& v) { return toScalar(vreduce_min(v)); }
+  //__forceinline int reduce_max(const vuint8& v) { return toScalar(vreduce_max(v)); }
+  __forceinline int reduce_add(const vuint8& v) { return toScalar(vreduce_add(v)); }
+
+  //__forceinline size_t select_min(const vuint8& v) { return bsf(movemask(v == vreduce_min(v))); }
+  //__forceinline size_t select_max(const vuint8& v) { return bsf(movemask(v == vreduce_max(v))); }
+
+  //__forceinline size_t select_min(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(pos_inf)); return bsf(movemask(valid & (a == vreduce_min(a)))); }
+  //__forceinline size_t select_max(const vboolf8& valid, const vuint8& v) { const vuint8 a = select(valid,v,vuint8(neg_inf)); return bsf(movemask(valid & (a == vreduce_max(a)))); }
+
+  __forceinline vuint8 assign(const vuint4& a) { return _mm256_castsi128_si256(a); }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Output Operators
+  ////////////////////////////////////////////////////////////////////////////////
+
+  __forceinline embree_ostream operator <<(embree_ostream cout, const vuint8& a) {
+    return cout << "<" << a[0] << ", " << a[1] << ", " << a[2] << ", " << a[3] << ", " << a[4] << ", " << a[5] << ", " << a[6] << ", " << a[7] << ">";
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/alloc.cpp b/thirdparty/embree-aarch64/common/sys/alloc.cpp
new file mode 100644
index 0000000000..12f143f131
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/alloc.cpp
@@ -0,0 +1,327 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "mutex.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+  
+namespace embree
+{
+  void* alignedMalloc(size_t size, size_t align) 
+  {
+    if (size == 0)
+      return nullptr;
+    
+    assert((align & (align-1)) == 0);
+    void* ptr = _mm_malloc(size,align);
+
+    if (size != 0 && ptr == nullptr)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort(); 
+      // -- GODOT end --
+    
+    return ptr;
+  }
+  
+  void alignedFree(void* ptr)
+  {
+    if (ptr)
+      _mm_free(ptr);
+  }
+
+  static bool huge_pages_enabled = false;
+  static MutexSys os_init_mutex;
+
+  __forceinline bool isHugePageCandidate(const size_t bytes) 
+  {
+    if (!huge_pages_enabled)
+      return false;
+
+    /* use huge pages only when memory overhead is low */
+    const size_t hbytes = (bytes+PAGE_SIZE_2M-1) & ~size_t(PAGE_SIZE_2M-1);
+    return 66*(hbytes-bytes) < bytes; // at most 1.5% overhead
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <malloc.h>
+
+namespace embree
+{
+  bool win_enable_selockmemoryprivilege (bool verbose)
+  {
+    HANDLE hToken;
+    if (!OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY | TOKEN_ADJUST_PRIVILEGES, &hToken)) {
+      if (verbose) std::cout << "WARNING: OpenProcessToken failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+      return false;
+    }
+
+    TOKEN_PRIVILEGES tp;
+    tp.PrivilegeCount = 1;
+    tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+
+    if (!LookupPrivilegeValueW(nullptr, L"SeLockMemoryPrivilege", &tp.Privileges[0].Luid)) {
+      if (verbose) std::cout << "WARNING: LookupPrivilegeValue failed while trying to enable SeLockMemoryPrivilege: " << GetLastError() << std::endl;
+      return false;
+    }
+    
+    SetLastError(ERROR_SUCCESS);
+    if (!AdjustTokenPrivileges(hToken, FALSE, &tp, sizeof(tp), nullptr, 0)) {
+      if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed while trying to enable SeLockMemoryPrivilege" << std::endl;
+      return false;
+    }
+    
+    if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) {
+      if (verbose) std::cout << "WARNING: AdjustTokenPrivileges failed to enable SeLockMemoryPrivilege: Add SeLockMemoryPrivilege for current user and run process in elevated mode (Run as administrator)." << std::endl;
+      return false;
+    } 
+
+    return true;
+  }
+
+  bool os_init(bool hugepages, bool verbose) 
+  {
+    Lock<MutexSys> lock(os_init_mutex);
+
+    if (!hugepages) {
+      huge_pages_enabled = false;
+      return true;
+    }
+
+    if (GetLargePageMinimum() != PAGE_SIZE_2M) {
+      huge_pages_enabled = false;
+      return false;
+    }
+
+    huge_pages_enabled = true;
+    return true;
+  }
+
+  void* os_malloc(size_t bytes, bool& hugepages)
+  {
+    if (bytes == 0) {
+      hugepages = false;
+      return nullptr;
+    }
+
+    /* try direct huge page allocation first */
+    if (isHugePageCandidate(bytes)) 
+    {
+      int flags = MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES;
+      char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+      if (ptr != nullptr) {
+        hugepages = true;
+        return ptr;
+      }
+    } 
+
+    /* fall back to 4k pages */
+    int flags = MEM_COMMIT | MEM_RESERVE;
+    char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+    // -- GODOT start --
+    // if (ptr == nullptr) throw std::bad_alloc();
+    if (ptr == nullptr) abort();
+    // -- GODOT end --
+    hugepages = false;
+    return ptr;
+  }
+
+  size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) 
+  {
+    if (hugepages) // decommitting huge pages seems not to work under Windows
+      return bytesOld;
+
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+    bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+    if (bytesNew >= bytesOld)
+      return bytesOld;
+
+    if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+
+    return bytesNew;
+  }
+
+  void os_free(void* ptr, size_t bytes, bool hugepages) 
+  {
+    if (bytes == 0) 
+      return;
+
+    if (!VirtualFree(ptr,0,MEM_RELEASE))
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+  }
+
+  void os_advise(void *ptr, size_t bytes)
+  {
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sstream>
+
+#if defined(__MACOSX__)
+#include <mach/vm_statistics.h>
+#endif
+
+namespace embree
+{
+  bool os_init(bool hugepages, bool verbose) 
+  {
+    Lock<MutexSys> lock(os_init_mutex);
+
+    if (!hugepages) {
+      huge_pages_enabled = false;
+      return true;
+    }
+
+#if defined(__LINUX__)
+
+    int hugepagesize = 0;
+
+    std::ifstream file; 
+    file.open("/proc/meminfo",std::ios::in);
+    if (!file.is_open()) {
+      if (verbose) std::cout << "WARNING: Could not open /proc/meminfo. Huge page support cannot get enabled!" << std::endl;
+      huge_pages_enabled = false;
+      return false;
+    }
+    
+    std::string line;
+    while (getline(file,line))
+    {
+      std::stringstream sline(line);
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string tag; getline(sline,tag,' ');
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string val; getline(sline,val,' ');
+      while (!sline.eof() && sline.peek() == ' ') sline.ignore();
+      std::string unit; getline(sline,unit,' ');
+      if (tag == "Hugepagesize:" && unit == "kB") {
+	hugepagesize = std::stoi(val)*1024;
+	break;
+      }
+    }
+    
+    if (hugepagesize != PAGE_SIZE_2M) 
+    {
+      if (verbose) std::cout << "WARNING: Only 2MB huge pages supported. Huge page support cannot get enabled!" << std::endl;
+      huge_pages_enabled = false;
+      return false;
+    }
+#endif
+
+    huge_pages_enabled = true;
+    return true;
+  }
+
+  void* os_malloc(size_t bytes, bool& hugepages)
+  { 
+    if (bytes == 0) {
+      hugepages = false;
+      return nullptr;
+    }
+
+    /* try direct huge page allocation first */
+    if (isHugePageCandidate(bytes)) 
+    {
+#if defined(__MACOSX__)
+      void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0);
+      if (ptr != MAP_FAILED) {
+        hugepages = true;
+        return ptr;
+      }
+#elif defined(MAP_HUGETLB)
+      void* ptr = mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_HUGETLB, -1, 0);
+      if (ptr != MAP_FAILED) {
+        hugepages = true;
+        return ptr;
+      }
+#endif
+    } 
+
+    /* fallback to 4k pages */
+    void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    // -- GODOT start --
+    // if (ptr == MAP_FAILED) throw std::bad_alloc();
+    if (ptr == MAP_FAILED) abort();
+    // -- GODOT end --
+    hugepages = false;
+
+    /* advise huge page hint for THP */
+    os_advise(ptr,bytes);
+    return ptr;
+  }
+
+  size_t os_shrink(void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages) 
+  {
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytesNew = (bytesNew+pageSize-1) & ~(pageSize-1);
+    bytesOld = (bytesOld+pageSize-1) & ~(pageSize-1);
+    if (bytesNew >= bytesOld)
+      return bytesOld;
+
+    if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+
+    return bytesNew;
+  }
+
+  void os_free(void* ptr, size_t bytes, bool hugepages) 
+  {
+    if (bytes == 0)
+      return;
+
+    /* for hugepages we need to also align the size */
+    const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+    bytes = (bytes+pageSize-1) & ~(pageSize-1);
+    if (munmap(ptr,bytes) == -1)
+      // -- GODOT start --
+      // throw std::bad_alloc();
+      abort();
+      // -- GODOT end --
+  }
+
+  /* hint for transparent huge pages (THP) */
+  void os_advise(void* pptr, size_t bytes)
+  {
+#if defined(MADV_HUGEPAGE)
+    madvise(pptr,bytes,MADV_HUGEPAGE); 
+#endif
+  }
+}
+
+#endif
diff --git a/thirdparty/embree-aarch64/common/sys/alloc.h b/thirdparty/embree-aarch64/common/sys/alloc.h
new file mode 100644
index 0000000000..5898ecda70
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/alloc.h
@@ -0,0 +1,164 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include <vector>
+#include <set>
+
+namespace embree
+{
+#define ALIGNED_STRUCT_(align)                                           \
+  void* operator new(size_t size) { return alignedMalloc(size,align); } \
+  void operator delete(void* ptr) { alignedFree(ptr); }                 \
+  void* operator new[](size_t size) { return alignedMalloc(size,align); } \
+  void operator delete[](void* ptr) { alignedFree(ptr); }
+
+#define ALIGNED_CLASS_(align)                                           \
+ public:                                                               \
+    ALIGNED_STRUCT_(align)                                              \
+ private:
+  
+  /*! aligned allocation */
+  void* alignedMalloc(size_t size, size_t align);
+  void alignedFree(void* ptr);
+  
+  /*! allocator that performs aligned allocations */
+  template<typename T, size_t alignment>
+    struct aligned_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline pointer allocate( size_type n ) {
+        return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+        return alignedFree(p);
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+    };
+
+  /*! allocates pages directly from OS */
+  bool win_enable_selockmemoryprivilege(bool verbose);
+  bool os_init(bool hugepages, bool verbose);
+  void* os_malloc (size_t bytes, bool& hugepages);
+  size_t os_shrink (void* ptr, size_t bytesNew, size_t bytesOld, bool hugepages);
+  void  os_free   (void* ptr, size_t bytes, bool hugepages);
+  void  os_advise (void* ptr, size_t bytes);
+
+  /*! allocator that performs OS allocations */
+  template<typename T>
+    struct os_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      __forceinline os_allocator () 
+        : hugepages(false) {}
+
+      __forceinline pointer allocate( size_type n ) {
+        return (pointer) os_malloc(n*sizeof(value_type),hugepages);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) {
+        return os_free(p,n*sizeof(value_type),hugepages);
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+
+      bool hugepages;
+    };
+
+  /*! allocator for IDs */
+  template<typename T, size_t max_id>
+    struct IDPool
+    {
+      typedef T value_type;
+
+      IDPool ()
+      : nextID(0) {}
+
+      T allocate() 
+      {
+        /* return ID from list */
+        if (!IDs.empty()) 
+        {
+          T id = *IDs.begin();
+          IDs.erase(IDs.begin());
+          return id;
+        } 
+
+        /* allocate new ID */
+        else
+        {
+          if (size_t(nextID)+1 > max_id)
+            return -1;
+          
+          return nextID++;
+        }
+      }
+
+      /* adds an ID provided by the user */
+      bool add(T id)
+      {
+        if (id > max_id)
+          return false;
+        
+        /* check if ID should be in IDs set */
+        if (id < nextID) {
+          auto p = IDs.find(id);
+          if (p == IDs.end()) return false;
+          IDs.erase(p);
+          return true;
+        }
+
+        /* otherwise increase ID set */
+        else
+        {
+          for (T i=nextID; i<id; i++) {
+            IDs.insert(i);
+          }
+          nextID = id+1;
+          return true;
+        }
+      }
+
+      void deallocate( T id ) 
+      {
+        assert(id < nextID);
+        MAYBE_UNUSED auto done = IDs.insert(id).second;
+        assert(done);
+      }
+
+    private:
+      std::set<T> IDs;   //!< stores deallocated IDs to be reused
+      T nextID;          //!< next ID to use when IDs vector is empty
+    };
+}
+
diff --git a/thirdparty/embree-aarch64/common/sys/array.h b/thirdparty/embree-aarch64/common/sys/array.h
new file mode 100644
index 0000000000..77722a39f6
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/array.h
@@ -0,0 +1,222 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "alloc.h"
+
+namespace embree
+{
+  /*! static array with static size */
+  template<typename T, size_t N>
+    class array_t
+    {
+    public:
+
+      /********************** Iterators  ****************************/
+
+      __forceinline T* begin() const { return items; };
+      __forceinline T* end  () const { return items+N; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return N == 0; }
+      __forceinline size_t size     () const { return N; }
+      __forceinline size_t max_size () const { return N; }
+            
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < N); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < N); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < N); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < N); return items[i]; }
+
+      __forceinline T& front() const { assert(N > 0); return items[0]; };
+      __forceinline T& back () const { assert(N > 0); return items[N-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+    private:
+      T items[N];
+    };
+
+  /*! static array with dynamic size */
+  template<typename T, size_t N>
+    class darray_t
+    {
+    public:
+
+      __forceinline darray_t () : M(0) {}
+
+      __forceinline darray_t (const T& v) : M(0) {
+        for (size_t i=0; i<N; i++) items[i] = v;
+      }
+
+      /********************** Iterators  ****************************/
+
+      __forceinline T* begin() const { return items; };
+      __forceinline T* end  () const { return items+M; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return M == 0; }
+      __forceinline size_t size     () const { return M; }
+      __forceinline size_t capacity () const { return N; }
+      __forceinline size_t max_size () const { return N; }
+      
+      void resize(size_t new_size) {
+        assert(new_size < max_size());
+        M = new_size;
+      }
+
+      /******************** Modifiers **************************/
+
+      __forceinline void push_back(const T& v) 
+      {
+        assert(M+1 < max_size());
+        items[M++] = v;
+      }
+
+      __forceinline void pop_back() 
+      {
+        assert(!empty());
+        M--;
+      }
+
+      __forceinline void clear() {
+        M = 0;
+      }
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < M); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < M); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < M); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < M); return items[i]; }
+
+      __forceinline T& front() const { assert(M > 0); return items[0]; };
+      __forceinline T& back () const { assert(M > 0); return items[M-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+    private:
+      size_t M;
+      T items[N];
+    };
+
+  /*! dynamic sized array that is allocated on the stack */
+#define dynamic_large_stack_array(Ty,Name,N,max_stack_bytes) StackArray<Ty,max_stack_bytes> Name(N)
+  template<typename Ty, size_t max_stack_bytes>
+    struct __aligned(64) StackArray
+  {
+    __forceinline StackArray (const size_t N)
+      : N(N)
+    {
+      if (N*sizeof(Ty) <= max_stack_bytes) 
+        data = &arr[0];
+      else
+        data = (Ty*) alignedMalloc(N*sizeof(Ty),64); 
+    }
+
+    __forceinline ~StackArray () {
+      if (data != &arr[0]) alignedFree(data);
+    }
+
+    __forceinline operator       Ty* ()       { return data; }
+    __forceinline operator const Ty* () const { return data; }
+
+    __forceinline       Ty& operator[](const int i)       { assert(i>=0 && i<N); return data[i]; }
+    __forceinline const Ty& operator[](const int i) const { assert(i>=0 && i<N); return data[i]; }
+
+    __forceinline       Ty& operator[](const unsigned i)       { assert(i<N); return data[i]; }
+    __forceinline const Ty& operator[](const unsigned i) const { assert(i<N); return data[i]; }
+
+#if defined(__X86_64__) || defined(__aarch64__)
+    __forceinline       Ty& operator[](const size_t i)       { assert(i<N); return data[i]; }
+    __forceinline const Ty& operator[](const size_t i) const { assert(i<N); return data[i]; }
+#endif
+
+  private:
+    Ty arr[max_stack_bytes/sizeof(Ty)];
+    Ty* data;
+    size_t N;
+
+  private:
+    StackArray (const StackArray& other) DELETED; // do not implement
+    StackArray& operator= (const StackArray& other) DELETED; // do not implement
+
+  };
+
+  /*! dynamic sized array that is allocated on the stack */
+  template<typename Ty, size_t max_stack_elements, size_t max_total_elements>
+    struct __aligned(64) DynamicStackArray
+  {
+    __forceinline DynamicStackArray ()
+      : data(&arr[0]) {}
+
+    __forceinline ~DynamicStackArray ()
+    {
+      if (!isStackAllocated())
+        delete[] data;
+    }
+
+    __forceinline bool isStackAllocated() const {
+      return data == &arr[0];
+    }
+
+    __forceinline size_t size() const
+    {
+      if (isStackAllocated()) return max_stack_elements;
+      else return max_total_elements;
+    }
+
+    __forceinline void resize(size_t M)
+    {
+      assert(M <= max_total_elements);
+      if (likely(M <= max_stack_elements)) return;
+      if (likely(!isStackAllocated())) return;
+
+      data = new Ty[max_total_elements];
+      
+      for (size_t i=0; i<max_stack_elements; i++)
+        data[i] = arr[i];
+    }
+
+    __forceinline operator       Ty* ()       { return data; }
+    __forceinline operator const Ty* () const { return data; }
+
+    __forceinline       Ty& operator[](const int i)      { assert(i>=0 && i<max_total_elements); resize(i+1); return data[i]; }
+    __forceinline       Ty& operator[](const unsigned i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+
+#if defined(__X86_64__) || defined(__aarch64__)
+    __forceinline       Ty& operator[](const size_t i) { assert(i<max_total_elements); resize(i+1); return data[i]; }
+#endif
+
+    __forceinline DynamicStackArray (const DynamicStackArray& other)
+      : data(&arr[0]) 
+    {
+      for (size_t i=0; i<other.size(); i++)
+        this->operator[] (i) = other[i];
+    }
+     
+    DynamicStackArray& operator= (const DynamicStackArray& other)
+    {
+      for (size_t i=0; i<other.size(); i++)
+        this->operator[] (i) = other[i];
+
+      return *this;
+    }
+
+  private:
+    Ty arr[max_stack_elements];
+    Ty* data;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/sys/atomic.h b/thirdparty/embree-aarch64/common/sys/atomic.h
new file mode 100644
index 0000000000..ebfb8552c3
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/atomic.h
@@ -0,0 +1,59 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <atomic>
+#include "intrinsics.h"
+
+namespace embree
+{
+/* compiler memory barriers */
+#if defined(__INTEL_COMPILER)
+//#define __memory_barrier() __memory_barrier()
+#elif defined(__GNUC__) || defined(__clang__)
+#  define __memory_barrier() asm volatile("" ::: "memory")
+#elif  defined(_MSC_VER)
+#  define __memory_barrier() _ReadWriteBarrier()
+#endif
+
+  template <typename T>
+    struct atomic : public std::atomic<T>
+  {
+    atomic () {}
+      
+    atomic (const T& a)
+      : std::atomic<T>(a) {}
+
+    atomic (const atomic<T>& a) {
+      this->store(a.load());
+    }
+
+    atomic& operator=(const atomic<T>& other) {
+      this->store(other.load());
+      return *this;
+    }
+  };
+
+  template<typename T>
+    __forceinline void atomic_min(std::atomic<T>& aref, const T& bref)
+  {
+    const T b = bref.load();
+    while (true) {
+      T a = aref.load();
+      if (a <= b) break;
+      if (aref.compare_exchange_strong(a,b)) break;
+    }
+  }
+
+  template<typename T>
+    __forceinline void atomic_max(std::atomic<T>& aref, const T& bref)
+  {
+    const T b = bref.load();
+    while (true) {
+      T a = aref.load();
+      if (a >= b) break;
+      if (aref.compare_exchange_strong(a,b)) break;
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/barrier.cpp b/thirdparty/embree-aarch64/common/sys/barrier.cpp
new file mode 100644
index 0000000000..0061d18db2
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/barrier.cpp
@@ -0,0 +1,289 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "barrier.h"
+#include "condition.h"
+#include "regression.h"
+#include "thread.h"
+
+#if defined (__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  struct BarrierSysImplementation
+  {
+    __forceinline BarrierSysImplementation (size_t N) 
+      : i(0), enterCount(0), exitCount(0), barrierSize(0) 
+    {
+      events[0] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+      events[1] = CreateEvent(nullptr, TRUE, FALSE, nullptr);
+      init(N);
+    }
+    
+    __forceinline ~BarrierSysImplementation ()
+    {
+      CloseHandle(events[0]);
+      CloseHandle(events[1]);
+    }
+    
+    __forceinline void init(size_t N) 
+    {
+      barrierSize = N;
+      enterCount.store(N);
+      exitCount.store(N);
+    }
+
+    __forceinline void wait()
+    {
+      /* every thread entering the barrier decrements this count */
+      size_t i0 = i;
+      size_t cnt0 = enterCount--;
+
+      /* all threads except the last one are wait in the barrier */
+      if (cnt0 > 1) 
+      {
+        if (WaitForSingleObject(events[i0], INFINITE) != WAIT_OBJECT_0)
+          THROW_RUNTIME_ERROR("WaitForSingleObjects failed");
+      }
+      
+      /* the last thread starts all threads waiting at the barrier */
+      else 
+      {
+        i = 1-i;
+        enterCount.store(barrierSize);
+        if (SetEvent(events[i0]) == 0)
+          THROW_RUNTIME_ERROR("SetEvent failed");
+      }
+
+      /* every thread leaving the barrier decrements this count */
+      size_t cnt1 = exitCount--;
+
+      /* the last thread that left the barrier resets the event again */
+      if (cnt1 == 1) 
+      {
+        exitCount.store(barrierSize);
+        if (ResetEvent(events[i0]) == 0)
+          THROW_RUNTIME_ERROR("ResetEvent failed");
+      }
+    }
+
+  public:
+    HANDLE events[2];
+    atomic<size_t> i;
+    atomic<size_t> enterCount;
+    atomic<size_t> exitCount;
+    size_t barrierSize;
+  };
+}
+
+#else
+
+namespace embree
+{
+  struct BarrierSysImplementation
+  {
+    __forceinline BarrierSysImplementation (size_t N) 
+      : count(0), barrierSize(0) 
+    {
+      init(N);
+    }
+    
+    __forceinline void init(size_t N) 
+    {
+      assert(count == 0);
+      count = 0;
+      barrierSize = N;
+    }
+
+    __forceinline void wait()
+    {
+      mutex.lock();
+      count++;
+      
+      if (count == barrierSize) {
+        count = 0;
+        cond.notify_all();
+        mutex.unlock();
+        return;
+      }
+      
+      cond.wait(mutex);
+      mutex.unlock();
+      return;
+    }
+
+  public:
+    MutexSys mutex;
+    ConditionSys cond;
+    volatile size_t count;
+    volatile size_t barrierSize;
+  };
+}
+
+#endif
+
+namespace embree
+{
+  BarrierSys::BarrierSys (size_t N) {
+    opaque = new BarrierSysImplementation(N);
+  }
+
+  BarrierSys::~BarrierSys () {
+    delete (BarrierSysImplementation*) opaque;
+  }
+
+  void BarrierSys::init(size_t count) {
+    ((BarrierSysImplementation*) opaque)->init(count);
+  }
+
+  void BarrierSys::wait() {
+    ((BarrierSysImplementation*) opaque)->wait();
+  }
+
+  LinearBarrierActive::LinearBarrierActive (size_t N) 
+    : count0(nullptr), count1(nullptr), mode(0), flag0(0), flag1(0), threadCount(0)
+  { 
+    if (N == 0) N = getNumberOfLogicalThreads();
+    init(N);
+  }
+
+  LinearBarrierActive::~LinearBarrierActive() 
+  {
+    delete[] count0;
+    delete[] count1;
+  }
+
+  void LinearBarrierActive::init(size_t N) 
+  {
+    if (threadCount != N) {
+      threadCount = N;
+      if (count0) delete[] count0; count0 = new unsigned char[N];
+      if (count1) delete[] count1; count1 = new unsigned char[N];
+    }
+    mode      = 0;
+    flag0     = 0;
+    flag1     = 0;
+    for (size_t i=0; i<N; i++) count0[i] = 0;
+    for (size_t i=0; i<N; i++) count1[i] = 0;
+  }
+
+  void LinearBarrierActive::wait (const size_t threadIndex)
+  {
+    if (mode == 0)
+    {			
+      if (threadIndex == 0)
+      {	
+        for (size_t i=0; i<threadCount; i++)
+          count1[i] = 0;
+        
+        for (size_t i=1; i<threadCount; i++)
+        {
+          while (likely(count0[i] == 0)) 
+            pause_cpu();
+        }
+        mode  = 1;
+        flag1 = 0;
+        __memory_barrier();
+        flag0 = 1;
+      }			
+      else
+      {					
+        count0[threadIndex] = 1;
+        {
+          while (likely(flag0 == 0))
+            pause_cpu();
+        }
+        
+      }		
+    }					
+    else						
+    {
+      if (threadIndex == 0)
+      {	
+        for (size_t i=0; i<threadCount; i++)
+          count0[i] = 0;
+        
+        for (size_t i=1; i<threadCount; i++)
+        {		
+          while (likely(count1[i] == 0))
+            pause_cpu();
+        }
+        
+        mode  = 0;
+        flag0 = 0;
+        __memory_barrier();
+        flag1 = 1;
+      }			
+      else
+      {					
+        count1[threadIndex] = 1;
+        {
+          while (likely(flag1 == 0))
+            pause_cpu();
+        }
+      }		
+    }					
+  }
+
+  struct barrier_sys_regression_test : public RegressionTest
+  {
+    BarrierSys barrier;
+    std::atomic<size_t> threadID;
+    std::atomic<size_t> numFailed;
+    std::vector<size_t> threadResults;
+
+    barrier_sys_regression_test() 
+      : RegressionTest("barrier_sys_regression_test"), threadID(0), numFailed(0)
+    {
+      registerRegressionTest(this);
+    }
+
+    static void thread_alloc(barrier_sys_regression_test* This)
+    {
+      size_t tid = This->threadID++;
+      for (size_t j=0; j<1000; j++)
+      {
+        This->barrier.wait();
+        This->threadResults[tid] = tid;
+        This->barrier.wait();
+      }
+    }
+    
+    bool run ()
+    {
+      threadID.store(0);
+      numFailed.store(0);
+
+      size_t numThreads = getNumberOfLogicalThreads();
+      threadResults.resize(numThreads);
+      barrier.init(numThreads+1);
+
+      /* create threads */
+      std::vector<thread_t> threads;
+      for (size_t i=0; i<numThreads; i++)
+        threads.push_back(createThread((thread_func)thread_alloc,this));
+
+      /* run test */ 
+      for (size_t i=0; i<1000; i++)
+      {
+        for (size_t i=0; i<numThreads; i++) threadResults[i] = 0;
+        barrier.wait();
+        barrier.wait();
+        for (size_t i=0; i<numThreads; i++) numFailed += threadResults[i] != i;
+      }
+
+      /* destroy threads */
+      for (size_t i=0; i<numThreads; i++)
+        join(threads[i]);
+
+      return numFailed == 0;
+    }
+  };
+
+  barrier_sys_regression_test barrier_sys_regression_test;
+}
+
+
diff --git a/thirdparty/embree-aarch64/common/sys/barrier.h b/thirdparty/embree-aarch64/common/sys/barrier.h
new file mode 100644
index 0000000000..89607b8685
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/barrier.h
@@ -0,0 +1,112 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "intrinsics.h"
+#include "sysinfo.h"
+#include "atomic.h"
+
+namespace embree
+{
+  /*! system barrier using operating system */
+  class BarrierSys
+  {
+  public:
+
+    /*! construction / destruction */
+    BarrierSys (size_t N = 0);
+    ~BarrierSys ();
+
+  private:
+    /*! class in non-copyable */
+    BarrierSys (const BarrierSys& other) DELETED; // do not implement
+    BarrierSys& operator= (const BarrierSys& other) DELETED; // do not implement
+
+  public:
+    /*! intializes the barrier with some number of threads */
+    void init(size_t count);
+
+    /*! lets calling thread wait in barrier */
+    void wait();
+
+  private:
+    void* opaque;
+  };
+
+  /*! fast active barrier using atomitc counter */
+  struct BarrierActive 
+  {
+  public:
+    BarrierActive () 
+      : cntr(0) {}
+    
+    void reset() {
+      cntr.store(0);
+    }
+
+    void wait (size_t numThreads) 
+    {
+      cntr++;
+      while (cntr.load() != numThreads) 
+        pause_cpu();
+    }
+
+  private:
+    std::atomic<size_t> cntr;
+  };
+
+  /*! fast active barrier that does not require initialization to some number of threads */
+  struct BarrierActiveAutoReset
+  {
+  public:
+    BarrierActiveAutoReset () 
+      : cntr0(0), cntr1(0) {}
+
+    void wait (size_t threadCount) 
+    {
+      cntr0.fetch_add(1);
+      while (cntr0 != threadCount) pause_cpu();
+      cntr1.fetch_add(1);
+      while (cntr1 != threadCount) pause_cpu();
+      cntr0.fetch_add(-1);
+      while (cntr0 != 0) pause_cpu();
+      cntr1.fetch_add(-1);
+      while (cntr1 != 0) pause_cpu();
+    }
+
+  private:
+    std::atomic<size_t> cntr0;
+    std::atomic<size_t> cntr1;
+  };
+
+  class LinearBarrierActive
+  {
+  public:
+
+    /*! construction and destruction */
+    LinearBarrierActive (size_t threadCount = 0);
+    ~LinearBarrierActive();
+    
+  private:
+    /*! class in non-copyable */
+    LinearBarrierActive (const LinearBarrierActive& other) DELETED; // do not implement
+    LinearBarrierActive& operator= (const LinearBarrierActive& other) DELETED; // do not implement
+
+  public:
+    /*! intializes the barrier with some number of threads */
+    void init(size_t threadCount);
+    
+    /*! thread with threadIndex waits in the barrier */
+    void wait (const size_t threadIndex);
+    
+  private:
+    volatile unsigned char* count0;
+    volatile unsigned char* count1; 
+    volatile unsigned int mode;
+    volatile unsigned int flag0;
+    volatile unsigned int flag1;
+    volatile size_t threadCount;
+  };
+}
+
diff --git a/thirdparty/embree-aarch64/common/sys/condition.cpp b/thirdparty/embree-aarch64/common/sys/condition.cpp
new file mode 100644
index 0000000000..0e7ca7af39
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/condition.cpp
@@ -0,0 +1,81 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "condition.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  struct ConditionImplementation
+  {
+    __forceinline ConditionImplementation () {
+      InitializeConditionVariable(&cond);
+    }
+
+    __forceinline ~ConditionImplementation () {
+    }
+
+    __forceinline void wait(MutexSys& mutex_in) {
+      SleepConditionVariableCS(&cond, (LPCRITICAL_SECTION)mutex_in.mutex, INFINITE);
+    }
+
+    __forceinline void notify_all() {
+      WakeAllConditionVariable(&cond);
+    }
+
+  public:
+    CONDITION_VARIABLE cond;
+  };
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+  struct ConditionImplementation
+  {
+    __forceinline ConditionImplementation () { 
+      pthread_cond_init(&cond,nullptr); 
+    }
+    
+    __forceinline ~ConditionImplementation() { 
+      pthread_cond_destroy(&cond);
+    } 
+    
+    __forceinline void wait(MutexSys& mutex) { 
+      pthread_cond_wait(&cond, (pthread_mutex_t*)mutex.mutex); 
+    }
+    
+    __forceinline void notify_all() { 
+      pthread_cond_broadcast(&cond); 
+    }
+    
+  public:
+    pthread_cond_t cond;
+  };
+}
+#endif
+
+namespace embree 
+{
+  ConditionSys::ConditionSys () { 
+    cond = new ConditionImplementation; 
+  }
+
+  ConditionSys::~ConditionSys() { 
+    delete (ConditionImplementation*) cond;
+  }
+
+  void ConditionSys::wait(MutexSys& mutex) { 
+    ((ConditionImplementation*) cond)->wait(mutex);
+  }
+
+  void ConditionSys::notify_all() { 
+    ((ConditionImplementation*) cond)->notify_all();
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/condition.h b/thirdparty/embree-aarch64/common/sys/condition.h
new file mode 100644
index 0000000000..7a3a05aa81
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/condition.h
@@ -0,0 +1,31 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "mutex.h"
+
+namespace embree
+{
+  class ConditionSys
+  {
+  public:
+    ConditionSys();
+    ~ConditionSys();
+    void wait( class MutexSys& mutex );
+    void notify_all();
+
+    template<typename Predicate>
+      __forceinline void wait( class MutexSys& mutex, const Predicate& pred )
+    {
+      while (!pred()) wait(mutex);
+    }
+
+  private:
+    ConditionSys (const ConditionSys& other) DELETED; // do not implement
+    ConditionSys& operator= (const ConditionSys& other) DELETED; // do not implement
+
+  protected:
+    void* cond;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/sys/filename.cpp b/thirdparty/embree-aarch64/common/sys/filename.cpp
new file mode 100644
index 0000000000..86182c1afb
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/filename.cpp
@@ -0,0 +1,138 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "filename.h"
+#include "sysinfo.h"
+
+namespace embree
+{
+#ifdef __WIN32__
+  const char path_sep = '\\';
+#else
+  const char path_sep = '/';
+#endif
+
+  /*! create an empty filename */
+  FileName::FileName () {}
+
+  /*! create a valid filename from a string */
+  FileName::FileName (const char* in) {
+    filename = in;
+    for (size_t i=0; i<filename.size(); i++)
+      if (filename[i] == '\\' || filename[i] == '/')
+        filename[i] = path_sep;
+    while (!filename.empty() && filename[filename.size()-1] == path_sep)
+      filename.resize(filename.size()-1);
+  }
+
+  /*! create a valid filename from a string */
+  FileName::FileName (const std::string& in) {
+    filename = in;
+    for (size_t i=0; i<filename.size(); i++)
+      if (filename[i] == '\\' || filename[i] == '/')
+        filename[i] = path_sep;
+    while (!filename.empty() && filename[filename.size()-1] == path_sep)
+      filename.resize(filename.size()-1);
+  }
+  
+  /*! returns path to home folder */
+  FileName FileName::homeFolder() 
+  {
+#ifdef __WIN32__
+    const char* home = getenv("UserProfile");
+#else
+    const char* home = getenv("HOME");
+#endif
+    if (home) return home;
+    return "";
+  }
+
+  /*! returns path to executable */
+  FileName FileName::executableFolder() {
+    return FileName(getExecutableFileName()).path();
+  }
+
+  /*! returns the path */
+  FileName FileName::path() const {
+    size_t pos = filename.find_last_of(path_sep);
+    if (pos == std::string::npos) return FileName();
+    return filename.substr(0,pos);
+  }
+
+  /*! returns the basename */
+  std::string FileName::base() const {
+    size_t pos = filename.find_last_of(path_sep);
+    if (pos == std::string::npos) return filename;
+    return filename.substr(pos+1);
+  }
+
+  /*! returns the extension */
+  std::string FileName::ext() const {
+    size_t pos = filename.find_last_of('.');
+    if (pos == std::string::npos) return "";
+    return filename.substr(pos+1);
+  }
+
+  /*! returns the extension */
+  FileName FileName::dropExt() const {
+    size_t pos = filename.find_last_of('.');
+    if (pos == std::string::npos) return filename;
+    return filename.substr(0,pos);
+  }
+
+  /*! returns the basename without extension */
+  std::string FileName::name() const {
+    size_t start = filename.find_last_of(path_sep);
+    if (start == std::string::npos) start = 0; else start++;
+    size_t end = filename.find_last_of('.');
+    if (end == std::string::npos || end < start) end = filename.size();
+    return filename.substr(start, end - start);
+  }
+
+  /*! replaces the extension */
+  FileName FileName::setExt(const std::string& ext) const {
+    size_t start = filename.find_last_of(path_sep);
+    if (start == std::string::npos) start = 0; else start++;
+    size_t end = filename.find_last_of('.');
+    if (end == std::string::npos || end < start) return FileName(filename+ext);
+    return FileName(filename.substr(0,end)+ext);
+  }
+
+  /*! adds the extension */
+  FileName FileName::addExt(const std::string& ext) const {
+    return FileName(filename+ext);
+  }
+
+  /*! concatenates two filenames to this/other */
+  FileName FileName::operator +( const FileName& other ) const {
+    if (filename == "") return FileName(other);
+    else return FileName(filename + path_sep + other.filename);
+  }
+
+  /*! concatenates two filenames to this/other */
+  FileName FileName::operator +( const std::string& other ) const {
+    return operator+(FileName(other));
+  }
+
+  /*! removes the base from a filename (if possible) */
+  FileName FileName::operator -( const FileName& base ) const {
+    size_t pos = filename.find_first_of(base);
+    if (pos == std::string::npos) return *this;
+    return FileName(filename.substr(pos+1));
+  }
+
+  /*! == operator */
+  bool operator== (const FileName& a, const FileName& b) {
+    return a.filename == b.filename;
+  }
+  
+  /*! != operator */
+  bool operator!= (const FileName& a, const FileName& b) {
+    return a.filename != b.filename;
+  }
+
+  /*! output operator */
+  std::ostream& operator<<(std::ostream& cout, const FileName& filename) {
+    return cout << filename.filename;
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/filename.h b/thirdparty/embree-aarch64/common/sys/filename.h
new file mode 100644
index 0000000000..58f881b14d
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/filename.h
@@ -0,0 +1,81 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+  /*! Convenience class for handling file names and paths. */
+  class FileName
+  {
+  public:
+
+    /*! create an empty filename */
+    FileName ();
+
+    /*! create a valid filename from a string */
+    FileName (const char* filename);
+
+    /*! create a valid filename from a string */
+    FileName (const std::string& filename);
+    
+    /*! returns path to home folder */
+    static FileName homeFolder();
+
+    /*! returns path to executable */
+    static FileName executableFolder();
+
+    /*! auto convert into a string */
+    operator std::string() const { return filename; }
+
+    /*! returns a string of the filename */
+    const std::string str() const { return filename; }
+
+    /*! returns a c-string of the filename */
+    const char* c_str() const { return filename.c_str(); }
+
+    /*! returns the path of a filename */
+    FileName path() const;
+
+    /*! returns the file of a filename  */
+    std::string base() const;
+
+    /*! returns the base of a filename without extension */
+    std::string name() const;
+
+    /*! returns the file extension */
+    std::string ext() const;
+
+    /*! drops the file extension */
+    FileName dropExt() const;
+
+    /*! replaces the file extension */
+    FileName setExt(const std::string& ext = "") const;
+
+    /*! adds file extension */
+    FileName addExt(const std::string& ext = "") const;
+
+    /*! concatenates two filenames to this/other */
+    FileName operator +( const FileName& other ) const;
+
+    /*! concatenates two filenames to this/other */
+    FileName operator +( const std::string& other ) const;
+
+    /*! removes the base from a filename (if possible) */
+    FileName operator -( const FileName& base ) const;
+
+    /*! == operator */
+    friend bool operator==(const FileName& a, const FileName& b);
+
+    /*! != operator */
+    friend bool operator!=(const FileName& a, const FileName& b);
+
+    /*! output operator */
+    friend embree_ostream operator<<(embree_ostream cout, const FileName& filename);
+   
+  private:
+    std::string filename;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/sys/intrinsics.h b/thirdparty/embree-aarch64/common/sys/intrinsics.h
new file mode 100644
index 0000000000..44cdbd8f0f
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/intrinsics.h
@@ -0,0 +1,559 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+#if defined(__WIN32__)
+#include <intrin.h>
+#endif
+
+#if defined(__ARM_NEON)
+#include "../math/SSE2NEON.h"
+#if defined(NEON_AVX2_EMULATION)
+#include "../math/AVX2NEON.h"
+#endif
+#else
+#include <immintrin.h>
+#endif
+
+#if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
+  #if !defined(_tzcnt_u32)
+    #define _tzcnt_u32 __tzcnt_u32
+  #endif
+  #if !defined(_tzcnt_u64)
+    #define _tzcnt_u64 __tzcnt_u64
+  #endif
+#endif
+
+#if defined(__aarch64__)
+#if !defined(_lzcnt_u32)
+  #define _lzcnt_u32 __builtin_clz
+#endif
+#if !defined(_lzcnt_u32)
+  #define _lzcnt_u32 __builtin_clzll
+#endif
+#else
+#if defined(__LZCNT__)
+  #if !defined(_lzcnt_u32)
+    #define _lzcnt_u32 __lzcnt32
+  #endif
+  #if !defined(_lzcnt_u64)
+    #define _lzcnt_u64 __lzcnt64
+  #endif
+#endif
+#endif
+
+#if defined(__WIN32__)
+#  ifndef NOMINMAX
+#  define NOMINMAX
+#  endif
+#  include <windows.h>
+#endif
+
+/* normally defined in pmmintrin.h, but we always need this */
+#if !defined(_MM_SET_DENORMALS_ZERO_MODE)
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+#endif
+
+namespace embree
+{
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+  __forceinline size_t read_tsc()
+  {
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    return (size_t)li.QuadPart;
+  }
+
+  __forceinline int bsf(int v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _tzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+  }
+
+  __forceinline unsigned bsf(unsigned v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _tzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanForward(&r,v); return r;
+#endif
+  }
+
+#if defined(__X86_64__)
+  __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__)
+    return _tzcnt_u64(v);
+#else
+    unsigned long r = 0; _BitScanForward64(&r,v); return r;
+#endif
+  }
+#endif
+
+  __forceinline int bscf(int& v)
+  {
+    int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+
+  __forceinline unsigned bscf(unsigned& v)
+  {
+    unsigned i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+
+#if defined(__X86_64__)
+  __forceinline size_t bscf(size_t& v)
+  {
+    size_t i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+#endif
+
+  __forceinline int bsr(int v) {
+#if defined(__AVX2__)  && !defined(__aarch64__)
+    return 31 - _lzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+  }
+
+  __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return 31 - _lzcnt_u32(v);
+#else
+    unsigned long r = 0; _BitScanReverse(&r,v); return r;
+#endif
+  }
+
+#if defined(__X86_64__)
+  __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__)
+    return 63 -_lzcnt_u64(v);
+#else
+    unsigned long r = 0; _BitScanReverse64(&r, v); return r;
+#endif
+  }
+#endif
+
+  __forceinline int lzcnt(const int x)
+  {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _lzcnt_u32(x);
+#else
+    if (unlikely(x == 0)) return 32;
+    return 31 - bsr(x);
+#endif
+  }
+
+  __forceinline int btc(int v, int i) {
+    long r = v; _bittestandcomplement(&r,i); return r;
+  }
+
+  __forceinline int bts(int v, int i) {
+    long r = v; _bittestandset(&r,i); return r;
+  }
+
+  __forceinline int btr(int v, int i) {
+    long r = v; _bittestandreset(&r,i); return r;
+  }
+
+#if defined(__X86_64__)
+
+  __forceinline size_t btc(size_t v, size_t i) {
+    size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
+  }
+
+  __forceinline size_t bts(size_t v, size_t i) {
+    __int64 r = v; _bittestandset64(&r,i); return r;
+  }
+
+  __forceinline size_t btr(size_t v, size_t i) {
+    __int64 r = v; _bittestandreset64(&r,i); return r;
+  }
+
+#endif
+
+  __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
+    return _InterlockedCompareExchange((volatile long*)p,v,c);
+  }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#else
+
+#if defined(__i386__) && defined(__PIC__)
+
+  __forceinline void __cpuid(int out[4], int op)
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
+                  : "0"(op));
+  }
+
+  __forceinline void __cpuid_count(int out[4], int op1, int op2)
+  {
+    asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
+                  "cpuid\n\t"
+                  "xchg{l}\t{%%}ebx, %1\n\t"
+                  : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
+                  : "0" (op1), "2" (op2));
+  }
+
+#else
+
+  __forceinline void __cpuid(int out[4], int op) {
+#if defined(__ARM_NEON)
+    if (op == 0) { // Get CPU name
+      out[0] = 0x41524d20;
+      out[1] = 0x41524d20;
+      out[2] = 0x41524d20;
+      out[3] = 0x41524d20;
+    }
+#else
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
+#endif
+  }
+
+#if !defined(__ARM_NEON)
+  __forceinline void __cpuid_count(int out[4], int op1, int op2) {
+    asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
+  }
+#endif
+
+#endif
+
+  __forceinline uint64_t read_tsc()  {
+#if defined(__ARM_NEON)
+    return 0; // FIXME(LTE): mimic rdtsc
+#else
+    uint32_t high,low;
+    asm volatile ("rdtsc" : "=d"(high), "=a"(low));
+    return (((uint64_t)high) << 32) + (uint64_t)low;
+#endif
+  }
+
+  __forceinline int bsf(int v) {
+#if defined(__ARM_NEON)
+    return __builtin_ctz(v);
+#else
+#if defined(__AVX2__)
+    return _tzcnt_u32(v);
+#else
+    int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+#endif
+  }
+
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline unsigned bsf(unsigned v)
+  {
+#if defined(__ARM_NEON)
+    return __builtin_ctz(v);
+#else
+#if defined(__AVX2__)
+    return _tzcnt_u32(v);
+#else
+    unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+#endif
+  }
+#endif
+
+  __forceinline size_t bsf(size_t v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__X86_64__)
+    return _tzcnt_u64(v);
+#else
+    return _tzcnt_u32(v);
+#endif
+#elif defined(__ARM_NEON)
+    return __builtin_ctzl(v);
+#else
+    size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+  }
+
+  __forceinline int bscf(int& v)
+  {
+    int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline unsigned int bscf(unsigned int& v)
+  {
+    unsigned int i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+#endif
+
+  __forceinline size_t bscf(size_t& v)
+  {
+    size_t i = bsf(v);
+    v &= v-1;
+    return i;
+  }
+
+  __forceinline int bsr(int v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return 31 - _lzcnt_u32(v);
+#elif defined(__ARM_NEON)
+    return __builtin_clz(v)^31;
+#else
+    int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+  }
+
+#if defined(__X86_64__) || defined(__aarch64__)
+  __forceinline unsigned bsr(unsigned v) {
+#if defined(__AVX2__)
+    return 31 - _lzcnt_u32(v);
+#elif defined(__ARM_NEON)
+    return __builtin_clz(v)^31;
+#else
+    unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+  }
+#endif
+
+  __forceinline size_t bsr(size_t v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__X86_64__)
+    return 63 - _lzcnt_u64(v);
+#else
+    return 31 - _lzcnt_u32(v);
+#endif
+#elif defined(__aarch64__)
+    return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
+#else
+    size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+#endif
+  }
+
+  __forceinline int lzcnt(const int x)
+  {
+#if defined(__AVX2__) && !defined(__aarch64__)
+    return _lzcnt_u32(x);
+#else
+    if (unlikely(x == 0)) return 32;
+    return 31 - bsr(x);
+#endif
+  }
+
+  __forceinline size_t blsr(size_t v) {
+#if defined(__AVX2__) && !defined(__aarch64__)
+#if defined(__INTEL_COMPILER)
+    return _blsr_u64(v);
+#else
+#if defined(__X86_64__)
+    return __blsr_u64(v);
+#else
+    return __blsr_u32(v);
+#endif
+#endif
+#else
+    return v & (v-1);
+#endif
+  }
+
+  __forceinline int btc(int v, int i) {
+#if defined(__aarch64__)
+    // _bittestandcomplement(long *a, long b) {
+    // unsigned char x = (*a >> b) & 1;
+    // *a = *a ^ (1 << b);
+    // return x;
+
+    // We only need `*a`
+    return (v ^ (1 << i));
+#else
+    int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#endif
+  }
+
+  __forceinline int bts(int v, int i) {
+#if defined(__aarch64__)
+    // _bittestandset(long *a, long b) {
+    // unsigned char x = (*a >> b) & 1;
+    //  *a = *a | (1 << b);
+    //  return x;
+    return (v | (v << i));
+#else
+    int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
+  }
+
+  __forceinline int btr(int v, int i) {
+#if defined(__aarch64__)
+    // _bittestandreset(long *a, long b) {
+    // unsigned char x = (*a >> b) & 1;
+    //  *a = *a & ~(1 << b);
+    //  return x;
+    return (v & ~(v << i));
+#else
+    int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
+  }
+
+  __forceinline size_t btc(size_t v, size_t i) {
+#if defined(__aarch64__)
+    return (v ^ (1 << i));
+#else
+    size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
+#endif
+  }
+
+  __forceinline size_t bts(size_t v, size_t i) {
+#if defined(__aarch64__)
+    return (v | (v << i));
+#else
+    size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
+  }
+
+  __forceinline size_t btr(size_t v, size_t i) {
+#if defined(__ARM_NEON)
+    return (v & ~(v << i));
+#else
+    size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+#endif
+  }
+
+  __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
+    return __sync_val_compare_and_swap(value, comparand, input);
+  }
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__clang__) || defined(__GNUC__)
+#if !defined(_mm_undefined_ps)
+  __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
+#endif
+#if !defined(_mm_undefined_si128)
+  __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
+#endif
+#if !defined(_mm256_undefined_ps) && defined(__AVX__)
+  __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
+#endif
+#if !defined(_mm256_undefined_si256) && defined(__AVX__)
+  __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
+#endif
+#if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
+  __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
+#endif
+#if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
+  __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
+#endif
+#endif
+
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+
+  __forceinline int popcnt(int in) {
+    return _mm_popcnt_u32(in);
+  }
+
+  __forceinline unsigned popcnt(unsigned in) {
+    return _mm_popcnt_u32(in);
+  }
+
+#if defined(__X86_64__) || defined(__ARM_NEON)
+  __forceinline size_t popcnt(size_t in) {
+    return _mm_popcnt_u64(in);
+  }
+#endif
+
+#endif
+
+  __forceinline uint64_t rdtsc()
+  {
+    int dummy[4];
+    __cpuid(dummy,0);
+    uint64_t clock = read_tsc();
+    __cpuid(dummy,0);
+    return clock;
+  }
+
+  __forceinline void pause_cpu(const size_t N = 8)
+  {
+    for (size_t i=0; i<N; i++)
+      _mm_pause();
+  }
+
+  /* prefetches */
+  __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
+  __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
+  __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
+  __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
+  __forceinline void prefetchEX (const void* ptr) {
+#if defined(__INTEL_COMPILER)
+    _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
+#else
+    _mm_prefetch((const char*)ptr,_MM_HINT_T0);
+#endif
+  }
+
+  __forceinline void prefetchL1EX(const void* ptr) {
+    prefetchEX(ptr);
+  }
+
+  __forceinline void prefetchL2EX(const void* ptr) {
+    prefetchEX(ptr);
+  }
+#if defined(__AVX2__) && !defined(__aarch64__)
+   __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
+   __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
+#if defined(__X86_64__)
+   __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
+   __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
+#endif
+#endif
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER)
+   __forceinline float mm512_cvtss_f32(__m512 v) {
+     return _mm512_cvtss_f32(v);
+   }
+   __forceinline int mm512_mask2int(__mmask16 k1) {
+     return _mm512_mask2int(k1);
+   }
+   __forceinline __mmask16 mm512_int2mask(int mask) {
+     return _mm512_int2mask(mask);
+   }
+#else
+   __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
+     return _mm_cvtss_f32(_mm512_castps512_ps128(v));
+   }
+   __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
+     return (int)k1;
+   }
+   __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
+     return (__mmask16)mask;
+   }
+#endif
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/sys/library.cpp b/thirdparty/embree-aarch64/common/sys/library.cpp
new file mode 100644
index 0000000000..899267a1e4
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/library.cpp
@@ -0,0 +1,83 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "library.h"
+#include "sysinfo.h"
+#include "filename.h"
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  /* opens a shared library */
+  lib_t openLibrary(const std::string& file)
+  {
+    std::string fullName = file+".dll";
+    FileName executable = getExecutableFileName();
+    HANDLE handle = LoadLibrary((executable.path() + fullName).c_str());
+    return lib_t(handle);
+  }
+
+  /* returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym) {
+    return reinterpret_cast<void *>(GetProcAddress(HMODULE(lib),sym.c_str()));
+  }
+
+  /* closes the shared library */
+  void closeLibrary(lib_t lib) {
+    FreeLibrary(HMODULE(lib));
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <dlfcn.h>
+
+namespace embree
+{
+  /* opens a shared library */
+  lib_t openLibrary(const std::string& file)
+  {
+#if defined(__MACOSX__)
+    std::string fullName = "lib"+file+".dylib";
+#else
+    std::string fullName = "lib"+file+".so";
+#endif
+    void* lib = dlopen(fullName.c_str(), RTLD_NOW);
+    if (lib) return lib_t(lib);
+    FileName executable = getExecutableFileName();
+    lib = dlopen((executable.path() + fullName).c_str(),RTLD_NOW);
+    if (lib == nullptr) {
+      const char* error = dlerror();
+      if (error) {
+        THROW_RUNTIME_ERROR(error);
+      } else {
+        THROW_RUNTIME_ERROR("could not load library "+executable.str());
+      }
+    }
+    return lib_t(lib);
+  }
+
+  /* returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym) {
+    return dlsym(lib,sym.c_str());
+  }
+
+  /* closes the shared library */
+  void closeLibrary(lib_t lib) {
+    dlclose(lib);
+  }
+}
+#endif
diff --git a/thirdparty/embree-aarch64/common/sys/library.h b/thirdparty/embree-aarch64/common/sys/library.h
new file mode 100644
index 0000000000..c2164e9fbe
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/library.h
@@ -0,0 +1,21 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+namespace embree
+{
+  /*! type for shared library */
+  typedef struct opaque_lib_t* lib_t;
+
+  /*! loads a shared library */
+  lib_t openLibrary(const std::string& file);
+
+  /*! returns address of a symbol from the library */
+  void* getSymbol(lib_t lib, const std::string& sym);
+
+  /*! unloads a shared library */
+  void closeLibrary(lib_t lib);
+}
diff --git a/thirdparty/embree-aarch64/common/sys/mutex.cpp b/thirdparty/embree-aarch64/common/sys/mutex.cpp
new file mode 100644
index 0000000000..11779bc9b9
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/mutex.cpp
@@ -0,0 +1,58 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "mutex.h"
+#include "regression.h"
+
+#if defined(__WIN32__) && !defined(PTHREADS_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  MutexSys::MutexSys() { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); }
+  MutexSys::~MutexSys() { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete (CRITICAL_SECTION*)mutex; }
+  void MutexSys::lock() { EnterCriticalSection((CRITICAL_SECTION*)mutex); }
+  bool MutexSys::try_lock() { return TryEnterCriticalSection((CRITICAL_SECTION*)mutex) != 0; }
+  void MutexSys::unlock() { LeaveCriticalSection((CRITICAL_SECTION*)mutex); }
+}
+#endif
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+#include <pthread.h>
+namespace embree
+{
+  /*! system mutex using pthreads */
+  MutexSys::MutexSys() 
+  { 
+    mutex = new pthread_mutex_t; 
+    if (pthread_mutex_init((pthread_mutex_t*)mutex, nullptr) != 0)
+      THROW_RUNTIME_ERROR("pthread_mutex_init failed");
+  }
+  
+  MutexSys::~MutexSys() 
+  { 
+    MAYBE_UNUSED bool ok = pthread_mutex_destroy((pthread_mutex_t*)mutex) == 0;
+    assert(ok);
+    delete (pthread_mutex_t*)mutex; 
+    mutex = nullptr;
+  }
+  
+  void MutexSys::lock() 
+  { 
+    if (pthread_mutex_lock((pthread_mutex_t*)mutex) != 0) 
+      THROW_RUNTIME_ERROR("pthread_mutex_lock failed");
+  }
+  
+  bool MutexSys::try_lock() { 
+    return pthread_mutex_trylock((pthread_mutex_t*)mutex) == 0;
+  }
+  
+  void MutexSys::unlock() 
+  { 
+    if (pthread_mutex_unlock((pthread_mutex_t*)mutex) != 0)
+      THROW_RUNTIME_ERROR("pthread_mutex_unlock failed");
+  }
+};
+#endif
diff --git a/thirdparty/embree-aarch64/common/sys/mutex.h b/thirdparty/embree-aarch64/common/sys/mutex.h
new file mode 100644
index 0000000000..1164210f23
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/mutex.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "intrinsics.h"
+#include "atomic.h"
+
+namespace embree
+{
+  /*! system mutex */
+  class MutexSys {
+    friend struct ConditionImplementation;
+  public:
+    MutexSys();
+    ~MutexSys();
+
+  private:
+    MutexSys (const MutexSys& other) DELETED; // do not implement
+    MutexSys& operator= (const MutexSys& other) DELETED; // do not implement
+
+  public:
+    void lock();
+    bool try_lock();
+    void unlock();
+
+  protected:
+    void* mutex;
+  };
+
+  /*! spinning mutex */
+  class SpinLock
+  {
+  public:
+ 
+    SpinLock ()
+      : flag(false) {}
+
+    __forceinline bool isLocked() {
+      return flag.load();
+    }
+
+    __forceinline void lock()
+    {
+      while (true) 
+      {
+        while (flag.load()) 
+        {
+          _mm_pause(); 
+          _mm_pause();
+        }
+        
+        bool expected = false;
+        if (flag.compare_exchange_strong(expected,true,std::memory_order_acquire))
+          break;
+      }
+    }
+    
+    __forceinline bool try_lock()
+    {
+      bool expected = false;
+      if (flag.load() != expected) {
+        return false;
+      }
+      return flag.compare_exchange_strong(expected,true,std::memory_order_acquire);
+    }
+
+    __forceinline void unlock() {
+      flag.store(false,std::memory_order_release);
+    }
+    
+    __forceinline void wait_until_unlocked() 
+    {
+      while(flag.load())
+      {
+        _mm_pause(); 
+        _mm_pause();
+      }
+    }
+
+  public:
+    atomic<bool> flag;
+  };
+
+  /*! safe mutex lock and unlock helper */
+  template<typename Mutex> class Lock {
+  public:
+    Lock (Mutex& mutex) : mutex(mutex), locked(true) { mutex.lock(); }
+    Lock (Mutex& mutex, bool locked) : mutex(mutex), locked(locked) {}
+    ~Lock() { if (locked) mutex.unlock(); }
+    __forceinline void lock() { assert(!locked); locked = true; mutex.lock(); }
+    __forceinline bool isLocked() const { return locked; }
+  protected:
+    Mutex& mutex;
+    bool locked;
+  };
+}
diff --git a/thirdparty/embree-aarch64/common/sys/platform.h b/thirdparty/embree-aarch64/common/sys/platform.h
new file mode 100644
index 0000000000..737f14aa6e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/platform.h
@@ -0,0 +1,387 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <cstddef>
+#include <cassert>
+#include <cstdlib>
+#include <cstdio>
+#include <memory>
+#include <stdexcept>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <string>
+#include <cstring>
+#include <stdint.h>
+#include <functional>
+
+////////////////////////////////////////////////////////////////////////////////
+/// detect platform
+////////////////////////////////////////////////////////////////////////////////
+
+/* detect 32 or 64 platform */
+#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#define __X86_64__
+#endif
+
+/* detect Linux platform */
+#if defined(linux) || defined(__linux__) || defined(__LINUX__)
+#  if !defined(__LINUX__)
+#     define __LINUX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect FreeBSD platform */
+#if defined(__FreeBSD__) || defined(__FREEBSD__)
+#  if !defined(__FREEBSD__)
+#     define __FREEBSD__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect Windows 95/98/NT/2000/XP/Vista/7/8/10 platform */
+#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__)
+#  if !defined(__WIN32__)
+#     define __WIN32__
+#  endif
+#endif
+
+/* detect Cygwin platform */
+#if defined(__CYGWIN__)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect MAC OS X platform */
+#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__)
+#  if !defined(__MACOSX__)
+#     define __MACOSX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* try to detect other Unix systems */
+#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Macros
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __WIN32__
+#define dll_export __declspec(dllexport)
+#define dll_import __declspec(dllimport)
+#else
+#define dll_export __attribute__ ((visibility ("default")))
+#define dll_import
+#endif
+
+#ifdef __WIN32__
+#if !defined(__noinline)
+#define __noinline             __declspec(noinline)
+#endif
+//#define __forceinline        __forceinline
+//#define __restrict           __restrict
+#if defined(__INTEL_COMPILER)
+#define __restrict__           __restrict
+#else
+#define __restrict__           //__restrict // causes issues with MSVC
+#endif
+#if !defined(__thread)
+// NOTE: Require `-fms-extensions` for clang
+#define __thread               __declspec(thread)
+#endif
+#if !defined(__aligned)
+#if defined(__MINGW32__)
+#define __aligned(...)           __attribute__((aligned(__VA_ARGS__)))
+#else
+#define __aligned(...)           __declspec(align(__VA_ARGS__))
+#endif
+#endif
+//#define __FUNCTION__           __FUNCTION__
+#define debugbreak()           __debugbreak()
+
+#else
+#if !defined(__noinline)
+#define __noinline             __attribute__((noinline))
+#endif
+#if !defined(__forceinline)
+#define __forceinline          inline __attribute__((always_inline))
+#endif
+//#define __restrict             __restrict
+//#define __thread               __thread
+#if !defined(__aligned)
+#define __aligned(...)           __attribute__((aligned(__VA_ARGS__)))
+#endif
+#if !defined(__FUNCTION__)
+#define __FUNCTION__           __PRETTY_FUNCTION__
+#endif
+#define debugbreak()           asm ("int $3")
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+  #define MAYBE_UNUSED __attribute__((unused))
+#else
+  #define MAYBE_UNUSED
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900) // before VS2015 deleted functions are not supported properly
+  #define DELETED
+#else
+  #define DELETED  = delete
+#endif
+
+// -- GODOT start --
+#ifndef likely
+// -- GODOT end --
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#define   likely(expr) (expr)
+#define unlikely(expr) (expr)
+#else
+#define   likely(expr) __builtin_expect((bool)(expr),true )
+#define unlikely(expr) __builtin_expect((bool)(expr),false)
+#endif
+// -- GODOT start --
+#endif
+// -- GODOT end --
+
+////////////////////////////////////////////////////////////////////////////////
+/// Error handling and debugging
+////////////////////////////////////////////////////////////////////////////////
+
+/* debug printing macros */
+#define STRING(x) #x
+#define TOSTRING(x) STRING(x)
+#define PING embree_cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << embree_endl
+#define PRINT(x) embree_cout << STRING(x) << " = " << (x) << embree_endl
+#define PRINT2(x,y) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << embree_endl
+#define PRINT3(x,y,z) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << embree_endl
+#define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+
+#if defined(DEBUG) // only report file and line in debug mode
+  // -- GODOT start --
+  // #define THROW_RUNTIME_ERROR(str)
+  //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+  #define THROW_RUNTIME_ERROR(str) \
+    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+  // -- GODOT end --
+#else
+  // -- GODOT start --
+  // #define THROW_RUNTIME_ERROR(str)
+  //   throw std::runtime_error(str);
+  #define THROW_RUNTIME_ERROR(str) \
+    abort();
+  // -- GODOT end --
+#endif
+
+#define FATAL(x)   THROW_RUNTIME_ERROR(x)
+#define WARNING(x) { std::cerr << "Warning: " << x << embree_endl << std::flush; }
+
+#define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented")
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic types
+////////////////////////////////////////////////////////////////////////////////
+
+/* default floating-point type */
+namespace embree {
+  typedef float real;
+}
+
+/* windows does not have ssize_t */
+#if defined(__WIN32__)
+#if defined(__X86_64__) || defined(__aarch64__)
+typedef int64_t ssize_t;
+#else
+typedef int32_t ssize_t;
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic utility functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline std::string toString(long long value) {
+  return std::to_string(value);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Disable some compiler warnings
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__INTEL_COMPILER)
+//#pragma warning(disable:265 ) // floating-point operation result is out of range
+//#pragma warning(disable:383 ) // value copied to temporary, reference to temporary used
+//#pragma warning(disable:869 ) // parameter was never referenced
+//#pragma warning(disable:981 ) // operands are evaluated in unspecified order
+//#pragma warning(disable:1418) // external function definition with no prior declaration
+//#pragma warning(disable:1419) // external declaration in primary source file
+//#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable
+//#pragma warning(disable:94  ) // the size of an array must be greater than zero
+//#pragma warning(disable:1599) // declaration hides parameter
+//#pragma warning(disable:424 ) // extra ";" ignored
+#pragma warning(disable:2196) // routine is both "inline" and "noinline"
+//#pragma warning(disable:177 ) // label was declared but never referenced
+//#pragma warning(disable:114 ) // function was referenced but not defined
+//#pragma warning(disable:819 ) // template nesting depth does not match the previous declaration of function
+#pragma warning(disable:15335)  // was not vectorized: vectorization possible but seems inefficient
+#endif
+
+#if defined(_MSC_VER)
+//#pragma warning(disable:4200) // nonstandard extension used : zero-sized array in struct/union
+#pragma warning(disable:4800) // forcing value to bool 'true' or 'false' (performance warning)
+//#pragma warning(disable:4267) // '=' : conversion from 'size_t' to 'unsigned long', possible loss of data
+#pragma warning(disable:4244) // 'argument' : conversion from 'ssize_t' to 'unsigned int', possible loss of data
+//#pragma warning(disable:4355) // 'this' : used in base member initializer list
+//#pragma warning(disable:391 ) // '<=' : signed / unsigned mismatch
+//#pragma warning(disable:4018) // '<' : signed / unsigned mismatch
+//#pragma warning(disable:4305) // 'initializing' : truncation from 'double' to 'float'
+//#pragma warning(disable:4068) // unknown pragma
+//#pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned
+//#pragma warning(disable:4838) // conversion from 'unsigned int' to 'const int' requires a narrowing conversion)
+//#pragma warning(disable:4227) // anachronism used : qualifiers on reference are ignored
+#pragma warning(disable:4503) // decorated name length exceeded, name was truncated
+#pragma warning(disable:4180) // qualifier applied to function type has no meaning; ignored
+#pragma warning(disable:4258) // definition from the for loop is ignored; the definition from the enclosing scope is used
+
+#  if _MSC_VER < 1910 // prior to Visual studio 2017 (V141)
+#    pragma warning(disable:4101) // warning C4101: 'x': unreferenced local variable // a compiler bug issues wrong warnings
+#    pragma warning(disable:4789) // buffer '' of size 8 bytes will be overrun; 32 bytes will be written starting at offset 0
+#  endif
+
+#endif
+
+#if defined(__clang__) && !defined(__INTEL_COMPILER)
+//#pragma clang diagnostic ignored "-Wunknown-pragmas"
+//#pragma clang diagnostic ignored "-Wunused-variable"
+//#pragma clang diagnostic ignored "-Wreorder"
+//#pragma clang diagnostic ignored "-Wmicrosoft"
+//#pragma clang diagnostic ignored "-Wunused-private-field"
+//#pragma clang diagnostic ignored "-Wunused-local-typedef"
+//#pragma clang diagnostic ignored "-Wunused-function"
+//#pragma clang diagnostic ignored "-Wnarrowing"
+//#pragma clang diagnostic ignored "-Wc++11-narrowing"
+//#pragma clang diagnostic ignored "-Wdeprecated-register"
+//#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wpragmas"
+//#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+//#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+//#pragma GCC diagnostic ignored "-Warray-bounds"
+#pragma GCC diagnostic ignored "-Wattributes"
+#pragma GCC diagnostic ignored "-Wmisleading-indentation"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+#if defined(__clang__) && defined(__WIN32__)
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wmicrosoft-cast"
+#pragma clang diagnostic ignored "-Wmicrosoft-enum-value"
+#pragma clang diagnostic ignored "-Wmicrosoft-include"
+#pragma clang diagnostic ignored "-Wunused-function"
+#pragma clang diagnostic ignored "-Wunknown-pragmas"
+#endif
+
+/* disabling deprecated warning, please use only where use of deprecated Embree API functions is desired */
+#if defined(__WIN32__) && defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 1478)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  __pragma(warning (enable:  1478)) // warning: function was declared deprecated
+#elif defined(__INTEL_COMPILER)
+#define DISABLE_DEPRECATED_WARNING _Pragma("warning (disable: 1478)") // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("warning (enable : 1478)") // warning: function was declared deprecated
+#elif defined(__clang__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("clang diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(__GNUC__)
+#define DISABLE_DEPRECATED_WARNING _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#define ENABLE_DEPRECATED_WARNING  _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"") // warning: xxx is deprecated
+#elif defined(_MSC_VER)
+#define DISABLE_DEPRECATED_WARNING __pragma(warning (disable: 4996)) // warning: function was declared deprecated
+#define ENABLE_DEPRECATED_WARNING  __pragma(warning (enable : 4996)) // warning: function was declared deprecated
+#endif
+
+/* embree output stream */
+#define embree_ostream std::ostream&
+#define embree_cout std::cout
+#define embree_cout_uniform std::cout
+#define embree_endl std::endl
+  
+////////////////////////////////////////////////////////////////////////////////
+/// Some macros for static profiling
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__GNUC__)
+#define IACA_SSC_MARK( MARK_ID )						\
+__asm__ __volatile__ (									\
+					  "\n\t  movl $"#MARK_ID", %%ebx"	\
+					  "\n\t  .byte 0x64, 0x67, 0x90"	\
+					  : : : "memory" );
+
+#define IACA_UD_BYTES __asm__ __volatile__ ("\n\t .byte 0x0F, 0x0B");
+
+#else
+#define IACA_UD_BYTES {__asm _emit 0x0F \
+	__asm _emit 0x0B}
+
+#define IACA_SSC_MARK(x) {__asm  mov ebx, x\
+	__asm  _emit 0x64 \
+	__asm  _emit 0x67 \
+	__asm  _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END   __writegsbyte(222, 222);
+
+#endif
+
+#define IACA_START {IACA_UD_BYTES \
+					IACA_SSC_MARK(111)}
+#define IACA_END {IACA_SSC_MARK(222) \
+					IACA_UD_BYTES}
+
+namespace embree
+{
+  template<typename Closure>
+    struct OnScopeExitHelper
+  {
+    OnScopeExitHelper (const Closure f) : active(true), f(f) {}
+    ~OnScopeExitHelper() { if (active) f(); }
+    void deactivate() { active = false; }
+    bool active;
+    const Closure f;
+  };
+
+  template <typename Closure>
+    OnScopeExitHelper<Closure> OnScopeExit(const Closure f) {
+    return OnScopeExitHelper<Closure>(f);
+  }
+
+#define STRING_JOIN2(arg1, arg2) DO_STRING_JOIN2(arg1, arg2)
+#define DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define ON_SCOPE_EXIT(code)                                             \
+  auto STRING_JOIN2(on_scope_exit_, __LINE__) = OnScopeExit([&](){code;})
+
+  template<typename Ty>
+    std::unique_ptr<Ty> make_unique(Ty* ptr) {
+    return std::unique_ptr<Ty>(ptr);
+  }
+
+}
diff --git a/thirdparty/embree-aarch64/common/sys/ref.h b/thirdparty/embree-aarch64/common/sys/ref.h
new file mode 100644
index 0000000000..24648e6234
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/ref.h
@@ -0,0 +1,122 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "atomic.h"
+
+namespace embree
+{
+  struct NullTy {
+  };
+
+  extern MAYBE_UNUSED NullTy null;
+  
+  class RefCount
+  {
+  public:
+    RefCount(int val = 0) : refCounter(val) {}
+    virtual ~RefCount() {};
+  
+    virtual RefCount* refInc() { refCounter.fetch_add(1); return this; }
+    virtual void refDec() { if (refCounter.fetch_add(-1) == 1) delete this; }
+  private:
+    std::atomic<size_t> refCounter;
+  };
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Reference to single object
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename Type>
+  class Ref
+  {
+  public:
+    Type* ptr;
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Constructors, Assignment & Cast Operators
+    ////////////////////////////////////////////////////////////////////////////////
+
+    __forceinline Ref() : ptr(nullptr) {}
+    __forceinline Ref(NullTy) : ptr(nullptr) {}
+    __forceinline Ref(const Ref& input) : ptr(input.ptr) { if (ptr) ptr->refInc(); }
+    __forceinline Ref(Ref&& input) : ptr(input.ptr) { input.ptr = nullptr; }
+
+    __forceinline Ref(Type* const input) : ptr(input)
+    {
+      if (ptr)
+        ptr->refInc();
+    }
+
+    __forceinline ~Ref()
+    {
+      if (ptr)
+        ptr->refDec();
+    }
+
+    __forceinline Ref& operator =(const Ref& input)
+    {
+      if (input.ptr)
+        input.ptr->refInc();
+      if (ptr)
+        ptr->refDec();
+      ptr = input.ptr;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(Ref&& input)
+    {
+      if (ptr)
+        ptr->refDec();
+      ptr = input.ptr;
+      input.ptr = nullptr;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(Type* const input)
+    {
+      if (input)
+        input->refInc();
+      if (ptr)
+        ptr->refDec();
+      ptr = input;
+      return *this;
+    }
+
+    __forceinline Ref& operator =(NullTy)
+    {
+      if (ptr)
+        ptr->refDec();
+      ptr = nullptr;
+      return *this;
+    }
+
+    __forceinline operator bool() const { return ptr != nullptr; }
+
+    __forceinline const Type& operator  *() const { return *ptr; }
+    __forceinline       Type& operator  *()       { return *ptr; }
+    __forceinline const Type* operator ->() const { return  ptr; }
+    __forceinline       Type* operator ->()       { return  ptr; }
+
+    template<typename TypeOut>
+    __forceinline       Ref<TypeOut> cast()       { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+    template<typename TypeOut>
+    __forceinline const Ref<TypeOut> cast() const { return Ref<TypeOut>(static_cast<TypeOut*>(ptr)); }
+
+    template<typename TypeOut>
+    __forceinline       Ref<TypeOut> dynamicCast()       { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+    template<typename TypeOut>
+    __forceinline const Ref<TypeOut> dynamicCast() const { return Ref<TypeOut>(dynamic_cast<TypeOut*>(ptr)); }
+  };
+
+  template<typename Type> __forceinline bool operator < (const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   <  b.ptr;   }
+
+  template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, NullTy            ) { return a.ptr   == nullptr; }
+  template<typename Type> __forceinline bool operator ==(NullTy            , const Ref<Type>& b) { return nullptr == b.ptr;   }
+  template<typename Type> __forceinline bool operator ==(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   == b.ptr;   }
+
+  template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, NullTy            ) { return a.ptr   != nullptr; }
+  template<typename Type> __forceinline bool operator !=(NullTy            , const Ref<Type>& b) { return nullptr != b.ptr;   }
+  template<typename Type> __forceinline bool operator !=(const Ref<Type>& a, const Ref<Type>& b) { return a.ptr   != b.ptr;   }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/regression.cpp b/thirdparty/embree-aarch64/common/sys/regression.cpp
new file mode 100644
index 0000000000..d95ff8dfe0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/regression.cpp
@@ -0,0 +1,30 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "regression.h"
+
+namespace embree
+{
+  /* registerRegressionTest is invoked from static initializers, thus
+   * we cannot have the regression_tests variable as global static
+   * variable due to issues with static variable initialization
+   * order. */
+  std::vector<RegressionTest*>& get_regression_tests()
+  {
+    static std::vector<RegressionTest*> regression_tests;
+    return regression_tests;
+  } 
+
+  void registerRegressionTest(RegressionTest* test) 
+  {
+    get_regression_tests().push_back(test);
+  }
+
+  RegressionTest* getRegressionTest(size_t index)
+  {
+    if (index >= get_regression_tests().size())
+      return nullptr;
+    
+    return get_regression_tests()[index];
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/regression.h b/thirdparty/embree-aarch64/common/sys/regression.h
new file mode 100644
index 0000000000..632f8d92cf
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/regression.h
@@ -0,0 +1,25 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+
+#include <vector>
+
+namespace embree
+{
+  /*! virtual interface for all regression tests */
+  struct RegressionTest 
+  { 
+    RegressionTest (std::string name) : name(name) {}
+    virtual bool run() = 0;
+    std::string name;
+  };
+ 
+  /*! registers a regression test */
+  void registerRegressionTest(RegressionTest* test);
+
+  /*! run all regression tests */
+  RegressionTest* getRegressionTest(size_t index);
+}
diff --git a/thirdparty/embree-aarch64/common/sys/string.cpp b/thirdparty/embree-aarch64/common/sys/string.cpp
new file mode 100644
index 0000000000..931244383e
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/string.cpp
@@ -0,0 +1,42 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "string.h"
+
+#include <algorithm>
+#include <ctype.h>
+
+namespace embree
+{
+  char to_lower(char c) { return char(tolower(int(c))); }
+  char to_upper(char c) { return char(toupper(int(c))); }
+  std::string toLowerCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_lower); return dst; }
+  std::string toUpperCase(const std::string& s) { std::string dst(s); std::transform(dst.begin(), dst.end(), dst.begin(), to_upper); return dst; }
+
+  Vec2f string_to_Vec2f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next);
+    return Vec2f(x,y);
+  }
+  
+  Vec3f string_to_Vec3f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next); str = str.substr(next+1);
+    const float z = std::stof(str,&next); 
+    return Vec3f(x,y,z);
+  }
+  
+  Vec4f string_to_Vec4f ( std::string str )
+  {
+    size_t next = 0;
+    const float x = std::stof(str,&next); str = str.substr(next+1);
+    const float y = std::stof(str,&next); str = str.substr(next+1);
+    const float z = std::stof(str,&next); str = str.substr(next+1);
+    const float w = std::stof(str,&next);
+    return Vec4f(x,y,z,w);
+  }
+}
diff --git a/thirdparty/embree-aarch64/common/sys/string.h b/thirdparty/embree-aarch64/common/sys/string.h
new file mode 100644
index 0000000000..2e9b0f88c3
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/string.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "../math/vec2.h"
+#include "../math/vec3.h"
+#include "../math/vec4.h"
+
+namespace embree
+{
+  class IOStreamStateRestorer 
+  {
+  public:
+    IOStreamStateRestorer(std::ostream& iostream)
+      : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) {
+    }
+
+    ~IOStreamStateRestorer() {
+      iostream.flags(flags);
+      iostream.precision(precision);
+    }
+    
+  private:
+    std::ostream& iostream;
+    std::ios::fmtflags flags;
+    std::streamsize precision;
+  };
+
+  std::string toLowerCase(const std::string& s);
+  std::string toUpperCase(const std::string& s);
+
+  Vec2f string_to_Vec2f ( std::string str );
+  Vec3f string_to_Vec3f ( std::string str );
+  Vec4f string_to_Vec4f ( std::string str );
+}
diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.cpp b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp
new file mode 100644
index 0000000000..1d11436770
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/sysinfo.cpp
@@ -0,0 +1,676 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sysinfo.h"
+#include "intrinsics.h"
+#include "string.h"
+#include "ref.h"
+#if defined(__FREEBSD__)
+#include <sys/cpuset.h>
+#include <pthread_np.h>
+typedef cpuset_t cpu_set_t;
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// All Platforms
+////////////////////////////////////////////////////////////////////////////////
+
+namespace embree
+{
+  NullTy null;
+
+  std::string getPlatformName()
+  {
+#if defined(__LINUX__) && defined(__ANDROID__) && defined(__aarch64__) && defined(__ARM_NEON)
+    return "Android Linux (aarch64 / arm64)";
+#elif defined(__LINUX__) && defined(__ANDROID__) && defined(__X86_64__)
+    return "Android Linux (x64)";
+#elif defined(__LINUX__) && defined(__ANDROID__) && (defined(_X86_) || defined(__X86__) || defined(_M_IX86))
+    return "Android Linux (x86)";
+#elif defined(__LINUX__) && !defined(__X86_64__)
+    return "Linux (32bit)";
+#elif defined(__LINUX__) && defined(__X86_64__)
+    return "Linux (64bit)";
+#elif defined(__FREEBSD__) && !defined(__X86_64__)
+    return "FreeBSD (32bit)";
+#elif defined(__FREEBSD__) && defined(__X86_64__)
+    return "FreeBSD (64bit)";
+#elif defined(__CYGWIN__) && !defined(__X86_64__)
+    return "Cygwin (32bit)";
+#elif defined(__CYGWIN__) && defined(__X86_64__)
+    return "Cygwin (64bit)";
+#elif defined(__WIN32__) && !defined(__X86_64__)
+    return "Windows (32bit)";
+#elif defined(__WIN32__) && defined(__X86_64__)
+    return "Windows (64bit)";
+#elif defined(TARGET_IPHONE_SIMULATOR) && defined(__X86_64__)
+    return "iOS Simulator (x64)";
+#elif defined(TARGET_OS_IPHONE) && defined(__aarch64__) && defined(__ARM_NEON)
+    return "iOS (aarch64 / arm64)";
+#elif defined(__MACOSX__) && !defined(__X86_64__)
+    return "Mac OS X (32bit)";
+#elif defined(__MACOSX__) && defined(__X86_64__)
+    return "Mac OS X (64bit)";
+#elif defined(__UNIX__) && defined(__aarch64__)
+    return "Unix (aarch64)";
+#elif defined(__UNIX__) && !defined(__X86_64__)
+    return "Unix (32bit)";
+#elif defined(__UNIX__) && defined(__X86_64__)
+    return "Unix (64bit)";
+#else
+    return "Unknown";
+#endif
+  }
+
+  std::string getCompilerName()
+  {
+#if defined(__INTEL_COMPILER)
+    int icc_mayor = __INTEL_COMPILER / 100 % 100;
+    int icc_minor = __INTEL_COMPILER % 100;
+    std::string version = "Intel Compiler ";
+    version += toString(icc_mayor);
+    version += "." + toString(icc_minor);
+#if defined(__INTEL_COMPILER_UPDATE)
+    version += "." + toString(__INTEL_COMPILER_UPDATE);
+#endif
+    return version;
+#elif defined(__clang__)
+    return "CLANG " __clang_version__;
+#elif defined (__GNUC__)
+    return "GCC " __VERSION__;
+#elif defined(_MSC_VER)
+    std::string version = toString(_MSC_FULL_VER);
+    version.insert(4,".");
+    version.insert(9,".");
+    version.insert(2,".");
+    return "Visual C++ Compiler " + version;
+#else
+    return "Unknown Compiler";
+#endif
+  }
+
+  std::string getCPUVendor()
+  {
+    int cpuinfo[4];
+    __cpuid (cpuinfo, 0);
+    int name[4];
+    name[0] = cpuinfo[1];
+    name[1] = cpuinfo[3];
+    name[2] = cpuinfo[2];
+    name[3] = 0;
+    return (char*)name;
+  }
+
+  CPU getCPUModel()
+  {
+    if (getCPUVendor() != "GenuineIntel")
+      return CPU::UNKNOWN;
+
+    int out[4];
+    __cpuid(out, 0);
+    if (out[0] < 1) return CPU::UNKNOWN;
+    __cpuid(out, 1);
+
+    /* please see CPUID documentation for these formulas */
+    uint32_t family_ID          = (out[0] >>  8) & 0x0F;
+    uint32_t extended_family_ID = (out[0] >> 20) & 0xFF;
+    
+    uint32_t model_ID           = (out[0] >>  4) & 0x0F;
+    uint32_t extended_model_ID  = (out[0] >> 16) & 0x0F;
+    
+    uint32_t DisplayFamily = family_ID;
+    if (family_ID == 0x0F)
+      DisplayFamily += extended_family_ID;
+    
+    uint32_t DisplayModel = model_ID;
+    if (family_ID == 0x06 || family_ID == 0x0F)
+      DisplayModel += extended_model_ID << 4;
+
+    uint32_t DisplayFamily_DisplayModel = (DisplayFamily << 8) + (DisplayModel << 0);
+
+    // Data from Intel® 64 and IA-32 Architectures, Volume 4, Chapter 2, Table 2-1 (CPUID Signature Values of DisplayFamily_DisplayModel)
+    if (DisplayFamily_DisplayModel == 0x067D) return CPU::CORE_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x067E) return CPU::CORE_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x068C) return CPU::CORE_TIGER_LAKE;
+    if (DisplayFamily_DisplayModel == 0x06A5) return CPU::CORE_COMET_LAKE;
+    if (DisplayFamily_DisplayModel == 0x06A6) return CPU::CORE_COMET_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0666) return CPU::CORE_CANNON_LAKE;
+    if (DisplayFamily_DisplayModel == 0x068E) return CPU::CORE_KABY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x069E) return CPU::CORE_KABY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x066A) return CPU::XEON_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x066C) return CPU::XEON_ICE_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0655) return CPU::XEON_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x064E) return CPU::CORE_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x065E) return CPU::CORE_SKY_LAKE;
+    if (DisplayFamily_DisplayModel == 0x0656) return CPU::XEON_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x064F) return CPU::XEON_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x0647) return CPU::CORE_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x063D) return CPU::CORE_BROADWELL;
+    if (DisplayFamily_DisplayModel == 0x063F) return CPU::XEON_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x063C) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x0645) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x0646) return CPU::CORE_HASWELL;
+    if (DisplayFamily_DisplayModel == 0x063E) return CPU::XEON_IVY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x063A) return CPU::CORE_IVY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062D) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062F) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062A) return CPU::SANDY_BRIDGE;
+    if (DisplayFamily_DisplayModel == 0x062E) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x0625) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x062C) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061E) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061F) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061A) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x061D) return CPU::NEHALEM;
+    if (DisplayFamily_DisplayModel == 0x0617) return CPU::CORE2;
+    if (DisplayFamily_DisplayModel == 0x060F) return CPU::CORE2;
+    if (DisplayFamily_DisplayModel == 0x060E) return CPU::CORE1;
+
+    if (DisplayFamily_DisplayModel == 0x0685) return CPU::XEON_PHI_KNIGHTS_MILL;
+    if (DisplayFamily_DisplayModel == 0x0657) return CPU::XEON_PHI_KNIGHTS_LANDING;
+    
+    return CPU::UNKNOWN;
+  }
+
+  std::string stringOfCPUModel(CPU model)
+  {
+    switch (model) {
+    case CPU::XEON_ICE_LAKE           : return "Xeon Ice Lake";
+    case CPU::CORE_ICE_LAKE           : return "Core Ice Lake";
+    case CPU::CORE_TIGER_LAKE         : return "Core Tiger Lake";
+    case CPU::CORE_COMET_LAKE         : return "Core Comet Lake";
+    case CPU::CORE_CANNON_LAKE        : return "Core Cannon Lake";
+    case CPU::CORE_KABY_LAKE          : return "Core Kaby Lake";
+    case CPU::XEON_SKY_LAKE           : return "Xeon Sky Lake";
+    case CPU::CORE_SKY_LAKE           : return "Core Sky Lake";
+    case CPU::XEON_PHI_KNIGHTS_MILL   : return "Xeon Phi Knights Mill";
+    case CPU::XEON_PHI_KNIGHTS_LANDING: return "Xeon Phi Knights Landing";
+    case CPU::XEON_BROADWELL          : return "Xeon Broadwell";
+    case CPU::CORE_BROADWELL          : return "Core Broadwell";
+    case CPU::XEON_HASWELL            : return "Xeon Haswell";
+    case CPU::CORE_HASWELL            : return "Core Haswell";
+    case CPU::XEON_IVY_BRIDGE         : return "Xeon Ivy Bridge";
+    case CPU::CORE_IVY_BRIDGE         : return "Core Ivy Bridge";
+    case CPU::SANDY_BRIDGE            : return "Sandy Bridge";
+    case CPU::NEHALEM                 : return "Nehalem";
+    case CPU::CORE2                   : return "Core2";
+    case CPU::CORE1                   : return "Core";
+    case CPU::ARM                     : return "Arm";
+    case CPU::UNKNOWN                 : return "Unknown CPU";
+    }
+    return "Unknown CPU (error)";
+  }
+
+#if !defined(__ARM_NEON)
+  /* constants to access destination registers of CPUID instruction */
+  static const int EAX = 0;
+  static const int EBX = 1;
+  static const int ECX = 2;
+  static const int EDX = 3;
+
+  /* cpuid[eax=1].ecx */
+  static const int CPU_FEATURE_BIT_SSE3   = 1 << 0;
+  static const int CPU_FEATURE_BIT_SSSE3  = 1 << 9;
+  static const int CPU_FEATURE_BIT_FMA3   = 1 << 12;
+  static const int CPU_FEATURE_BIT_SSE4_1 = 1 << 19;
+  static const int CPU_FEATURE_BIT_SSE4_2 = 1 << 20;
+  //static const int CPU_FEATURE_BIT_MOVBE  = 1 << 22;
+  static const int CPU_FEATURE_BIT_POPCNT = 1 << 23;
+  //static const int CPU_FEATURE_BIT_XSAVE  = 1 << 26;
+  static const int CPU_FEATURE_BIT_OXSAVE = 1 << 27;
+  static const int CPU_FEATURE_BIT_AVX    = 1 << 28;
+  static const int CPU_FEATURE_BIT_F16C   = 1 << 29;
+  static const int CPU_FEATURE_BIT_RDRAND = 1 << 30;
+
+  /* cpuid[eax=1].edx */
+  static const int CPU_FEATURE_BIT_SSE  = 1 << 25;
+  static const int CPU_FEATURE_BIT_SSE2 = 1 << 26;
+
+  /* cpuid[eax=0x80000001].ecx */
+  static const int CPU_FEATURE_BIT_LZCNT = 1 << 5;
+
+  /* cpuid[eax=7,ecx=0].ebx */
+  static const int CPU_FEATURE_BIT_BMI1    = 1 << 3;
+  static const int CPU_FEATURE_BIT_AVX2    = 1 << 5;
+  static const int CPU_FEATURE_BIT_BMI2    = 1 << 8;
+  static const int CPU_FEATURE_BIT_AVX512F = 1 << 16;     // AVX512F  (foundation)
+  static const int CPU_FEATURE_BIT_AVX512DQ = 1 << 17;    // AVX512DQ (doubleword and quadword instructions)
+  static const int CPU_FEATURE_BIT_AVX512PF = 1 << 26;    // AVX512PF (prefetch gather/scatter instructions)
+  static const int CPU_FEATURE_BIT_AVX512ER = 1 << 27;    // AVX512ER (exponential and reciprocal instructions)
+  static const int CPU_FEATURE_BIT_AVX512CD = 1 << 28;    // AVX512CD (conflict detection instructions)
+  static const int CPU_FEATURE_BIT_AVX512BW = 1 << 30;    // AVX512BW (byte and word instructions)
+  static const int CPU_FEATURE_BIT_AVX512VL = 1 << 31;    // AVX512VL (vector length extensions)
+  static const int CPU_FEATURE_BIT_AVX512IFMA = 1 << 21;  // AVX512IFMA (integer fused multiple-add instructions)
+
+  /* cpuid[eax=7,ecx=0].ecx */
+  static const int CPU_FEATURE_BIT_AVX512VBMI = 1 << 1;   // AVX512VBMI (vector bit manipulation instructions)
+#endif
+
+#if !defined(__ARM_NEON)
+  __noinline int64_t get_xcr0()
+  {
+    // https://github.com/opencv/opencv/blob/master/modules/core/src/system.cpp#L466
+#if defined (__WIN32__) && defined(_XCR_XFEATURE_ENABLED_MASK)
+    int64_t xcr0 = 0; // int64_t is workaround for compiler bug under VS2013, Win32
+    xcr0 = _xgetbv(0);
+    return xcr0;
+#else
+    int xcr0 = 0;
+    __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
+    return xcr0;
+#endif
+  }
+#endif
+
+  int getCPUFeatures()
+  {
+#if defined(__ARM_NEON)
+      int cpu_features = CPU_FEATURE_NEON|CPU_FEATURE_SSE|CPU_FEATURE_SSE2;
+#if defined(NEON_AVX2_EMULATION)
+      cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE42;
+      cpu_features |= CPU_FEATURE_XMM_ENABLED;
+      cpu_features |= CPU_FEATURE_YMM_ENABLED;
+      cpu_features |= CPU_FEATURE_SSE41 | CPU_FEATURE_RDRAND | CPU_FEATURE_F16C;
+      cpu_features |= CPU_FEATURE_POPCNT;
+      cpu_features |= CPU_FEATURE_AVX;
+      cpu_features |= CPU_FEATURE_AVX2;
+      cpu_features |= CPU_FEATURE_FMA3;
+      cpu_features |= CPU_FEATURE_LZCNT;
+      cpu_features |= CPU_FEATURE_BMI1;
+      cpu_features |= CPU_FEATURE_BMI2;
+      cpu_features |= CPU_FEATURE_NEON_2X;
+
+
+ 
+#endif
+     return cpu_features;
+      
+#else
+    /* cache CPU features access */
+    static int cpu_features = 0;
+    if (cpu_features)
+      return cpu_features;
+
+    /* get number of CPUID leaves */
+    int cpuid_leaf0[4];
+    __cpuid(cpuid_leaf0, 0x00000000);
+    unsigned nIds = cpuid_leaf0[EAX];
+
+    /* get number of extended CPUID leaves */
+    int cpuid_leafe[4];
+    __cpuid(cpuid_leafe, 0x80000000);
+    unsigned nExIds = cpuid_leafe[EAX];
+
+    /* get CPUID leaves for EAX = 1,7, and 0x80000001 */
+    int cpuid_leaf_1[4] = { 0,0,0,0 };
+    int cpuid_leaf_7[4] = { 0,0,0,0 };
+    int cpuid_leaf_e1[4] = { 0,0,0,0 };
+    if (nIds >= 1) __cpuid (cpuid_leaf_1,0x00000001);
+#if _WIN32
+#if _MSC_VER && (_MSC_FULL_VER < 160040219)
+#else
+    if (nIds >= 7) __cpuidex(cpuid_leaf_7,0x00000007,0);
+#endif
+#else
+    if (nIds >= 7) __cpuid_count(cpuid_leaf_7,0x00000007,0);
+#endif
+    if (nExIds >= 0x80000001) __cpuid(cpuid_leaf_e1,0x80000001);
+
+    /* detect if OS saves XMM, YMM, and ZMM states */
+    bool xmm_enabled = true;
+    bool ymm_enabled = false;
+    bool zmm_enabled = false;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_OXSAVE) {
+      int64_t xcr0 = get_xcr0();
+      xmm_enabled = ((xcr0 & 0x02) == 0x02);                /* checks if xmm are enabled in XCR0 */
+      ymm_enabled = xmm_enabled && ((xcr0 & 0x04) == 0x04); /* checks if ymm state are enabled in XCR0 */
+      zmm_enabled = ymm_enabled && ((xcr0 & 0xE0) == 0xE0); /* checks if OPMASK state, upper 256-bit of ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled in XCR0 */
+    }
+    if (xmm_enabled) cpu_features |= CPU_FEATURE_XMM_ENABLED;
+    if (ymm_enabled) cpu_features |= CPU_FEATURE_YMM_ENABLED;
+    if (zmm_enabled) cpu_features |= CPU_FEATURE_ZMM_ENABLED;
+
+    if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE   ) cpu_features |= CPU_FEATURE_SSE;
+    if (cpuid_leaf_1[EDX] & CPU_FEATURE_BIT_SSE2  ) cpu_features |= CPU_FEATURE_SSE2;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE3  ) cpu_features |= CPU_FEATURE_SSE3;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSSE3 ) cpu_features |= CPU_FEATURE_SSSE3;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_1) cpu_features |= CPU_FEATURE_SSE41;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_SSE4_2) cpu_features |= CPU_FEATURE_SSE42;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_POPCNT) cpu_features |= CPU_FEATURE_POPCNT;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_AVX   ) cpu_features |= CPU_FEATURE_AVX;
+
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_F16C  ) cpu_features |= CPU_FEATURE_F16C;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_RDRAND) cpu_features |= CPU_FEATURE_RDRAND;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX2  ) cpu_features |= CPU_FEATURE_AVX2;
+    if (cpuid_leaf_1[ECX] & CPU_FEATURE_BIT_FMA3  ) cpu_features |= CPU_FEATURE_FMA3;
+    if (cpuid_leaf_e1[ECX] & CPU_FEATURE_BIT_LZCNT) cpu_features |= CPU_FEATURE_LZCNT;
+    if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI1 ) cpu_features |= CPU_FEATURE_BMI1;
+    if (cpuid_leaf_7 [EBX] & CPU_FEATURE_BIT_BMI2 ) cpu_features |= CPU_FEATURE_BMI2;
+
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512F   ) cpu_features |= CPU_FEATURE_AVX512F;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512DQ  ) cpu_features |= CPU_FEATURE_AVX512DQ;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512PF  ) cpu_features |= CPU_FEATURE_AVX512PF;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512ER  ) cpu_features |= CPU_FEATURE_AVX512ER;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512CD  ) cpu_features |= CPU_FEATURE_AVX512CD;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512BW  ) cpu_features |= CPU_FEATURE_AVX512BW;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512IFMA) cpu_features |= CPU_FEATURE_AVX512IFMA;
+    if (cpuid_leaf_7[EBX] & CPU_FEATURE_BIT_AVX512VL  ) cpu_features |= CPU_FEATURE_AVX512VL;
+    if (cpuid_leaf_7[ECX] & CPU_FEATURE_BIT_AVX512VBMI) cpu_features |= CPU_FEATURE_AVX512VBMI;
+
+    return cpu_features;
+#endif
+  }
+
+  std::string stringOfCPUFeatures(int features)
+  {
+    std::string str;
+    if (features & CPU_FEATURE_XMM_ENABLED) str += "XMM ";
+    if (features & CPU_FEATURE_YMM_ENABLED) str += "YMM ";
+    if (features & CPU_FEATURE_ZMM_ENABLED) str += "ZMM ";
+    if (features & CPU_FEATURE_SSE   ) str += "SSE ";
+    if (features & CPU_FEATURE_SSE2  ) str += "SSE2 ";
+    if (features & CPU_FEATURE_SSE3  ) str += "SSE3 ";
+    if (features & CPU_FEATURE_SSSE3 ) str += "SSSE3 ";
+    if (features & CPU_FEATURE_SSE41 ) str += "SSE4.1 ";
+    if (features & CPU_FEATURE_SSE42 ) str += "SSE4.2 ";
+    if (features & CPU_FEATURE_POPCNT) str += "POPCNT ";
+    if (features & CPU_FEATURE_AVX   ) str += "AVX ";
+    if (features & CPU_FEATURE_F16C  ) str += "F16C ";
+    if (features & CPU_FEATURE_RDRAND) str += "RDRAND ";
+    if (features & CPU_FEATURE_AVX2  ) str += "AVX2 ";
+    if (features & CPU_FEATURE_FMA3  ) str += "FMA3 ";
+    if (features & CPU_FEATURE_LZCNT ) str += "LZCNT ";
+    if (features & CPU_FEATURE_BMI1  ) str += "BMI1 ";
+    if (features & CPU_FEATURE_BMI2  ) str += "BMI2 ";
+    if (features & CPU_FEATURE_AVX512F) str += "AVX512F ";
+    if (features & CPU_FEATURE_AVX512DQ) str += "AVX512DQ ";
+    if (features & CPU_FEATURE_AVX512PF) str += "AVX512PF ";
+    if (features & CPU_FEATURE_AVX512ER) str += "AVX512ER ";
+    if (features & CPU_FEATURE_AVX512CD) str += "AVX512CD ";
+    if (features & CPU_FEATURE_AVX512BW) str += "AVX512BW ";
+    if (features & CPU_FEATURE_AVX512VL) str += "AVX512VL ";
+    if (features & CPU_FEATURE_AVX512IFMA) str += "AVX512IFMA ";
+    if (features & CPU_FEATURE_AVX512VBMI) str += "AVX512VBMI ";
+    if (features & CPU_FEATURE_NEON) str += "NEON ";
+    if (features & CPU_FEATURE_NEON_2X) str += "2xNEON ";
+    return str;
+  }
+
+  std::string stringOfISA (int isa)
+  {
+    if (isa == SSE) return "SSE";
+    if (isa == SSE2) return "SSE2";
+    if (isa == SSE3) return "SSE3";
+    if (isa == SSSE3) return "SSSE3";
+    if (isa == SSE41) return "SSE4.1";
+    if (isa == SSE42) return "SSE4.2";
+    if (isa == AVX) return "AVX";
+    if (isa == AVX2) return "AVX2";
+    if (isa == AVX512KNL) return "AVX512KNL";
+    if (isa == AVX512SKX) return "AVX512SKX";
+    if (isa == NEON) return "NEON";    
+    if (isa == NEON_2X) return "2xNEON";
+    return "UNKNOWN";
+  }
+
+  bool hasISA(int features, int isa) {
+    return (features & isa) == isa;
+  }
+
+  std::string supportedTargetList (int features)
+  {
+    std::string v;
+    if (hasISA(features,SSE)) v += "SSE ";
+    if (hasISA(features,SSE2)) v += "SSE2 ";
+    if (hasISA(features,SSE3)) v += "SSE3 ";
+    if (hasISA(features,SSSE3)) v += "SSSE3 ";
+    if (hasISA(features,SSE41)) v += "SSE4.1 ";
+    if (hasISA(features,SSE42)) v += "SSE4.2 ";
+    if (hasISA(features,AVX)) v += "AVX ";
+    if (hasISA(features,AVXI)) v += "AVXI ";
+    if (hasISA(features,AVX2)) v += "AVX2 ";
+    if (hasISA(features,AVX512KNL)) v += "AVX512KNL ";
+    if (hasISA(features,AVX512SKX)) v += "AVX512SKX ";
+    if (hasISA(features,NEON)) v += "NEON ";
+    if (hasISA(features,NEON_2X)) v += "2xNEON ";
+    return v;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <psapi.h>
+
+namespace embree
+{
+  std::string getExecutableFileName() {
+    char filename[1024];
+    if (!GetModuleFileName(nullptr, filename, sizeof(filename)))
+      return std::string();
+    return std::string(filename);
+  }
+
+  unsigned int getNumberOfLogicalThreads()
+  {
+    static int nThreads = -1;
+    if (nThreads != -1) return nThreads;
+
+    typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)();
+    typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD);
+    HMODULE hlib = LoadLibrary("Kernel32");
+    GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
+    GetActiveProcessorCountFunc      pGetActiveProcessorCount      = (GetActiveProcessorCountFunc)     GetProcAddress(hlib, "GetActiveProcessorCount");
+
+    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount)
+    {
+      int groups = pGetActiveProcessorGroupCount();
+      int totalProcessors = 0;
+      for (int i = 0; i < groups; i++)
+        totalProcessors += pGetActiveProcessorCount(i);
+      nThreads = totalProcessors;
+    }
+    else
+    {
+      SYSTEM_INFO sysinfo;
+      GetSystemInfo(&sysinfo);
+      nThreads = sysinfo.dwNumberOfProcessors;
+    }
+    assert(nThreads);
+    return nThreads;
+  }
+
+  int getTerminalWidth()
+  {
+    HANDLE handle = GetStdHandle(STD_OUTPUT_HANDLE);
+    if (handle == INVALID_HANDLE_VALUE) return 80;
+    CONSOLE_SCREEN_BUFFER_INFO info;
+    memset(&info,0,sizeof(info));
+    GetConsoleScreenBufferInfo(handle, &info);
+    return info.dwSize.X;
+  }
+
+  double getSeconds()
+  {
+    LARGE_INTEGER freq, val;
+    QueryPerformanceFrequency(&freq);
+    QueryPerformanceCounter(&val);
+    return (double)val.QuadPart / (double)freq.QuadPart;
+  }
+
+  void sleepSeconds(double t) {
+    Sleep(DWORD(1000.0*t));
+  }
+
+  size_t getVirtualMemoryBytes()
+  {
+    PROCESS_MEMORY_COUNTERS info;
+    GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+    return (size_t)info.QuotaPeakPagedPoolUsage;
+  }
+
+  size_t getResidentMemoryBytes()
+  {
+    PROCESS_MEMORY_COUNTERS info;
+    GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) );
+    return (size_t)info.WorkingSetSize;
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__LINUX__)
+
+#include <stdio.h>
+#include <unistd.h>
+
+namespace embree
+{
+  std::string getExecutableFileName()
+  {
+    std::string pid = "/proc/" + toString(getpid()) + "/exe";
+    char buf[4096];
+    memset(buf,0,sizeof(buf));
+    if (readlink(pid.c_str(), buf, sizeof(buf)-1) == -1)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes()
+  {
+    size_t virt, resident, shared;
+    std::ifstream buffer("/proc/self/statm");
+    buffer >> virt >> resident >> shared;
+    return virt*sysconf(_SC_PAGE_SIZE);
+  }
+
+  size_t getResidentMemoryBytes()
+  {
+    size_t virt, resident, shared;
+    std::ifstream buffer("/proc/self/statm");
+    buffer >> virt >> resident >> shared;
+    return resident*sysconf(_SC_PAGE_SIZE);
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// FreeBSD Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined (__FreeBSD__)
+
+#include <sys/sysctl.h>
+
+namespace embree
+{
+  std::string getExecutableFileName()
+  {
+    const int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 };
+    char buf[4096];
+    memset(buf,0,sizeof(buf));
+    size_t len = sizeof(buf)-1;
+    if (sysctl(mib, 4, buf, &len, 0x0, 0) == -1)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes() {
+    return 0;
+  }
+
+  size_t getResidentMemoryBytes() {
+    return 0;
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Mac OS X Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MACOSX__)
+
+#include <mach-o/dyld.h>
+
+namespace embree
+{
+  std::string getExecutableFileName()
+  {
+    char buf[4096];
+    uint32_t size = sizeof(buf);
+    if (_NSGetExecutablePath(buf, &size) != 0)
+      return std::string();
+    return std::string(buf);
+  }
+
+  size_t getVirtualMemoryBytes() {
+    return 0;
+  }
+
+  size_t getResidentMemoryBytes() {
+    return 0;
+  }
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+namespace embree
+{
+  unsigned int getNumberOfLogicalThreads()
+  {
+    static int nThreads = -1;
+    if (nThreads != -1) return nThreads;
+
+#if defined(__MACOSX__) || defined(__ANDROID__)
+    nThreads = sysconf(_SC_NPROCESSORS_ONLN); // does not work in Linux LXC container
+    assert(nThreads);
+#else
+    cpu_set_t set;
+    if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
+      nThreads = CPU_COUNT(&set);
+#endif
+
+    assert(nThreads);
+    return nThreads;
+  }
+
+  int getTerminalWidth()
+  {
+    struct winsize info;
+    if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &info) < 0) return 80;
+    return info.ws_col;
+  }
+
+  double getSeconds() {
+    struct timeval tp; gettimeofday(&tp,nullptr);
+    return double(tp.tv_sec) + double(tp.tv_usec)/1E6;
+  }
+
+  void sleepSeconds(double t) {
+    usleep(1000000.0*t);
+  }
+}
+#endif
+
diff --git a/thirdparty/embree-aarch64/common/sys/sysinfo.h b/thirdparty/embree-aarch64/common/sys/sysinfo.h
new file mode 100644
index 0000000000..8e313a59b3
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/sysinfo.h
@@ -0,0 +1,192 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define CACHELINE_SIZE 64
+
+#if !defined(PAGE_SIZE)
+  #define PAGE_SIZE 4096
+#endif
+
+#define PAGE_SIZE_2M (2*1024*1024)
+#define PAGE_SIZE_4K (4*1024)
+
+#include "platform.h"
+
+/* define isa namespace and ISA bitvector */
+#if defined (__AVX512VL__)
+#  define isa avx512skx
+#  define ISA AVX512SKX
+#  define ISA_STR "AVX512SKX"
+#elif defined (__AVX512F__)
+#  define isa avx512knl
+#  define ISA AVX512KNL
+#  define ISA_STR "AVX512KNL"
+#elif defined (__AVX2__)
+#  define isa avx2
+#  define ISA AVX2
+#  define ISA_STR "AVX2"
+#elif defined(__AVXI__)
+#  define isa avxi
+#  define ISA AVXI
+#  define ISA_STR "AVXI"
+#elif defined(__AVX__)
+#  define isa avx
+#  define ISA AVX
+#  define ISA_STR "AVX"
+#elif defined (__SSE4_2__)
+#  define isa sse42
+#  define ISA SSE42
+#  define ISA_STR "SSE4.2"
+//#elif defined (__SSE4_1__) //  we demote this to SSE2, MacOSX code compiles with SSE41 by default with XCode 11
+//#  define isa sse41
+//#  define ISA SSE41
+//#  define ISA_STR "SSE4.1"
+//#elif defined(__SSSE3__) // we demote this to SSE2, MacOSX code compiles with SSSE3 by default with ICC
+//#  define isa ssse3
+//#  define ISA SSSE3
+//#  define ISA_STR "SSSE3"
+//#elif defined(__SSE3__) // we demote this to SSE2, MacOSX code compiles with SSE3 by default with clang
+//#  define isa sse3
+//#  define ISA SSE3
+//#  define ISA_STR "SSE3"
+#elif defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__)
+#  define isa sse2
+#  define ISA SSE2
+#  define ISA_STR "SSE2"
+#elif defined(__SSE__)
+#  define isa sse
+#  define ISA SSE
+#  define ISA_STR "SSE"
+#elif defined(__ARM_NEON)
+// NOTE(LTE): Use sse2 for `isa` for the compatibility at the moment.
+#define isa sse2
+#define ISA NEON
+#define ISA_STR "NEON"
+#else
+#error Unknown ISA
+#endif
+
+namespace embree
+{
+  enum class CPU
+  {
+    XEON_ICE_LAKE,
+    CORE_ICE_LAKE,
+    CORE_TIGER_LAKE,
+    CORE_COMET_LAKE,
+    CORE_CANNON_LAKE,
+    CORE_KABY_LAKE,
+    XEON_SKY_LAKE,
+    CORE_SKY_LAKE,
+    XEON_PHI_KNIGHTS_MILL,
+    XEON_PHI_KNIGHTS_LANDING,
+    XEON_BROADWELL,
+    CORE_BROADWELL,
+    XEON_HASWELL,
+    CORE_HASWELL,
+    XEON_IVY_BRIDGE,
+    CORE_IVY_BRIDGE,
+    SANDY_BRIDGE,
+    NEHALEM,
+    CORE2,
+    CORE1,
+    ARM,
+    UNKNOWN,
+  };
+  
+  /*! get the full path to the running executable */
+  std::string getExecutableFileName();
+
+  /*! return platform name */
+  std::string getPlatformName();
+
+  /*! get the full name of the compiler */
+  std::string getCompilerName();
+
+  /*! return the name of the CPU */
+  std::string getCPUVendor();
+
+  /*! get microprocessor model */
+  CPU getCPUModel(); 
+
+  /*! converts CPU model into string */
+  std::string stringOfCPUModel(CPU model);
+
+  /*! CPU features */
+  static const int CPU_FEATURE_SSE    = 1 << 0;
+  static const int CPU_FEATURE_SSE2   = 1 << 1;
+  static const int CPU_FEATURE_SSE3   = 1 << 2;
+  static const int CPU_FEATURE_SSSE3  = 1 << 3;
+  static const int CPU_FEATURE_SSE41  = 1 << 4;
+  static const int CPU_FEATURE_SSE42  = 1 << 5;
+  static const int CPU_FEATURE_POPCNT = 1 << 6;
+  static const int CPU_FEATURE_AVX    = 1 << 7;
+  static const int CPU_FEATURE_F16C   = 1 << 8;
+  static const int CPU_FEATURE_RDRAND = 1 << 9;
+  static const int CPU_FEATURE_AVX2   = 1 << 10;
+  static const int CPU_FEATURE_FMA3   = 1 << 11;
+  static const int CPU_FEATURE_LZCNT  = 1 << 12;
+  static const int CPU_FEATURE_BMI1   = 1 << 13;
+  static const int CPU_FEATURE_BMI2   = 1 << 14;
+  static const int CPU_FEATURE_AVX512F = 1 << 16;
+  static const int CPU_FEATURE_AVX512DQ = 1 << 17;
+  static const int CPU_FEATURE_AVX512PF = 1 << 18;
+  static const int CPU_FEATURE_AVX512ER = 1 << 19;
+  static const int CPU_FEATURE_AVX512CD = 1 << 20;
+  static const int CPU_FEATURE_AVX512BW = 1 << 21;
+  static const int CPU_FEATURE_AVX512VL = 1 << 22;
+  static const int CPU_FEATURE_AVX512IFMA = 1 << 23;
+  static const int CPU_FEATURE_AVX512VBMI = 1 << 24;
+  static const int CPU_FEATURE_XMM_ENABLED = 1 << 25;
+  static const int CPU_FEATURE_YMM_ENABLED = 1 << 26;
+  static const int CPU_FEATURE_ZMM_ENABLED = 1 << 27;
+  static const int CPU_FEATURE_NEON = 1 << 28;
+  static const int CPU_FEATURE_NEON_2X = 1 << 29;
+
+  /*! get CPU features */
+  int getCPUFeatures();
+
+  /*! convert CPU features into a string */
+  std::string stringOfCPUFeatures(int features);
+
+  /*! creates a string of all supported targets that are supported */
+  std::string supportedTargetList (int isa);
+
+  /*! ISAs */
+  static const int SSE    = CPU_FEATURE_SSE | CPU_FEATURE_XMM_ENABLED;
+  static const int SSE2   = SSE | CPU_FEATURE_SSE2;
+  static const int SSE3   = SSE2 | CPU_FEATURE_SSE3;
+  static const int SSSE3  = SSE3 | CPU_FEATURE_SSSE3;
+  static const int SSE41  = SSSE3 | CPU_FEATURE_SSE41;
+  static const int SSE42  = SSE41 | CPU_FEATURE_SSE42 | CPU_FEATURE_POPCNT;
+  static const int AVX    = SSE42 | CPU_FEATURE_AVX | CPU_FEATURE_YMM_ENABLED;
+  static const int AVXI   = AVX | CPU_FEATURE_F16C | CPU_FEATURE_RDRAND;
+  static const int AVX2   = AVXI | CPU_FEATURE_AVX2 | CPU_FEATURE_FMA3 | CPU_FEATURE_BMI1 | CPU_FEATURE_BMI2 | CPU_FEATURE_LZCNT;
+  static const int AVX512KNL = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512PF | CPU_FEATURE_AVX512ER | CPU_FEATURE_AVX512CD | CPU_FEATURE_ZMM_ENABLED;
+  static const int AVX512SKX = AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512DQ | CPU_FEATURE_AVX512CD | CPU_FEATURE_AVX512BW | CPU_FEATURE_AVX512VL | CPU_FEATURE_ZMM_ENABLED;
+  static const int NEON = CPU_FEATURE_NEON | CPU_FEATURE_SSE | CPU_FEATURE_SSE2;
+  static const int NEON_2X = CPU_FEATURE_NEON_2X | AVX2;
+
+  /*! converts ISA bitvector into a string */
+  std::string stringOfISA(int features);
+
+  /*! return the number of logical threads of the system */
+  unsigned int getNumberOfLogicalThreads();
+
+  /*! returns the size of the terminal window in characters */
+  int getTerminalWidth();
+
+  /*! returns performance counter in seconds */
+  double getSeconds();
+
+  /*! sleeps the specified number of seconds */
+  void sleepSeconds(double t);
+
+  /*! returns virtual address space occupied by process */
+  size_t getVirtualMemoryBytes();
+
+  /*! returns resident memory required by process */
+  size_t getResidentMemoryBytes();
+}
diff --git a/thirdparty/embree-aarch64/common/sys/thread.cpp b/thirdparty/embree-aarch64/common/sys/thread.cpp
new file mode 100644
index 0000000000..f9ea5b7d96
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/thread.cpp
@@ -0,0 +1,429 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "thread.h"
+#include "sysinfo.h"
+#include "string.h"
+
+#include <iostream>
+#if defined(__ARM_NEON)
+#include "../math/SSE2NEON.h"
+#else
+#include <xmmintrin.h>
+#endif
+
+#if defined(PTHREADS_WIN32)
+#pragma comment (lib, "pthreadVC.lib")
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace embree
+{
+  /*! set the affinity of a given thread */
+  void setAffinity(HANDLE thread, ssize_t affinity)
+  {
+    typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)();
+    typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD);
+    typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY);
+    typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER);
+    HMODULE hlib = LoadLibrary("Kernel32");
+    GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
+    GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount");
+    SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity");
+    SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx");
+    if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx)
+    {
+      int groups = pGetActiveProcessorGroupCount();
+      int totalProcessors = 0, group = 0, number = 0;
+      for (int i = 0; i<groups; i++) {
+        int processors = pGetActiveProcessorCount(i);
+        if (totalProcessors + processors > affinity) {
+          group = i;
+          number = (int)affinity - totalProcessors;
+          break;
+        }
+        totalProcessors += processors;
+      }
+
+      GROUP_AFFINITY groupAffinity;
+      groupAffinity.Group = (WORD)group;
+      groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number);
+      groupAffinity.Reserved[0] = 0;
+      groupAffinity.Reserved[1] = 0;
+      groupAffinity.Reserved[2] = 0;
+      if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr))
+        WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning
+
+      PROCESSOR_NUMBER processorNumber;
+      processorNumber.Group = group;
+      processorNumber.Number = number;
+      processorNumber.Reserved = 0;
+      if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr))
+        WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning
+    }
+    else
+    {
+      if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity)))
+        WARNING("SetThreadAffinityMask failed"); // on purpose only a warning
+      if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1)
+        WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning
+      }
+  }
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity) {
+    setAffinity(GetCurrentThread(), affinity);
+  }
+
+  struct ThreadStartupData
+  {
+  public:
+    ThreadStartupData (thread_func f, void* arg)
+      : f(f), arg(arg) {}
+  public:
+    thread_func f;
+    void* arg;
+  };
+
+  DWORD WINAPI threadStartup(LPVOID ptr)
+  {
+    ThreadStartupData* parg = (ThreadStartupData*) ptr;
+    _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
+    parg->f(parg->arg);
+    delete parg;
+    parg = nullptr;
+    return 0;
+  }
+
+#if !defined(PTHREADS_WIN32)
+
+  /*! creates a hardware thread running on specific core */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
+  {
+    HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr);
+    if (thread == nullptr) FATAL("CreateThread failed");
+    if (threadID >= 0) setAffinity(thread, threadID);
+    return thread_t(thread);
+  }
+
+  /*! the thread calling this function gets yielded */
+  void yield() {
+    SwitchToThread();
+  }
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid) {
+    WaitForSingleObject(HANDLE(tid), INFINITE);
+    CloseHandle(HANDLE(tid));
+  }
+
+  /*! creates thread local storage */
+  tls_t createTls() {
+    return tls_t(size_t(TlsAlloc()));
+  }
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr) {
+    TlsSetValue(DWORD(size_t(tls)), ptr);
+  }
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls) {
+    return TlsGetValue(DWORD(size_t(tls)));
+  }
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls) {
+    TlsFree(DWORD(size_t(tls)));
+  }
+#endif
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__LINUX__)
+
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+
+#if defined(__ANDROID__)
+#include <pthread.h>
+#endif
+
+namespace embree
+{
+  static MutexSys mutex;
+  static std::vector<size_t> threadIDs;
+
+#if !defined(__ANDROID__) // TODO(LTE): Implement for Android target
+  /* changes thread ID mapping such that we first fill up all thread on one core */
+  size_t mapThreadID(size_t threadID)
+  {
+    Lock<MutexSys> lock(mutex);
+
+    if (threadIDs.size() == 0)
+    {
+      /* parse thread/CPU topology */
+      for (size_t cpuID=0;;cpuID++)
+      {
+        std::fstream fs;
+        std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list");
+        fs.open (cpu.c_str(), std::fstream::in);
+        if (fs.fail()) break;
+
+        int i;
+        while (fs >> i)
+        {
+          if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; }))
+            threadIDs.push_back(i);
+          if (fs.peek() == ',')
+            fs.ignore();
+        }
+        fs.close();
+      }
+
+#if 0
+      for (size_t i=0;i<threadIDs.size();i++)
+        std::cout << i << " -> " << threadIDs[i] << std::endl;
+#endif
+
+      /* verify the mapping and do not use it if the mapping has errors */
+      for (size_t i=0;i<threadIDs.size();i++) {
+        for (size_t j=0;j<threadIDs.size();j++) {
+          if (i != j && threadIDs[i] == threadIDs[j]) {
+            threadIDs.clear();
+          }
+        }
+      }
+    }
+
+    /* re-map threadIDs if mapping is available */
+    size_t ID = threadID;
+    if (threadID < threadIDs.size())
+      ID = threadIDs[threadID];
+
+    /* find correct thread to affinitize to */
+    cpu_set_t set;
+    if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
+    {
+      for (int i=0, j=0; i<CPU_SETSIZE; i++)
+      {
+        if (!CPU_ISSET(i,&set)) continue;
+
+        if (j == ID) {
+          ID = i;
+          break;
+        }
+        j++;
+      }
+    }
+
+    return ID;
+  }
+#endif
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+#if defined(__ANDROID__)
+    // TODO(LTE): Implement
+#else
+    cpu_set_t cset;
+    CPU_ZERO(&cset);
+    size_t threadID = mapThreadID(affinity);
+    CPU_SET(threadID, &cset);
+
+    pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
+#endif
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// FreeBSD Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__FreeBSD__)
+
+#include <pthread_np.h>
+
+namespace embree
+{
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+    cpuset_t cset;
+    CPU_ZERO(&cset);
+    CPU_SET(affinity, &cset);
+
+    pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// MacOSX Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MACOSX__)
+
+#include <mach/thread_act.h>
+#include <mach/thread_policy.h>
+#include <mach/mach_init.h>
+
+namespace embree
+{
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity)
+  {
+    thread_affinity_policy ap;
+    ap.affinity_tag = affinity;
+    if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS)
+      WARNING("setting thread affinity failed"); // on purpose only a warning
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__) || defined(PTHREADS_WIN32)
+
+#include <pthread.h>
+#include <sched.h>
+
+#if defined(__USE_NUMA__)
+#include <numa.h>
+#endif
+
+namespace embree
+{
+  struct ThreadStartupData
+  {
+  public:
+    ThreadStartupData (thread_func f, void* arg, int affinity)
+      : f(f), arg(arg), affinity(affinity) {}
+  public:
+    thread_func f;
+    void* arg;
+    ssize_t affinity;
+  };
+
+  static void* threadStartup(ThreadStartupData* parg)
+  {
+    _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
+
+    /*! Mac OS X does not support setting affinity at thread creation time */
+#if defined(__MACOSX__)
+    if (parg->affinity >= 0)
+	setAffinity(parg->affinity);
+#endif
+
+    parg->f(parg->arg);
+    delete parg;
+    parg = nullptr;
+    return nullptr;
+  }
+
+  /*! creates a hardware thread running on specific core */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
+  {
+    /* set stack size */
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size);
+
+    /* create thread */
+    pthread_t* tid = new pthread_t;
+    if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) {
+      pthread_attr_destroy(&attr);
+      delete tid;
+      FATAL("pthread_create failed");
+    }
+    pthread_attr_destroy(&attr);
+
+    /* set affinity */
+#if defined(__LINUX__) && !defined(__ANDROID__)
+    if (threadID >= 0) {
+      cpu_set_t cset;
+      CPU_ZERO(&cset);
+      threadID = mapThreadID(threadID);
+      CPU_SET(threadID, &cset);
+      pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+    }
+#elif defined(__FreeBSD__)
+    if (threadID >= 0) {
+      cpuset_t cset;
+      CPU_ZERO(&cset);
+      CPU_SET(threadID, &cset);
+      pthread_setaffinity_np(*tid, sizeof(cset), &cset);
+    }
+#endif
+
+    return thread_t(tid);
+  }
+
+  /*! the thread calling this function gets yielded */
+  void yield() {
+    sched_yield();
+  }
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid) {
+    if (pthread_join(*(pthread_t*)tid, nullptr) != 0)
+      FATAL("pthread_join failed");
+    delete (pthread_t*)tid;
+  }
+
+  /*! creates thread local storage */
+  tls_t createTls()
+  {
+    pthread_key_t* key = new pthread_key_t;
+    if (pthread_key_create(key,nullptr) != 0) {
+      delete key;
+      FATAL("pthread_key_create failed");
+    }
+
+    return tls_t(key);
+  }
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls)
+  {
+    assert(tls);
+    return pthread_getspecific(*(pthread_key_t*)tls);
+  }
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr)
+  {
+    assert(tls);
+    if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0)
+      FATAL("pthread_setspecific failed");
+  }
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls)
+  {
+    assert(tls);
+    if (pthread_key_delete(*(pthread_key_t*)tls) != 0)
+      FATAL("pthread_key_delete failed");
+    delete (pthread_key_t*)tls;
+  }
+}
+
+#endif
diff --git a/thirdparty/embree-aarch64/common/sys/thread.h b/thirdparty/embree-aarch64/common/sys/thread.h
new file mode 100644
index 0000000000..45da6e6a70
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/thread.h
@@ -0,0 +1,46 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "platform.h"
+#include "mutex.h"
+#include "alloc.h"
+#include "vector.h"
+#include <vector>
+
+namespace embree
+{
+  /*! type for thread */
+  typedef struct opaque_thread_t* thread_t;
+
+  /*! signature of thread start function */
+  typedef void (*thread_func)(void*);
+
+  /*! creates a hardware thread running on specific logical thread */
+  thread_t createThread(thread_func f, void* arg, size_t stack_size = 0, ssize_t threadID = -1);
+
+  /*! set affinity of the calling thread */
+  void setAffinity(ssize_t affinity);
+
+  /*! the thread calling this function gets yielded */
+  void yield();
+
+  /*! waits until the given thread has terminated */
+  void join(thread_t tid);
+
+  /*! type for handle to thread local storage */
+  typedef struct opaque_tls_t* tls_t;
+
+  /*! creates thread local storage */
+  tls_t createTls();
+
+  /*! set the thread local storage pointer */
+  void setTls(tls_t tls, void* const ptr);
+
+  /*! return the thread local storage pointer */
+  void* getTls(tls_t tls);
+
+  /*! destroys thread local storage identifier */
+  void destroyTls(tls_t tls);
+}
diff --git a/thirdparty/embree-aarch64/common/sys/vector.h b/thirdparty/embree-aarch64/common/sys/vector.h
new file mode 100644
index 0000000000..e41794de7c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/sys/vector.h
@@ -0,0 +1,242 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "alloc.h"
+#include <algorithm>
+
+namespace embree
+{
+   template<typename T, typename allocator>
+    class vector_t
+    {
+    public:
+      typedef T value_type;
+      typedef T* iterator;
+      typedef const T* const_iterator;
+    
+      __forceinline vector_t () 
+        : size_active(0), size_alloced(0), items(nullptr) {}
+    
+      __forceinline explicit vector_t (size_t sz) 
+        : size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); }
+    
+      template<typename M>
+      __forceinline explicit vector_t (M alloc, size_t sz) 
+      : alloc(alloc), size_active(0), size_alloced(0), items(nullptr) { internal_resize_init(sz); }
+    
+      __forceinline ~vector_t() {
+        clear();
+      }
+    
+      __forceinline vector_t (const vector_t& other)
+      {
+        size_active = other.size_active;
+        size_alloced = other.size_alloced;
+        items = alloc.allocate(size_alloced);
+        for (size_t i=0; i<size_active; i++) 
+          ::new (&items[i]) value_type(other.items[i]);
+      }
+    
+      __forceinline vector_t (vector_t&& other)
+        : alloc(std::move(other.alloc))
+      {
+        size_active = other.size_active; other.size_active = 0;
+        size_alloced = other.size_alloced; other.size_alloced = 0;
+        items = other.items; other.items = nullptr;
+      }
+
+      __forceinline vector_t& operator=(const vector_t& other) 
+      {
+        resize(other.size_active);
+        for (size_t i=0; i<size_active; i++)
+          items[i] = value_type(other.items[i]);
+        return *this;
+      }
+
+      __forceinline vector_t& operator=(vector_t&& other) 
+      {
+        clear();
+        alloc = std::move(other.alloc);
+        size_active = other.size_active; other.size_active = 0;
+        size_alloced = other.size_alloced; other.size_alloced = 0;
+        items = other.items; other.items = nullptr;
+        return *this;
+      }
+
+      /********************** Iterators  ****************************/
+    
+      __forceinline       iterator begin()       { return items; };
+      __forceinline const_iterator begin() const { return items; };
+
+      __forceinline       iterator end  ()       { return items+size_active; };
+      __forceinline const_iterator end  () const { return items+size_active; };
+
+
+      /********************** Capacity ****************************/
+
+      __forceinline bool   empty    () const { return size_active == 0; }
+      __forceinline size_t size     () const { return size_active; }
+      __forceinline size_t capacity () const { return size_alloced; }
+
+
+      __forceinline void resize(size_t new_size) {
+        internal_resize(new_size,internal_grow_size(new_size));
+      }
+
+      __forceinline void reserve(size_t new_alloced) 
+      {
+        /* do nothing if container already large enough */
+        if (new_alloced <= size_alloced) 
+          return;
+
+        /* resize exact otherwise */
+        internal_resize(size_active,new_alloced);
+      }
+
+      __forceinline void shrink_to_fit() {
+        internal_resize(size_active,size_active);
+      }
+
+      /******************** Element access **************************/
+
+      __forceinline       T& operator[](size_t i)       { assert(i < size_active); return items[i]; }
+      __forceinline const T& operator[](size_t i) const { assert(i < size_active); return items[i]; }
+
+      __forceinline       T& at(size_t i)       { assert(i < size_active); return items[i]; }
+      __forceinline const T& at(size_t i) const { assert(i < size_active); return items[i]; }
+
+      __forceinline T& front() const { assert(size_active > 0); return items[0]; };
+      __forceinline T& back () const { assert(size_active > 0); return items[size_active-1]; };
+
+      __forceinline       T* data()       { return items; };
+      __forceinline const T* data() const { return items; };
+
+     
+      /******************** Modifiers **************************/
+
+      __forceinline void push_back(const T& nt) 
+      {
+        const T v = nt; // need local copy as input reference could point to this vector
+        internal_resize(size_active,internal_grow_size(size_active+1));
+        ::new (&items[size_active++]) T(v);
+      }
+
+      __forceinline void pop_back() 
+      {
+        assert(!empty());
+        size_active--;
+        alloc.destroy(&items[size_active]);
+      }
+
+      __forceinline void clear() 
+      {
+        /* destroy elements */
+        for (size_t i=0; i<size_active; i++)
+          alloc.destroy(&items[i]);
+        
+        /* free memory */
+        alloc.deallocate(items,size_alloced); 
+        items = nullptr;
+        size_active = size_alloced = 0;
+      }
+
+    /******************** Comparisons **************************/
+    
+    friend bool operator== (const vector_t& a, const vector_t& b) 
+    {
+      if (a.size() != b.size()) return false;
+      for (size_t i=0; i<a.size(); i++)
+        if (a[i] != b[i])
+          return false;
+      return true;
+    }
+
+    friend bool operator!= (const vector_t& a, const vector_t& b) {
+      return !(a==b);
+    }
+
+    private:
+
+      __forceinline void internal_resize_init(size_t new_active)
+      {
+        assert(size_active == 0); 
+        assert(size_alloced == 0);
+        assert(items == nullptr);
+        if (new_active == 0) return;
+        items = alloc.allocate(new_active);
+        for (size_t i=0; i<new_active; i++) ::new (&items[i]) T();
+        size_active = new_active;
+        size_alloced = new_active;
+      }
+
+      __forceinline void internal_resize(size_t new_active, size_t new_alloced)
+      {
+        assert(new_active <= new_alloced); 
+
+        /* destroy elements */
+        if (new_active < size_active) 
+        {
+          for (size_t i=new_active; i<size_active; i++)
+            alloc.destroy(&items[i]);
+          size_active = new_active;
+        }
+
+        /* only reallocate if necessary */
+        if (new_alloced == size_alloced) {
+          for (size_t i=size_active; i<new_active; i++) ::new (&items[i]) T;
+          size_active = new_active;
+          return;
+        }
+
+        /* reallocate and copy items */
+        T* old_items = items;
+        items = alloc.allocate(new_alloced);
+        for (size_t i=0; i<size_active; i++) {
+          ::new (&items[i]) T(std::move(old_items[i]));
+          alloc.destroy(&old_items[i]);
+        }
+
+        for (size_t i=size_active; i<new_active; i++) {
+          ::new (&items[i]) T;
+        }
+
+        alloc.deallocate(old_items,size_alloced);
+        size_active = new_active;
+        size_alloced = new_alloced;
+      }
+
+      __forceinline size_t internal_grow_size(size_t new_alloced)
+      {
+        /* do nothing if container already large enough */
+        if (new_alloced <= size_alloced) 
+          return size_alloced;
+
+        /* resize to next power of 2 otherwise */
+        size_t new_size_alloced = size_alloced;
+        while (new_size_alloced < new_alloced) {
+          new_size_alloced = std::max(size_t(1),2*new_size_alloced);
+        }
+        return new_size_alloced;
+      }
+
+    private:
+      allocator alloc;
+      size_t size_active;    // number of valid items
+      size_t size_alloced;   // number of items allocated
+      T* items;              // data array
+    };
+
+  /*! vector class that performs standard allocations */
+  template<typename T>
+    using vector = vector_t<T,std::allocator<T>>;
+
+  /*! vector class that performs aligned allocations */
+  template<typename T>
+    using avector = vector_t<T,aligned_allocator<T,std::alignment_of<T>::value> >;
+  
+  /*! vector class that performs OS allocations */
+  template<typename T>
+    using ovector = vector_t<T,os_allocator<T> >;
+}
diff --git a/thirdparty/embree-aarch64/common/tasking/taskscheduler.h b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h
new file mode 100644
index 0000000000..9940e068d0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskscheduler.h
@@ -0,0 +1,17 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if defined(TASKING_INTERNAL)
+#  include "taskschedulerinternal.h"
+#elif defined(TASKING_GCD) && defined(BUILD_IOS)
+#  include "taskschedulergcd.h"
+#elif defined(TASKING_TBB)
+#  include "taskschedulertbb.h"
+#elif defined(TASKING_PPL)
+#  include "taskschedulerppl.h"
+#else
+#  error "no tasking system enabled"
+#endif
+
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h
new file mode 100644
index 0000000000..d31f8bb478
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulergcd.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#include <dispatch/dispatch.h>
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy() {}
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID()
+    {
+      return threadIndex();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    static __forceinline size_t threadIndex()
+    {
+        currentThreadIndex = (currentThreadIndex + 1) % GCDNumThreads;
+        return currentThreadIndex;
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount()
+    {
+        return GCDNumThreads;
+    }
+
+    private:
+      static size_t GCDNumThreads;
+      static size_t currentThreadIndex;
+
+  };
+
+};
+
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
new file mode 100644
index 0000000000..ebf656d1a0
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
@@ -0,0 +1,426 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "taskschedulerinternal.h"
+#include "../math/math.h"
+#include "../sys/sysinfo.h"
+#include <algorithm>
+
+namespace embree
+{
+  RTC_NAMESPACE_BEGIN
+  
+  static MutexSys g_mutex;
+  size_t TaskScheduler::g_numThreads = 0;
+  __thread TaskScheduler* TaskScheduler::g_instance = nullptr;
+  std::vector<Ref<TaskScheduler>> g_instance_vector;
+  __thread TaskScheduler::Thread* TaskScheduler::thread_local_thread = nullptr;
+  TaskScheduler::ThreadPool* TaskScheduler::threadPool = nullptr;
+
+  template<typename Predicate, typename Body>
+  __forceinline void TaskScheduler::steal_loop(Thread& thread, const Predicate& pred, const Body& body)
+  {
+    while (true)
+    {
+      /*! some rounds that yield */
+      for (size_t i=0; i<32; i++)
+      {
+        /*! some spinning rounds */
+        const size_t threadCount = thread.threadCount();
+        for (size_t j=0; j<1024; j+=threadCount)
+        {
+          if (!pred()) return;
+          if (thread.scheduler->steal_from_other_threads(thread)) {
+            i=j=0;
+            body();
+          }
+        }
+        yield();
+      }
+    }
+  }
+
+  /*! run this task */
+  void TaskScheduler::Task::run_internal (Thread& thread) // FIXME: avoid as many dll_exports as possible
+  {
+    /* try to run if not already stolen */
+    if (try_switch_state(INITIALIZED,DONE))
+    {
+      Task* prevTask = thread.task;
+      thread.task = this;
+      // -- GODOT start --
+      // try {
+      // if (thread.scheduler->cancellingException == nullptr)
+          closure->execute();
+      // } catch (...) {
+      //   if (thread.scheduler->cancellingException == nullptr)
+      //     thread.scheduler->cancellingException = std::current_exception();
+      // }
+      // -- GODOT end --
+      thread.task = prevTask;
+      add_dependencies(-1);
+    }
+
+    /* steal until all dependencies have completed */
+    steal_loop(thread,
+               [&] () { return dependencies>0; },
+               [&] () { while (thread.tasks.execute_local_internal(thread,this)); });
+
+    /* now signal our parent task that we are finished */
+    if (parent)
+      parent->add_dependencies(-1);
+  }
+
+    /*! run this task */
+  dll_export void TaskScheduler::Task::run (Thread& thread) {
+    run_internal(thread);
+  }
+
+  bool TaskScheduler::TaskQueue::execute_local_internal(Thread& thread, Task* parent)
+  {
+    /* stop if we run out of local tasks or reach the waiting task */
+    if (right == 0 || &tasks[right-1] == parent)
+      return false;
+
+    /* execute task */
+    size_t oldRight = right;
+    tasks[right-1].run_internal(thread);
+    if (right != oldRight) {
+      THROW_RUNTIME_ERROR("you have to wait for spawned subtasks");
+    }
+
+    /* pop task and closure from stack */
+    right--;
+    if (tasks[right].stackPtr != size_t(-1))
+      stackPtr = tasks[right].stackPtr;
+
+    /* also move left pointer */
+    if (left >= right) left.store(right.load());
+
+    return right != 0;
+  }
+
+  dll_export bool TaskScheduler::TaskQueue::execute_local(Thread& thread, Task* parent) {
+    return execute_local_internal(thread,parent);
+  }
+
+  bool TaskScheduler::TaskQueue::steal(Thread& thread)
+  {
+    size_t l = left;
+    size_t r = right;
+    if (l < r)
+    {
+      l = left++;
+       if (l >= r)
+         return false;
+    }
+    else
+      return false;
+
+    if (!tasks[l].try_steal(thread.tasks.tasks[thread.tasks.right]))
+      return false;
+
+    thread.tasks.right++;
+    return true;
+  }
+
+  /* we steal from the left */
+  size_t TaskScheduler::TaskQueue::getTaskSizeAtLeft()
+  {
+    if (left >= right) return 0;
+    return tasks[left].N;
+  }
+
+  void threadPoolFunction(std::pair<TaskScheduler::ThreadPool*,size_t>* pair)
+  {
+    TaskScheduler::ThreadPool* pool = pair->first;
+    size_t threadIndex = pair->second;
+    delete pair;
+    pool->thread_loop(threadIndex);
+  }
+
+  TaskScheduler::ThreadPool::ThreadPool(bool set_affinity)
+    : numThreads(0), numThreadsRunning(0), set_affinity(set_affinity), running(false) {}
+
+  dll_export void TaskScheduler::ThreadPool::startThreads()
+  {
+    if (running) return;
+    setNumThreads(numThreads,true);
+  }
+
+  void TaskScheduler::ThreadPool::setNumThreads(size_t newNumThreads, bool startThreads)
+  {
+    Lock<MutexSys> lock(g_mutex);
+    assert(newNumThreads);
+    newNumThreads = min(newNumThreads, (size_t) getNumberOfLogicalThreads());
+
+    // We are observing a few % gain by increasing number threads by 2 on aarch64.
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    numThreads = newNumThreads*2;
+#else
+    numThreads = newNumThreads;
+#endif
+    numThreads = newNumThreads;
+    if (!startThreads && !running) return;
+    running = true;
+    size_t numThreadsActive = numThreadsRunning;
+
+    mutex.lock();
+    numThreadsRunning = newNumThreads;
+    mutex.unlock();
+    condition.notify_all();
+
+    /* start new threads */
+    for (size_t t=numThreadsActive; t<numThreads; t++)
+    {
+      if (t == 0) continue;
+      auto pair = new std::pair<TaskScheduler::ThreadPool*,size_t>(this,t);
+      threads.push_back(createThread((thread_func)threadPoolFunction,pair,4*1024*1024,set_affinity ? t : -1));
+    }
+
+    /* stop some threads if we reduce the number of threads */
+    for (ssize_t t=numThreadsActive-1; t>=ssize_t(numThreadsRunning); t--) {
+      if (t == 0) continue;
+      embree::join(threads.back());
+      threads.pop_back();
+    }
+  }
+
+  TaskScheduler::ThreadPool::~ThreadPool()
+  {
+    /* leave all taskschedulers */
+    mutex.lock();
+    numThreadsRunning = 0;
+    mutex.unlock();
+    condition.notify_all();
+
+    /* wait for threads to terminate */
+    for (size_t i=0; i<threads.size(); i++)
+      embree::join(threads[i]);
+  }
+
+  dll_export void TaskScheduler::ThreadPool::add(const Ref<TaskScheduler>& scheduler)
+  {
+    mutex.lock();
+    schedulers.push_back(scheduler);
+    mutex.unlock();
+    condition.notify_all();
+  }
+
+  dll_export void TaskScheduler::ThreadPool::remove(const Ref<TaskScheduler>& scheduler)
+  {
+    Lock<MutexSys> lock(mutex);
+    for (std::list<Ref<TaskScheduler> >::iterator it = schedulers.begin(); it != schedulers.end(); it++) {
+      if (scheduler == *it) {
+        schedulers.erase(it);
+        return;
+      }
+    }
+  }
+
+  void TaskScheduler::ThreadPool::thread_loop(size_t globalThreadIndex)
+  {
+    while (globalThreadIndex < numThreadsRunning)
+    {
+      Ref<TaskScheduler> scheduler = NULL;
+      ssize_t threadIndex = -1;
+      {
+        Lock<MutexSys> lock(mutex);
+        condition.wait(mutex, [&] () { return globalThreadIndex >= numThreadsRunning || !schedulers.empty(); });
+        if (globalThreadIndex >= numThreadsRunning) break;
+        scheduler = schedulers.front();
+        threadIndex = scheduler->allocThreadIndex();
+      }
+      scheduler->thread_loop(threadIndex);
+    }
+  }
+
+  TaskScheduler::TaskScheduler()
+    : threadCounter(0), anyTasksRunning(0), hasRootTask(false)
+  {
+    threadLocal.resize(2*getNumberOfLogicalThreads()); // FIXME: this has to be 2x as in the compatibility join mode with rtcCommitScene the worker threads also join. When disallowing rtcCommitScene to join a build we can remove the 2x.
+    for (size_t i=0; i<threadLocal.size(); i++)
+      threadLocal[i].store(nullptr);
+  }
+
+  TaskScheduler::~TaskScheduler()
+  {
+    assert(threadCounter == 0);
+  }
+
+  dll_export size_t TaskScheduler::threadID()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread) return thread->threadIndex;
+    else        return 0;
+  }
+
+  dll_export size_t TaskScheduler::threadIndex()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread) return thread->threadIndex;
+    else        return 0;
+  }
+
+  dll_export size_t TaskScheduler::threadCount() {
+    return threadPool->size();
+  }
+
+  dll_export TaskScheduler* TaskScheduler::instance()
+  {
+    if (g_instance == NULL) {
+      Lock<MutexSys> lock(g_mutex);
+      g_instance = new TaskScheduler;
+      g_instance_vector.push_back(g_instance);
+    }
+    return g_instance;
+  }
+
+  void TaskScheduler::create(size_t numThreads, bool set_affinity, bool start_threads)
+  {
+    if (!threadPool) threadPool = new TaskScheduler::ThreadPool(set_affinity);
+    threadPool->setNumThreads(numThreads,start_threads);
+  }
+
+  void TaskScheduler::destroy() {
+    delete threadPool; threadPool = nullptr;
+  }
+
+  dll_export ssize_t TaskScheduler::allocThreadIndex()
+  {
+    size_t threadIndex = threadCounter++;
+    assert(threadIndex < threadLocal.size());
+    return threadIndex;
+  }
+
+  void TaskScheduler::join()
+  {
+    mutex.lock();
+    size_t threadIndex = allocThreadIndex();
+    condition.wait(mutex, [&] () { return hasRootTask.load(); });
+    mutex.unlock();
+    // -- GODOT start --
+    // std::exception_ptr except = thread_loop(threadIndex);
+    // if (except != nullptr) std::rethrow_exception(except);
+    thread_loop(threadIndex);
+    // -- GODOT end --
+  }
+
+  void TaskScheduler::reset() {
+    hasRootTask = false;
+  }
+
+  void TaskScheduler::wait_for_threads(size_t threadCount)
+  {
+    while (threadCounter < threadCount-1)
+      pause_cpu();
+  }
+
+  dll_export TaskScheduler::Thread* TaskScheduler::thread() {
+    return thread_local_thread;
+  }
+
+  dll_export TaskScheduler::Thread* TaskScheduler::swapThread(Thread* thread)
+  {
+    Thread* old = thread_local_thread;
+    thread_local_thread = thread;
+    return old;
+  }
+
+  dll_export bool TaskScheduler::wait()
+  {
+    Thread* thread = TaskScheduler::thread();
+    if (thread == nullptr) return true;
+    while (thread->tasks.execute_local_internal(*thread,thread->task)) {};
+    return thread->scheduler->cancellingException == nullptr;
+  }
+
+// -- GODOT start --
+//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
+  void TaskScheduler::thread_loop(size_t threadIndex)
+// -- GODOT end --
+  {
+    /* allocate thread structure */
+    std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+    Thread& thread = *mthread;
+    threadLocal[threadIndex].store(&thread);
+    Thread* oldThread = swapThread(&thread);
+
+    /* main thread loop */
+    while (anyTasksRunning)
+    {
+      steal_loop(thread,
+                 [&] () { return anyTasksRunning > 0; },
+                 [&] () {
+                   anyTasksRunning++;
+                   while (thread.tasks.execute_local_internal(thread,nullptr));
+                   anyTasksRunning--;
+                 });
+    }
+    threadLocal[threadIndex].store(nullptr);
+    swapThread(oldThread);
+
+    /* remember exception to throw */
+    // -- GODOT start --
+    // std::exception_ptr except = nullptr;
+    // if (cancellingException != nullptr) except = cancellingException;
+    // -- GODOT end --
+    /* wait for all threads to terminate */
+    threadCounter--;
+#if defined(__WIN32__)
+	size_t loopIndex = 1;
+#endif
+#define LOOP_YIELD_THRESHOLD (4096)
+	while (threadCounter > 0) {
+#if defined(__WIN32__)
+          if ((loopIndex % LOOP_YIELD_THRESHOLD) == 0)
+            yield();
+          else
+            _mm_pause();
+	  loopIndex++;
+#else
+          yield();
+#endif
+	}
+    // -- GODOT start --
+    // return except;
+    return;
+    // -- GODOT end --
+  }
+
+  bool TaskScheduler::steal_from_other_threads(Thread& thread)
+  {
+    const size_t threadIndex = thread.threadIndex;
+    const size_t threadCount = this->threadCounter;
+
+    for (size_t i=1; i<threadCount; i++)
+    {
+      pause_cpu(32);
+      size_t otherThreadIndex = threadIndex+i;
+      if (otherThreadIndex >= threadCount) otherThreadIndex -= threadCount;
+
+      Thread* othread = threadLocal[otherThreadIndex].load();
+      if (!othread)
+        continue;
+
+      if (othread->tasks.steal(thread))
+        return true;
+    }
+
+    return false;
+  }
+
+  dll_export void TaskScheduler::startThreads() {
+    threadPool->startThreads();
+  }
+
+  dll_export void TaskScheduler::addScheduler(const Ref<TaskScheduler>& scheduler) {
+    threadPool->add(scheduler);
+  }
+
+  dll_export void TaskScheduler::removeScheduler(const Ref<TaskScheduler>& scheduler) {
+    threadPool->remove(scheduler);
+  }
+
+  RTC_NAMESPACE_END
+}
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
new file mode 100644
index 0000000000..8bd70b2b8c
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
@@ -0,0 +1,386 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+#include "../sys/atomic.h"
+#include "../math/range.h"
+#include "../../include/embree3/rtcore.h"
+
+#include <list>
+
+namespace embree
+{
+
+  /* The tasking system exports some symbols to be used by the tutorials. Thus we 
+     hide is also in the API namespace when requested. */
+  RTC_NAMESPACE_BEGIN
+
+  struct TaskScheduler : public RefCount
+  {
+    ALIGNED_STRUCT_(64);
+    friend class Device;
+
+    static const size_t TASK_STACK_SIZE = 4*1024;           //!< task structure stack
+    static const size_t CLOSURE_STACK_SIZE = 512*1024;    //!< stack for task closures
+
+    struct Thread;
+
+    /*! virtual interface for all tasks */
+    struct TaskFunction {
+      virtual void execute() = 0;
+    };
+
+    /*! builds a task interface from a closure */
+    template<typename Closure>
+    struct ClosureTaskFunction : public TaskFunction
+    {
+      Closure closure;
+      __forceinline ClosureTaskFunction (const Closure& closure) : closure(closure) {}
+      void execute() { closure(); };
+    };
+
+    struct __aligned(64) Task
+    {
+      /*! states a task can be in */
+      enum { DONE, INITIALIZED };
+
+      /*! switch from one state to another */
+      __forceinline void switch_state(int from, int to)
+      {
+	__memory_barrier();
+        MAYBE_UNUSED bool success = state.compare_exchange_strong(from,to);
+	assert(success);
+      }
+
+      /*! try to switch from one state to another */
+      __forceinline bool try_switch_state(int from, int to) {
+	__memory_barrier();
+	return state.compare_exchange_strong(from,to);
+      }
+
+       /*! increment/decrement dependency counter */
+      void add_dependencies(int n) {
+	dependencies+=n;
+      }
+
+      /*! initialize all tasks to DONE state by default */
+      __forceinline Task()
+	: state(DONE) {}
+
+      /*! construction of new task */
+      __forceinline Task (TaskFunction* closure, Task* parent, size_t stackPtr, size_t N)
+        : dependencies(1), stealable(true), closure(closure), parent(parent), stackPtr(stackPtr), N(N)
+      {
+        if (parent) parent->add_dependencies(+1);
+	switch_state(DONE,INITIALIZED);
+      }
+
+      /*! construction of stolen task, stealing thread will decrement initial dependency */
+      __forceinline Task (TaskFunction* closure, Task* parent)
+        : dependencies(1), stealable(false), closure(closure), parent(parent), stackPtr(-1), N(1)
+      {
+	switch_state(DONE,INITIALIZED);
+      }
+
+      /*! try to steal this task */
+      bool try_steal(Task& child)
+      {
+        if (!stealable) return false;
+	if (!try_switch_state(INITIALIZED,DONE)) return false;
+	new (&child) Task(closure, this);
+        return true;
+      }
+
+      /*! run this task */
+      dll_export void run(Thread& thread);
+
+      void run_internal(Thread& thread);
+
+    public:
+      std::atomic<int> state;            //!< state this task is in
+      std::atomic<int> dependencies;     //!< dependencies to wait for
+      std::atomic<bool> stealable;       //!< true if task can be stolen
+      TaskFunction* closure;             //!< the closure to execute
+      Task* parent;                      //!< parent task to signal when we are finished
+      size_t stackPtr;                   //!< stack location where closure is stored
+      size_t N;                          //!< approximative size of task
+    };
+
+    struct TaskQueue
+    {
+      TaskQueue ()
+      : left(0), right(0), stackPtr(0) {}
+
+      __forceinline void* alloc(size_t bytes, size_t align = 64)
+      {
+        size_t ofs = bytes + ((align - stackPtr) & (align-1));
+        if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+          // -- GODOT start --
+          // throw std::runtime_error("closure stack overflow");
+          abort();
+          // -- GODOT end --
+        stackPtr += ofs;
+        return &stack[stackPtr-bytes];
+      }
+
+      template<typename Closure>
+      __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+      {
+        if (right >= TASK_STACK_SIZE)
+          // -- GODOT start --
+          // throw std::runtime_error("task stack overflow");
+          abort();
+          // -- GODOT end --
+
+	/* allocate new task on right side of stack */
+        size_t oldStackPtr = stackPtr;
+        TaskFunction* func = new (alloc(sizeof(ClosureTaskFunction<Closure>))) ClosureTaskFunction<Closure>(closure);
+        /* gcc 8 or later fails to compile without explicit .load() */
+        new (&(tasks[right.load()])) Task(func,thread.task,oldStackPtr,size);
+        right++;
+
+	/* also move left pointer */
+	if (left >= right-1) left = right-1;
+      }
+
+      dll_export bool execute_local(Thread& thread, Task* parent);
+      bool execute_local_internal(Thread& thread, Task* parent);
+      bool steal(Thread& thread);
+      size_t getTaskSizeAtLeft();
+
+      bool empty() { return right == 0; }
+
+    public:
+
+      /* task stack */
+      Task tasks[TASK_STACK_SIZE];
+      __aligned(64) std::atomic<size_t> left;   //!< threads steal from left
+      __aligned(64) std::atomic<size_t> right;  //!< new tasks are added to the right
+
+      /* closure stack */
+      __aligned(64) char stack[CLOSURE_STACK_SIZE];
+      size_t stackPtr;
+    };
+
+    /*! thread local structure for each thread */
+    struct Thread
+    {
+      ALIGNED_STRUCT_(64);
+
+      Thread (size_t threadIndex, const Ref<TaskScheduler>& scheduler)
+      : threadIndex(threadIndex), task(nullptr), scheduler(scheduler) {}
+
+      __forceinline size_t threadCount() {
+        return scheduler->threadCounter;
+      }
+
+      size_t threadIndex;              //!< ID of this thread
+      TaskQueue tasks;                 //!< local task queue
+      Task* task;                      //!< current active task
+      Ref<TaskScheduler> scheduler;     //!< pointer to task scheduler
+    };
+
+    /*! pool of worker threads */
+    struct ThreadPool
+    {
+      ThreadPool (bool set_affinity);
+      ~ThreadPool ();
+
+      /*! starts the threads */
+      dll_export void startThreads();
+
+      /*! sets number of threads to use */
+      void setNumThreads(size_t numThreads, bool startThreads = false);
+
+      /*! adds a task scheduler object for scheduling */
+      dll_export void add(const Ref<TaskScheduler>& scheduler);
+
+      /*! remove the task scheduler object again */
+      dll_export void remove(const Ref<TaskScheduler>& scheduler);
+
+      /*! returns number of threads of the thread pool */
+      size_t size() const { return numThreads; }
+
+      /*! main loop for all threads */
+      void thread_loop(size_t threadIndex);
+
+    private:
+      std::atomic<size_t> numThreads;
+      std::atomic<size_t> numThreadsRunning;
+      bool set_affinity;
+      std::atomic<bool> running;
+      std::vector<thread_t> threads;
+
+    private:
+      MutexSys mutex;
+      ConditionSys condition;
+      std::list<Ref<TaskScheduler> > schedulers;
+    };
+
+    TaskScheduler ();
+    ~TaskScheduler ();
+
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /*! lets new worker threads join the tasking system */
+    void join();
+    void reset();
+
+    /*! let a worker thread allocate a thread index */
+    dll_export ssize_t allocThreadIndex();
+
+    /*! wait for some number of threads available (threadCount includes main thread) */
+    void wait_for_threads(size_t threadCount);
+
+    /*! thread loop for all worker threads */
+    // -- GODOT start --
+    // std::exception_ptr thread_loop(size_t threadIndex);
+    void thread_loop(size_t threadIndex);
+    // -- GODOT end --
+
+    /*! steals a task from a different thread */
+    bool steal_from_other_threads(Thread& thread);
+
+    template<typename Predicate, typename Body>
+      static void steal_loop(Thread& thread, const Predicate& pred, const Body& body);
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+      void spawn_root(const Closure& closure, size_t size = 1, bool useThreadPool = true)
+    {
+      if (useThreadPool) startThreads();
+
+      size_t threadIndex = allocThreadIndex();
+      std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+      Thread& thread = *mthread;
+      assert(threadLocal[threadIndex].load() == nullptr);
+      threadLocal[threadIndex] = &thread;
+      Thread* oldThread = swapThread(&thread);
+      thread.tasks.push_right(thread,size,closure);
+      {
+        Lock<MutexSys> lock(mutex);
+	anyTasksRunning++;
+        hasRootTask = true;
+        condition.notify_all();
+      }
+
+      if (useThreadPool) addScheduler(this);
+
+      while (thread.tasks.execute_local(thread,nullptr));
+      anyTasksRunning--;
+      if (useThreadPool) removeScheduler(this);
+
+      threadLocal[threadIndex] = nullptr;
+      swapThread(oldThread);
+
+      /* remember exception to throw */
+      std::exception_ptr except = nullptr;
+      if (cancellingException != nullptr) except = cancellingException;
+
+      /* wait for all threads to terminate */
+      threadCounter--;
+      while (threadCounter > 0) yield();
+      cancellingException = nullptr;
+
+      /* re-throw proper exception */
+      if (except != nullptr)
+        std::rethrow_exception(except);
+    }
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+    static __forceinline void spawn(size_t size, const Closure& closure)
+    {
+      Thread* thread = TaskScheduler::thread();
+      if (likely(thread != nullptr)) thread->tasks.push_right(*thread,size,closure);
+      else                           instance()->spawn_root(closure,size);
+    }
+
+    /* spawn a new task at the top of the threads task stack */
+    template<typename Closure>
+    static __forceinline void spawn(const Closure& closure) {
+      spawn(1,closure);
+    }
+
+    /* spawn a new task set  */
+    template<typename Index, typename Closure>
+    static void spawn(const Index begin, const Index end, const Index blockSize, const Closure& closure)
+    {
+      spawn(end-begin, [=]()
+        {
+	  if (end-begin <= blockSize) {
+	    return closure(range<Index>(begin,end));
+	  }
+	  const Index center = (begin+end)/2;
+	  spawn(begin,center,blockSize,closure);
+	  spawn(center,end  ,blockSize,closure);
+	  wait();
+	});
+    }
+
+    /* work on spawned subtasks and wait until all have finished */
+    dll_export static bool wait();
+
+    /* returns the ID of the current thread */
+    dll_export static size_t threadID();
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    dll_export static size_t threadIndex();
+
+    /* returns the total number of threads */
+    dll_export static size_t threadCount();
+
+  private:
+
+    /* returns the thread local task list of this worker thread */
+    dll_export static Thread* thread();
+
+    /* sets the thread local task list of this worker thread */
+    dll_export static Thread* swapThread(Thread* thread);
+
+    /*! returns the taskscheduler object to be used by the master thread */
+    dll_export static TaskScheduler* instance();
+
+    /*! starts the threads */
+    dll_export static void startThreads();
+
+    /*! adds a task scheduler object for scheduling */
+    dll_export static void addScheduler(const Ref<TaskScheduler>& scheduler);
+
+    /*! remove the task scheduler object again */
+    dll_export static void removeScheduler(const Ref<TaskScheduler>& scheduler);
+
+  private:
+    std::vector<atomic<Thread*>> threadLocal;
+    std::atomic<size_t> threadCounter;
+    std::atomic<size_t> anyTasksRunning;
+    std::atomic<bool> hasRootTask;
+    std::exception_ptr cancellingException;
+    MutexSys mutex;
+    ConditionSys condition;
+
+  private:
+    static size_t g_numThreads;
+    static __thread TaskScheduler* g_instance;
+    static __thread Thread* thread_local_thread;
+    static ThreadPool* threadPool;
+  };
+
+  RTC_NAMESPACE_END
+
+#if defined(RTC_NAMESPACE)
+    using RTC_NAMESPACE::TaskScheduler;
+#endif
+}
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h
new file mode 100644
index 0000000000..776f98cdac
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerppl.h
@@ -0,0 +1,46 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if !defined(__WIN32__)
+#error PPL tasking system only available under windows
+#endif
+
+#include <ppl.h>
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID() {
+      return GetCurrentThreadId();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    /* FIXME: threadIndex is NOT supported by PPL! */
+    static __forceinline size_t threadIndex() {
+      return 0;
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount() {
+      return GetMaximumProcessorCount(ALL_PROCESSOR_GROUPS) + 1;
+    }
+  };
+};
diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h
new file mode 100644
index 0000000000..98dba26871
--- /dev/null
+++ b/thirdparty/embree-aarch64/common/tasking/taskschedulertbb.h
@@ -0,0 +1,67 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../sys/platform.h"
+#include "../sys/alloc.h"
+#include "../sys/barrier.h"
+#include "../sys/thread.h"
+#include "../sys/mutex.h"
+#include "../sys/condition.h"
+#include "../sys/ref.h"
+
+#if defined(__WIN32__)
+#  define NOMINMAX
+#endif
+
+// We need to define these to avoid implicit linkage against
+// tbb_debug.lib under Windows. When removing these lines debug build
+// under Windows fails.
+#define __TBB_NO_IMPLICIT_LINKAGE 1
+#define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#define TBB_PREVIEW_ISOLATED_TASK_GROUP 1
+#include "tbb/tbb.h"
+#include "tbb/parallel_sort.h"
+
+namespace embree
+{
+  struct TaskScheduler
+  {
+    /*! initializes the task scheduler */
+    static void create(size_t numThreads, bool set_affinity, bool start_threads);
+
+    /*! destroys the task scheduler again */
+    static void destroy();
+
+    /* returns the ID of the current thread */
+    static __forceinline size_t threadID()
+    {
+      return threadIndex();
+    }
+
+    /* returns the index (0..threadCount-1) of the current thread */
+    static __forceinline size_t threadIndex()
+    {
+#if TBB_INTERFACE_VERSION >= 9100
+      return tbb::this_task_arena::current_thread_index();
+#elif TBB_INTERFACE_VERSION >= 9000
+      return tbb::task_arena::current_thread_index();
+#else
+      return 0;
+#endif
+    }
+
+    /* returns the total number of threads */
+    static __forceinline size_t threadCount() {
+#if TBB_INTERFACE_VERSION >= 9100
+      return tbb::this_task_arena::max_concurrency();
+#else
+      return tbb::task_scheduler_init::default_num_threads();
+#endif
+    }
+
+  };
+
+};
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore.h b/thirdparty/embree-aarch64/include/embree3/rtcore.h
new file mode 100644
index 0000000000..5830bb5880
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore.h
@@ -0,0 +1,14 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_config.h"
+#include "rtcore_common.h"
+#include "rtcore_device.h"
+#include "rtcore_buffer.h"
+#include "rtcore_ray.h"
+#include "rtcore_geometry.h"
+#include "rtcore_scene.h"
+#include "rtcore_builder.h"
+#include "rtcore_quaternion.h"
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h b/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h
new file mode 100644
index 0000000000..400b604aa5
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_buffer.h
@@ -0,0 +1,51 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_device.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Types of buffers */
+enum RTCBufferType
+{
+  RTC_BUFFER_TYPE_INDEX            = 0,
+  RTC_BUFFER_TYPE_VERTEX           = 1,
+  RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE = 2,
+  RTC_BUFFER_TYPE_NORMAL           = 3,
+  RTC_BUFFER_TYPE_TANGENT          = 4,
+  RTC_BUFFER_TYPE_NORMAL_DERIVATIVE = 5,
+
+  RTC_BUFFER_TYPE_GRID                 = 8,
+
+  RTC_BUFFER_TYPE_FACE                 = 16,
+  RTC_BUFFER_TYPE_LEVEL                = 17,
+  RTC_BUFFER_TYPE_EDGE_CREASE_INDEX    = 18,
+  RTC_BUFFER_TYPE_EDGE_CREASE_WEIGHT   = 19,
+  RTC_BUFFER_TYPE_VERTEX_CREASE_INDEX  = 20,
+  RTC_BUFFER_TYPE_VERTEX_CREASE_WEIGHT = 21,
+  RTC_BUFFER_TYPE_HOLE                 = 22,
+
+  RTC_BUFFER_TYPE_FLAGS = 32
+};
+
+/* Opaque buffer type */
+typedef struct RTCBufferTy* RTCBuffer;
+
+/* Creates a new buffer. */
+RTC_API RTCBuffer rtcNewBuffer(RTCDevice device, size_t byteSize);
+
+/* Creates a new shared buffer. */
+RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice device, void* ptr, size_t byteSize);
+
+/* Returns a pointer to the buffer data. */
+RTC_API void* rtcGetBufferData(RTCBuffer buffer);
+
+/* Retains the buffer (increments the reference count). */
+RTC_API void rtcRetainBuffer(RTCBuffer buffer);
+
+/* Releases the buffer (decrements the reference count). */
+RTC_API void rtcReleaseBuffer(RTCBuffer buffer);
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h b/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h
new file mode 100644
index 0000000000..d62a7f72cc
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_builder.h
@@ -0,0 +1,125 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_scene.h"
+
+RTC_NAMESPACE_BEGIN
+  
+/* Opaque BVH type */
+typedef struct RTCBVHTy* RTCBVH;
+
+/* Input build primitives for the builder */
+struct RTC_ALIGN(32) RTCBuildPrimitive
+{
+  float lower_x, lower_y, lower_z; 
+  unsigned int geomID;
+  float upper_x, upper_y, upper_z;
+  unsigned int primID;
+};
+
+/* Opaque thread local allocator type */
+typedef struct RTCThreadLocalAllocatorTy* RTCThreadLocalAllocator;
+
+/* Callback to create a node */
+typedef void* (*RTCCreateNodeFunction) (RTCThreadLocalAllocator allocator, unsigned int childCount, void* userPtr);
+
+/* Callback to set the pointer to all children */
+typedef void (*RTCSetNodeChildrenFunction) (void* nodePtr, void** children, unsigned int childCount, void* userPtr);
+
+/* Callback to set the bounds of all children */
+typedef void (*RTCSetNodeBoundsFunction) (void* nodePtr, const struct RTCBounds** bounds, unsigned int childCount, void* userPtr);
+
+/* Callback to create a leaf node */
+typedef void* (*RTCCreateLeafFunction) (RTCThreadLocalAllocator allocator, const struct RTCBuildPrimitive* primitives, size_t primitiveCount, void* userPtr);
+
+/* Callback to split a build primitive */
+typedef void (*RTCSplitPrimitiveFunction) (const struct RTCBuildPrimitive* primitive, unsigned int dimension, float position, struct RTCBounds* leftBounds, struct RTCBounds* rightBounds, void* userPtr);
+
+/* Build flags */
+enum RTCBuildFlags
+{
+  RTC_BUILD_FLAG_NONE    = 0,
+  RTC_BUILD_FLAG_DYNAMIC = (1 << 0),
+};
+
+enum RTCBuildConstants
+{
+  RTC_BUILD_MAX_PRIMITIVES_PER_LEAF = 32
+};
+
+/* Input for builders */
+struct RTCBuildArguments
+{
+  size_t byteSize;
+  
+  enum RTCBuildQuality buildQuality;
+  enum RTCBuildFlags buildFlags;
+  unsigned int maxBranchingFactor;
+  unsigned int maxDepth;
+  unsigned int sahBlockSize;
+  unsigned int minLeafSize;
+  unsigned int maxLeafSize;
+  float traversalCost;
+  float intersectionCost;
+  
+  RTCBVH bvh;
+  struct RTCBuildPrimitive* primitives;
+  size_t primitiveCount;
+  size_t primitiveArrayCapacity;
+  
+  RTCCreateNodeFunction createNode;
+  RTCSetNodeChildrenFunction setNodeChildren;
+  RTCSetNodeBoundsFunction setNodeBounds;
+  RTCCreateLeafFunction createLeaf;
+  RTCSplitPrimitiveFunction splitPrimitive;
+  RTCProgressMonitorFunction buildProgress;
+  void* userPtr;
+};
+
+/* Returns the default build settings.  */
+RTC_FORCEINLINE struct RTCBuildArguments rtcDefaultBuildArguments()
+{
+  struct RTCBuildArguments args;
+  args.byteSize = sizeof(args);
+  args.buildQuality = RTC_BUILD_QUALITY_MEDIUM;
+  args.buildFlags = RTC_BUILD_FLAG_NONE;
+  args.maxBranchingFactor = 2;
+  args.maxDepth = 32;
+  args.sahBlockSize = 1;
+  args.minLeafSize = 1;
+  args.maxLeafSize = RTC_BUILD_MAX_PRIMITIVES_PER_LEAF;
+  args.traversalCost = 1.0f;
+  args.intersectionCost = 1.0f;
+  args.bvh = NULL;
+  args.primitives = NULL;
+  args.primitiveCount = 0;
+  args.primitiveArrayCapacity = 0;
+  args.createNode = NULL;
+  args.setNodeChildren = NULL;
+  args.setNodeBounds = NULL;
+  args.createLeaf = NULL;
+  args.splitPrimitive = NULL;
+  args.buildProgress = NULL;
+  args.userPtr = NULL;
+  return args;
+}
+
+/* Creates a new BVH. */
+RTC_API RTCBVH rtcNewBVH(RTCDevice device);
+
+/* Builds a BVH. */
+RTC_API void* rtcBuildBVH(const struct RTCBuildArguments* args);
+
+/* Allocates memory using the thread local allocator. */
+RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator allocator, size_t bytes, size_t align);
+
+/* Retains the BVH (increments reference count). */
+RTC_API void rtcRetainBVH(RTCBVH bvh);
+
+/* Releases the BVH (decrements reference count). */
+RTC_API void rtcReleaseBVH(RTCBVH bvh);
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_common.h b/thirdparty/embree-aarch64/include/embree3/rtcore_common.h
new file mode 100644
index 0000000000..890e06faa3
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_common.h
@@ -0,0 +1,326 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <stddef.h>
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include "rtcore_config.h"
+
+RTC_NAMESPACE_BEGIN
+
+#if defined(_WIN32)
+#if defined(_M_X64)
+typedef long long ssize_t;
+#else
+typedef int ssize_t;
+#endif
+#endif
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+#  define RTC_ALIGN(...) __declspec(align(__VA_ARGS__))
+#else
+#  define RTC_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
+#endif
+
+#if !defined (RTC_DEPRECATED)
+#ifdef __GNUC__
+  #define RTC_DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+  #define RTC_DEPRECATED __declspec(deprecated)
+#else
+  #define RTC_DEPRECATED
+#endif
+#endif
+
+#if defined(_WIN32)
+#  define RTC_FORCEINLINE __forceinline
+#else
+#  define RTC_FORCEINLINE inline __attribute__((always_inline))
+#endif
+
+/* Invalid geometry ID */
+#define RTC_INVALID_GEOMETRY_ID ((unsigned int)-1)
+
+/* Maximum number of time steps */
+#define RTC_MAX_TIME_STEP_COUNT 129
+
+/* Formats of buffers and other data structures */
+enum RTCFormat
+{
+  RTC_FORMAT_UNDEFINED = 0,
+
+  /* 8-bit unsigned integer */
+  RTC_FORMAT_UCHAR = 0x1001,
+  RTC_FORMAT_UCHAR2,
+  RTC_FORMAT_UCHAR3,
+  RTC_FORMAT_UCHAR4,
+
+  /* 8-bit signed integer */
+  RTC_FORMAT_CHAR = 0x2001,
+  RTC_FORMAT_CHAR2,
+  RTC_FORMAT_CHAR3,
+  RTC_FORMAT_CHAR4,
+
+  /* 16-bit unsigned integer */
+  RTC_FORMAT_USHORT = 0x3001,
+  RTC_FORMAT_USHORT2,
+  RTC_FORMAT_USHORT3,
+  RTC_FORMAT_USHORT4,
+
+  /* 16-bit signed integer */
+  RTC_FORMAT_SHORT = 0x4001,
+  RTC_FORMAT_SHORT2,
+  RTC_FORMAT_SHORT3,
+  RTC_FORMAT_SHORT4,
+
+  /* 32-bit unsigned integer */
+  RTC_FORMAT_UINT = 0x5001,
+  RTC_FORMAT_UINT2,
+  RTC_FORMAT_UINT3,
+  RTC_FORMAT_UINT4,
+
+  /* 32-bit signed integer */
+  RTC_FORMAT_INT = 0x6001,
+  RTC_FORMAT_INT2,
+  RTC_FORMAT_INT3,
+  RTC_FORMAT_INT4,
+
+  /* 64-bit unsigned integer */
+  RTC_FORMAT_ULLONG = 0x7001,
+  RTC_FORMAT_ULLONG2,
+  RTC_FORMAT_ULLONG3,
+  RTC_FORMAT_ULLONG4,
+
+  /* 64-bit signed integer */
+  RTC_FORMAT_LLONG = 0x8001,
+  RTC_FORMAT_LLONG2,
+  RTC_FORMAT_LLONG3,
+  RTC_FORMAT_LLONG4,
+
+  /* 32-bit float */
+  RTC_FORMAT_FLOAT = 0x9001,
+  RTC_FORMAT_FLOAT2,
+  RTC_FORMAT_FLOAT3,
+  RTC_FORMAT_FLOAT4,
+  RTC_FORMAT_FLOAT5,
+  RTC_FORMAT_FLOAT6,
+  RTC_FORMAT_FLOAT7,
+  RTC_FORMAT_FLOAT8,
+  RTC_FORMAT_FLOAT9,
+  RTC_FORMAT_FLOAT10,
+  RTC_FORMAT_FLOAT11,
+  RTC_FORMAT_FLOAT12,
+  RTC_FORMAT_FLOAT13,
+  RTC_FORMAT_FLOAT14,
+  RTC_FORMAT_FLOAT15,
+  RTC_FORMAT_FLOAT16,
+
+  /* 32-bit float matrix (row-major order) */
+  RTC_FORMAT_FLOAT2X2_ROW_MAJOR = 0x9122,
+  RTC_FORMAT_FLOAT2X3_ROW_MAJOR = 0x9123,
+  RTC_FORMAT_FLOAT2X4_ROW_MAJOR = 0x9124,
+  RTC_FORMAT_FLOAT3X2_ROW_MAJOR = 0x9132,
+  RTC_FORMAT_FLOAT3X3_ROW_MAJOR = 0x9133,
+  RTC_FORMAT_FLOAT3X4_ROW_MAJOR = 0x9134,
+  RTC_FORMAT_FLOAT4X2_ROW_MAJOR = 0x9142,
+  RTC_FORMAT_FLOAT4X3_ROW_MAJOR = 0x9143,
+  RTC_FORMAT_FLOAT4X4_ROW_MAJOR = 0x9144,
+
+  /* 32-bit float matrix (column-major order) */
+  RTC_FORMAT_FLOAT2X2_COLUMN_MAJOR = 0x9222,
+  RTC_FORMAT_FLOAT2X3_COLUMN_MAJOR = 0x9223,
+  RTC_FORMAT_FLOAT2X4_COLUMN_MAJOR = 0x9224,
+  RTC_FORMAT_FLOAT3X2_COLUMN_MAJOR = 0x9232,
+  RTC_FORMAT_FLOAT3X3_COLUMN_MAJOR = 0x9233,
+  RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR = 0x9234,
+  RTC_FORMAT_FLOAT4X2_COLUMN_MAJOR = 0x9242,
+  RTC_FORMAT_FLOAT4X3_COLUMN_MAJOR = 0x9243,
+  RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR = 0x9244,
+
+  /* special 12-byte format for grids */
+  RTC_FORMAT_GRID = 0xA001
+};
+
+/* Build quality levels */
+enum RTCBuildQuality
+{
+  RTC_BUILD_QUALITY_LOW    = 0,
+  RTC_BUILD_QUALITY_MEDIUM = 1,
+  RTC_BUILD_QUALITY_HIGH   = 2,
+  RTC_BUILD_QUALITY_REFIT  = 3,
+};
+
+/* Axis-aligned bounding box representation */
+struct RTC_ALIGN(16) RTCBounds
+{
+  float lower_x, lower_y, lower_z, align0;
+  float upper_x, upper_y, upper_z, align1;
+};
+
+/* Linear axis-aligned bounding box representation */
+struct RTC_ALIGN(16) RTCLinearBounds
+{
+  struct RTCBounds bounds0;
+  struct RTCBounds bounds1;
+};
+
+/* Intersection context flags */
+enum RTCIntersectContextFlags
+{
+  RTC_INTERSECT_CONTEXT_FLAG_NONE       = 0,
+  RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT = (0 << 0), // optimize for incoherent rays
+  RTC_INTERSECT_CONTEXT_FLAG_COHERENT   = (1 << 0)  // optimize for coherent rays
+};
+
+/* Arguments for RTCFilterFunctionN */
+struct RTCFilterFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  struct RTCIntersectContext* context;
+  struct RTCRayN* ray;
+  struct RTCHitN* hit;
+  unsigned int N;
+};
+
+/* Filter callback function */
+typedef void (*RTCFilterFunctionN)(const struct RTCFilterFunctionNArguments* args);
+
+/* Intersection context passed to intersect/occluded calls */
+struct RTCIntersectContext
+{
+  enum RTCIntersectContextFlags flags;               // intersection flags
+  RTCFilterFunctionN filter;                         // filter function to execute
+  
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  unsigned int instStackSize;                        // Number of instances currently on the stack.
+#endif
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // The current stack of instance ids.
+  
+#if RTC_MIN_WIDTH
+  float minWidthDistanceFactor;                      // curve radius is set to this factor times distance to ray origin
+#endif
+};
+
+/* Initializes an intersection context. */
+RTC_FORCEINLINE void rtcInitIntersectContext(struct RTCIntersectContext* context)
+{
+  unsigned l = 0;
+  context->flags = RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT;
+  context->filter = NULL;
+  
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  context->instStackSize = 0;
+#endif
+  for (; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    context->instID[l] = RTC_INVALID_GEOMETRY_ID;
+  
+#if RTC_MIN_WIDTH
+  context->minWidthDistanceFactor = 0.0f;
+#endif
+}
+
+/* Point query structure for closest point query */
+struct RTC_ALIGN(16) RTCPointQuery
+{
+  float x;                // x coordinate of the query point
+  float y;                // y coordinate of the query point
+  float z;                // z coordinate of the query point
+  float time;             // time of the point query
+  float radius;           // radius of the point query
+};
+
+/* Structure of a packet of 4 query points */
+struct RTC_ALIGN(16) RTCPointQuery4
+{
+  float x[4];                // x coordinate of the query point
+  float y[4];                // y coordinate of the query point
+  float z[4];                // z coordinate of the query point
+  float time[4];             // time of the point query
+  float radius[4];           // radius of the point query
+};
+
+/* Structure of a packet of 8 query points */
+struct RTC_ALIGN(32) RTCPointQuery8
+{
+  float x[8];                // x coordinate of the query point
+  float y[8];                // y coordinate of the query point
+  float z[8];                // z coordinate of the query point
+  float time[8];             // time of the point query
+  float radius[8];           // radius ofr the point query
+};
+
+/* Structure of a packet of 16 query points */
+struct RTC_ALIGN(64) RTCPointQuery16
+{
+  float x[16];                // x coordinate of the query point
+  float y[16];                // y coordinate of the query point
+  float z[16];                // z coordinate of the query point
+  float time[16];             // time of the point quey
+  float radius[16];           // radius of the point query
+};
+
+struct RTCPointQueryN;
+
+struct RTC_ALIGN(16) RTCPointQueryContext
+{
+  // accumulated 4x4 column major matrices from world space to instance space.
+  // undefined if size == 0.
+  float world2inst[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
+
+  // accumulated 4x4 column major matrices from instance space to world space.
+  // undefined if size == 0.
+  float inst2world[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
+
+  // instance ids.
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
+
+  // number of instances currently on the stack.
+  unsigned int instStackSize;
+};
+
+/* Initializes an intersection context. */
+RTC_FORCEINLINE void rtcInitPointQueryContext(struct RTCPointQueryContext* context)
+{
+  context->instStackSize = 0;
+  context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+}
+
+struct RTC_ALIGN(16) RTCPointQueryFunctionArguments
+{
+  // The (world space) query object that was passed as an argument of rtcPointQuery. The
+  // radius of the query can be decreased inside the callback to shrink the
+  // search domain. Increasing the radius or modifying the time or position of
+  // the query results in undefined behaviour.
+  struct RTCPointQuery* query;
+
+  // Used for user input/output data. Will not be read or modified internally.
+  void* userPtr;
+
+  // primitive and geometry ID of primitive
+  unsigned int  primID;
+  unsigned int  geomID;
+
+  // the context with transformation and instance ID stack
+  struct RTCPointQueryContext* context;
+
+  // If the current instance transform M (= context->world2inst[context->instStackSize])
+  // is a similarity matrix, i.e there is a constant factor similarityScale such that,
+  //    for all x,y: dist(Mx, My) = similarityScale * dist(x, y),
+  // The similarity scale is 0, if the current instance transform is not a
+  // similarity transform and vice versa. The similarity scale allows to compute
+  // distance information in instance space and scale the distances into world
+  // space by dividing with the similarity scale, for example, to update the
+  // query radius. If the current instance transform is not a similarity
+  // transform (similarityScale = 0), the distance computation has to be
+  // performed in world space to ensure correctness. if there is no instance
+  // transform (context->instStackSize == 0), the similarity scale is 1.
+  float similarityScale;
+};
+
+typedef bool (*RTCPointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_config.h b/thirdparty/embree-aarch64/include/embree3/rtcore_config.h
new file mode 100644
index 0000000000..337d4e9487
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_config.h
@@ -0,0 +1,57 @@
+
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define RTC_VERSION_MAJOR 3
+#define RTC_VERSION_MINOR 12
+#define RTC_VERSION_PATCH 1
+#define RTC_VERSION 31201
+#define RTC_VERSION_STRING "3.12.1"
+
+#define RTC_MAX_INSTANCE_LEVEL_COUNT 1
+
+#define EMBREE_MIN_WIDTH 0
+#define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
+
+#define EMBREE_STATIC_LIB
+/* #undef EMBREE_API_NAMESPACE */
+
+#if defined(EMBREE_API_NAMESPACE)
+#  define RTC_NAMESPACE
+#  define RTC_NAMESPACE_BEGIN namespace  {
+#  define RTC_NAMESPACE_END }
+#  define RTC_NAMESPACE_USE using namespace ;
+#  define RTC_API_EXTERN_C
+#  undef EMBREE_API_NAMESPACE
+#else
+#  define RTC_NAMESPACE_BEGIN
+#  define RTC_NAMESPACE_END
+#  define RTC_NAMESPACE_USE
+#  if defined(__cplusplus)
+#    define RTC_API_EXTERN_C extern "C"
+#  else
+#    define RTC_API_EXTERN_C
+#  endif
+#endif
+
+#if defined(ISPC)
+#  define RTC_API_IMPORT extern "C" unmasked
+#  define RTC_API_EXPORT extern "C" unmasked
+#elif defined(EMBREE_STATIC_LIB)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C
+#elif defined(_WIN32)
+#  define RTC_API_IMPORT RTC_API_EXTERN_C __declspec(dllimport)
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __declspec(dllexport)
+#else
+#  define RTC_API_IMPORT RTC_API_EXTERN_C
+#  define RTC_API_EXPORT RTC_API_EXTERN_C __attribute__ ((visibility ("default")))
+#endif
+
+#if defined(RTC_EXPORT_API)
+#  define RTC_API RTC_API_EXPORT
+#else
+#  define RTC_API RTC_API_IMPORT
+#endif
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_device.h b/thirdparty/embree-aarch64/include/embree3/rtcore_device.h
new file mode 100644
index 0000000000..594e2b755d
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_device.h
@@ -0,0 +1,87 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Opaque device type */
+typedef struct RTCDeviceTy* RTCDevice;
+
+/* Creates a new Embree device. */
+RTC_API RTCDevice rtcNewDevice(const char* config);
+
+/* Retains the Embree device (increments the reference count). */
+RTC_API void rtcRetainDevice(RTCDevice device);
+  
+/* Releases an Embree device (decrements the reference count). */
+RTC_API void rtcReleaseDevice(RTCDevice device);
+
+/* Device properties */
+enum RTCDeviceProperty
+{
+  RTC_DEVICE_PROPERTY_VERSION       = 0,
+  RTC_DEVICE_PROPERTY_VERSION_MAJOR = 1,
+  RTC_DEVICE_PROPERTY_VERSION_MINOR = 2,
+  RTC_DEVICE_PROPERTY_VERSION_PATCH = 3,
+
+  RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED  = 32,
+  RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED  = 33,
+  RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED = 34,
+  RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED   = 35,
+
+  RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED = 63,
+  RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED          = 64,
+  RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED    = 65,
+  RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED   = 66,
+  RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED = 67,
+  RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED       = 68,
+
+  RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED    = 96,
+  RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED        = 97,
+  RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED = 98,
+  RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED       = 99,
+  RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED        = 100,
+  RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED       = 101,
+
+  RTC_DEVICE_PROPERTY_TASKING_SYSTEM        = 128,
+  RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED = 129,
+  RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED = 130
+};
+
+/* Gets a device property. */
+RTC_API ssize_t rtcGetDeviceProperty(RTCDevice device, enum RTCDeviceProperty prop);
+
+/* Sets a device property. */
+RTC_API void rtcSetDeviceProperty(RTCDevice device, const enum RTCDeviceProperty prop, ssize_t value);
+  
+/* Error codes */
+enum RTCError
+{
+  RTC_ERROR_NONE              = 0,
+  RTC_ERROR_UNKNOWN           = 1,
+  RTC_ERROR_INVALID_ARGUMENT  = 2,
+  RTC_ERROR_INVALID_OPERATION = 3,
+  RTC_ERROR_OUT_OF_MEMORY     = 4,
+  RTC_ERROR_UNSUPPORTED_CPU   = 5,
+  RTC_ERROR_CANCELLED         = 6
+};
+
+/* Returns the error code. */
+RTC_API enum RTCError rtcGetDeviceError(RTCDevice device);
+
+/* Error callback function */
+typedef void (*RTCErrorFunction)(void* userPtr, enum RTCError code, const char* str);
+
+/* Sets the error callback function. */
+RTC_API void rtcSetDeviceErrorFunction(RTCDevice device, RTCErrorFunction error, void* userPtr);
+
+/* Memory monitor callback function */
+typedef bool (*RTCMemoryMonitorFunction)(void* ptr, ssize_t bytes, bool post);
+
+/* Sets the memory monitor callback function. */
+RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice device, RTCMemoryMonitorFunction memoryMonitor, void* userPtr);
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h b/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h
new file mode 100644
index 0000000000..c70f1b0e5c
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_geometry.h
@@ -0,0 +1,383 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_buffer.h"
+#include "rtcore_quaternion.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Opaque scene type */
+typedef struct RTCSceneTy* RTCScene;
+
+/* Opaque geometry type */
+typedef struct RTCGeometryTy* RTCGeometry;
+
+/* Types of geometries */
+enum RTCGeometryType
+{
+  RTC_GEOMETRY_TYPE_TRIANGLE = 0, // triangle mesh
+  RTC_GEOMETRY_TYPE_QUAD     = 1, // quad (triangle pair) mesh
+  RTC_GEOMETRY_TYPE_GRID     = 2, // grid mesh
+
+  RTC_GEOMETRY_TYPE_SUBDIVISION = 8, // Catmull-Clark subdivision surface
+
+  RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE   = 15, // Cone linear curves - discontinuous at edge boundaries 
+  RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE  = 16, // Round (rounded cone like) linear curves 
+  RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE   = 17, // flat (ribbon-like) linear curves
+
+  RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE  = 24, // round (tube-like) Bezier curves
+  RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE   = 25, // flat (ribbon-like) Bezier curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE  = 26, // flat normal-oriented Bezier curves
+  
+  RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE = 32, // round (tube-like) B-spline curves
+  RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE  = 33, // flat (ribbon-like) B-spline curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE  = 34, // flat normal-oriented B-spline curves
+
+  RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE = 40, // round (tube-like) Hermite curves
+  RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE  = 41, // flat (ribbon-like) Hermite curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE  = 42, // flat normal-oriented Hermite curves
+
+  RTC_GEOMETRY_TYPE_SPHERE_POINT = 50,
+  RTC_GEOMETRY_TYPE_DISC_POINT = 51,
+  RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT = 52,
+
+  RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE = 58, // round (tube-like) Catmull-Rom curves
+  RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE  = 59, // flat (ribbon-like) Catmull-Rom curves
+  RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE  = 60, // flat normal-oriented Catmull-Rom curves
+
+  RTC_GEOMETRY_TYPE_USER     = 120, // user-defined geometry
+  RTC_GEOMETRY_TYPE_INSTANCE = 121  // scene instance
+};
+
+/* Interpolation modes for subdivision surfaces */
+enum RTCSubdivisionMode
+{
+  RTC_SUBDIVISION_MODE_NO_BOUNDARY     = 0,
+  RTC_SUBDIVISION_MODE_SMOOTH_BOUNDARY = 1,
+  RTC_SUBDIVISION_MODE_PIN_CORNERS     = 2,
+  RTC_SUBDIVISION_MODE_PIN_BOUNDARY    = 3,
+  RTC_SUBDIVISION_MODE_PIN_ALL         = 4,
+};
+
+/* Curve segment flags */
+enum RTCCurveFlags
+{
+  RTC_CURVE_FLAG_NEIGHBOR_LEFT  = (1 << 0), // left segments exists
+  RTC_CURVE_FLAG_NEIGHBOR_RIGHT = (1 << 1)  // right segment exists
+};
+
+/* Arguments for RTCBoundsFunction */
+struct RTCBoundsFunctionArguments
+{
+  void* geometryUserPtr;
+  unsigned int primID;
+  unsigned int timeStep;
+  struct RTCBounds* bounds_o;
+};
+
+/* Bounding callback function */
+typedef void (*RTCBoundsFunction)(const struct RTCBoundsFunctionArguments* args);
+
+/* Arguments for RTCIntersectFunctionN */
+struct RTCIntersectFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  unsigned int primID;
+  struct RTCIntersectContext* context;
+  struct RTCRayHitN* rayhit;
+  unsigned int N;
+  unsigned int geomID;
+};
+
+/* Intersection callback function */
+typedef void (*RTCIntersectFunctionN)(const struct RTCIntersectFunctionNArguments* args);
+
+/* Arguments for RTCOccludedFunctionN */
+struct RTCOccludedFunctionNArguments
+{
+  int* valid;
+  void* geometryUserPtr;
+  unsigned int primID;
+  struct RTCIntersectContext* context;
+  struct RTCRayN* ray;
+  unsigned int N;
+  unsigned int geomID;
+};
+
+/* Occlusion callback function */
+typedef void (*RTCOccludedFunctionN)(const struct RTCOccludedFunctionNArguments* args);
+
+/* Arguments for RTCDisplacementFunctionN */
+struct RTCDisplacementFunctionNArguments
+{
+  void* geometryUserPtr;
+  RTCGeometry geometry;
+  unsigned int primID;
+  unsigned int timeStep;
+  const float* u;
+  const float* v;
+  const float* Ng_x;
+  const float* Ng_y;
+  const float* Ng_z;
+  float* P_x;
+  float* P_y;
+  float* P_z;
+  unsigned int N;
+};
+
+/* Displacement mapping callback function */
+typedef void (*RTCDisplacementFunctionN)(const struct RTCDisplacementFunctionNArguments* args);
+
+/* Creates a new geometry of specified type. */
+RTC_API RTCGeometry rtcNewGeometry(RTCDevice device, enum RTCGeometryType type);
+
+/* Retains the geometry (increments the reference count). */
+RTC_API void rtcRetainGeometry(RTCGeometry geometry);
+
+/* Releases the geometry (decrements the reference count) */
+RTC_API void rtcReleaseGeometry(RTCGeometry geometry);
+
+/* Commits the geometry. */
+RTC_API void rtcCommitGeometry(RTCGeometry geometry);
+
+
+/* Enables the geometry. */
+RTC_API void rtcEnableGeometry(RTCGeometry geometry);
+
+/* Disables the geometry. */
+RTC_API void rtcDisableGeometry(RTCGeometry geometry);
+
+
+/* Sets the number of motion blur time steps of the geometry. */
+RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry geometry, unsigned int timeStepCount);
+
+/* Sets the motion blur time range of the geometry. */
+RTC_API void rtcSetGeometryTimeRange(RTCGeometry geometry, float startTime, float endTime);
+  
+/* Sets the number of vertex attributes of the geometry. */
+RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry geometry, unsigned int vertexAttributeCount);
+
+/* Sets the ray mask of the geometry. */
+RTC_API void rtcSetGeometryMask(RTCGeometry geometry, unsigned int mask);
+
+/* Sets the build quality of the geometry. */
+RTC_API void rtcSetGeometryBuildQuality(RTCGeometry geometry, enum RTCBuildQuality quality);
+
+/* Sets the maximal curve or point radius scale allowed by min-width feature. */
+RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry geometry, float maxRadiusScale);
+
+
+/* Sets a geometry buffer. */
+RTC_API void rtcSetGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, RTCBuffer buffer, size_t byteOffset, size_t byteStride, size_t itemCount);
+
+/* Sets a shared geometry buffer. */
+RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount);
+
+/* Creates and sets a new geometry buffer. */
+RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot, enum RTCFormat format, size_t byteStride, size_t itemCount);
+
+/* Returns the pointer to the data of a buffer. */
+RTC_API void* rtcGetGeometryBufferData(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot);
+
+/* Updates a geometry buffer. */
+RTC_API void rtcUpdateGeometryBuffer(RTCGeometry geometry, enum RTCBufferType type, unsigned int slot);
+
+
+/* Sets the intersection filter callback function of the geometry. */
+RTC_API void rtcSetGeometryIntersectFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter);
+
+/* Sets the occlusion filter callback function of the geometry. */
+RTC_API void rtcSetGeometryOccludedFilterFunction(RTCGeometry geometry, RTCFilterFunctionN filter);
+
+/* Sets the user-defined data pointer of the geometry. */
+RTC_API void rtcSetGeometryUserData(RTCGeometry geometry, void* ptr);
+
+/* Gets the user-defined data pointer of the geometry. */
+RTC_API void* rtcGetGeometryUserData(RTCGeometry geometry);
+
+/* Set the point query callback function of a geometry. */
+RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry geometry, RTCPointQueryFunction pointQuery);
+
+/* Sets the number of primitives of a user geometry. */
+RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry geometry, unsigned int userPrimitiveCount);
+
+/* Sets the bounding callback function to calculate bounding boxes for user primitives. */
+RTC_API void rtcSetGeometryBoundsFunction(RTCGeometry geometry, RTCBoundsFunction bounds, void* userPtr);
+
+/* Set the intersect callback function of a user geometry. */
+RTC_API void rtcSetGeometryIntersectFunction(RTCGeometry geometry, RTCIntersectFunctionN intersect);
+
+/* Set the occlusion callback function of a user geometry. */
+RTC_API void rtcSetGeometryOccludedFunction(RTCGeometry geometry, RTCOccludedFunctionN occluded);
+
+/* Invokes the intersection filter from the intersection callback function. */
+RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
+
+/* Invokes the occlusion filter from the occlusion callback function. */
+RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* args, const struct RTCFilterFunctionNArguments* filterArgs);
+
+
+/* Sets the instanced scene of an instance geometry. */
+RTC_API void rtcSetGeometryInstancedScene(RTCGeometry geometry, RTCScene scene);
+
+/* Sets the transformation of an instance for the specified time step. */
+RTC_API void rtcSetGeometryTransform(RTCGeometry geometry, unsigned int timeStep, enum RTCFormat format, const void* xfm);
+
+/* Sets the transformation quaternion of an instance for the specified time step. */
+RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry geometry, unsigned int timeStep, const struct RTCQuaternionDecomposition* qd);
+
+/* Returns the interpolated transformation of an instance for the specified time. */
+RTC_API void rtcGetGeometryTransform(RTCGeometry geometry, float time, enum RTCFormat format, void* xfm);
+
+
+/* Sets the uniform tessellation rate of the geometry. */
+RTC_API void rtcSetGeometryTessellationRate(RTCGeometry geometry, float tessellationRate);
+
+/* Sets the number of topologies of a subdivision surface. */
+RTC_API void rtcSetGeometryTopologyCount(RTCGeometry geometry, unsigned int topologyCount);
+
+/* Sets the subdivision interpolation mode. */
+RTC_API void rtcSetGeometrySubdivisionMode(RTCGeometry geometry, unsigned int topologyID, enum RTCSubdivisionMode mode);
+
+/* Binds a vertex attribute to a topology of the geometry. */
+RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry geometry, unsigned int vertexAttributeID, unsigned int topologyID);
+
+/* Sets the displacement callback function of a subdivision surface. */
+RTC_API void rtcSetGeometryDisplacementFunction(RTCGeometry geometry, RTCDisplacementFunctionN displacement);
+
+/* Returns the first half edge of a face. */
+RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry geometry, unsigned int faceID);
+
+/* Returns the face the half edge belongs to. */
+RTC_API unsigned int rtcGetGeometryFace(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns next half edge. */
+RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns previous half edge. */
+RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry geometry, unsigned int edgeID);
+
+/* Returns opposite half edge. */
+RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry geometry, unsigned int topologyID, unsigned int edgeID);
+
+
+/* Arguments for rtcInterpolate */
+struct RTCInterpolateArguments
+{
+  RTCGeometry geometry;
+  unsigned int primID;
+  float u;
+  float v;
+  enum RTCBufferType bufferType;
+  unsigned int bufferSlot;
+  float* P;
+  float* dPdu;
+  float* dPdv;
+  float* ddPdudu;
+  float* ddPdvdv;
+  float* ddPdudv;
+  unsigned int valueCount;
+};
+
+/* Interpolates vertex data to some u/v location and optionally calculates all derivatives. */
+RTC_API void rtcInterpolate(const struct RTCInterpolateArguments* args);
+
+/* Interpolates vertex data to some u/v location. */
+RTC_FORCEINLINE void rtcInterpolate0(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot, float* P, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = NULL;
+  args.dPdv = NULL;
+  args.ddPdudu = NULL;
+  args.ddPdvdv = NULL;
+  args.ddPdudv = NULL;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Interpolates vertex data to some u/v location and calculates first order derivatives. */
+RTC_FORCEINLINE void rtcInterpolate1(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot,
+                                     float* P, float* dPdu, float* dPdv, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = dPdu;
+  args.dPdv = dPdv;
+  args.ddPdudu = NULL;
+  args.ddPdvdv = NULL;
+  args.ddPdudv = NULL;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Interpolates vertex data to some u/v location and calculates first and second order derivatives. */
+RTC_FORCEINLINE void rtcInterpolate2(RTCGeometry geometry, unsigned int primID, float u, float v, enum RTCBufferType bufferType, unsigned int bufferSlot,
+                                     float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, unsigned int valueCount)
+{
+  struct RTCInterpolateArguments args;
+  args.geometry = geometry;
+  args.primID = primID;
+  args.u = u;
+  args.v = v;
+  args.bufferType = bufferType;
+  args.bufferSlot = bufferSlot;
+  args.P = P;
+  args.dPdu = dPdu;
+  args.dPdv = dPdv;
+  args.ddPdudu = ddPdudu;
+  args.ddPdvdv = ddPdvdv;
+  args.ddPdudv = ddPdudv;
+  args.valueCount = valueCount;
+  rtcInterpolate(&args);
+}
+
+/* Arguments for rtcInterpolateN */
+struct RTCInterpolateNArguments
+{
+  RTCGeometry geometry;
+  const void* valid;
+  const unsigned int* primIDs;
+  const float* u;
+  const float* v;
+  unsigned int N;
+  enum RTCBufferType bufferType;
+  unsigned int bufferSlot;
+  float* P;
+  float* dPdu;
+  float* dPdv;
+  float* ddPdudu;
+  float* ddPdvdv;
+  float* ddPdudv;
+  unsigned int valueCount;
+};
+
+/* Interpolates vertex data to an array of u/v locations. */
+RTC_API void rtcInterpolateN(const struct RTCInterpolateNArguments* args);
+
+/* RTCGrid primitive for grid mesh */
+struct RTCGrid
+{
+  unsigned int startVertexID;
+  unsigned int stride;
+  unsigned short width,height; // max is a 32k x 32k grid
+};
+
+RTC_NAMESPACE_END
+
+
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h b/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h
new file mode 100644
index 0000000000..449cdedfdc
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_quaternion.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/*
+ * Structure for transformation respresentation as a matrix decomposition using
+ * a quaternion
+ */
+struct RTC_ALIGN(16) RTCQuaternionDecomposition
+{
+  float scale_x;
+  float scale_y;
+  float scale_z;
+  float skew_xy;
+  float skew_xz;
+  float skew_yz;
+  float shift_x;
+  float shift_y;
+  float shift_z;
+  float quaternion_r;
+  float quaternion_i;
+  float quaternion_j;
+  float quaternion_k;
+  float translation_x;
+  float translation_y;
+  float translation_z;
+};
+
+RTC_FORCEINLINE void rtcInitQuaternionDecomposition(struct RTCQuaternionDecomposition* qdecomp)
+{
+  qdecomp->scale_x = 1.f;
+  qdecomp->scale_y = 1.f;
+  qdecomp->scale_z = 1.f;
+  qdecomp->skew_xy = 0.f;
+  qdecomp->skew_xz = 0.f;
+  qdecomp->skew_yz = 0.f;
+  qdecomp->shift_x = 0.f;
+  qdecomp->shift_y = 0.f;
+  qdecomp->shift_z = 0.f;
+  qdecomp->quaternion_r = 1.f;
+  qdecomp->quaternion_i = 0.f;
+  qdecomp->quaternion_j = 0.f;
+  qdecomp->quaternion_k = 0.f;
+  qdecomp->translation_x = 0.f;
+  qdecomp->translation_y = 0.f;
+  qdecomp->translation_z = 0.f;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetQuaternion(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float r, float i, float j, float k)
+{
+  qdecomp->quaternion_r = r;
+  qdecomp->quaternion_i = i;
+  qdecomp->quaternion_j = j;
+  qdecomp->quaternion_k = k;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetScale(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float scale_x, float scale_y, float scale_z)
+{
+  qdecomp->scale_x = scale_x;
+  qdecomp->scale_y = scale_y;
+  qdecomp->scale_z = scale_z;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetSkew(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float skew_xy, float skew_xz, float skew_yz)
+{
+  qdecomp->skew_xy = skew_xy;
+  qdecomp->skew_xz = skew_xz;
+  qdecomp->skew_yz = skew_yz;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetShift(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float shift_x, float shift_y, float shift_z)
+{
+  qdecomp->shift_x = shift_x;
+  qdecomp->shift_y = shift_y;
+  qdecomp->shift_z = shift_z;
+}
+
+RTC_FORCEINLINE void rtcQuaternionDecompositionSetTranslation(
+  struct RTCQuaternionDecomposition* qdecomp,
+  float translation_x, float translation_y, float translation_z)
+{
+  qdecomp->translation_x = translation_x;
+  qdecomp->translation_y = translation_y;
+  qdecomp->translation_z = translation_z;
+}
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h b/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h
new file mode 100644
index 0000000000..1ae3309ef1
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_ray.h
@@ -0,0 +1,378 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_common.h"
+
+RTC_NAMESPACE_BEGIN
+
+/* Ray structure for a single ray */
+struct RTC_ALIGN(16) RTCRay
+{
+  float org_x;        // x coordinate of ray origin
+  float org_y;        // y coordinate of ray origin
+  float org_z;        // z coordinate of ray origin
+  float tnear;        // start of ray segment
+
+  float dir_x;        // x coordinate of ray direction
+  float dir_y;        // y coordinate of ray direction
+  float dir_z;        // z coordinate of ray direction
+  float time;         // time of this ray for motion blur
+
+  float tfar;         // end of ray segment (set to hit distance)
+  unsigned int mask;  // ray mask
+  unsigned int id;    // ray ID
+  unsigned int flags; // ray flags
+};
+
+/* Hit structure for a single ray */
+struct RTC_ALIGN(16) RTCHit
+{
+  float Ng_x;          // x coordinate of geometry normal
+  float Ng_y;          // y coordinate of geometry normal
+  float Ng_z;          // z coordinate of geometry normal
+
+  float u;             // barycentric u coordinate of hit
+  float v;             // barycentric v coordinate of hit
+
+  unsigned int primID; // primitive ID
+  unsigned int geomID; // geometry ID
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+};
+
+/* Combined ray/hit structure for a single ray */
+struct RTCRayHit
+{
+  struct RTCRay ray;
+  struct RTCHit hit;
+};
+
+/* Ray structure for a packet of 4 rays */
+struct RTC_ALIGN(16) RTCRay4
+{
+  float org_x[4];
+  float org_y[4];
+  float org_z[4];
+  float tnear[4];
+
+  float dir_x[4];
+  float dir_y[4];
+  float dir_z[4];
+  float time[4];
+
+  float tfar[4];
+  unsigned int mask[4];
+  unsigned int id[4];
+  unsigned int flags[4];
+};
+
+/* Hit structure for a packet of 4 rays */
+struct RTC_ALIGN(16) RTCHit4
+{
+  float Ng_x[4];
+  float Ng_y[4];
+  float Ng_z[4];
+
+  float u[4];
+  float v[4];
+
+  unsigned int primID[4];
+  unsigned int geomID[4];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][4];
+};
+
+/* Combined ray/hit structure for a packet of 4 rays */
+struct RTCRayHit4
+{
+  struct RTCRay4 ray;
+  struct RTCHit4 hit;
+};
+
+/* Ray structure for a packet of 8 rays */
+struct RTC_ALIGN(32) RTCRay8
+{
+  float org_x[8];
+  float org_y[8];
+  float org_z[8];
+  float tnear[8];
+
+  float dir_x[8];
+  float dir_y[8];
+  float dir_z[8];
+  float time[8];
+
+  float tfar[8];
+  unsigned int mask[8];
+  unsigned int id[8];
+  unsigned int flags[8];
+};
+
+/* Hit structure for a packet of 8 rays */
+struct RTC_ALIGN(32) RTCHit8
+{
+  float Ng_x[8];
+  float Ng_y[8];
+  float Ng_z[8];
+
+  float u[8];
+  float v[8];
+
+  unsigned int primID[8];
+  unsigned int geomID[8];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][8];
+};
+
+/* Combined ray/hit structure for a packet of 8 rays */
+struct RTCRayHit8
+{
+  struct RTCRay8 ray;
+  struct RTCHit8 hit;
+};
+
+/* Ray structure for a packet of 16 rays */
+struct RTC_ALIGN(64) RTCRay16
+{
+  float org_x[16];
+  float org_y[16];
+  float org_z[16];
+  float tnear[16];
+
+  float dir_x[16];
+  float dir_y[16];
+  float dir_z[16];
+  float time[16];
+
+  float tfar[16];
+  unsigned int mask[16];
+  unsigned int id[16];
+  unsigned int flags[16];
+};
+
+/* Hit structure for a packet of 16 rays */
+struct RTC_ALIGN(64) RTCHit16
+{
+  float Ng_x[16];
+  float Ng_y[16];
+  float Ng_z[16];
+
+  float u[16];
+  float v[16];
+
+  unsigned int primID[16];
+  unsigned int geomID[16];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][16];
+};
+
+/* Combined ray/hit structure for a packet of 16 rays */
+struct RTCRayHit16
+{
+  struct RTCRay16 ray;
+  struct RTCHit16 hit;
+};
+
+/* Ray structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCRayNp
+{
+  float* org_x;
+  float* org_y;
+  float* org_z;
+  float* tnear;
+
+  float* dir_x;
+  float* dir_y;
+  float* dir_z;
+  float* time;
+
+  float* tfar;
+  unsigned int* mask;
+  unsigned int* id;
+  unsigned int* flags;
+};
+
+/* Hit structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCHitNp
+{
+  float* Ng_x;
+  float* Ng_y;
+  float* Ng_z;
+
+  float* u;
+  float* v;
+
+  unsigned int* primID;
+  unsigned int* geomID;
+  unsigned int* instID[RTC_MAX_INSTANCE_LEVEL_COUNT];
+};
+
+/* Combined ray/hit structure for a packet/stream of N rays in pointer SOA layout */
+struct RTCRayHitNp
+{
+  struct RTCRayNp ray;
+  struct RTCHitNp hit;
+};
+
+struct RTCRayN;
+struct RTCHitN;
+struct RTCRayHitN;
+
+#if defined(__cplusplus)
+
+/* Helper functions to access ray packets of runtime size N */
+RTC_FORCEINLINE float& RTCRayN_org_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[0*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_org_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[1*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_org_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[2*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_tnear(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[3*N+i]; }
+
+RTC_FORCEINLINE float& RTCRayN_dir_x(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[4*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_dir_y(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[5*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_dir_z(RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[6*N+i]; }
+RTC_FORCEINLINE float& RTCRayN_time (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[7*N+i]; }
+
+RTC_FORCEINLINE float&        RTCRayN_tfar (RTCRayN* ray, unsigned int N, unsigned int i) { return ((float*)ray)[8*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_mask (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[9*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_id   (RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[10*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCRayN_flags(RTCRayN* ray, unsigned int N, unsigned int i) { return ((unsigned*)ray)[11*N+i]; }
+
+/* Helper functions to access hit packets of runtime size N */
+RTC_FORCEINLINE float& RTCHitN_Ng_x(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[0*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_Ng_y(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[1*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_Ng_z(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[2*N+i]; }
+
+RTC_FORCEINLINE float& RTCHitN_u(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[3*N+i]; }
+RTC_FORCEINLINE float& RTCHitN_v(RTCHitN* hit, unsigned int N, unsigned int i) { return ((float*)hit)[4*N+i]; }
+
+RTC_FORCEINLINE unsigned int& RTCHitN_primID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[5*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCHitN_geomID(RTCHitN* hit, unsigned int N, unsigned int i) { return ((unsigned*)hit)[6*N+i]; }
+RTC_FORCEINLINE unsigned int& RTCHitN_instID(RTCHitN* hit, unsigned int N, unsigned int i, unsigned int l) { return ((unsigned*)hit)[7*N+i+N*l]; }
+
+/* Helper functions to extract RTCRayN and RTCHitN from RTCRayHitN */
+RTC_FORCEINLINE RTCRayN* RTCRayHitN_RayN(RTCRayHitN* rayhit, unsigned int N) { return (RTCRayN*)&((float*)rayhit)[0*N]; }
+RTC_FORCEINLINE RTCHitN* RTCRayHitN_HitN(RTCRayHitN* rayhit, unsigned int N) { return (RTCHitN*)&((float*)rayhit)[12*N]; }
+
+/* Helper structure for a ray packet of compile-time size N */
+template<int N>
+struct RTCRayNt
+{
+  float org_x[N];
+  float org_y[N];
+  float org_z[N];
+  float tnear[N];
+
+  float dir_x[N];
+  float dir_y[N];
+  float dir_z[N];
+  float time[N];
+
+  float tfar[N];
+  unsigned int mask[N];
+  unsigned int id[N];
+  unsigned int flags[N];
+};
+
+/* Helper structure for a hit packet of compile-time size N */
+template<int N>
+struct RTCHitNt
+{
+  float Ng_x[N];
+  float Ng_y[N];
+  float Ng_z[N];
+
+  float u[N];
+  float v[N];
+
+  unsigned int primID[N];
+  unsigned int geomID[N];
+  unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT][N];
+};
+
+/* Helper structure for a combined ray/hit packet of compile-time size N */
+template<int N>
+struct RTCRayHitNt
+{
+  RTCRayNt<N> ray;
+  RTCHitNt<N> hit;
+};
+
+RTC_FORCEINLINE RTCRay rtcGetRayFromRayN(RTCRayN* rayN, unsigned int N, unsigned int i)
+{
+  RTCRay ray;
+  ray.org_x = RTCRayN_org_x(rayN,N,i);
+  ray.org_y = RTCRayN_org_y(rayN,N,i);
+  ray.org_z = RTCRayN_org_z(rayN,N,i);
+  ray.tnear = RTCRayN_tnear(rayN,N,i);
+  ray.dir_x = RTCRayN_dir_x(rayN,N,i);
+  ray.dir_y = RTCRayN_dir_y(rayN,N,i);
+  ray.dir_z = RTCRayN_dir_z(rayN,N,i);
+  ray.time  = RTCRayN_time(rayN,N,i);
+  ray.tfar  = RTCRayN_tfar(rayN,N,i);
+  ray.mask  = RTCRayN_mask(rayN,N,i);
+  ray.id    = RTCRayN_id(rayN,N,i);
+  ray.flags = RTCRayN_flags(rayN,N,i);
+  return ray;
+}
+
+RTC_FORCEINLINE RTCHit rtcGetHitFromHitN(RTCHitN* hitN, unsigned int N, unsigned int i)
+{
+  RTCHit hit;
+  hit.Ng_x   = RTCHitN_Ng_x(hitN,N,i);
+  hit.Ng_y   = RTCHitN_Ng_y(hitN,N,i);
+  hit.Ng_z   = RTCHitN_Ng_z(hitN,N,i);
+  hit.u      = RTCHitN_u(hitN,N,i);
+  hit.v      = RTCHitN_v(hitN,N,i);
+  hit.primID = RTCHitN_primID(hitN,N,i);
+  hit.geomID = RTCHitN_geomID(hitN,N,i);
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    hit.instID[l] = RTCHitN_instID(hitN,N,i,l);
+  return hit;
+}
+
+RTC_FORCEINLINE void rtcCopyHitToHitN(RTCHitN* hitN, const RTCHit* hit, unsigned int N, unsigned int i)
+{
+  RTCHitN_Ng_x(hitN,N,i)   = hit->Ng_x;
+  RTCHitN_Ng_y(hitN,N,i)   = hit->Ng_y;
+  RTCHitN_Ng_z(hitN,N,i)   = hit->Ng_z;
+  RTCHitN_u(hitN,N,i)      = hit->u;
+  RTCHitN_v(hitN,N,i)      = hit->v;
+  RTCHitN_primID(hitN,N,i) = hit->primID;
+  RTCHitN_geomID(hitN,N,i) = hit->geomID;
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    RTCHitN_instID(hitN,N,i,l) = hit->instID[l];
+}
+
+RTC_FORCEINLINE RTCRayHit rtcGetRayHitFromRayHitN(RTCRayHitN* rayhitN, unsigned int N, unsigned int i)
+{
+  RTCRayHit rh;
+
+  RTCRayN* ray = RTCRayHitN_RayN(rayhitN,N);
+  rh.ray.org_x = RTCRayN_org_x(ray,N,i);
+  rh.ray.org_y = RTCRayN_org_y(ray,N,i);
+  rh.ray.org_z = RTCRayN_org_z(ray,N,i);
+  rh.ray.tnear = RTCRayN_tnear(ray,N,i);
+  rh.ray.dir_x = RTCRayN_dir_x(ray,N,i);
+  rh.ray.dir_y = RTCRayN_dir_y(ray,N,i);
+  rh.ray.dir_z = RTCRayN_dir_z(ray,N,i);
+  rh.ray.time  = RTCRayN_time(ray,N,i);
+  rh.ray.tfar  = RTCRayN_tfar(ray,N,i);
+  rh.ray.mask  = RTCRayN_mask(ray,N,i);
+  rh.ray.id    = RTCRayN_id(ray,N,i);
+  rh.ray.flags = RTCRayN_flags(ray,N,i);
+
+  RTCHitN* hit  = RTCRayHitN_HitN(rayhitN,N);
+  rh.hit.Ng_x   = RTCHitN_Ng_x(hit,N,i);
+  rh.hit.Ng_y   = RTCHitN_Ng_y(hit,N,i);
+  rh.hit.Ng_z   = RTCHitN_Ng_z(hit,N,i);
+  rh.hit.u      = RTCHitN_u(hit,N,i);
+  rh.hit.v      = RTCHitN_v(hit,N,i);
+  rh.hit.primID = RTCHitN_primID(hit,N,i);
+  rh.hit.geomID = RTCHitN_geomID(hit,N,i);
+  for (unsigned int l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; l++)
+    rh.hit.instID[l] = RTCHitN_instID(hit,N,i,l);
+
+  return rh;
+}
+
+#endif
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h b/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h
new file mode 100644
index 0000000000..0cd6401593
--- /dev/null
+++ b/thirdparty/embree-aarch64/include/embree3/rtcore_scene.h
@@ -0,0 +1,160 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore_device.h"
+
+RTC_NAMESPACE_BEGIN
+  
+/* Forward declarations for ray structures */
+struct RTCRayHit;
+struct RTCRayHit4;
+struct RTCRayHit8;
+struct RTCRayHit16;
+struct RTCRayHitNp;
+
+/* Scene flags */
+enum RTCSceneFlags
+{
+  RTC_SCENE_FLAG_NONE                    = 0,
+  RTC_SCENE_FLAG_DYNAMIC                 = (1 << 0),
+  RTC_SCENE_FLAG_COMPACT                 = (1 << 1),
+  RTC_SCENE_FLAG_ROBUST                  = (1 << 2),
+  RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION = (1 << 3)
+};
+
+/* Creates a new scene. */
+RTC_API RTCScene rtcNewScene(RTCDevice device);
+
+/* Returns the device the scene got created in. The reference count of
+ * the device is incremented by this function. */
+RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene);
+   
+/* Retains the scene (increments the reference count). */
+RTC_API void rtcRetainScene(RTCScene scene);
+
+/* Releases the scene (decrements the reference count). */
+RTC_API void rtcReleaseScene(RTCScene scene);
+
+
+/* Attaches the geometry to a scene. */
+RTC_API unsigned int rtcAttachGeometry(RTCScene scene, RTCGeometry geometry);
+
+/* Attaches the geometry to a scene using the specified geometry ID. */
+RTC_API void rtcAttachGeometryByID(RTCScene scene, RTCGeometry geometry, unsigned int geomID);
+
+/* Detaches the geometry from the scene. */
+RTC_API void rtcDetachGeometry(RTCScene scene, unsigned int geomID);
+
+/* Gets a geometry handle from the scene. */
+RTC_API RTCGeometry rtcGetGeometry(RTCScene scene, unsigned int geomID);
+
+
+/* Commits the scene. */
+RTC_API void rtcCommitScene(RTCScene scene);
+
+/* Commits the scene from multiple threads. */
+RTC_API void rtcJoinCommitScene(RTCScene scene);
+
+
+/* Progress monitor callback function */
+typedef bool (*RTCProgressMonitorFunction)(void* ptr, double n);
+
+/* Sets the progress monitor callback function of the scene. */
+RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene scene, RTCProgressMonitorFunction progress, void* ptr);
+
+/* Sets the build quality of the scene. */
+RTC_API void rtcSetSceneBuildQuality(RTCScene scene, enum RTCBuildQuality quality);
+
+/* Sets the scene flags. */
+RTC_API void rtcSetSceneFlags(RTCScene scene, enum RTCSceneFlags flags);
+
+/* Returns the scene flags. */
+RTC_API enum RTCSceneFlags rtcGetSceneFlags(RTCScene scene);
+
+/* Returns the axis-aligned bounds of the scene. */
+RTC_API void rtcGetSceneBounds(RTCScene scene, struct RTCBounds* bounds_o);
+
+/* Returns the linear axis-aligned bounds of the scene. */
+RTC_API void rtcGetSceneLinearBounds(RTCScene scene, struct RTCLinearBounds* bounds_o);
+
+
+/* Perform a closest point query of the scene. */
+RTC_API bool rtcPointQuery(RTCScene scene, struct RTCPointQuery* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void* userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery4(const int* valid, RTCScene scene, struct RTCPointQuery4* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery8(const int* valid, RTCScene scene, struct RTCPointQuery8* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Perform a closest point query with a packet of 4 points with the scene. */
+RTC_API bool rtcPointQuery16(const int* valid, RTCScene scene, struct RTCPointQuery16* query, struct RTCPointQueryContext* context, RTCPointQueryFunction queryFunc, void** userPtr);
+
+/* Intersects a single ray with the scene. */
+RTC_API void rtcIntersect1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit);
+
+/* Intersects a packet of 4 rays with the scene. */
+RTC_API void rtcIntersect4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit4* rayhit);
+
+/* Intersects a packet of 8 rays with the scene. */
+RTC_API void rtcIntersect8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit8* rayhit);
+
+/* Intersects a packet of 16 rays with the scene. */
+RTC_API void rtcIntersect16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit16* rayhit);
+
+/* Intersects a stream of M rays with the scene. */
+RTC_API void rtcIntersect1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit* rayhit, unsigned int M, size_t byteStride);
+
+/* Intersects a stream of pointers to M rays with the scene. */
+RTC_API void rtcIntersect1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHit** rayhit, unsigned int M);
+
+/* Intersects a stream of M ray packets of size N in SOA format with the scene. */
+RTC_API void rtcIntersectNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride);
+
+/* Intersects a stream of M ray packets of size N in SOA format with the scene. */
+RTC_API void rtcIntersectNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayHitNp* rayhit, unsigned int N);
+
+/* Tests a single ray for occlusion with the scene. */
+RTC_API void rtcOccluded1(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray);
+
+/* Tests a packet of 4 rays for occlusion occluded with the scene. */
+RTC_API void rtcOccluded4(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay4* ray);
+
+/* Tests a packet of 8 rays for occlusion with the scene. */
+RTC_API void rtcOccluded8(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay8* ray);
+
+/* Tests a packet of 16 rays for occlusion with the scene. */
+RTC_API void rtcOccluded16(const int* valid, RTCScene scene, struct RTCIntersectContext* context, struct RTCRay16* ray);
+
+/* Tests a stream of M rays for occlusion with the scene. */
+RTC_API void rtcOccluded1M(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay* ray, unsigned int M, size_t byteStride);
+
+/* Tests a stream of pointers to M rays for occlusion with the scene. */
+RTC_API void rtcOccluded1Mp(RTCScene scene, struct RTCIntersectContext* context, struct RTCRay** ray, unsigned int M);
+
+/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */
+RTC_API void rtcOccludedNM(RTCScene scene, struct RTCIntersectContext* context, struct RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride);
+
+/* Tests a stream of M ray packets of size N in SOA format for occlusion with the scene. */
+RTC_API void rtcOccludedNp(RTCScene scene, struct RTCIntersectContext* context, const struct RTCRayNp* ray, unsigned int N);
+
+/*! collision callback */
+struct RTCCollision { unsigned int geomID0; unsigned int primID0; unsigned int geomID1; unsigned int primID1; };
+typedef void (*RTCCollideFunc) (void* userPtr, struct RTCCollision* collisions, unsigned int num_collisions);
+
+/*! Performs collision detection of two scenes */
+RTC_API void rtcCollide (RTCScene scene0, RTCScene scene1, RTCCollideFunc callback, void* userPtr);
+ 
+#if defined(__cplusplus)
+
+/* Helper for easily combining scene flags */
+inline RTCSceneFlags operator|(RTCSceneFlags a, RTCSceneFlags b) {
+  return (RTCSceneFlags)((size_t)a | (size_t)b);
+}
+
+#endif
+
+RTC_NAMESPACE_END
+
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h
new file mode 100644
index 0000000000..755ce255fb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_hair.h
@@ -0,0 +1,411 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_strand_array.h"
+
+#define NUM_HAIR_OBJECT_BINS 32
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderHair
+    {
+      /*! settings for builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7), finished_range_threshold(inf) {}
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        size_t finished_range_threshold;  //!< finished range threshold
+      };
+
+      template<typename NodeRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeFunc,
+        typename SetAABBNodeFunc,
+        typename CreateOBBNodeFunc,
+        typename SetOBBNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor,
+        typename ReportFinishedRangeFunc>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+          friend struct BVHBuilderHair;
+
+          typedef FastAllocator::CachedAllocator Allocator;
+          typedef HeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> HeuristicBinningSAH;
+          typedef UnalignedHeuristicArrayBinningSAH<PrimRef,NUM_HAIR_OBJECT_BINS> UnalignedHeuristicBinningSAH;
+          typedef HeuristicStrandSplit HeuristicStrandSplitSAH;
+
+          static const size_t MAX_BRANCHING_FACTOR =  8;         //!< maximum supported BVH branching factor
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree if we are that many levels before the maximum tree depth
+          static const size_t SINGLE_THREADED_THRESHOLD = 4096;  //!< threshold to switch to single threaded build
+
+          static const size_t travCostAligned = 1;
+          static const size_t travCostUnaligned = 5;
+          static const size_t intCost = 6;
+
+          BuilderT (Scene* scene,
+                    PrimRef* prims,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateAABBNodeFunc& createAABBNode,
+                    const SetAABBNodeFunc& setAABBNode,
+                    const CreateOBBNodeFunc& createOBBNode,
+                    const SetOBBNodeFunc& setOBBNode,
+                    const CreateLeafFunc& createLeaf,
+                    const ProgressMonitor& progressMonitor,
+                    const ReportFinishedRangeFunc& reportFinishedRange,
+                    const Settings settings)
+
+            : cfg(settings),
+            prims(prims),
+            createAlloc(createAlloc),
+            createAABBNode(createAABBNode),
+            setAABBNode(setAABBNode),
+            createOBBNode(createOBBNode),
+            setOBBNode(setOBBNode),
+            createLeaf(createLeaf),
+            progressMonitor(progressMonitor),
+            reportFinishedRange(reportFinishedRange),
+            alignedHeuristic(prims), unalignedHeuristic(scene,prims), strandHeuristic(scene,prims) {}
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const PrimInfoRange& range)
+          {
+            if (range.size() == 0) return true;
+            unsigned int firstGeomID = prims[range.begin()].geomID();
+            for (size_t i=range.begin()+1; i<range.end(); i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+
+          /*! creates a large leaf that could be larger than supported by the BVH */
+          NodeRef createLargeLeaf(size_t depth, const PrimInfoRange& pinfo, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* create leaf for few primitives */
+            if (pinfo.size() <= cfg.maxLeafSize && sameGeometry(pinfo))
+              return createLeaf(prims,pinfo,alloc);
+
+            /* fill all children by always splitting the largest one */
+            PrimInfoRange children[MAX_BRANCHING_FACTOR];
+            unsigned numChildren = 1;
+            children[0] = pinfo;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              int bestChild = -1;
+              size_t bestSize = 0;
+              for (unsigned i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i]))
+                  continue;
+
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              __aligned(64) PrimInfoRange left, right;
+              if (!sameGeometry(children[bestChild])) {
+                alignedHeuristic.splitByGeometry(children[bestChild],left,right);
+              } else {
+                alignedHeuristic.splitFallback(children[bestChild],left,right);
+              }
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            /* create node */
+            auto node = createAABBNode(alloc);
+
+            for (size_t i=0; i<numChildren; i++) {
+              const NodeRef child = createLargeLeaf(depth+1,children[i],alloc);
+              setAABBNode(node,i,child,children[i].geomBounds);
+            }
+
+            return node;
+          }
+
+          /*! performs split */
+          __noinline void split(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo, bool& aligned) // FIXME: not inlined as ICC otherwise uses much stack
+          {
+            /* variable to track the SAH of the best splitting approach */
+            float bestSAH = inf;
+            const size_t blocks = (pinfo.size()+(1ull<<cfg.logBlockSize)-1ull) >> cfg.logBlockSize;
+            const float leafSAH = intCost*float(blocks)*halfArea(pinfo.geomBounds);
+
+            /* try standard binning in aligned space */
+            float alignedObjectSAH = inf;
+            HeuristicBinningSAH::Split alignedObjectSplit;
+            if (aligned) {
+              alignedObjectSplit = alignedHeuristic.find(pinfo,cfg.logBlockSize);
+              alignedObjectSAH = travCostAligned*halfArea(pinfo.geomBounds) + intCost*alignedObjectSplit.splitSAH();
+              bestSAH = min(alignedObjectSAH,bestSAH);
+            }
+
+            /* try standard binning in unaligned space */
+            UnalignedHeuristicBinningSAH::Split unalignedObjectSplit;
+            LinearSpace3fa uspace;
+            float unalignedObjectSAH = inf;
+            if (bestSAH > 0.7f*leafSAH) {
+              uspace = unalignedHeuristic.computeAlignedSpace(pinfo);
+              const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(pinfo,uspace);
+              unalignedObjectSplit = unalignedHeuristic.find(sinfo,cfg.logBlockSize,uspace);
+              unalignedObjectSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*unalignedObjectSplit.splitSAH();
+              bestSAH = min(unalignedObjectSAH,bestSAH);
+            }
+
+            /* try splitting into two strands */
+            HeuristicStrandSplitSAH::Split strandSplit;
+            float strandSAH = inf;
+            if (bestSAH > 0.7f*leafSAH && pinfo.size() <= 256) {
+              strandSplit = strandHeuristic.find(pinfo,cfg.logBlockSize);
+              strandSAH = travCostUnaligned*halfArea(pinfo.geomBounds) + intCost*strandSplit.splitSAH();
+              bestSAH = min(strandSAH,bestSAH);
+            }
+
+            /* fallback if SAH heuristics failed */
+            if (unlikely(!std::isfinite(bestSAH)))
+            {
+              alignedHeuristic.deterministic_order(pinfo);
+              alignedHeuristic.splitFallback(pinfo,linfo,rinfo);
+            }
+
+            /* perform aligned split if this is best */
+            else if (bestSAH == alignedObjectSAH) {
+              alignedHeuristic.split(alignedObjectSplit,pinfo,linfo,rinfo);
+            }
+
+            /* perform unaligned split if this is best */
+            else if (bestSAH == unalignedObjectSAH) {
+              unalignedHeuristic.split(unalignedObjectSplit,uspace,pinfo,linfo,rinfo);
+              aligned = false;
+            }
+
+            /* perform strand split if this is best */
+            else if (bestSAH == strandSAH) {
+              strandHeuristic.split(strandSplit,pinfo,linfo,rinfo);
+              aligned = false;
+            }
+
+            /* can never happen */
+            else
+              assert(false);
+          }
+
+          /*! recursive build */
+          NodeRef recurse(size_t depth, const PrimInfoRange& pinfo, Allocator alloc, bool toplevel, bool alloc_barrier)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && pinfo.size() <= SINGLE_THREADED_THRESHOLD)
+              progressMonitor(pinfo.size());
+
+            PrimInfoRange children[MAX_BRANCHING_FACTOR];
+
+            /* create leaf node */
+            if (depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || pinfo.size() <= cfg.minLeafSize) {
+              alignedHeuristic.deterministic_order(pinfo);
+              return createLargeLeaf(depth,pinfo,alloc);
+            }
+
+            /* fill all children by always splitting the one with the largest surface area */
+            size_t numChildren = 1;
+            children[0] = pinfo;
+            bool aligned = true;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              ssize_t bestChild = -1;
+              float bestArea = neg_inf;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.minLeafSize)
+                  continue;
+
+                /* remember child with largest area */
+                if (area(children[i].geomBounds) > bestArea) {
+                  bestArea = area(children[i].geomBounds);
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              PrimInfoRange left, right;
+              split(children[bestChild],left,right,aligned);
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            NodeRef node;
+
+            /* create aligned node */
+            if (aligned)
+            {
+              node = createAABBNode(alloc);
+
+              /* spawn tasks or ... */
+              if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                      setAABBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),children[i].geomBounds);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else {
+                for (size_t i=0; i<numChildren; i++) {
+                  const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                  setAABBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),children[i].geomBounds);
+                }
+              }
+            }
+
+            /* create unaligned node */
+            else
+            {
+              node = createOBBNode(alloc);
+
+              /* spawn tasks or ... */
+              if (pinfo.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+                      const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+                      const OBBox3fa obounds(space,sinfo.geomBounds);
+                      const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                      setOBBNode(node,i,recurse(depth+1,children[i],nullptr,true,child_alloc_barrier),obounds);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else
+              {
+                for (size_t i=0; i<numChildren; i++) {
+                  const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpace(children[i]);
+                  const PrimInfoRange sinfo = unalignedHeuristic.computePrimInfo(children[i],space);
+                  const OBBox3fa obounds(space,sinfo.geomBounds);
+                  const bool child_alloc_barrier = pinfo.size() > cfg.finished_range_threshold && children[i].size() <= cfg.finished_range_threshold;
+                  setOBBNode(node,i,recurse(depth+1,children[i],alloc,false,child_alloc_barrier),obounds);
+                }
+              }
+            }
+
+            /* reports a finished range of primrefs */
+            if (unlikely(alloc_barrier))
+              reportFinishedRange(pinfo);
+
+            return node;
+          }
+
+        private:
+          Settings cfg;
+          PrimRef* prims;
+          const CreateAllocFunc& createAlloc;
+          const CreateAABBNodeFunc& createAABBNode;
+          const SetAABBNodeFunc& setAABBNode;
+          const CreateOBBNodeFunc& createOBBNode;
+          const SetOBBNodeFunc& setOBBNode;
+          const CreateLeafFunc& createLeaf;
+          const ProgressMonitor& progressMonitor;
+          const ReportFinishedRangeFunc& reportFinishedRange;
+
+        private:
+          HeuristicBinningSAH alignedHeuristic;
+          UnalignedHeuristicBinningSAH unalignedHeuristic;
+          HeuristicStrandSplitSAH strandHeuristic;
+        };
+
+      template<typename NodeRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeFunc,
+        typename SetAABBNodeFunc,
+        typename CreateOBBNodeFunc,
+        typename SetOBBNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor,
+        typename ReportFinishedRangeFunc>
+
+        static NodeRef build (const CreateAllocFunc& createAlloc,
+                              const CreateAABBNodeFunc& createAABBNode,
+                              const SetAABBNodeFunc& setAABBNode,
+                              const CreateOBBNodeFunc& createOBBNode,
+                              const SetOBBNodeFunc& setOBBNode,
+                              const CreateLeafFunc& createLeaf,
+                              const ProgressMonitor& progressMonitor,
+                              const ReportFinishedRangeFunc& reportFinishedRange,
+                              Scene* scene,
+                              PrimRef* prims,
+                              const PrimInfo& pinfo,
+                              const Settings settings)
+        {
+          typedef BuilderT<NodeRef,
+            CreateAllocFunc,
+            CreateAABBNodeFunc,SetAABBNodeFunc,
+            CreateOBBNodeFunc,SetOBBNodeFunc,
+            CreateLeafFunc,ProgressMonitor,
+            ReportFinishedRangeFunc> Builder;
+
+          Builder builder(scene,prims,createAlloc,
+                          createAABBNode,setAABBNode,
+                          createOBBNode,setOBBNode,
+                          createLeaf,progressMonitor,reportFinishedRange,settings);
+
+          NodeRef root = builder.recurse(1,pinfo,nullptr,true,false);
+          _mm_mfence(); // to allow non-temporal stores during build
+          return root;
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
new file mode 100644
index 0000000000..92be2f7e65
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_morton.h
@@ -0,0 +1,501 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/builder.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderMorton
+    {
+      static const size_t MAX_BRANCHING_FACTOR = 8;          //!< maximum supported BVH branching factor
+      static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree of we are that many levels before the maximum tree depth
+
+      /*! settings for morton builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024) {}
+
+        /*! initialize settings from API settings */
+        Settings (const RTCBuildArguments& settings)
+        : branchingFactor(2), maxDepth(32), minLeafSize(1), maxLeafSize(7), singleThreadThreshold(1024)
+        {
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
+
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+        Settings (size_t branchingFactor, size_t maxDepth, size_t minLeafSize, size_t maxLeafSize, size_t singleThreadThreshold)
+        : branchingFactor(branchingFactor), maxDepth(maxDepth), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize), singleThreadThreshold(singleThreadThreshold)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+      };
+
+      /*! Build primitive consisting of morton code and primitive ID. */
+      struct __aligned(8) BuildPrim
+      {
+        union {
+          struct {
+            unsigned int code;     //!< morton code
+            unsigned int index;    //!< i'th primitive
+          };
+          uint64_t t;
+        };
+
+        /*! interface for radix sort */
+        __forceinline operator unsigned() const { return code; }
+
+        /*! interface for standard sort */
+        __forceinline bool operator<(const BuildPrim &m) const { return code < m.code; }
+      };
+
+      /*! maps bounding box to morton code */
+      struct MortonCodeMapping
+      {
+        static const size_t LATTICE_BITS_PER_DIM = 10;
+        static const size_t LATTICE_SIZE_PER_DIM = size_t(1) << LATTICE_BITS_PER_DIM;
+
+        vfloat4 base;
+        vfloat4 scale;
+
+        __forceinline MortonCodeMapping(const BBox3fa& bounds)
+        {
+          base  = (vfloat4)bounds.lower;
+          const vfloat4 diag  = (vfloat4)bounds.upper - (vfloat4)bounds.lower;
+          scale = select(diag > vfloat4(1E-19f), rcp(diag) * vfloat4(LATTICE_SIZE_PER_DIM * 0.99f),vfloat4(0.0f));
+        }
+
+        __forceinline const vint4 bin (const BBox3fa& box) const
+        {
+          const vfloat4 lower = (vfloat4)box.lower;
+          const vfloat4 upper = (vfloat4)box.upper;
+          const vfloat4 centroid = lower+upper;
+          return vint4((centroid-base)*scale);
+        }
+
+        __forceinline unsigned int code (const BBox3fa& box) const
+        {
+          const vint4 binID = bin(box);
+          const unsigned int x = extract<0>(binID);
+          const unsigned int y = extract<1>(binID);
+          const unsigned int z = extract<2>(binID);
+          const unsigned int xyz = bitInterleave(x,y,z);
+          return xyz;
+        }
+      };
+
+#if defined (__AVX2__)
+
+      /*! for AVX2 there is a fast scalar bitInterleave */
+      struct MortonCodeGenerator
+      {
+        __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+          : mapping(mapping), dest(dest) {}
+
+        __forceinline void operator() (const BBox3fa& b, const unsigned index)
+        {
+          dest->index = index;
+          dest->code = mapping.code(b);
+          dest++;
+        }
+
+      public:
+        const MortonCodeMapping mapping;
+        BuildPrim* dest;
+        size_t currentID;
+      };
+
+#else
+
+      /*! before AVX2 is it better to use the SSE version of bitInterleave */
+      struct MortonCodeGenerator
+      {
+        __forceinline MortonCodeGenerator(const MortonCodeMapping& mapping, BuildPrim* dest)
+          : mapping(mapping), dest(dest), currentID(0), slots(0), ax(0), ay(0), az(0), ai(0) {}
+
+        __forceinline ~MortonCodeGenerator()
+        {
+          if (slots != 0)
+          {
+            const vint4 code = bitInterleave(ax,ay,az);
+            for (size_t i=0; i<slots; i++) {
+              dest[currentID-slots+i].index = ai[i];
+              dest[currentID-slots+i].code = code[i];
+            }
+          }
+        }
+
+        __forceinline void operator() (const BBox3fa& b, const unsigned index)
+        {
+          const vint4 binID = mapping.bin(b);
+          ax[slots] = extract<0>(binID);
+          ay[slots] = extract<1>(binID);
+          az[slots] = extract<2>(binID);
+          ai[slots] = index;
+          slots++;
+          currentID++;
+
+          if (slots == 4)
+          {
+            const vint4 code = bitInterleave(ax,ay,az);
+            vint4::storeu(&dest[currentID-4],unpacklo(code,ai));
+            vint4::storeu(&dest[currentID-2],unpackhi(code,ai));
+            slots = 0;
+          }
+        }
+
+      public:
+        const MortonCodeMapping mapping;
+        BuildPrim* dest;
+        size_t currentID;
+        size_t slots;
+        vint4 ax, ay, az, ai;
+      };
+
+#endif
+
+      template<
+        typename ReductionTy,
+        typename Allocator,
+        typename CreateAllocator,
+        typename CreateNodeFunc,
+        typename SetNodeBoundsFunc,
+        typename CreateLeafFunc,
+        typename CalculateBounds,
+        typename ProgressMonitor>
+
+        class BuilderT : private Settings
+      {
+        ALIGNED_CLASS_(16);
+
+      public:
+
+        BuilderT (CreateAllocator& createAllocator,
+                  CreateNodeFunc& createNode,
+                  SetNodeBoundsFunc& setBounds,
+                  CreateLeafFunc& createLeaf,
+                  CalculateBounds& calculateBounds,
+                  ProgressMonitor& progressMonitor,
+                  const Settings& settings)
+
+          : Settings(settings),
+          createAllocator(createAllocator),
+          createNode(createNode),
+          setBounds(setBounds),
+          createLeaf(createLeaf),
+          calculateBounds(calculateBounds),
+          progressMonitor(progressMonitor),
+          morton(nullptr) {}
+
+        ReductionTy createLargeLeaf(size_t depth, const range<unsigned>& current, Allocator alloc)
+        {
+          /* this should never occur but is a fatal error */
+          if (depth > maxDepth)
+            throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+          /* create leaf for few primitives */
+          if (current.size() <= maxLeafSize)
+            return createLeaf(current,alloc);
+
+          /* fill all children by always splitting the largest one */
+          range<unsigned> children[MAX_BRANCHING_FACTOR];
+          size_t numChildren = 1;
+          children[0] = current;
+
+          do {
+
+            /* find best child with largest number of primitives */
+            size_t bestChild = -1;
+            size_t bestSize = 0;
+            for (size_t i=0; i<numChildren; i++)
+            {
+              /* ignore leaves as they cannot get split */
+              if (children[i].size() <= maxLeafSize)
+                continue;
+
+              /* remember child with largest size */
+              if (children[i].size() > bestSize) {
+                bestSize = children[i].size();
+                bestChild = i;
+              }
+            }
+            if (bestChild == size_t(-1)) break;
+
+            /*! split best child into left and right child */
+            auto split = children[bestChild].split();
+
+            /* add new children left and right */
+            children[bestChild] = children[numChildren-1];
+            children[numChildren-1] = split.first;
+            children[numChildren+0] = split.second;
+            numChildren++;
+
+          } while (numChildren < branchingFactor);
+
+          /* create node */
+          auto node = createNode(alloc,numChildren);
+
+          /* recurse into each child */
+          ReductionTy bounds[MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<numChildren; i++)
+            bounds[i] = createLargeLeaf(depth+1,children[i],alloc);
+
+          return setBounds(node,bounds,numChildren);
+        }
+
+        /*! recreates morton codes when reaching a region where all codes are identical */
+        __noinline void recreateMortonCodes(const range<unsigned>& current) const
+        {
+          /* fast path for small ranges */
+          if (likely(current.size() < 1024))
+          {
+            /*! recalculate centroid bounds */
+            BBox3fa centBounds(empty);
+            for (size_t i=current.begin(); i<current.end(); i++)
+              centBounds.extend(center2(calculateBounds(morton[i])));
+
+            /* recalculate morton codes */
+            MortonCodeMapping mapping(centBounds);
+            for (size_t i=current.begin(); i<current.end(); i++)
+              morton[i].code = mapping.code(calculateBounds(morton[i]));
+
+            /* sort morton codes */
+            std::sort(morton+current.begin(),morton+current.end());
+          }
+          else
+          {
+            /*! recalculate centroid bounds */
+            auto calculateCentBounds = [&] ( const range<unsigned>& r ) {
+              BBox3fa centBounds = empty;
+              for (size_t i=r.begin(); i<r.end(); i++)
+                centBounds.extend(center2(calculateBounds(morton[i])));
+              return centBounds;
+            };
+            const BBox3fa centBounds = parallel_reduce(current.begin(), current.end(), unsigned(1024),
+                                                       BBox3fa(empty), calculateCentBounds, BBox3fa::merge);
+
+            /* recalculate morton codes */
+            MortonCodeMapping mapping(centBounds);
+            parallel_for(current.begin(), current.end(), unsigned(1024), [&] ( const range<unsigned>& r ) {
+                for (size_t i=r.begin(); i<r.end(); i++) {
+                  morton[i].code = mapping.code(calculateBounds(morton[i]));
+                }
+              });
+
+            /*! sort morton codes */
+#if defined(TASKING_TBB)
+            tbb::parallel_sort(morton+current.begin(),morton+current.end());
+#else
+            radixsort32(morton+current.begin(),current.size());
+#endif
+          }
+        }
+
+        __forceinline void split(const range<unsigned>& current, range<unsigned>& left, range<unsigned>& right) const
+        {
+          const unsigned int code_start = morton[current.begin()].code;
+          const unsigned int code_end   = morton[current.end()-1].code;
+          unsigned int bitpos = lzcnt(code_start^code_end);
+
+          /* if all items mapped to same morton code, then re-create new morton codes for the items */
+          if (unlikely(bitpos == 32))
+          {
+            recreateMortonCodes(current);
+            const unsigned int code_start = morton[current.begin()].code;
+            const unsigned int code_end   = morton[current.end()-1].code;
+            bitpos = lzcnt(code_start^code_end);
+
+            /* if the morton code is still the same, goto fall back split */
+            if (unlikely(bitpos == 32)) {
+              current.split(left,right);
+              return;
+            }
+          }
+
+          /* split the items at the topmost different morton code bit */
+          const unsigned int bitpos_diff = 31-bitpos;
+          const unsigned int bitmask = 1 << bitpos_diff;
+
+          /* find location where bit differs using binary search */
+          unsigned begin = current.begin();
+          unsigned end   = current.end();
+          while (begin + 1 != end) {
+            const unsigned mid = (begin+end)/2;
+            const unsigned bit = morton[mid].code & bitmask;
+            if (bit == 0) begin = mid; else end = mid;
+          }
+          unsigned center = end;
+#if defined(DEBUG)
+          for (unsigned int i=begin;  i<center; i++) assert((morton[i].code & bitmask) == 0);
+          for (unsigned int i=center; i<end;    i++) assert((morton[i].code & bitmask) == bitmask);
+#endif
+
+          left = make_range(current.begin(),center);
+          right = make_range(center,current.end());
+        }
+
+        ReductionTy recurse(size_t depth, const range<unsigned>& current, Allocator alloc, bool toplevel)
+        {
+          /* get thread local allocator */
+          if (!alloc)
+            alloc = createAllocator();
+
+          /* call memory monitor function to signal progress */
+          if (toplevel && current.size() <= singleThreadThreshold)
+            progressMonitor(current.size());
+
+          /* create leaf node */
+          if (unlikely(depth+MIN_LARGE_LEAF_LEVELS >= maxDepth || current.size() <= minLeafSize))
+            return createLargeLeaf(depth,current,alloc);
+
+          /* fill all children by always splitting the one with the largest surface area */
+          range<unsigned> children[MAX_BRANCHING_FACTOR];
+          split(current,children[0],children[1]);
+          size_t numChildren = 2;
+
+          while (numChildren < branchingFactor)
+          {
+            /* find best child with largest number of primitives */
+            int bestChild = -1;
+            unsigned bestItems = 0;
+            for (unsigned int i=0; i<numChildren; i++)
+            {
+              /* ignore leaves as they cannot get split */
+              if (children[i].size() <= minLeafSize)
+                continue;
+
+              /* remember child with largest area */
+              if (children[i].size() > bestItems) {
+                bestItems = children[i].size();
+                bestChild = i;
+              }
+            }
+            if (bestChild == -1) break;
+
+            /*! split best child into left and right child */
+            range<unsigned> left, right;
+            split(children[bestChild],left,right);
+
+            /* add new children left and right */
+            children[bestChild] = children[numChildren-1];
+            children[numChildren-1] = left;
+            children[numChildren+0] = right;
+            numChildren++;
+          }
+
+          /* create leaf node if no split is possible */
+          if (unlikely(numChildren == 1))
+            return createLeaf(current,alloc);
+
+          /* allocate node */
+          auto node = createNode(alloc,numChildren);
+
+          /* process top parts of tree parallel */
+          ReductionTy bounds[MAX_BRANCHING_FACTOR];
+          if (current.size() > singleThreadThreshold)
+          {
+            /*! parallel_for is faster than spawing sub-tasks */
+            parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) {
+                for (size_t i=r.begin(); i<r.end(); i++) {
+                  bounds[i] = recurse(depth+1,children[i],nullptr,true);
+                  _mm_mfence(); // to allow non-temporal stores during build
+                }
+              });
+          }
+
+          /* finish tree sequentially */
+          else
+          {
+            for (size_t i=0; i<numChildren; i++)
+              bounds[i] = recurse(depth+1,children[i],alloc,false);
+          }
+
+          return setBounds(node,bounds,numChildren);
+        }
+
+        /* build function */
+        ReductionTy build(BuildPrim* src, BuildPrim* tmp, size_t numPrimitives)
+        {
+          /* sort morton codes */
+          morton = src;
+          radix_sort_u32(src,tmp,numPrimitives,singleThreadThreshold);
+
+          /* build BVH */
+          const ReductionTy root = recurse(1, range<unsigned>(0,(unsigned)numPrimitives), nullptr, true);
+          _mm_mfence(); // to allow non-temporal stores during build
+          return root;
+        }
+
+      public:
+        CreateAllocator& createAllocator;
+        CreateNodeFunc& createNode;
+        SetNodeBoundsFunc& setBounds;
+        CreateLeafFunc& createLeaf;
+        CalculateBounds& calculateBounds;
+        ProgressMonitor& progressMonitor;
+
+      public:
+        BuildPrim* morton;
+      };
+
+
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetBoundsFunc,
+        typename CreateLeafFunc,
+        typename CalculateBoundsFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAllocator,
+                                 CreateNodeFunc createNode,
+                                 SetBoundsFunc setBounds,
+                                 CreateLeafFunc createLeaf,
+                                 CalculateBoundsFunc calculateBounds,
+                                 ProgressMonitor progressMonitor,
+                                 BuildPrim* src,
+                                 BuildPrim* tmp,
+                                 size_t numPrimitives,
+                                 const Settings& settings)
+        {
+          typedef BuilderT<
+            ReductionTy,
+            decltype(createAllocator()),
+            CreateAllocFunc,
+            CreateNodeFunc,
+            SetBoundsFunc,
+            CreateLeafFunc,
+            CalculateBoundsFunc,
+            ProgressMonitor> Builder;
+
+          Builder builder(createAllocator,
+                          createNode,
+                          setBounds,
+                          createLeaf,
+                          calculateBounds,
+                          progressMonitor,
+                          settings);
+
+          return builder.build(src,tmp,numPrimitives);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h
new file mode 100644
index 0000000000..4c138dacdb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur.h
@@ -0,0 +1,692 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#define MBLUR_NUM_TEMPORAL_BINS 2
+#define MBLUR_NUM_OBJECT_BINS   32
+
+#include "../bvh/bvh.h"
+#include "../common/primref_mb.h"
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_timesplit_array.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      struct SharedVector
+      {
+        __forceinline SharedVector() {}
+
+        __forceinline SharedVector(T* ptr, size_t refCount = 1)
+          : prims(ptr), refCount(refCount) {}
+
+        __forceinline void incRef() {
+          refCount++;
+        }
+
+        __forceinline void decRef()
+        {
+          if (--refCount == 0)
+            delete prims;
+        }
+
+        T* prims;
+        size_t refCount;
+      };
+
+    template<typename BuildRecord, int MAX_BRANCHING_FACTOR>
+      struct LocalChildListT
+      {
+        typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+
+        __forceinline LocalChildListT (const BuildRecord& record)
+          : numChildren(1), numSharedPrimVecs(1)
+        {
+          /* the local root will be freed in the ancestor where it was created (thus refCount is 2) */
+          children[0] = record;
+          primvecs[0] = new (&sharedPrimVecs[0]) SharedPrimRefVector(record.prims.prims, 2);
+        }
+
+        __forceinline ~LocalChildListT()
+        {
+          for (size_t i = 0; i < numChildren; i++)
+            primvecs[i]->decRef();
+        }
+
+        __forceinline BuildRecord& operator[] ( const size_t i ) {
+          return children[i];
+        }
+
+        __forceinline size_t size() const {
+          return numChildren;
+        }
+
+        __forceinline void split(ssize_t bestChild, const BuildRecord& lrecord, const BuildRecord& rrecord, std::unique_ptr<mvector<PrimRefMB>> new_vector)
+        {
+          SharedPrimRefVector* bsharedPrimVec = primvecs[bestChild];
+          if (lrecord.prims.prims == bsharedPrimVec->prims) {
+            primvecs[bestChild] = bsharedPrimVec;
+            bsharedPrimVec->incRef();
+          }
+          else {
+            primvecs[bestChild] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(lrecord.prims.prims);
+          }
+
+          if (rrecord.prims.prims == bsharedPrimVec->prims) {
+            primvecs[numChildren] = bsharedPrimVec;
+            bsharedPrimVec->incRef();
+          }
+          else {
+            primvecs[numChildren] = new (&sharedPrimVecs[numSharedPrimVecs++]) SharedPrimRefVector(rrecord.prims.prims);
+          }
+          bsharedPrimVec->decRef();
+          new_vector.release();
+
+          children[bestChild] = lrecord;
+          children[numChildren] = rrecord;
+          numChildren++;
+        }
+
+      public:
+        array_t<BuildRecord,MAX_BRANCHING_FACTOR> children;
+        array_t<SharedPrimRefVector*,MAX_BRANCHING_FACTOR> primvecs;
+        size_t numChildren;
+
+        array_t<SharedPrimRefVector,2*MAX_BRANCHING_FACTOR> sharedPrimVecs;
+        size_t numSharedPrimVecs;
+      };
+
+    template<typename Mesh>
+      struct RecalculatePrimRef
+      {
+        Scene* scene;
+
+        __forceinline RecalculatePrimRef (Scene* scene)
+          : scene(scene) {}
+
+        __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+        {
+          const unsigned geomID = prim.geomID();
+          const unsigned primID = prim.primID();
+          const Mesh* mesh = scene->get<Mesh>(geomID);
+          const LBBox3fa lbounds = mesh->linearBounds(primID, time_range);
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+        }
+
+        // __noinline is workaround for ICC16 bug under MacOSX
+        __noinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+        {
+          const unsigned geomID = prim.geomID();
+          const unsigned primID = prim.primID();
+          const Mesh* mesh = scene->get<Mesh>(geomID);
+          const LBBox3fa lbounds = mesh->linearBounds(space, primID, time_range);
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+        }
+
+        __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+          return scene->get<Mesh>(prim.geomID())->linearBounds(prim.primID(), time_range);
+        }
+
+        // __noinline is workaround for ICC16 bug under MacOSX
+        __noinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+          return scene->get<Mesh>(prim.geomID())->linearBounds(space, prim.primID(), time_range);
+        }
+      };
+
+    struct VirtualRecalculatePrimRef
+    {
+      Scene* scene;
+      
+      __forceinline VirtualRecalculatePrimRef (Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+      {
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const Geometry* mesh = scene->get(geomID);
+        const LBBox3fa lbounds = mesh->vlinearBounds(primID, time_range);
+        const range<int> tbounds = mesh->timeSegmentRange(time_range);
+        return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+      }
+      
+      __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const
+      {
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const Geometry* mesh = scene->get(geomID);
+        const LBBox3fa lbounds = mesh->vlinearBounds(space, primID, time_range);
+        const range<int> tbounds = mesh->timeSegmentRange(time_range);
+        return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, mesh->numTimeSegments(), geomID, primID);
+      }
+      
+      __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+        return scene->get(prim.geomID())->vlinearBounds(prim.primID(), time_range);
+      }
+      
+      __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range, const LinearSpace3fa& space) const {
+        return scene->get(prim.geomID())->vlinearBounds(space, prim.primID(), time_range);
+      }
+    };
+
+    struct BVHBuilderMSMBlur
+    {
+      /*! settings for msmblur builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8),
+          travCost(1.0f), intCost(1.0f), singleLeafTimeSegment(false),
+          singleThreadThreshold(1024) {}
+
+
+        Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold)
+        : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+          travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        float travCost;          //!< estimated cost of one traversal step
+        float intCost;           //!< estimated cost of one primitive intersection
+        bool singleLeafTimeSegment; //!< split time to single time range
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+      };
+
+      struct BuildRecord
+      {
+      public:
+	__forceinline BuildRecord () {}
+
+        __forceinline BuildRecord (size_t depth)
+          : depth(depth) {}
+
+        __forceinline BuildRecord (const SetMB& prims, size_t depth)
+          : depth(depth), prims(prims) {}
+
+        __forceinline friend bool operator< (const BuildRecord& a, const BuildRecord& b) {
+          return a.prims.size() < b.prims.size();
+        }
+
+        __forceinline size_t size() const {
+          return prims.size();
+        }
+
+      public:
+	size_t depth;                     //!< Depth of the root of this subtree.
+	SetMB prims;                      //!< The list of primitives.
+      };
+
+      struct BuildRecordSplit : public BuildRecord
+      {
+        __forceinline BuildRecordSplit () {}
+
+        __forceinline BuildRecordSplit (size_t depth) 
+          : BuildRecord(depth) {}
+
+        __forceinline BuildRecordSplit (const BuildRecord& record, const BinSplit<MBLUR_NUM_OBJECT_BINS>& split)
+          : BuildRecord(record), split(split) {}
+        
+        BinSplit<MBLUR_NUM_OBJECT_BINS> split;
+      };
+
+      template<
+        typename NodeRef,
+        typename RecalculatePrimRef,
+        typename Allocator,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+          static const size_t MAX_BRANCHING_FACTOR = 16;       //!< maximum supported BVH branching factor	  
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;        //!< create balanced tree if we are that many levels before the maximum tree depth
+
+          typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+          typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+          typedef mvector<PrimRefMB>* PrimRefVector;
+          typedef SharedVector<mvector<PrimRefMB>> SharedPrimRefVector;
+          typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+          typedef LocalChildListT<BuildRecordSplit,MAX_BRANCHING_FACTOR> LocalChildListSplit;
+
+        public:
+
+          BuilderT (MemoryMonitorInterface* device,
+                    const RecalculatePrimRef recalculatePrimRef,
+                    const CreateAllocFunc createAlloc,
+                    const CreateNodeFunc createNode,
+                    const SetNodeFunc setNode,
+                    const CreateLeafFunc createLeaf,
+                    const ProgressMonitor progressMonitor,
+                    const Settings& settings)
+            : cfg(settings),
+            heuristicObjectSplit(),
+            heuristicTemporalSplit(device, recalculatePrimRef),
+            recalculatePrimRef(recalculatePrimRef), createAlloc(createAlloc), createNode(createNode), setNode(setNode), createLeaf(createLeaf),
+            progressMonitor(progressMonitor)
+          {
+            if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+          }
+
+          /*! finds the best split */
+          const Split find(const SetMB& set)
+          {
+            /* first try standard object split */
+            const Split object_split = heuristicObjectSplit.find(set,cfg.logBlockSize);
+            const float object_split_sah = object_split.splitSAH();
+
+            /* test temporal splits only when object split was bad */
+            const float leaf_sah = set.leafSAH(cfg.logBlockSize);
+            if (object_split_sah < 0.50f*leaf_sah)
+              return object_split;
+
+            /* do temporal splits only if the the time range is big enough */
+            if (set.time_range.size() > 1.01f/float(set.max_num_time_segments))
+            {
+              const Split temporal_split = heuristicTemporalSplit.find(set,cfg.logBlockSize);
+              const float temporal_split_sah = temporal_split.splitSAH();
+
+              /* take temporal split if it improved SAH */
+              if (temporal_split_sah < object_split_sah)
+                return temporal_split;
+            }
+
+            return object_split;
+          }
+
+          /*! array partitioning */
+          __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            /* perform object split */
+            if (likely(split.data == Split::SPLIT_OBJECT)) {
+              heuristicObjectSplit.split(split,set,lset,rset);
+            }
+            /* perform temporal split */
+            else if (likely(split.data == Split::SPLIT_TEMPORAL)) {
+              return heuristicTemporalSplit.split(split,set,lset,rset);
+            }
+            /* perform fallback split */
+            else if (unlikely(split.data == Split::SPLIT_FALLBACK)) {
+              set.deterministic_order();
+              splitFallback(set,lset,rset);
+            }
+            /* split by geometry */
+            else if (unlikely(split.data == Split::SPLIT_GEOMID)) {
+              set.deterministic_order();
+              splitByGeometry(set,lset,rset);
+            }
+            else
+              assert(false);
+
+            return std::unique_ptr<mvector<PrimRefMB>>();
+          }
+
+          /*! finds the best fallback split */
+          __noinline Split findFallback(const SetMB& set)
+          {
+            /* split if primitives are not from same geometry */
+            if (!sameGeometry(set))
+              return Split(0.0f,Split::SPLIT_GEOMID);
+            
+            /* if a leaf can only hold a single time-segment, we might have to do additional temporal splits */
+            if (cfg.singleLeafTimeSegment)
+            {
+              /* test if one primitive has more than one time segment in time range, if so split time */
+              for (size_t i=set.begin(); i<set.end(); i++)
+              {
+                const PrimRefMB& prim = (*set.prims)[i];
+                const range<int> itime_range = prim.timeSegmentRange(set.time_range);
+                const int localTimeSegments = itime_range.size();
+                assert(localTimeSegments > 0);
+                if (localTimeSegments > 1) {
+                  const int icenter = (itime_range.begin() + itime_range.end())/2;
+                  const float splitTime = prim.timeStep(icenter);
+                  return Split(0.0f,(unsigned)Split::SPLIT_TEMPORAL,0,splitTime);
+                }
+              }
+            }        
+
+            /* otherwise return fallback split */
+            return Split(0.0f,Split::SPLIT_FALLBACK);
+          }
+
+          /*! performs fallback split */
+          void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            const size_t center = (begin + end)/2;
+
+            PrimInfoMB linfo = empty;
+            for (size_t i=begin; i<center; i++)
+              linfo.add_primref(prims[i]);
+
+            PrimInfoMB rinfo = empty;
+            for (size_t i=center; i<end; i++)
+              rinfo.add_primref(prims[i]);
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const SetMB& set)
+          {
+            if (set.size() == 0) return true;
+            mvector<PrimRefMB>& prims = *set.prims;
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            unsigned int firstGeomID = prims[begin].geomID();
+            for (size_t i=begin+1; i<end; i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+
+          /* split by geometry ID */
+          void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            assert(set.size() > 1);
+
+            mvector<PrimRefMB>& prims = *set.prims;
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            
+            PrimInfoMB left(empty);
+            PrimInfoMB right(empty);
+            unsigned int geomID = prims[begin].geomID();
+            size_t center = serial_partitioning(prims.data(),begin,end,left,right,
+                                                [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+                                                [ ] ( PrimInfoMB& dst, const PrimRefMB& prim ) { dst.add_primref(prim); });
+            
+            new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(right,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          const NodeRecordMB4D createLargeLeaf(const BuildRecord& in, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (in.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* replace already found split by fallback split */
+            const BuildRecordSplit current(BuildRecord(in.prims,in.depth),findFallback(in.prims));
+
+            /* special case when directly creating leaf without any splits that could shrink time_range */
+            bool force_split = false;
+            if (current.depth == 1 && current.size() > 0)
+            {
+              BBox1f c = empty;
+              BBox1f p = current.prims.time_range;
+              for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+                mvector<PrimRefMB>& prims = *current.prims.prims;
+                c.extend(prims[i].time_range);
+              }
+              
+              force_split = c.lower > p.lower || c.upper < p.upper;
+            }
+	    
+            /* create leaf for few primitives */
+            if (current.size() <= cfg.maxLeafSize && current.split.data < Split::SPLIT_ENFORCE && !force_split)
+              return createLeaf(current,alloc);
+	  
+            /* fill all children by always splitting the largest one */
+            bool hasTimeSplits = false;
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildListSplit children(current);
+
+            do {
+              /* find best child with largest bounding box area */
+              size_t bestChild = -1;
+              size_t bestSize = 0;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && children[i].split.data < Split::SPLIT_ENFORCE && !force_split)
+                  continue;
+
+                force_split = false;
+                
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform best found split */
+              BuildRecordSplit& brecord = children[bestChild];
+              BuildRecordSplit lrecord(current.depth+1);
+              BuildRecordSplit rrecord(current.depth+1);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(brecord.split,brecord.prims,lrecord.prims,rrecord.prims);
+              hasTimeSplits |= new_vector != nullptr;
+
+              /* find new splits */
+              lrecord.split = findFallback(lrecord.prims);
+              rrecord.split = findFallback(rrecord.prims);
+              children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+
+            } while (children.size() < cfg.branchingFactor);
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = in.prims.time_range;
+              hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create node */
+            auto node = createNode(children.children.data(),children.numChildren,alloc,hasTimeSplits);
+
+            /* recurse into each child and perform reduction */
+            LBBox3fa gbounds = empty;
+            for (size_t i=0; i<children.size(); i++) {
+              values[i] = createLargeLeaf(children[i],alloc);
+              gbounds.extend(values[i].lbounds);
+            }
+
+            setNode(current,children.children.data(),node,values,children.numChildren);
+
+            /* calculate geometry bounds of this node */
+            if (hasTimeSplits)
+              return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+            else
+              return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+          }
+
+          const NodeRecordMB4D recurse(const BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= cfg.singleThreadThreshold)
+              progressMonitor(current.size());
+
+            /*! find best split */
+            const Split csplit = find(current.prims);
+
+            /*! compute leaf and split cost */
+            const float leafSAH  = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+            const float splitSAH = cfg.travCost*current.prims.halfArea()+cfg.intCost*csplit.splitSAH();
+            assert((current.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+            /*! create a leaf node when threshold reached or SAH tells us to stop */
+            if (current.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+              current.prims.deterministic_order();
+              return createLargeLeaf(current,alloc);
+            }
+
+            /*! perform initial split */
+            SetMB lprims,rprims;
+            std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,current.prims,lprims,rprims);
+            bool hasTimeSplits = new_vector != nullptr;
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildList children(current);
+            {
+              BuildRecord lrecord(lprims,current.depth+1);
+              BuildRecord rrecord(rprims,current.depth+1);
+              children.split(0,lrecord,rrecord,std::move(new_vector));
+            }
+
+            /*! split until node is full or SAH tells us to stop */
+            while (children.size() < cfg.branchingFactor) 
+            {
+              /*! find best child to split */
+              float bestArea = neg_inf;
+              ssize_t bestChild = -1;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                if (children[i].size() <= cfg.minLeafSize) continue;
+                if (expectedApproxHalfArea(children[i].prims.geomBounds) > bestArea) {
+                  bestChild = i; bestArea = expectedApproxHalfArea(children[i].prims.geomBounds);
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform split */
+              BuildRecord& brecord = children[bestChild];
+              BuildRecord lrecord(current.depth+1);
+              BuildRecord rrecord(current.depth+1);
+              Split csplit = find(brecord.prims);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(csplit,brecord.prims,lrecord.prims,rrecord.prims);
+              hasTimeSplits |= new_vector != nullptr;
+              children.split(bestChild,lrecord,rrecord,std::move(new_vector));
+            }
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              hasTimeSplits |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* sort buildrecords for simpler shadow ray traversal */
+            //std::sort(&children[0],&children[children.size()],std::greater<BuildRecord>()); // FIXME: reduces traversal performance of bvh8.triangle4 (need to verified) !!
+
+            /*! create an inner node */
+            auto node = createNode(children.children.data(), children.numChildren, alloc, hasTimeSplits);
+            LBBox3fa gbounds = empty;
+
+            /* spawn tasks */
+            if (unlikely(current.size() > cfg.singleThreadThreshold))
+            {
+              /*! parallel_for is faster than spawing sub-tasks */
+              parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++) {
+                    values[i] = recurse(children[i],nullptr,true);
+                    _mm_mfence(); // to allow non-temporal stores during build
+                  }
+                });
+
+              /*! merge bounding boxes */
+              for (size_t i=0; i<children.size(); i++)
+                gbounds.extend(values[i].lbounds);
+            }
+            /* recurse into each child */
+            else
+            {
+              //for (size_t i=0; i<children.size(); i++)
+              for (ssize_t i=children.size()-1; i>=0; i--) {
+                values[i] = recurse(children[i],alloc,false);
+                gbounds.extend(values[i].lbounds);
+              }
+            }
+
+            setNode(current,children.children.data(),node,values,children.numChildren);
+
+            /* calculate geometry bounds of this node */
+            if (unlikely(hasTimeSplits))
+              return NodeRecordMB4D(node,current.prims.linearBounds(recalculatePrimRef),current.prims.time_range);
+            else
+              return NodeRecordMB4D(node,gbounds,current.prims.time_range);
+          }
+
+          /*! builder entry function */
+          __forceinline const NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+          {
+            const SetMB set(pinfo,&prims);
+            auto ret = recurse(BuildRecord(set,1),nullptr,true);
+            _mm_mfence(); // to allow non-temporal stores during build
+            return ret;
+          }
+
+        private:
+          Settings cfg;
+          HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> heuristicObjectSplit;
+          HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> heuristicTemporalSplit;
+          const RecalculatePrimRef recalculatePrimRef;
+          const CreateAllocFunc createAlloc;
+          const CreateNodeFunc createNode;
+          const SetNodeFunc setNode;
+          const CreateLeafFunc createLeaf;
+          const ProgressMonitor progressMonitor;
+        };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename SetNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitorFunc>
+
+        static const BVHNodeRecordMB4D<NodeRef> build(mvector<PrimRefMB>& prims,
+                                                      const PrimInfoMB& pinfo,
+                                                      MemoryMonitorInterface* device,
+                                                      const RecalculatePrimRef recalculatePrimRef,
+                                                      const CreateAllocFunc createAlloc,
+                                                      const CreateNodeFunc createNode,
+                                                      const SetNodeFunc setNode,
+                                                      const CreateLeafFunc createLeaf,
+                                                      const ProgressMonitorFunc progressMonitor,
+                                                      const Settings& settings)
+      {
+          typedef BuilderT<
+            NodeRef,
+            RecalculatePrimRef,
+            decltype(createAlloc()),
+            CreateAllocFunc,
+            CreateNodeFunc,
+            SetNodeFunc,
+            CreateLeafFunc,
+            ProgressMonitorFunc> Builder;
+
+          Builder builder(device,
+                          recalculatePrimRef,
+                          createAlloc,
+                          createNode,
+                          setNode,
+                          createLeaf,
+                          progressMonitor,
+                          settings);
+
+
+          return builder(prims,pinfo);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h
new file mode 100644
index 0000000000..e477c313a3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_msmblur_hair.h
@@ -0,0 +1,526 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../geometry/primitive.h"
+#include "../builders/bvh_builder_msmblur.h"
+#include "../builders/heuristic_binning_array_aligned.h"
+#include "../builders/heuristic_binning_array_unaligned.h"
+#include "../builders/heuristic_timesplit_array.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct BVHBuilderHairMSMBlur
+    {
+      /*! settings for msmblur builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(8) {}
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+      };
+
+      struct BuildRecord
+      {
+      public:
+	__forceinline BuildRecord () {}
+
+        __forceinline BuildRecord (size_t depth)
+          : depth(depth) {}
+
+        __forceinline BuildRecord (const SetMB& prims, size_t depth)
+          : depth(depth), prims(prims) {}
+
+        __forceinline size_t size() const {
+          return prims.size();
+        }
+
+      public:
+	size_t depth;       //!< depth of the root of this subtree
+	SetMB prims;        //!< the list of primitives
+      };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeMBFunc,
+        typename SetAABBNodeMBFunc,
+        typename CreateOBBNodeMBFunc,
+        typename SetOBBNodeMBFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          ALIGNED_CLASS_(16);
+
+          static const size_t MAX_BRANCHING_FACTOR =  8;         //!< maximum supported BVH branching factor
+          static const size_t MIN_LARGE_LEAF_LEVELS = 8;         //!< create balanced tree if we are that many levels before the maximum tree depth
+          static const size_t SINGLE_THREADED_THRESHOLD = 4096;  //!< threshold to switch to single threaded build
+
+          typedef BVHNodeRecordMB<NodeRef> NodeRecordMB;
+          typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+
+          typedef FastAllocator::CachedAllocator Allocator;
+          typedef LocalChildListT<BuildRecord,MAX_BRANCHING_FACTOR> LocalChildList;
+
+          typedef HeuristicMBlurTemporalSplit<PrimRefMB,RecalculatePrimRef,MBLUR_NUM_TEMPORAL_BINS> HeuristicTemporal;
+          typedef HeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> HeuristicBinning;
+          typedef UnalignedHeuristicArrayBinningMB<PrimRefMB,MBLUR_NUM_OBJECT_BINS> UnalignedHeuristicBinning;
+
+        public:
+
+          BuilderT (Scene* scene,
+                    const RecalculatePrimRef& recalculatePrimRef,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateAABBNodeMBFunc& createAABBNodeMB,
+                    const SetAABBNodeMBFunc& setAABBNodeMB,
+                    const CreateOBBNodeMBFunc& createOBBNodeMB,
+                    const SetOBBNodeMBFunc& setOBBNodeMB,
+                    const CreateLeafFunc& createLeaf,
+                    const ProgressMonitor& progressMonitor,
+                    const Settings settings)
+
+            : cfg(settings),
+            scene(scene),
+            recalculatePrimRef(recalculatePrimRef),
+            createAlloc(createAlloc),
+            createAABBNodeMB(createAABBNodeMB), setAABBNodeMB(setAABBNodeMB),
+            createOBBNodeMB(createOBBNodeMB), setOBBNodeMB(setOBBNodeMB),
+            createLeaf(createLeaf),
+            progressMonitor(progressMonitor),
+            unalignedHeuristic(scene),
+            temporalSplitHeuristic(scene->device,recalculatePrimRef) {}
+
+        private:
+
+          /*! checks if all primitives are from the same geometry */
+          __forceinline bool sameGeometry(const SetMB& set)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+            unsigned int firstGeomID = prims[set.begin()].geomID();
+            for (size_t i=set.begin()+1; i<set.end(); i++) {
+              if (prims[i].geomID() != firstGeomID){
+                return false;
+              }
+            }
+            return true;
+          }
+          
+          /*! performs some split if SAH approaches fail */
+          void splitFallback(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            mvector<PrimRefMB>& prims = *set.prims;
+
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            const size_t center = (begin + end)/2;
+
+            PrimInfoMB linfo = empty;
+            for (size_t i=begin; i<center; i++)
+              linfo.add_primref(prims[i]);
+
+            PrimInfoMB rinfo = empty;
+            for (size_t i=center; i<end; i++)
+              rinfo.add_primref(prims[i]);
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          void splitByGeometry(const SetMB& set, SetMB& lset, SetMB& rset)
+          {
+            assert(set.size() > 1);
+            const size_t begin = set.begin();
+            const size_t end   = set.end();
+            PrimInfoMB linfo(empty);
+            PrimInfoMB rinfo(empty);
+            unsigned int geomID = (*set.prims)[begin].geomID();
+            size_t center = serial_partitioning(set.prims->data(),begin,end,linfo,rinfo,
+                                                [&] ( const PrimRefMB& prim ) { return prim.geomID() == geomID; },
+                                                [ ] ( PrimInfoMB& a, const PrimRefMB& ref ) { a.add_primref(ref); });
+
+            new (&lset) SetMB(linfo,set.prims,range<size_t>(begin,center),set.time_range);
+            new (&rset) SetMB(rinfo,set.prims,range<size_t>(center,end  ),set.time_range);
+          }
+
+          /*! creates a large leaf that could be larger than supported by the BVH */
+          NodeRecordMB4D createLargeLeaf(BuildRecord& current, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (current.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* special case when directly creating leaf without any splits that could shrink time_range */
+            bool force_split = false;
+            if (current.depth == 1 && current.size() > 0)
+            {
+              BBox1f c = empty;
+              BBox1f p = current.prims.time_range;
+              for (size_t i=current.prims.begin(); i<current.prims.end(); i++) {
+                mvector<PrimRefMB>& prims = *current.prims.prims;
+                c.extend(prims[i].time_range);
+              }
+              
+              force_split = c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create leaf for few primitives */
+            if (current.size() <= cfg.maxLeafSize && sameGeometry(current.prims) && !force_split)
+              return createLeaf(current.prims,alloc);
+
+            /* fill all children by always splitting the largest one */
+            LocalChildList children(current);
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+
+            do {
+
+              /* find best child with largest bounding box area */
+              int bestChild = -1;
+              size_t bestSize = 0;
+              for (unsigned i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.maxLeafSize && sameGeometry(children[i].prims) && !force_split)
+                  continue;
+
+                force_split = false;
+
+                /* remember child with largest size */
+                if (children[i].size() > bestSize) {
+                  bestSize = children[i].size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              if (!sameGeometry(children[bestChild].prims)) {
+                splitByGeometry(children[bestChild].prims,left.prims,right.prims);
+              } else {
+                splitFallback(children[bestChild].prims,left.prims,right.prims);
+              }
+              children.split(bestChild,left,right,std::unique_ptr<mvector<PrimRefMB>>());
+
+            } while (children.size() < cfg.branchingFactor);
+
+
+            /* detect time_ranges that have shrunken */
+            bool timesplit = false;
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              timesplit |= c.lower > p.lower || c.upper < p.upper;
+            }
+            
+            /* create node */
+            NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,timesplit);
+
+            LBBox3fa bounds = empty;
+            for (size_t i=0; i<children.size(); i++) {
+              values[i] = createLargeLeaf(children[i],alloc);
+              bounds.extend(values[i].lbounds);
+            }
+
+            setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+            if (timesplit)
+              bounds = current.prims.linearBounds(recalculatePrimRef);
+              
+            return NodeRecordMB4D(node,bounds,current.prims.time_range);
+          }
+
+          /*! performs split */
+          std::unique_ptr<mvector<PrimRefMB>> split(const BuildRecord& current, BuildRecord& lrecord, BuildRecord& rrecord, bool& aligned, bool& timesplit)
+          {
+            /* variable to track the SAH of the best splitting approach */
+            float bestSAH = inf;
+            const float leafSAH = current.prims.leafSAH(cfg.logBlockSize);
+
+            /* perform standard binning in aligned space */
+            HeuristicBinning::Split alignedObjectSplit = alignedHeuristic.find(current.prims,cfg.logBlockSize);
+            float alignedObjectSAH = alignedObjectSplit.splitSAH();
+            bestSAH = min(alignedObjectSAH,bestSAH);
+
+            /* perform standard binning in unaligned space */
+            UnalignedHeuristicBinning::Split unalignedObjectSplit;
+            LinearSpace3fa uspace;
+            float unalignedObjectSAH = inf;
+            if (alignedObjectSAH > 0.7f*leafSAH) {
+              uspace = unalignedHeuristic.computeAlignedSpaceMB(scene,current.prims);
+              const SetMB sset = current.prims.primInfo(recalculatePrimRef,uspace);
+              unalignedObjectSplit = unalignedHeuristic.find(sset,cfg.logBlockSize,uspace);
+              unalignedObjectSAH = 1.3f*unalignedObjectSplit.splitSAH(); // makes unaligned splits more expensive
+              bestSAH = min(unalignedObjectSAH,bestSAH);
+            }
+
+            /* do temporal splits only if previous approaches failed to produce good SAH and the the time range is large enough */
+            float temporal_split_sah = inf;
+            typename HeuristicTemporal::Split temporal_split;
+            if (bestSAH > 0.5f*leafSAH) {
+              if (current.prims.time_range.size() > 1.01f/float(current.prims.max_num_time_segments)) {
+                temporal_split = temporalSplitHeuristic.find(current.prims,cfg.logBlockSize);
+                temporal_split_sah = temporal_split.splitSAH();
+                bestSAH = min(temporal_split_sah,bestSAH);
+              }
+            }
+
+            /* perform fallback split if SAH heuristics failed */
+            if (unlikely(!std::isfinite(bestSAH))) {
+              current.prims.deterministic_order();
+              splitFallback(current.prims,lrecord.prims,rrecord.prims);
+            }
+            /* perform aligned split if this is best */
+            else if (likely(bestSAH == alignedObjectSAH)) {
+              alignedHeuristic.split(alignedObjectSplit,current.prims,lrecord.prims,rrecord.prims);
+            }
+            /* perform unaligned split if this is best */
+            else if (likely(bestSAH == unalignedObjectSAH)) {
+              unalignedHeuristic.split(unalignedObjectSplit,uspace,current.prims,lrecord.prims,rrecord.prims);
+              aligned = false;
+            }
+            /* perform temporal split if this is best */
+            else if (likely(bestSAH == temporal_split_sah)) {
+              timesplit = true;
+              return temporalSplitHeuristic.split(temporal_split,current.prims,lrecord.prims,rrecord.prims);
+            }
+            else
+              assert(false);
+
+            return std::unique_ptr<mvector<PrimRefMB>>();
+          }
+
+          /*! recursive build */
+          NodeRecordMB4D recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= SINGLE_THREADED_THRESHOLD)
+              progressMonitor(current.size());
+
+            /* create leaf node */
+            if (current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || current.size() <= cfg.minLeafSize) {
+              current.prims.deterministic_order();
+              return createLargeLeaf(current,alloc);
+            }
+
+            /* fill all children by always splitting the one with the largest surface area */
+            NodeRecordMB4D values[MAX_BRANCHING_FACTOR];
+            LocalChildList children(current);
+            bool aligned = true;
+            bool timesplit = false;
+
+            do {
+
+              /* find best child with largest bounding box area */
+              ssize_t bestChild = -1;
+              float bestArea = neg_inf;
+              for (size_t i=0; i<children.size(); i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].size() <= cfg.minLeafSize)
+                  continue;
+
+                /* remember child with largest area */
+                const float A = children[i].prims.halfArea();
+                if (A > bestArea) {
+                  bestArea = children[i].prims.halfArea();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == -1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              std::unique_ptr<mvector<PrimRefMB>> new_vector = split(children[bestChild],left,right,aligned,timesplit);
+              children.split(bestChild,left,right,std::move(new_vector));
+
+            } while (children.size() < cfg.branchingFactor);
+
+            /* detect time_ranges that have shrunken */
+            for (size_t i=0; i<children.size(); i++) {
+              const BBox1f c = children[i].prims.time_range;
+              const BBox1f p = current.prims.time_range;
+              timesplit |= c.lower > p.lower || c.upper < p.upper;
+            }
+
+            /* create time split node */
+            if (timesplit)
+            {
+              const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      values[i] = recurse(children[i],nullptr,true);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequential */
+              else {
+                for (size_t i=0; i<children.size(); i++) {
+                  values[i] = recurse(children[i],alloc,false);
+                }
+              }
+
+              setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+
+              const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+              return NodeRecordMB4D(node,bounds,current.prims.time_range);
+            }
+
+            /* create aligned node */
+            else if (aligned)
+            {
+              const NodeRef node = createAABBNodeMB(children.children.data(),children.numChildren,alloc,true);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                LBBox3fa cbounds[MAX_BRANCHING_FACTOR];
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      values[i] = recurse(children[i],nullptr,true);
+                      cbounds[i] = values[i].lbounds;
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+
+                LBBox3fa bounds = empty;
+                for (size_t i=0; i<children.size(); i++)
+                  bounds.extend(cbounds[i]);
+                setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+                return NodeRecordMB4D(node,bounds,current.prims.time_range);
+              }
+              /* ... continue sequentially */
+              else
+              {
+                LBBox3fa bounds = empty;
+                for (size_t i=0; i<children.size(); i++) {
+                  values[i] = recurse(children[i],alloc,false);
+                  bounds.extend(values[i].lbounds);
+                }
+                setAABBNodeMB(current,children.children.data(),node,values,children.numChildren);
+                return NodeRecordMB4D(node,bounds,current.prims.time_range);
+              }
+            }
+
+            /* create unaligned node */
+            else
+            {
+              const NodeRef node = createOBBNodeMB(alloc);
+
+              /* spawn tasks or ... */
+              if (current.size() > SINGLE_THREADED_THRESHOLD)
+              {
+                parallel_for(size_t(0), children.size(), [&] (const range<size_t>& r) {
+                    for (size_t i=r.begin(); i<r.end(); i++) {
+                      const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+                      const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+                      const auto child = recurse(children[i],nullptr,true);
+                      setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+                      _mm_mfence(); // to allow non-temporal stores during build
+                    }
+                  });
+              }
+              /* ... continue sequentially */
+              else
+              {
+                for (size_t i=0; i<children.size(); i++) {
+                  const LinearSpace3fa space = unalignedHeuristic.computeAlignedSpaceMB(scene,children[i].prims);
+                  const LBBox3fa lbounds = children[i].prims.linearBounds(recalculatePrimRef,space);
+                  const auto child = recurse(children[i],alloc,false);
+                  setOBBNodeMB(node,i,child.ref,space,lbounds,children[i].prims.time_range);
+                }
+              }
+
+              const LBBox3fa bounds = current.prims.linearBounds(recalculatePrimRef);
+              return NodeRecordMB4D(node,bounds,current.prims.time_range);
+            }
+          }
+
+        public:
+
+          /*! entry point into builder */
+          NodeRecordMB4D operator() (mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo)
+          {
+            BuildRecord record(SetMB(pinfo,&prims),1);
+            auto root = recurse(record,nullptr,true);
+            _mm_mfence(); // to allow non-temporal stores during build
+            return root;
+          }
+
+        private:
+          Settings cfg;
+          Scene* scene;
+          const RecalculatePrimRef& recalculatePrimRef;
+          const CreateAllocFunc& createAlloc;
+          const CreateAABBNodeMBFunc& createAABBNodeMB;
+          const SetAABBNodeMBFunc& setAABBNodeMB;
+          const CreateOBBNodeMBFunc& createOBBNodeMB;
+          const SetOBBNodeMBFunc& setOBBNodeMB;
+          const CreateLeafFunc& createLeaf;
+          const ProgressMonitor& progressMonitor;
+
+        private:
+          HeuristicBinning alignedHeuristic;
+          UnalignedHeuristicBinning unalignedHeuristic;
+          HeuristicTemporal temporalSplitHeuristic;
+        };
+
+      template<typename NodeRef,
+        typename RecalculatePrimRef,
+        typename CreateAllocFunc,
+        typename CreateAABBNodeMBFunc,
+        typename SetAABBNodeMBFunc,
+        typename CreateOBBNodeMBFunc,
+        typename SetOBBNodeMBFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        static BVHNodeRecordMB4D<NodeRef> build (Scene* scene, mvector<PrimRefMB>& prims, const PrimInfoMB& pinfo,
+                                               const RecalculatePrimRef& recalculatePrimRef,
+                                               const CreateAllocFunc& createAlloc,
+                                               const CreateAABBNodeMBFunc& createAABBNodeMB,
+                                               const SetAABBNodeMBFunc& setAABBNodeMB,
+                                               const CreateOBBNodeMBFunc& createOBBNodeMB,
+                                               const SetOBBNodeMBFunc& setOBBNodeMB,
+                                               const CreateLeafFunc& createLeaf,
+                                               const ProgressMonitor& progressMonitor,
+                                               const Settings settings)
+        {
+          typedef BuilderT<NodeRef,RecalculatePrimRef,CreateAllocFunc,
+            CreateAABBNodeMBFunc,SetAABBNodeMBFunc,
+            CreateOBBNodeMBFunc,SetOBBNodeMBFunc,
+            CreateLeafFunc,ProgressMonitor> Builder;
+
+          Builder builder(scene,recalculatePrimRef,createAlloc,
+                          createAABBNodeMB,setAABBNodeMB,
+                          createOBBNodeMB,setOBBNodeMB,
+                          createLeaf,progressMonitor,settings);
+
+          return builder(prims,pinfo);
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h
new file mode 100644
index 0000000000..3f7e678a10
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/bvh_builder_sah.h
@@ -0,0 +1,669 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning_array_aligned.h"
+#include "heuristic_spatial_array.h"
+#include "heuristic_openmerge_array.h"
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+#  define NUM_OBJECT_BINS 16
+#  define NUM_SPATIAL_BINS 16
+#else
+#  define NUM_OBJECT_BINS 32
+#  define NUM_SPATIAL_BINS 16
+#endif
+
+namespace embree
+{
+  namespace isa
+  {
+    MAYBE_UNUSED static const float travCost = 1.0f;
+    MAYBE_UNUSED static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+    struct GeneralBVHBuilder
+    {
+      static const size_t MAX_BRANCHING_FACTOR = 16;       //!< maximum supported BVH branching factor      
+      static const size_t MIN_LARGE_LEAF_LEVELS = 8;       //!< create balanced tree of we are that many levels before the maximum tree depth
+      
+
+      /*! settings for SAH builder */
+      struct Settings
+      {
+        /*! default settings */
+        Settings ()
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+          travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf) {}
+
+        /*! initialize settings from API settings */
+        Settings (const RTCBuildArguments& settings)
+        : branchingFactor(2), maxDepth(32), logBlockSize(0), minLeafSize(1), maxLeafSize(7),
+          travCost(1.0f), intCost(1.0f), singleThreadThreshold(1024), primrefarrayalloc(inf)
+        {
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxBranchingFactor)) branchingFactor = settings.maxBranchingFactor;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxDepth          )) maxDepth        = settings.maxDepth;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,sahBlockSize      )) logBlockSize    = bsr(static_cast<size_t>(settings.sahBlockSize));
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,minLeafSize       )) minLeafSize     = settings.minLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,maxLeafSize       )) maxLeafSize     = settings.maxLeafSize;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,traversalCost     )) travCost        = settings.traversalCost;
+          if (RTC_BUILD_ARGUMENTS_HAS(settings,intersectionCost  )) intCost         = settings.intersectionCost;
+
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+        Settings (size_t sahBlockSize, size_t minLeafSize, size_t maxLeafSize, float travCost, float intCost, size_t singleThreadThreshold, size_t primrefarrayalloc = inf)
+        : branchingFactor(2), maxDepth(32), logBlockSize(bsr(sahBlockSize)), minLeafSize(minLeafSize), maxLeafSize(maxLeafSize),
+          travCost(travCost), intCost(intCost), singleThreadThreshold(singleThreadThreshold), primrefarrayalloc(primrefarrayalloc)
+        {
+          minLeafSize = min(minLeafSize,maxLeafSize);
+        }
+
+      public:
+        size_t branchingFactor;  //!< branching factor of BVH to build
+        size_t maxDepth;         //!< maximum depth of BVH to build
+        size_t logBlockSize;     //!< log2 of blocksize for SAH heuristic
+        size_t minLeafSize;      //!< minimum size of a leaf
+        size_t maxLeafSize;      //!< maximum size of a leaf
+        float travCost;          //!< estimated cost of one traversal step
+        float intCost;           //!< estimated cost of one primitive intersection
+        size_t singleThreadThreshold; //!< threshold when we switch to single threaded build
+        size_t primrefarrayalloc;  //!< builder uses prim ref array to allocate nodes and leaves when a subtree of that size is finished
+      };
+
+      /*! recursive state of builder */
+      template<typename Set, typename Split>
+        struct BuildRecordT
+        {
+        public:
+          __forceinline BuildRecordT () {}
+
+          __forceinline BuildRecordT (size_t depth)
+            : depth(depth), alloc_barrier(false), prims(empty) {}
+
+          __forceinline BuildRecordT (size_t depth, const Set& prims)
+            : depth(depth), alloc_barrier(false), prims(prims) {}
+
+          __forceinline BBox3fa bounds() const { return prims.geomBounds; }
+
+          __forceinline friend bool operator< (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() < b.prims.size(); }
+          __forceinline friend bool operator> (const BuildRecordT& a, const BuildRecordT& b) { return a.prims.size() > b.prims.size();  }
+
+          __forceinline size_t size() const { return prims.size(); }
+
+        public:
+          size_t depth;       //!< Depth of the root of this subtree.
+          bool alloc_barrier; //!< barrier used to reuse primref-array blocks to allocate nodes
+          Set prims;          //!< The list of primitives.
+        };
+
+      template<typename PrimRef, typename Set>
+      struct DefaultCanCreateLeafFunc
+      {
+        __forceinline bool operator()(const PrimRef*, const Set&) const { return true; }
+      };
+
+      template<typename PrimRef, typename Set>
+      struct DefaultCanCreateLeafSplitFunc
+      {
+        __forceinline void operator()(PrimRef*, const Set&, Set&, Set&) const { }
+      };
+
+      template<typename BuildRecord,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename ReductionTy,
+        typename Allocator,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        class BuilderT
+        {
+          friend struct GeneralBVHBuilder;
+
+          BuilderT (PrimRef* prims,
+                    Heuristic& heuristic,
+                    const CreateAllocFunc& createAlloc,
+                    const CreateNodeFunc& createNode,
+                    const UpdateNodeFunc& updateNode,
+                    const CreateLeafFunc& createLeaf,
+                    const CanCreateLeafFunc& canCreateLeaf,
+                    const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                    const ProgressMonitor& progressMonitor,
+                    const Settings& settings) :
+                    cfg(settings),
+                    prims(prims),
+                    heuristic(heuristic),
+                    createAlloc(createAlloc),
+                    createNode(createNode),
+                    updateNode(updateNode),
+                    createLeaf(createLeaf),
+                    canCreateLeaf(canCreateLeaf),
+                    canCreateLeafSplit(canCreateLeafSplit),
+                    progressMonitor(progressMonitor)
+          {
+            if (cfg.branchingFactor > MAX_BRANCHING_FACTOR)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"bvh_builder: branching factor too large");
+          }
+
+          const ReductionTy createLargeLeaf(const BuildRecord& current, Allocator alloc)
+          {
+            /* this should never occur but is a fatal error */
+            if (current.depth > cfg.maxDepth)
+              throw_RTCError(RTC_ERROR_UNKNOWN,"depth limit reached");
+
+            /* create leaf for few primitives */
+            if (current.prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,current.prims))
+              return createLeaf(prims,current.prims,alloc);
+
+            /* fill all children by always splitting the largest one */
+            ReductionTy values[MAX_BRANCHING_FACTOR];
+            BuildRecord children[MAX_BRANCHING_FACTOR];
+            size_t numChildren = 1;
+            children[0] = current;
+            do {
+
+              /* find best child with largest bounding box area */
+              size_t bestChild = -1;
+              size_t bestSize = 0;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].prims.size() <= cfg.maxLeafSize && canCreateLeaf(prims,children[i].prims))
+                  continue;
+
+                /* remember child with largest size */
+                if (children[i].prims.size() > bestSize) {
+                  bestSize = children[i].prims.size();
+                  bestChild = i;
+                }
+              }
+              if (bestChild == (size_t)-1) break;
+
+              /*! split best child into left and right child */
+              BuildRecord left(current.depth+1);
+              BuildRecord right(current.depth+1);
+              if (!canCreateLeaf(prims,children[bestChild].prims)) {
+                canCreateLeafSplit(prims,children[bestChild].prims,left.prims,right.prims);
+              } else {
+                heuristic.splitFallback(children[bestChild].prims,left.prims,right.prims);
+              }
+
+              /* add new children left and right */
+              children[bestChild] = children[numChildren-1];
+              children[numChildren-1] = left;
+              children[numChildren+0] = right;
+              numChildren++;
+
+            } while (numChildren < cfg.branchingFactor);
+
+            /* set barrier for primrefarrayalloc */
+            if (unlikely(current.size() > cfg.primrefarrayalloc))
+              for (size_t i=0; i<numChildren; i++)
+                children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+            /* create node */
+            auto node = createNode(children,numChildren,alloc);
+
+            /* recurse into each child  and perform reduction */
+            for (size_t i=0; i<numChildren; i++)
+              values[i] = createLargeLeaf(children[i],alloc);
+
+            /* perform reduction */
+            return updateNode(current,children,node,values,numChildren);
+          }
+
+          const ReductionTy recurse(BuildRecord& current, Allocator alloc, bool toplevel)
+          {
+            /* get thread local allocator */
+            if (!alloc)
+              alloc = createAlloc();
+
+            /* call memory monitor function to signal progress */
+            if (toplevel && current.size() <= cfg.singleThreadThreshold)
+              progressMonitor(current.size());
+
+            /*! find best split */
+            auto split = heuristic.find(current.prims,cfg.logBlockSize);
+
+            /*! compute leaf and split cost */
+            const float leafSAH  = cfg.intCost*current.prims.leafSAH(cfg.logBlockSize);
+            const float splitSAH = cfg.travCost*halfArea(current.prims.geomBounds)+cfg.intCost*split.splitSAH();
+            assert((current.prims.size() == 0) || ((leafSAH >= 0) && (splitSAH >= 0)));
+
+            /*! create a leaf node when threshold reached or SAH tells us to stop */
+            if (current.prims.size() <= cfg.minLeafSize || current.depth+MIN_LARGE_LEAF_LEVELS >= cfg.maxDepth || (current.prims.size() <= cfg.maxLeafSize && leafSAH <= splitSAH)) {
+              heuristic.deterministic_order(current.prims);
+              return createLargeLeaf(current,alloc);
+            }
+
+            /*! perform initial split */
+            Set lprims,rprims;
+            heuristic.split(split,current.prims,lprims,rprims);
+	    
+            /*! initialize child list with initial split */
+            ReductionTy values[MAX_BRANCHING_FACTOR];
+            BuildRecord children[MAX_BRANCHING_FACTOR];
+            children[0] = BuildRecord(current.depth+1,lprims);
+            children[1] = BuildRecord(current.depth+1,rprims);
+            size_t numChildren = 2;
+
+            /*! split until node is full or SAH tells us to stop */
+            while (numChildren < cfg.branchingFactor)
+            {
+              /*! find best child to split */
+              float bestArea = neg_inf;
+              ssize_t bestChild = -1;
+              for (size_t i=0; i<numChildren; i++)
+              {
+                /* ignore leaves as they cannot get split */
+                if (children[i].prims.size() <= cfg.minLeafSize) continue;
+
+                /* find child with largest surface area */
+                if (halfArea(children[i].prims.geomBounds) > bestArea) {
+                  bestChild = i;
+                  bestArea = halfArea(children[i].prims.geomBounds);
+                }
+              }
+              if (bestChild == -1) break;
+
+              /* perform best found split */
+              BuildRecord& brecord = children[bestChild];
+              BuildRecord lrecord(current.depth+1);
+              BuildRecord rrecord(current.depth+1);
+              auto split = heuristic.find(brecord.prims,cfg.logBlockSize);
+              heuristic.split(split,brecord.prims,lrecord.prims,rrecord.prims);
+              children[bestChild  ] = lrecord;
+              children[numChildren] = rrecord;
+              numChildren++;
+            }
+
+            /* set barrier for primrefarrayalloc */
+            if (unlikely(current.size() > cfg.primrefarrayalloc))
+              for (size_t i=0; i<numChildren; i++)
+                children[i].alloc_barrier = children[i].size() <= cfg.primrefarrayalloc;
+
+            /* sort buildrecords for faster shadow ray traversal */
+            std::sort(&children[0],&children[numChildren],std::greater<BuildRecord>());
+
+            /*! create an inner node */
+            auto node = createNode(children,numChildren,alloc);
+
+            /* spawn tasks */
+            if (current.size() > cfg.singleThreadThreshold)
+            {
+              /*! parallel_for is faster than spawing sub-tasks */
+              parallel_for(size_t(0), numChildren, [&] (const range<size_t>& r) { // FIXME: no range here
+                  for (size_t i=r.begin(); i<r.end(); i++) {
+                    values[i] = recurse(children[i],nullptr,true);
+                    _mm_mfence(); // to allow non-temporal stores during build
+                  }
+                });
+
+              return updateNode(current,children,node,values,numChildren);
+            }
+            /* recurse into each child */
+            else
+            {
+              for (size_t i=0; i<numChildren; i++)
+                values[i] = recurse(children[i],alloc,false);
+
+              return updateNode(current,children,node,values,numChildren);
+            }
+          }
+
+        private:
+          Settings cfg;
+          PrimRef* prims;
+          Heuristic& heuristic;
+          const CreateAllocFunc& createAlloc;
+          const CreateNodeFunc& createNode;
+          const UpdateNodeFunc& updateNode;
+          const CreateLeafFunc& createLeaf;
+          const CanCreateLeafFunc& canCreateLeaf;
+          const CanCreateLeafSplitFunc& canCreateLeafSplit;
+          const ProgressMonitor& progressMonitor;
+        };
+
+      template<
+      typename ReductionTy,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        __noinline static ReductionTy build(Heuristic& heuristic,
+                                            PrimRef* prims,
+                                            const Set& set,
+                                            CreateAllocFunc createAlloc,
+                                            CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                            const CreateLeafFunc& createLeaf,
+                                            const ProgressMonitor& progressMonitor,
+                                            const Settings& settings)
+      {
+        typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+        typedef BuilderT<
+          BuildRecord,
+          Heuristic,
+          Set,
+          PrimRef,
+          ReductionTy,
+          decltype(createAlloc()),
+          CreateAllocFunc,
+          CreateNodeFunc,
+          UpdateNodeFunc,
+          CreateLeafFunc,
+          DefaultCanCreateLeafFunc<PrimRef, Set>,
+          DefaultCanCreateLeafSplitFunc<PrimRef, Set>,
+          ProgressMonitor> Builder;
+
+        /* instantiate builder */
+        Builder builder(prims,
+                        heuristic,
+                        createAlloc,
+                        createNode,
+                        updateNode,
+                        createLeaf,
+                        DefaultCanCreateLeafFunc<PrimRef, Set>(),
+                        DefaultCanCreateLeafSplitFunc<PrimRef, Set>(),
+                        progressMonitor,
+                        settings);
+
+        /* build hierarchy */
+        BuildRecord record(1,set);
+        const ReductionTy root = builder.recurse(record,nullptr,true);
+        _mm_mfence(); // to allow non-temporal stores during build
+        return root;
+      }
+
+      template<
+      typename ReductionTy,
+        typename Heuristic,
+        typename Set,
+        typename PrimRef,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        __noinline static ReductionTy build(Heuristic& heuristic,
+                                            PrimRef* prims,
+                                            const Set& set,
+                                            CreateAllocFunc createAlloc,
+                                            CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                            const CreateLeafFunc& createLeaf,
+                                            const CanCreateLeafFunc& canCreateLeaf,
+                                            const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                                            const ProgressMonitor& progressMonitor,
+                                            const Settings& settings)
+      {
+        typedef BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+
+        typedef BuilderT<
+          BuildRecord,
+          Heuristic,
+          Set,
+          PrimRef,
+          ReductionTy,
+          decltype(createAlloc()),
+          CreateAllocFunc,
+          CreateNodeFunc,
+          UpdateNodeFunc,
+          CreateLeafFunc,
+          CanCreateLeafFunc,
+          CanCreateLeafSplitFunc,
+          ProgressMonitor> Builder;
+
+        /* instantiate builder */
+        Builder builder(prims,
+                        heuristic,
+                        createAlloc,
+                        createNode,
+                        updateNode,
+                        createLeaf,
+                        canCreateLeaf,
+                        canCreateLeafSplit,
+                        progressMonitor,
+                        settings);
+
+        /* build hierarchy */
+        BuildRecord record(1,set);
+        const ReductionTy root = builder.recurse(record,nullptr,true);
+        _mm_mfence(); // to allow non-temporal stores during build
+        return root;
+      }
+    };
+
+    /* SAH builder that operates on an array of BuildRecords */
+    struct BVHBuilderBinnedSAH
+    {
+      typedef PrimInfoRange Set;
+      typedef HeuristicArrayBinningSAH<PrimRef,NUM_OBJECT_BINS> Heuristic;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,typename Heuristic::Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 const ProgressMonitor& progressMonitor,
+                                 PrimRef* prims, const PrimInfo& pinfo,
+                                 const Settings& settings)
+      {
+        Heuristic heuristic(prims);
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+          heuristic,
+          prims,
+          PrimInfoRange(0,pinfo.size(),pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          progressMonitor,
+          settings);
+      }
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename CanCreateLeafFunc,
+        typename CanCreateLeafSplitFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode, UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 const CanCreateLeafFunc& canCreateLeaf,
+                                 const CanCreateLeafSplitFunc& canCreateLeafSplit,
+                                 const ProgressMonitor& progressMonitor,
+                                 PrimRef* prims, const PrimInfo& pinfo,
+                                 const Settings& settings)
+      {
+        Heuristic heuristic(prims);
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+          heuristic,
+          prims,
+          PrimInfoRange(0,pinfo.size(),pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          canCreateLeaf,
+          canCreateLeafSplit,
+          progressMonitor,
+          settings);
+      }
+    };
+
+    /* Spatial SAH builder that operates on an double-buffered array of BuildRecords */
+    struct BVHBuilderBinnedFastSpatialSAH
+    {
+      typedef PrimInfoExtRange Set;
+      typedef Split2<BinSplit<NUM_OBJECT_BINS>,SpatialBinSplit<NUM_SPATIAL_BINS> > Split;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+
+      static const unsigned int GEOMID_MASK = 0xFFFFFFFF >>     RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+      static const unsigned int SPLITS_MASK = 0xFFFFFFFF << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+      template<typename ReductionTy, typename UserCreateLeaf>
+      struct CreateLeafExt
+      {
+        __forceinline CreateLeafExt (const UserCreateLeaf userCreateLeaf)
+          : userCreateLeaf(userCreateLeaf) {}
+
+        // __noinline is workaround for ICC2016 compiler bug
+        template<typename Allocator>
+        __noinline ReductionTy operator() (PrimRef* prims, const range<size_t>& range, Allocator alloc) const
+        {
+          for (size_t i=range.begin(); i<range.end(); i++)
+            prims[i].lower.u &= GEOMID_MASK;
+
+          return userCreateLeaf(prims,range,alloc);
+        }
+
+        const UserCreateLeaf userCreateLeaf;
+      };
+
+      /*! special builder that propagates reduction over the tree */
+      template<
+      typename ReductionTy,
+        typename CreateAllocFunc,
+        typename CreateNodeFunc,
+        typename UpdateNodeFunc,
+        typename CreateLeafFunc,
+        typename SplitPrimitiveFunc,
+        typename ProgressMonitor>
+
+        static ReductionTy build(CreateAllocFunc createAlloc,
+                                 CreateNodeFunc createNode,
+                                 UpdateNodeFunc updateNode,
+                                 const CreateLeafFunc& createLeaf,
+                                 SplitPrimitiveFunc splitPrimitive,
+                                 ProgressMonitor progressMonitor,
+                                 PrimRef* prims,
+                                 const size_t extSize,
+                                 const PrimInfo& pinfo,
+                                 const Settings& settings)
+        {
+          typedef HeuristicArraySpatialSAH<SplitPrimitiveFunc,PrimRef,NUM_OBJECT_BINS,NUM_SPATIAL_BINS> Heuristic;
+          Heuristic heuristic(splitPrimitive,prims,pinfo);
+
+          /* calculate total surface area */ // FIXME: this sum is not deterministic
+          const float A = (float) parallel_reduce(size_t(0),pinfo.size(),0.0, [&] (const range<size_t>& r) -> double {
+
+              double A = 0.0f;
+              for (size_t i=r.begin(); i<r.end(); i++)
+              {
+                PrimRef& prim = prims[i];
+                A += area(prim.bounds());
+              }
+              return A;
+            },std::plus<double>());
+
+
+          /* calculate maximum number of spatial splits per primitive */
+          const unsigned int maxSplits = ((size_t)1 << RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)-1;
+          const float f = 10.0f;
+
+          const float invA = 1.0f / A;
+          parallel_for( size_t(0), pinfo.size(), [&](const range<size_t>& r) {
+
+              for (size_t i=r.begin(); i<r.end(); i++)
+              {
+                PrimRef& prim = prims[i];
+                assert((prim.geomID() & SPLITS_MASK) == 0);
+                // FIXME: is there a better general heuristic ?
+                const float nf = ceilf(f*pinfo.size()*area(prim.bounds()) * invA);
+                unsigned int n = 4+min((int)maxSplits-4, max(1, (int)(nf)));
+                prim.lower.u |= n << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+              }
+            });
+
+          return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,PrimRef>(
+            heuristic,
+            prims,
+            PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+            createAlloc,
+            createNode,
+            updateNode,
+            CreateLeafExt<ReductionTy,CreateLeafFunc>(createLeaf),
+            progressMonitor,
+            settings);
+        }
+    };
+
+    /* Open/Merge SAH builder that operates on an array of BuildRecords */
+    struct BVHBuilderBinnedOpenMergeSAH
+    {
+      static const size_t NUM_OBJECT_BINS_HQ = 32;
+      typedef PrimInfoExtRange Set;
+      typedef BinSplit<NUM_OBJECT_BINS_HQ> Split;
+      typedef GeneralBVHBuilder::BuildRecordT<Set,Split> BuildRecord;
+      typedef GeneralBVHBuilder::Settings Settings;
+      
+      /*! special builder that propagates reduction over the tree */
+      template<
+        typename ReductionTy, 
+        typename BuildRef,
+        typename CreateAllocFunc, 
+        typename CreateNodeFunc, 
+        typename UpdateNodeFunc, 
+        typename CreateLeafFunc, 
+        typename NodeOpenerFunc, 
+        typename ProgressMonitor>
+        
+        static ReductionTy build(CreateAllocFunc createAlloc, 
+                                 CreateNodeFunc createNode, 
+                                 UpdateNodeFunc updateNode, 
+                                 const CreateLeafFunc& createLeaf, 
+                                 NodeOpenerFunc nodeOpenerFunc,
+                                 ProgressMonitor progressMonitor,
+                                 BuildRef* prims, 
+                                 const size_t extSize,
+                                 const PrimInfo& pinfo, 
+                                 const Settings& settings)
+      {
+        typedef HeuristicArrayOpenMergeSAH<NodeOpenerFunc,BuildRef,NUM_OBJECT_BINS_HQ> Heuristic;
+        Heuristic heuristic(nodeOpenerFunc,prims,settings.branchingFactor);
+
+        return GeneralBVHBuilder::build<ReductionTy,Heuristic,Set,BuildRef>(
+          heuristic,
+          prims,
+          PrimInfoExtRange(0,pinfo.size(),extSize,pinfo),
+          createAlloc,
+          createNode,
+          updateNode,
+          createLeaf,
+          progressMonitor,
+          settings);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h
new file mode 100644
index 0000000000..a4d3b68e46
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning.h
@@ -0,0 +1,972 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! mapping into bins */
+    template<size_t BINS>
+      struct BinMapping
+      {
+      public:
+        __forceinline BinMapping() {}
+        
+        /*! calculates the mapping */
+        __forceinline BinMapping(size_t N, const BBox3fa& centBounds) 
+        {
+          num = min(BINS,size_t(4.0f + 0.05f*N));
+          assert(num >= 1);
+          const vfloat4 eps = 1E-34f;
+          const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) centBounds.lower;
+        }
+
+        /*! calculates the mapping */
+        __forceinline BinMapping(const BBox3fa& centBounds) 
+        {
+          num = BINS;
+          const vfloat4 eps = 1E-34f;
+          const vfloat4 diag = max(eps, (vfloat4) centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) centBounds.lower;
+        }
+
+        /*! calculates the mapping */
+        template<typename PrimInfo>
+        __forceinline BinMapping(const PrimInfo& pinfo) 
+        {
+          const vfloat4 eps = 1E-34f;
+          num = min(BINS,size_t(4.0f + 0.05f*pinfo.size()));
+          const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size());
+          scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+          ofs  = (vfloat4) pinfo.centBounds.lower;
+        }
+
+        /*! returns number of bins */
+        __forceinline size_t size() const { return num; }
+        
+        /*! slower but safe binning */
+        __forceinline Vec3ia bin(const Vec3fa& p) const 
+        {
+          const vint4 i = floori((vfloat4(p)-ofs)*scale);
+#if 1
+          assert(i[0] >= 0 && (size_t)i[0] < num); 
+          assert(i[1] >= 0 && (size_t)i[1] < num);
+          assert(i[2] >= 0 && (size_t)i[2] < num);
+          return Vec3ia(i);
+#else
+          return Vec3ia(clamp(i,vint4(0),vint4(num-1)));
+#endif
+        }
+
+        /*! faster but unsafe binning */
+        __forceinline Vec3ia bin_unsafe(const Vec3fa& p) const {
+          return Vec3ia(floori((vfloat4(p)-ofs)*scale));
+        }
+
+        /*! faster but unsafe binning */
+        template<typename PrimRef>
+        __forceinline Vec3ia bin_unsafe(const PrimRef& p) const {
+          return bin_unsafe(p.binCenter());
+        }
+
+        /*! faster but unsafe binning */
+        template<typename PrimRef, typename BinBoundsAndCenter>
+        __forceinline Vec3ia bin_unsafe(const PrimRef& p, const BinBoundsAndCenter& binBoundsAndCenter) const {
+          return bin_unsafe(binBoundsAndCenter.binCenter(p));
+        }
+
+        template<typename PrimRef>
+        __forceinline bool bin_unsafe(const PrimRef& ref,
+                                      const vint4&   vSplitPos,
+                                      const vbool4&  splitDimMask) const // FIXME: rename to isLeft
+        {
+          return any(((vint4)bin_unsafe(center2(ref.bounds())) < vSplitPos) & splitDimMask);
+        }
+        /*! calculates left spatial position of bin */
+        __forceinline float pos(const size_t bin, const size_t dim) const {
+          return madd(float(bin),1.0f / scale[dim],ofs[dim]);
+        }
+
+        /*! returns true if the mapping is invalid in some dimension */
+        __forceinline bool invalid(const size_t dim) const {
+          return scale[dim] == 0.0f;
+        }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const BinMapping& mapping) {
+          return cout << "BinMapping { num = " << mapping.num << ", ofs = " << mapping.ofs << ", scale = " << mapping.scale << "}";
+        }
+        
+      public:
+        size_t num;
+        vfloat4 ofs,scale;        //!< linear function that maps to bin ID
+      };
+    
+    /*! stores all information to perform some split */
+    template<size_t BINS>
+      struct BinSplit
+      {
+        enum
+        {
+          SPLIT_OBJECT   = 0,
+          SPLIT_FALLBACK = 1,
+          SPLIT_ENFORCE  = 2, // splits with larger ID are enforced in createLargeLeaf even if we could create a leaf already
+          SPLIT_TEMPORAL = 2,
+          SPLIT_GEOMID   = 3,
+        };
+
+        /*! construct an invalid split by default */
+        __forceinline BinSplit()
+          : sah(inf), dim(-1), pos(0), data(0) {}
+
+        __forceinline BinSplit(float sah, unsigned data, int dim = 0, float fpos = 0)
+          : sah(sah), dim(dim), fpos(fpos), data(data) {}
+        
+        /*! constructs specified split */
+        __forceinline BinSplit(float sah, int dim, int pos, const BinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), data(0), mapping(mapping) {}
+        
+        /*! tests if this split is valid */
+        __forceinline bool valid() const { return dim != -1; }
+        
+        /*! calculates surface area heuristic for performing the split */
+        __forceinline float splitSAH() const { return sah; }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const BinSplit& split) {
+          return cout << "BinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << "}";
+        }
+        
+      public:
+        float sah;                //!< SAH cost of the split
+        int dim;                  //!< split dimension
+        union { int pos; float fpos; };                  //!< bin index for splitting
+        unsigned int data;        //!< extra optional split data
+        BinMapping<BINS> mapping; //!< mapping into bins
+      };
+    
+    /*! stores extended information about the split */
+    template<typename BBox>
+      struct SplitInfoT
+    {
+
+      __forceinline SplitInfoT () {}
+      
+      __forceinline SplitInfoT (size_t leftCount, const BBox& leftBounds, size_t rightCount, const BBox& rightBounds)
+	: leftCount(leftCount), rightCount(rightCount), leftBounds(leftBounds), rightBounds(rightBounds) {}
+      
+    public:
+      size_t leftCount,rightCount;
+      BBox leftBounds,rightBounds;
+    };
+
+    typedef SplitInfoT<BBox3fa> SplitInfo;
+    typedef SplitInfoT<LBBox3fa> SplitInfo2;
+    
+    /*! stores all binning information */
+    template<size_t BINS, typename PrimRef, typename BBox>
+      struct __aligned(64) BinInfoT
+    {		  
+      typedef BinSplit<BINS> Split;
+      typedef vbool4 vbool;
+      typedef vint4 vint;
+      typedef vfloat4 vfloat;
+      
+      __forceinline BinInfoT() {
+      }
+      
+      __forceinline BinInfoT(EmptyTy) {
+	clear();
+      }
+
+      /*! bin access function */
+      __forceinline BBox &bounds(const size_t binID, const size_t dimID)             { return _bounds[binID][dimID]; }
+      __forceinline const BBox &bounds(const size_t binID, const size_t dimID) const { return _bounds[binID][dimID]; }
+
+      __forceinline unsigned int &counts(const size_t binID, const size_t dimID)             { return _counts[binID][dimID]; }
+      __forceinline const unsigned int &counts(const size_t binID, const size_t dimID) const { return _counts[binID][dimID]; }
+
+      __forceinline vuint4 &counts(const size_t binID)             { return _counts[binID]; }
+      __forceinline const vuint4 &counts(const size_t binID) const { return _counts[binID]; }
+
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+	for (size_t i=0; i<BINS; i++) {
+	  bounds(i,0) = bounds(i,1) = bounds(i,2) = empty;
+	  counts(i) = vuint4(zero);
+	}
+      }
+      
+      /*! bins an array of primitives */
+      __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping)
+      {
+	if (unlikely(N == 0)) return;
+	size_t i; 
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          BBox prim0; Vec3fa center0;
+          prims[i+0].binBoundsAndCenter(prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          BBox prim1; Vec3fa center1;
+          prims[i+1].binBoundsAndCenter(prim1,center1); 
+          const vint4 bin1 = (vint4)mapping.bin(center1); 
+          
+          /*! increase bounds for bins for even primitive */
+          const unsigned int b00 = extract<0>(bin0); bounds(b00,0).extend(prim0); 
+          const unsigned int b01 = extract<1>(bin0); bounds(b01,1).extend(prim0); 
+          const unsigned int b02 = extract<2>(bin0); bounds(b02,2).extend(prim0); 
+          const unsigned int s0 = (unsigned int)prims[i+0].size();
+          counts(b00,0)+=s0;
+          counts(b01,1)+=s0;
+          counts(b02,2)+=s0;
+
+          /*! increase bounds of bins for odd primitive */
+          const unsigned int b10 = extract<0>(bin1);  bounds(b10,0).extend(prim1); 
+          const unsigned int b11 = extract<1>(bin1);  bounds(b11,1).extend(prim1); 
+          const unsigned int b12 = extract<2>(bin1);  bounds(b12,2).extend(prim1); 
+          const unsigned int s1 = (unsigned int)prims[i+1].size();
+          counts(b10,0)+=s1;
+          counts(b11,1)+=s1;
+          counts(b12,2)+=s1;
+        }
+	/*! for uneven number of primitives */
+	if (i < N)
+        {
+          /*! map primitive to bin */
+          BBox prim0; Vec3fa center0;
+          prims[i].binBoundsAndCenter(prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          /*! increase bounds of bins */
+          const unsigned int s0 = (unsigned int)prims[i].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+        }
+      }
+
+      /*! bins an array of primitives */
+      template<typename BinBoundsAndCenter>
+        __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+      {
+	if (N == 0) return;
+        
+	size_t i; 
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          BBox prim1; Vec3fa center1; binBoundsAndCenter.binBoundsAndCenter(prims[i+1],prim1,center1); 
+          const vint4 bin1 = (vint4)mapping.bin(center1); 
+          
+          /*! increase bounds for bins for even primitive */
+          const unsigned int s0 = prims[i+0].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+          
+          /*! increase bounds of bins for odd primitive */
+          const unsigned int s1 = prims[i+1].size();
+          const int b10 = extract<0>(bin1); counts(b10,0)+=s1; bounds(b10,0).extend(prim1);
+          const int b11 = extract<1>(bin1); counts(b11,1)+=s1; bounds(b11,1).extend(prim1);
+          const int b12 = extract<2>(bin1); counts(b12,2)+=s1; bounds(b12,2).extend(prim1);
+        }
+	
+	/*! for uneven number of primitives */
+	if (i < N)
+        {
+          /*! map primitive to bin */
+          BBox prim0; Vec3fa center0; binBoundsAndCenter.binBoundsAndCenter(prims[i+0],prim0,center0); 
+          const vint4 bin0 = (vint4)mapping.bin(center0); 
+          
+          /*! increase bounds of bins */
+          const unsigned int s0 = prims[i+0].size();
+          const int b00 = extract<0>(bin0); counts(b00,0)+=s0; bounds(b00,0).extend(prim0);
+          const int b01 = extract<1>(bin0); counts(b01,1)+=s0; bounds(b01,1).extend(prim0);
+          const int b02 = extract<2>(bin0); counts(b02,2)+=s0; bounds(b02,2).extend(prim0);
+        }
+      }
+      
+      __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping) {
+	bin(prims+begin,end-begin,mapping);
+      }
+
+      template<typename BinBoundsAndCenter>
+        __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<BINS>& mapping, const BinBoundsAndCenter& binBoundsAndCenter) {
+	bin<BinBoundsAndCenter>(prims+begin,end-begin,mapping,binBoundsAndCenter);
+      }
+
+      /*! merges in other binning information */
+      __forceinline void merge (const BinInfoT& other, size_t numBins)
+      {
+		
+	for (size_t i=0; i<numBins; i++) 
+        {
+          counts(i) += other.counts(i);
+          bounds(i,0).extend(other.bounds(i,0));
+          bounds(i,1).extend(other.bounds(i,1));
+          bounds(i,2).extend(other.bounds(i,2));
+        }
+      }
+
+      /*! reduces binning information */
+      static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b, const size_t numBins = BINS)
+      {
+        BinInfoT c;
+	for (size_t i=0; i<numBins; i++) 
+        {
+          c.counts(i) = a.counts(i)+b.counts(i);
+          c.bounds(i,0) = embree::merge(a.bounds(i,0),b.bounds(i,0));
+          c.bounds(i,1) = embree::merge(a.bounds(i,1),b.bounds(i,1));
+          c.bounds(i,2) = embree::merge(a.bounds(i,2),b.bounds(i,2));
+        }
+        return c;
+      }
+      
+      /*! finds the best split by scanning binning information */
+      __forceinline Split best(const BinMapping<BINS>& mapping, const size_t blocks_shift) const
+      {
+	/* sweep from right to left and compute parallel prefix of merged bounds */
+	vfloat4 rAreas[BINS];
+	vuint4 rCounts[BINS];
+	vuint4 count = 0; BBox bx = empty; BBox by = empty; BBox bz = empty;
+	for (size_t i=mapping.size()-1; i>0; i--)
+        {
+          count += counts(i);
+          rCounts[i] = count;
+          bx.extend(bounds(i,0)); rAreas[i][0] = expectedApproxHalfArea(bx);
+          by.extend(bounds(i,1)); rAreas[i][1] = expectedApproxHalfArea(by);
+          bz.extend(bounds(i,2)); rAreas[i][2] = expectedApproxHalfArea(bz);
+          rAreas[i][3] = 0.0f;
+        }
+	/* sweep from left to right and compute SAH */
+	vuint4 blocks_add = (1 << blocks_shift)-1;
+	vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; 
+	count = 0; bx = empty; by = empty; bz = empty;
+	for (size_t i=1; i<mapping.size(); i++, ii+=1)
+        {
+          count += counts(i-1);
+          bx.extend(bounds(i-1,0)); float Ax = expectedApproxHalfArea(bx);
+          by.extend(bounds(i-1,1)); float Ay = expectedApproxHalfArea(by);
+          bz.extend(bounds(i-1,2)); float Az = expectedApproxHalfArea(bz);
+          const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+          const vfloat4 rArea = rAreas[i];
+          const vuint4 lCount = (count     +blocks_add) >> (unsigned int)(blocks_shift); // if blocks_shift >=1 then lCount < 4B and could be represented with an vint4, which would allow for faster vfloat4 conversions.
+          const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+          const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+          //const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+
+          vbestPos = select(sah < vbestSAH,ii ,vbestPos);
+          vbestSAH = select(sah < vbestSAH,sah,vbestSAH);
+        }
+	
+	/* find best dimension */
+	float bestSAH = inf;
+	int   bestDim = -1;
+	int   bestPos = 0;
+	for (int dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+          
+          /* test if this is a better dimension */
+          if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+            bestDim = dim;
+            bestPos = vbestPos[dim];
+            bestSAH = vbestSAH[dim];
+          }
+        }
+	return Split(bestSAH,bestDim,bestPos,mapping);
+      }
+      
+      /*! calculates extended split information */
+      __forceinline void getSplitInfo(const BinMapping<BINS>& mapping, const Split& split, SplitInfoT<BBox>& info) const 
+      {
+	if (split.dim == -1) {
+	  new (&info) SplitInfoT<BBox>(0,empty,0,empty);
+	  return;
+	}
+	
+	size_t leftCount = 0;
+	BBox leftBounds = empty;
+	for (size_t i=0; i<(size_t)split.pos; i++) {
+	  leftCount += counts(i,split.dim);
+	  leftBounds.extend(bounds(i,split.dim));
+	}
+	size_t rightCount = 0;
+	BBox rightBounds = empty;
+	for (size_t i=split.pos; i<mapping.size(); i++) {
+	  rightCount += counts(i,split.dim);
+	  rightBounds.extend(bounds(i,split.dim));
+	}
+	new (&info) SplitInfoT<BBox>(leftCount,leftBounds,rightCount,rightBounds);
+      }
+
+      /*! gets the number of primitives left of the split */
+      __forceinline size_t getLeftCount(const BinMapping<BINS>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t leftCount = 0;
+        for (size_t i = 0; i < (size_t)split.pos; i++) {
+          leftCount += counts(i, split.dim);
+        }
+        return leftCount;
+      }
+
+      /*! gets the number of primitives right of the split */
+      __forceinline size_t getRightCount(const BinMapping<BINS>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t rightCount = 0;
+        for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
+          rightCount += counts(i, split.dim);
+        }
+        return rightCount;
+      }
+
+    private:
+      BBox _bounds[BINS][3]; //!< geometry bounds for each bin in each dimension
+      vuint4   _counts[BINS];    //!< counts number of primitives that map into the bins
+    };
+
+#if defined(__AVX512ER__) // KNL
+
+   /*! mapping into bins */
+   template<>
+     struct BinMapping<16>
+   {
+   public:
+     __forceinline BinMapping() {}
+      
+     /*! calculates the mapping */
+     template<typename PrimInfo>
+     __forceinline BinMapping(const PrimInfo& pinfo)
+     {
+       num = 16;
+       const vfloat4 eps = 1E-34f;
+       const vfloat4 diag = max(eps,(vfloat4) pinfo.centBounds.size());
+       scale = select(diag > eps,vfloat4(0.99f*num)/diag,vfloat4(0.0f));
+       ofs  = (vfloat4) pinfo.centBounds.lower;
+       scale16 = scale;
+       ofs16 = ofs;
+     }
+
+     /*! returns number of bins */
+     __forceinline size_t size() const { return num; }
+
+     __forceinline vint16 bin16(const Vec3fa& p) const {
+       return vint16(vint4(floori((vfloat4(p)-ofs)*scale)));
+     }
+
+     __forceinline vint16 bin16(const vfloat16& p) const {
+       return floori((p-ofs16)*scale16);
+     }
+
+     __forceinline int bin_unsafe(const PrimRef& ref,
+                                  const vint16&  vSplitPos,
+                                  const vbool16& splitDimMask) const // FIXME: rename to isLeft
+     {
+       const vfloat16 lower(*(vfloat4*)&ref.lower);
+       const vfloat16 upper(*(vfloat4*)&ref.upper);
+       const vfloat16 p = lower + upper;
+       const vint16 i = floori((p-ofs16)*scale16);
+       return lt(splitDimMask,i,vSplitPos);
+     }
+
+     /*! returns true if the mapping is invalid in some dimension */
+     __forceinline bool invalid(const size_t dim) const {
+       return scale[dim] == 0.0f;
+     }
+        
+    public:
+      size_t num;
+      vfloat4 ofs,scale;         //!< linear function that maps to bin ID
+      vfloat16 ofs16,scale16;    //!< linear function that maps to bin ID
+    };
+
+    /* 16 bins in-register binner */
+    template<typename PrimRef>
+      struct __aligned(64) BinInfoT<16,PrimRef,BBox3fa>
+    {
+      typedef BinSplit<16> Split;
+      typedef vbool16 vbool;
+      typedef vint16 vint;
+      typedef vfloat16 vfloat;
+      
+      __forceinline BinInfoT() {
+      }
+      
+      __forceinline BinInfoT(EmptyTy) {
+	clear();
+      }
+      
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+        lower[0] = lower[1] = lower[2] = pos_inf;
+        upper[0] = upper[1] = upper[2] = neg_inf;
+        count[0] = count[1] = count[2] = 0;
+      }
+
+
+      static __forceinline vfloat16 prefix_area_rl(const vfloat16 min_x,
+                                                   const vfloat16 min_y,
+                                                   const vfloat16 min_z,
+                                                   const vfloat16 max_x,
+                                                   const vfloat16 max_y,
+                                                   const vfloat16 max_z)
+      {
+        const vfloat16 r_min_x = reverse_prefix_min(min_x);
+        const vfloat16 r_min_y = reverse_prefix_min(min_y);
+        const vfloat16 r_min_z = reverse_prefix_min(min_z);
+        const vfloat16 r_max_x = reverse_prefix_max(max_x);
+        const vfloat16 r_max_y = reverse_prefix_max(max_y);
+        const vfloat16 r_max_z = reverse_prefix_max(max_z);
+        const vfloat16 dx = r_max_x - r_min_x;
+        const vfloat16 dy = r_max_y - r_min_y;
+        const vfloat16 dz = r_max_z - r_min_z;
+        const vfloat16 area_rl = madd(dx,dy,madd(dx,dz,dy*dz));
+        return area_rl;
+      }
+
+      static __forceinline vfloat16 prefix_area_lr(const vfloat16 min_x,
+                                                   const vfloat16 min_y,
+                                                   const vfloat16 min_z,
+                                                   const vfloat16 max_x,
+                                                   const vfloat16 max_y,
+                                                   const vfloat16 max_z)
+      {
+        const vfloat16 r_min_x = prefix_min(min_x);
+        const vfloat16 r_min_y = prefix_min(min_y);
+        const vfloat16 r_min_z = prefix_min(min_z);
+        const vfloat16 r_max_x = prefix_max(max_x);
+        const vfloat16 r_max_y = prefix_max(max_y);
+        const vfloat16 r_max_z = prefix_max(max_z);
+        const vfloat16 dx = r_max_x - r_min_x;
+        const vfloat16 dy = r_max_y - r_min_y;
+        const vfloat16 dz = r_max_z - r_min_z;
+        const vfloat16 area_lr = madd(dx,dy,madd(dx,dz,dy*dz));
+        return area_lr;
+      }
+
+
+      /*! bins an array of primitives */
+      __forceinline void bin (const PrimRef* prims, size_t N, const BinMapping<16>& mapping)
+      {
+        if (unlikely(N == 0)) return;
+
+        const vfloat16 init_min(pos_inf);
+        const vfloat16 init_max(neg_inf);
+
+        vfloat16 min_x0,min_x1,min_x2;
+        vfloat16 min_y0,min_y1,min_y2;
+        vfloat16 min_z0,min_z1,min_z2;
+        vfloat16 max_x0,max_x1,max_x2;
+        vfloat16 max_y0,max_y1,max_y2;
+        vfloat16 max_z0,max_z1,max_z2;
+        vuint16 count0,count1,count2;
+
+        min_x0 = init_min;
+        min_x1 = init_min;
+        min_x2 = init_min;
+        min_y0 = init_min;
+        min_y1 = init_min;
+        min_y2 = init_min;
+        min_z0 = init_min;
+        min_z1 = init_min;
+        min_z2 = init_min;
+
+        max_x0 = init_max;
+        max_x1 = init_max;
+        max_x2 = init_max;
+        max_y0 = init_max;
+        max_y1 = init_max;
+        max_y2 = init_max;
+        max_z0 = init_max;
+        max_z1 = init_max;
+        max_z2 = init_max;
+
+        count0 = zero;
+        count1 = zero;
+        count2 = zero;
+
+        const vint16 step16(step);
+        size_t i;
+	for (i=0; i<N-1; i+=2)
+        {
+          /*! map even and odd primitive to bin */
+          const BBox3fa primA = prims[i+0].bounds();
+          const vfloat16 centerA = vfloat16((vfloat4)primA.lower) + vfloat16((vfloat4)primA.upper);
+          const vint16 binA = mapping.bin16(centerA);
+
+          const BBox3fa primB = prims[i+1].bounds();
+          const vfloat16 centerB = vfloat16((vfloat4)primB.lower) + vfloat16((vfloat4)primB.upper); 
+          const vint16 binB = mapping.bin16(centerB);
+
+          /* A */
+          {
+            const vfloat16 b_min_x = prims[i+0].lower.x;
+            const vfloat16 b_min_y = prims[i+0].lower.y;
+            const vfloat16 b_min_z = prims[i+0].lower.z;
+            const vfloat16 b_max_x = prims[i+0].upper.x;
+            const vfloat16 b_max_y = prims[i+0].upper.y;
+            const vfloat16 b_max_z = prims[i+0].upper.z;
+
+            const vint16 bin0 = shuffle<0>(binA);
+            const vint16 bin1 = shuffle<1>(binA);
+            const vint16 bin2 = shuffle<2>(binA);
+
+            const vbool16 m_update_x = step16 == bin0;
+            const vbool16 m_update_y = step16 == bin1;
+            const vbool16 m_update_z = step16 == bin2;
+
+            assert(popcnt((size_t)m_update_x) == 1);
+            assert(popcnt((size_t)m_update_y) == 1);
+            assert(popcnt((size_t)m_update_z) == 1);
+
+            min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+            min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+            min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+            max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+            max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+            min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+            min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);      
+            // ------------------------------------------------------------------------      
+            max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+            max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+            max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+            min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+            min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+            max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+            max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+            // ------------------------------------------------------------------------
+            count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+            count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+            count2 = mask_add(m_update_z,count2,count2,vuint16(1));      
+          }
+
+
+          /* B */
+          {
+            const vfloat16 b_min_x = prims[i+1].lower.x;
+            const vfloat16 b_min_y = prims[i+1].lower.y;
+            const vfloat16 b_min_z = prims[i+1].lower.z;
+            const vfloat16 b_max_x = prims[i+1].upper.x;
+            const vfloat16 b_max_y = prims[i+1].upper.y;
+            const vfloat16 b_max_z = prims[i+1].upper.z;
+
+            const vint16 bin0 = shuffle<0>(binB);
+            const vint16 bin1 = shuffle<1>(binB);
+            const vint16 bin2 = shuffle<2>(binB);
+
+            const vbool16 m_update_x = step16 == bin0;
+            const vbool16 m_update_y = step16 == bin1;
+            const vbool16 m_update_z = step16 == bin2;
+
+            assert(popcnt((size_t)m_update_x) == 1);
+            assert(popcnt((size_t)m_update_y) == 1);
+            assert(popcnt((size_t)m_update_z) == 1);
+
+            min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+            min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+            min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+            max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+            max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+            min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+            min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);      
+            // ------------------------------------------------------------------------      
+            max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+            max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+            max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+            // ------------------------------------------------------------------------
+            min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+            min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+            min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+            // ------------------------------------------------------------------------      
+            max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+            max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+            max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+            // ------------------------------------------------------------------------
+            count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+            count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+            count2 = mask_add(m_update_z,count2,count2,vuint16(1));      
+          }
+
+        }
+
+        if (i < N)
+        {
+          const BBox3fa prim0 = prims[i].bounds();
+          const vfloat16 center0 = vfloat16((vfloat4)prim0.lower) + vfloat16((vfloat4)prim0.upper); 
+          const vint16 bin = mapping.bin16(center0);
+
+          const vfloat16 b_min_x = prims[i].lower.x;
+          const vfloat16 b_min_y = prims[i].lower.y;
+          const vfloat16 b_min_z = prims[i].lower.z;
+          const vfloat16 b_max_x = prims[i].upper.x;
+          const vfloat16 b_max_y = prims[i].upper.y;
+          const vfloat16 b_max_z = prims[i].upper.z;
+
+          const vint16 bin0 = shuffle<0>(bin);
+          const vint16 bin1 = shuffle<1>(bin);
+          const vint16 bin2 = shuffle<2>(bin);
+
+          const vbool16 m_update_x = step16 == bin0;
+          const vbool16 m_update_y = step16 == bin1;
+          const vbool16 m_update_z = step16 == bin2;
+
+          assert(popcnt((size_t)m_update_x) == 1);
+          assert(popcnt((size_t)m_update_y) == 1);
+          assert(popcnt((size_t)m_update_z) == 1);
+
+          min_x0 = mask_min(m_update_x,min_x0,min_x0,b_min_x);
+          min_y0 = mask_min(m_update_x,min_y0,min_y0,b_min_y);
+          min_z0 = mask_min(m_update_x,min_z0,min_z0,b_min_z);
+          // ------------------------------------------------------------------------      
+          max_x0 = mask_max(m_update_x,max_x0,max_x0,b_max_x);
+          max_y0 = mask_max(m_update_x,max_y0,max_y0,b_max_y);
+          max_z0 = mask_max(m_update_x,max_z0,max_z0,b_max_z);
+          // ------------------------------------------------------------------------
+          min_x1 = mask_min(m_update_y,min_x1,min_x1,b_min_x);
+          min_y1 = mask_min(m_update_y,min_y1,min_y1,b_min_y);
+          min_z1 = mask_min(m_update_y,min_z1,min_z1,b_min_z);      
+          // ------------------------------------------------------------------------      
+          max_x1 = mask_max(m_update_y,max_x1,max_x1,b_max_x);
+          max_y1 = mask_max(m_update_y,max_y1,max_y1,b_max_y);
+          max_z1 = mask_max(m_update_y,max_z1,max_z1,b_max_z);
+          // ------------------------------------------------------------------------
+          min_x2 = mask_min(m_update_z,min_x2,min_x2,b_min_x);
+          min_y2 = mask_min(m_update_z,min_y2,min_y2,b_min_y);
+          min_z2 = mask_min(m_update_z,min_z2,min_z2,b_min_z);
+          // ------------------------------------------------------------------------      
+          max_x2 = mask_max(m_update_z,max_x2,max_x2,b_max_x);
+          max_y2 = mask_max(m_update_z,max_y2,max_y2,b_max_y);
+          max_z2 = mask_max(m_update_z,max_z2,max_z2,b_max_z);
+          // ------------------------------------------------------------------------
+          count0 = mask_add(m_update_x,count0,count0,vuint16(1));
+          count1 = mask_add(m_update_y,count1,count1,vuint16(1));
+          count2 = mask_add(m_update_z,count2,count2,vuint16(1));      
+        }
+
+        lower[0] = Vec3vf16( min_x0, min_y0, min_z0 );
+        lower[1] = Vec3vf16( min_x1, min_y1, min_z1 );
+        lower[2] = Vec3vf16( min_x2, min_y2, min_z2 );
+
+        upper[0] = Vec3vf16( max_x0, max_y0, max_z0 );
+        upper[1] = Vec3vf16( max_x1, max_y1, max_z1 );
+        upper[2] = Vec3vf16( max_x2, max_y2, max_z2 );
+
+        count[0] = count0;
+        count[1] = count1;
+        count[2] = count2;
+      }
+
+      __forceinline void bin(const PrimRef* prims, size_t begin, size_t end, const BinMapping<16>& mapping) {
+	bin(prims+begin,end-begin,mapping);
+      }
+
+      /*! merges in other binning information */
+      __forceinline void merge (const BinInfoT& other, size_t numBins)
+      {
+        for (size_t i=0; i<3; i++)
+        {
+          lower[i]  = min(lower[i],other.lower[i]);
+          upper[i]  = max(upper[i],other.upper[i]);
+          count[i] += other.count[i];
+        }
+      }
+
+      /*! reducesr binning information */
+      static __forceinline const BinInfoT reduce (const BinInfoT& a, const BinInfoT& b)
+      {
+        BinInfoT c;
+	for (size_t i=0; i<3; i++) 
+        {
+          c.counts[i] = a.counts[i] + b.counts[i];
+          c.lower[i]  = min(a.lower[i],b.lower[i]);
+          c.upper[i]  = max(a.upper[i],b.upper[i]);
+        }
+        return c;
+      }
+
+      /*! finds the best split by scanning binning information */
+      __forceinline Split best(const BinMapping<16>& mapping, const size_t blocks_shift) const
+      {
+	/* find best dimension */
+	float bestSAH = inf;
+	int   bestDim = -1;
+	int   bestPos = 0;
+	const vuint16 blocks_add = (1 << blocks_shift)-1;
+        const vfloat16 inf(pos_inf);
+	for (size_t dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+
+          const vfloat16 rArea16 = prefix_area_rl(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z);
+          const vfloat16 lArea16 = prefix_area_lr(lower[dim].x,lower[dim].y,lower[dim].z, upper[dim].x,upper[dim].y,upper[dim].z);
+          const vuint16  lCount16 = prefix_sum(count[dim]);
+          const vuint16  rCount16 = reverse_prefix_sum(count[dim]); 
+
+          /* compute best split in this dimension */
+          const vfloat16 leftArea  = lArea16;
+          const vfloat16 rightArea = align_shift_right<1>(zero,rArea16);
+          const vuint16 lC = lCount16;
+          const vuint16 rC = align_shift_right<1>(zero,rCount16);
+          const vuint16 leftCount  = ( lC + blocks_add) >> blocks_shift;
+          const vuint16 rightCount = ( rC + blocks_add) >> blocks_shift;
+          const vbool16 valid = (leftArea < inf) & (rightArea < inf) & vbool16(0x7fff); // handles inf entries
+          const vfloat16 sah = select(valid,madd(leftArea,vfloat16(leftCount),rightArea*vfloat16(rightCount)),vfloat16(pos_inf));
+          /* test if this is a better dimension */
+          if (any(sah < vfloat16(bestSAH))) 
+          {
+            const size_t index = select_min(sah);            
+            assert(index < 15);
+            assert(sah[index] < bestSAH);
+            bestDim = dim;
+            bestPos = index+1;
+            bestSAH = sah[index];
+          }
+        }
+	
+	return Split(bestSAH,bestDim,bestPos,mapping);
+
+      }
+
+      /*! calculates extended split information */
+      __forceinline void getSplitInfo(const BinMapping<16>& mapping, const Split& split, SplitInfo& info) const 
+      {
+	if (split.dim == -1) {
+	  new (&info) SplitInfo(0,empty,0,empty);
+	  return;
+	}
+	// FIXME: horizontal reduction!
+
+	size_t leftCount = 0;
+	BBox3fa leftBounds = empty;
+	for (size_t i=0; i<(size_t)split.pos; i++) {
+	  leftCount += count[split.dim][i];
+          Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]);
+          Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]);
+	  leftBounds.extend(BBox3fa(bounds_lower,bounds_upper));
+	}
+	size_t rightCount = 0;
+	BBox3fa rightBounds = empty;
+	for (size_t i=split.pos; i<mapping.size(); i++) {
+	  rightCount += count[split.dim][i];
+          Vec3fa bounds_lower(lower[split.dim].x[i],lower[split.dim].y[i],lower[split.dim].z[i]);
+          Vec3fa bounds_upper(upper[split.dim].x[i],upper[split.dim].y[i],upper[split.dim].z[i]);
+	  rightBounds.extend(BBox3fa(bounds_lower,bounds_upper));
+	}
+	new (&info) SplitInfo(leftCount,leftBounds,rightCount,rightBounds);
+      }
+
+      /*! gets the number of primitives left of the split */
+      __forceinline size_t getLeftCount(const BinMapping<16>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t leftCount = 0;
+        for (size_t i = 0; i < (size_t)split.pos; i++) {
+          leftCount += count[split.dim][i];
+        }
+        return leftCount;
+      }
+
+      /*! gets the number of primitives right of the split */
+      __forceinline size_t getRightCount(const BinMapping<16>& mapping, const Split& split) const
+      {
+        if (unlikely(split.dim == -1)) return -1;
+
+        size_t rightCount = 0;
+        for (size_t i = (size_t)split.pos; i<mapping.size(); i++) {
+          rightCount += count[split.dim][i];
+        }
+        return rightCount;
+      }
+            
+    private:
+      Vec3vf16 lower[3];
+      Vec3vf16 upper[3];
+      vuint16   count[3];
+    };
+#endif
+  }
+
+  template<typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping)
+  {
+    if (likely(end-begin < parallelThreshold)) {
+      binner.bin(prims,begin,end,mapping);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+  {
+    if (likely(end-begin < parallelThreshold)) {
+      binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<bool parallel, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping)
+  {
+    if (!parallel) {
+      binner.bin(prims,begin,end,mapping);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+
+  template<bool parallel, typename BinBoundsAndCenter, typename BinInfoT, typename BinMapping, typename PrimRef>
+  __forceinline void bin_serial_or_parallel(BinInfoT& binner, const PrimRef* prims, size_t begin, size_t end, size_t blockSize, const BinMapping& mapping, const BinBoundsAndCenter& binBoundsAndCenter)
+  {
+    if (!parallel) {
+      binner.bin(prims,begin,end,mapping,binBoundsAndCenter);
+    } else {
+      binner = parallel_reduce(begin,end,blockSize,binner,
+                              [&](const range<size_t>& r) -> BinInfoT { BinInfoT binner(empty); binner.bin(prims + r.begin(), r.size(), mapping, binBoundsAndCenter); return binner; },
+                              [&](const BinInfoT& b0, const BinInfoT& b1) -> BinInfoT { BinInfoT r = b0; r.merge(b1, mapping.size()); return r; });
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h
new file mode 100644
index 0000000000..a4c272f015
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_aligned.h
@@ -0,0 +1,205 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct PrimInfoRange : public CentGeomBBox3fa, public range<size_t>
+    {
+      __forceinline PrimInfoRange () {
+      }
+
+      __forceinline PrimInfoRange(const PrimInfo& pinfo)
+        : CentGeomBBox3fa(pinfo), range<size_t>(pinfo.begin,pinfo.end) {}
+
+      __forceinline PrimInfoRange(EmptyTy)
+        : CentGeomBBox3fa(EmptyTy()), range<size_t>(0,0) {}
+
+      __forceinline PrimInfoRange (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds)
+        : CentGeomBBox3fa(centGeomBounds), range<size_t>(begin,end) {}
+      
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+    };
+    
+    /*! Performs standard object binning */
+    template<typename PrimRef, size_t BINS>
+      struct HeuristicArrayBinningSAH
+      {
+        typedef BinSplit<BINS> Split;
+        typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+        typedef range<size_t> Set;
+
+#if defined(__AVX512ER__) // KNL
+        static const size_t PARALLEL_THRESHOLD = 4*768; 
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 768;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 768;
+#else
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#endif
+        __forceinline HeuristicArrayBinningSAH ()
+          : prims(nullptr) {}
+
+        /*! remember prim array */
+        __forceinline HeuristicArrayBinningSAH (PrimRef* prims)
+          : prims(prims) {}
+
+        /*! finds the best split */
+        __noinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize)
+        {
+          if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+            return find_template<false>(pinfo,logBlockSize);
+          else
+            return find_template<true>(pinfo,logBlockSize);
+        }
+
+        template<bool parallel>
+        __forceinline const Split find_template(const PrimInfoRange& pinfo, const size_t logBlockSize)
+        {
+          Binner binner(empty);
+          const BinMapping<BINS> mapping(pinfo);
+          bin_serial_or_parallel<parallel>(binner,prims,pinfo.begin(),pinfo.end(),PARALLEL_FIND_BLOCK_SIZE,mapping);
+          return binner.best(mapping,logBlockSize);
+        }
+
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          if (likely(pinfo.size() < PARALLEL_THRESHOLD))
+            split_template<false>(split,pinfo,linfo,rinfo);
+          else
+            split_template<true>(split,pinfo,linfo,rinfo);
+        }
+
+        template<bool parallel>
+        __forceinline void split_template(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (!split.valid()) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+          
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          CentGeomBBox3fa local_left(empty);
+          CentGeomBBox3fa local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const typename Binner::vint vSplitPos(splitPos);
+          const typename Binner::vbool vSplitMask(splitDimMask);
+          auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          size_t center = 0;
+          if (!parallel)
+            center = serial_partitioning(prims,begin,end,local_left,local_right,isLeft,
+                                         [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+          else
+            center = parallel_partitioning(
+              prims,begin,end,EmptyTy(),local_left,local_right,isLeft,
+              [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+              [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+              PARALLEL_PARTITION_BLOCK_SIZE);
+          
+          new (&lset) PrimInfoRange(begin,center,local_left);
+          new (&rset) PrimInfoRange(center,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+        }
+
+        void deterministic_order(const PrimInfoRange& pinfo)
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims[pinfo.begin()],&prims[pinfo.end()]);
+        }
+
+        void splitFallback(const PrimInfoRange& pinfo, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          const size_t begin = pinfo.begin();
+          const size_t end   = pinfo.end();
+          const size_t center = (begin + end)/2;
+
+          CentGeomBBox3fa left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.extend_center2(prims[i]);
+          new (&linfo) PrimInfoRange(begin,center,left);
+
+          CentGeomBBox3fa right(empty);
+          for (size_t i=center; i<end; i++)
+            right.extend_center2(prims[i]);
+          new (&rinfo) PrimInfoRange(center,end,right);
+        }
+
+        void splitByGeometry(const range<size_t>& range, PrimInfoRange& linfo, PrimInfoRange& rinfo)
+        {
+          assert(range.size() > 1);
+          CentGeomBBox3fa left(empty);
+          CentGeomBBox3fa right(empty);
+          unsigned int geomID = prims[range.begin()].geomID();
+          size_t center = serial_partitioning(prims,range.begin(),range.end(),left,right,
+                                              [&] ( const PrimRef& prim ) { return prim.geomID() == geomID; },
+                                              [ ] ( CentGeomBBox3fa& a, const PrimRef& ref ) { a.extend_center2(ref); });
+
+          new (&linfo) PrimInfoRange(range.begin(),center,left);
+          new (&rinfo) PrimInfoRange(center,range.end(),right);
+        }
+
+      private:
+        PrimRef* const prims;
+      };
+
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, size_t BINS>
+      struct HeuristicArrayBinningMB
+      {
+        typedef BinSplit<BINS> Split;
+        typedef typename PrimRefMB::BBox BBox;
+        typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize)
+        {
+          ObjectBinner binner(empty);
+          const BinMapping<BINS> mapping(set.size(),set.centBounds);
+          bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping);
+          Split osplit = binner.best(mapping,logBlockSize);
+          osplit.sah *= set.time_range.size();
+          if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return osplit;
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfoMB left = empty;
+          PrimInfoMB right = empty;
+          const vint4 vSplitPos(split.pos);
+          const vbool4 vSplitMask(1 << split.dim);
+          auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref) < vSplitPos) & vSplitMask); };
+          auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+          auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+          size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+          new (&lset) SetMB(left, set.prims,range<size_t>(begin,center),set.time_range);
+          new (&rset) SetMB(right,set.prims,range<size_t>(center,end  ),set.time_range);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h
new file mode 100644
index 0000000000..1370244586
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_binning_array_unaligned.h
@@ -0,0 +1,302 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename PrimRef, size_t BINS>
+      struct UnalignedHeuristicArrayBinningSAH
+      {
+        typedef BinSplit<BINS> Split;
+        typedef BinInfoT<BINS,PrimRef,BBox3fa> Binner;
+        typedef range<size_t> Set;
+
+        __forceinline UnalignedHeuristicArrayBinningSAH () // FIXME: required?
+          : scene(nullptr), prims(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline UnalignedHeuristicArrayBinningSAH (Scene* scene, PrimRef* prims)
+          : scene(scene), prims(prims) {}
+
+        const LinearSpace3fa computeAlignedSpace(const range<size_t>& set)
+        {
+          Vec3fa axis(0,0,1);
+          uint64_t bestGeomPrimID = -1;
+
+          /*! find curve with minimum ID that defines valid direction */
+          for (size_t i=set.begin(); i<set.end(); i++)
+          {
+            const unsigned int geomID = prims[i].geomID();
+            const unsigned int primID = prims[i].primID();
+            const uint64_t geomprimID = prims[i].ID64();
+            if (geomprimID >= bestGeomPrimID) continue;
+            const Vec3fa axis1 = scene->get(geomID)->computeDirection(primID);
+            if (sqr_length(axis1) > 1E-18f) {
+              axis = normalize(axis1);
+              bestGeomPrimID = geomprimID;
+            }
+          }
+          return frame(axis).transposed();
+        }
+        
+        const PrimInfo computePrimInfo(const range<size_t>& set, const LinearSpace3fa& space)
+        {
+          auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+            {
+              CentGeomBBox3fa bounds(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                Geometry* mesh = scene->get(prims[i].geomID());
+                bounds.extend(mesh->vbounds(space,prims[i].primID()));
+              }
+              return bounds;
+            };
+          
+          const CentGeomBBox3fa bounds = parallel_reduce(set.begin(), set.end(), size_t(1024), size_t(4096), 
+                                                         CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+          return PrimInfo(set.begin(),set.end(),bounds);
+        }
+
+        struct BinBoundsAndCenter
+        {
+          __forceinline BinBoundsAndCenter(Scene* scene, const LinearSpace3fa& space)
+            : scene(scene), space(space) {}
+          
+            /*! returns center for binning */
+          __forceinline Vec3fa binCenter(const PrimRef& ref) const
+          {
+            Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+            BBox3fa bounds = mesh->vbounds(space,ref.primID());
+            return embree::center2(bounds);
+          }
+          
+          /*! returns bounds and centroid used for binning */
+          __forceinline void binBoundsAndCenter(const PrimRef& ref, BBox3fa& bounds_o, Vec3fa& center_o) const
+          {
+            Geometry* mesh = (Geometry*) scene->get(ref.geomID());
+            BBox3fa bounds = mesh->vbounds(space,ref.primID());
+            bounds_o = bounds;
+            center_o = embree::center2(bounds);
+          }
+
+        private:
+          Scene* scene;
+          const LinearSpace3fa space;
+        };
+        
+        /*! finds the best split */
+        __forceinline const Split find(const PrimInfoRange& pinfo, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          if (likely(pinfo.size() < 10000))
+            return find_template<false>(pinfo,logBlockSize,space);
+          else
+            return find_template<true>(pinfo,logBlockSize,space);
+        }
+
+        /*! finds the best split */
+        template<bool parallel>
+        const Split find_template(const PrimInfoRange& set, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          Binner binner(empty);
+          const BinMapping<BINS> mapping(set);
+          BinBoundsAndCenter binBoundsAndCenter(scene,space);
+          bin_serial_or_parallel<parallel>(binner,prims,set.begin(),set.end(),size_t(4096),mapping,binBoundsAndCenter);
+          return binner.best(mapping,logBlockSize);
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (likely(set.size() < 10000))
+            split_template<false>(split,space,set,lset,rset);
+          else
+            split_template<true>(split,space,set,lset,rset);
+        }
+
+        /*! array partitioning */
+        template<bool parallel>
+        __forceinline void split_template(const Split& split, const LinearSpace3fa& space, const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          if (!split.valid()) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+          
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          CentGeomBBox3fa local_left(empty);
+          CentGeomBBox3fa local_right(empty);
+          const int splitPos = split.pos;
+          const int splitDim = split.dim;
+          BinBoundsAndCenter binBoundsAndCenter(scene,space);
+
+          size_t center = 0;
+          if (likely(set.size() < 10000))
+            center = serial_partitioning(prims,begin,end,local_left,local_right,
+                                         [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+                                         [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); });
+          else
+            center = parallel_partitioning(prims,begin,end,EmptyTy(),local_left,local_right,
+                                           [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,binBoundsAndCenter)[splitDim] < splitPos; },
+                                           [] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { pinfo.extend_center2(ref); },
+                                           [] (CentGeomBBox3fa& pinfo0,const CentGeomBBox3fa& pinfo1) { pinfo0.merge(pinfo1); },
+                                           128);
+          
+          new (&lset) PrimInfoRange(begin,center,local_left);
+          new (&rset) PrimInfoRange(center,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+        }
+        
+        void deterministic_order(const range<size_t>& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims[set.begin()],&prims[set.end()]);
+        }
+        
+        void splitFallback(const range<size_t>& set, PrimInfoRange& lset, PrimInfoRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+          
+          CentGeomBBox3fa left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.extend_center2(prims[i]);
+          new (&lset) PrimInfoRange(begin,center,left);
+          
+          CentGeomBBox3fa right(empty);
+          for (size_t i=center; i<end; i++)
+            right.extend_center2(prims[i]);
+          new (&rset) PrimInfoRange(center,end,right);
+        }
+        
+      private:
+        Scene* const scene;
+        PrimRef* const prims;
+      };
+
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, size_t BINS>
+      struct UnalignedHeuristicArrayBinningMB
+      {
+        typedef BinSplit<BINS> Split;
+        typedef typename PrimRefMB::BBox BBox;
+        typedef BinInfoT<BINS,PrimRefMB,BBox> ObjectBinner;
+        
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        UnalignedHeuristicArrayBinningMB(Scene* scene)
+        : scene(scene) {}
+
+        const LinearSpace3fa computeAlignedSpaceMB(Scene* scene, const SetMB& set)
+        {
+          Vec3fa axis0(0,0,1);
+          uint64_t bestGeomPrimID = -1;
+
+          /*! find curve with minimum ID that defines valid direction */
+          for (size_t i=set.begin(); i<set.end(); i++)
+          {
+            const PrimRefMB& prim = (*set.prims)[i];
+            const unsigned int geomID = prim.geomID();
+            const unsigned int primID = prim.primID();
+            const uint64_t geomprimID = prim.ID64();
+            if (geomprimID >= bestGeomPrimID) continue;
+            
+            const Geometry* mesh = scene->get(geomID);
+            const range<int> tbounds = mesh->timeSegmentRange(set.time_range);
+            if (tbounds.size() == 0) continue;
+
+            const size_t t = (tbounds.begin()+tbounds.end())/2;
+            const Vec3fa axis1 = mesh->computeDirection(primID,t);
+            if (sqr_length(axis1) > 1E-18f) {
+              axis0 = normalize(axis1);
+              bestGeomPrimID = geomprimID;
+            }
+          }
+
+          return frame(axis0).transposed();
+        }
+
+        struct BinBoundsAndCenter
+        {
+          __forceinline BinBoundsAndCenter(Scene* scene, BBox1f time_range, const LinearSpace3fa& space)
+            : scene(scene), time_range(time_range), space(space) {}
+          
+          /*! returns center for binning */
+          template<typename PrimRef>
+          __forceinline Vec3fa binCenter(const PrimRef& ref) const
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            return center2(lbounds.interpolate(0.5f));
+          }
+
+          /*! returns bounds and centroid used for binning */
+          __noinline void binBoundsAndCenter (const PrimRefMB& ref, BBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            bounds_o = lbounds.interpolate(0.5f);
+            center_o = center2(bounds_o);
+          }
+
+          /*! returns bounds and centroid used for binning */
+          __noinline void binBoundsAndCenter (const PrimRefMB& ref, LBBox3fa& bounds_o, Vec3fa& center_o) const // __noinline is workaround for ICC16 bug under MacOSX
+          {
+            Geometry* mesh = scene->get(ref.geomID());
+            LBBox3fa lbounds = mesh->vlinearBounds(space,ref.primID(),time_range);
+            bounds_o = lbounds;
+            center_o = center2(lbounds.interpolate(0.5f));
+          }
+          
+        private:
+          Scene* scene;
+          BBox1f time_range;
+          const LinearSpace3fa space;
+        };
+
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize, const LinearSpace3fa& space)
+        {
+          BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+          ObjectBinner binner(empty);
+          const BinMapping<BINS> mapping(set.size(),set.centBounds);
+          bin_parallel(binner,set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,mapping,binBoundsAndCenter);
+          Split osplit = binner.best(mapping,logBlockSize);
+          osplit.sah *= set.time_range.size();
+          if (!osplit.valid()) osplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return osplit;
+        }
+        
+        /*! array partitioning */
+        __forceinline void split(const Split& split, const LinearSpace3fa& space, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          BinBoundsAndCenter binBoundsAndCenter(scene,set.time_range,space);
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfoMB left = empty;
+          PrimInfoMB right = empty;
+          const vint4 vSplitPos(split.pos);
+          const vbool4 vSplitMask(1 << split.dim);
+          auto isLeft = [&] (const PrimRefMB &ref) { return any(((vint4)split.mapping.bin_unsafe(ref,binBoundsAndCenter) < vSplitPos) & vSplitMask); };
+          auto reduction = [] (PrimInfoMB& pinfo, const PrimRefMB& ref) { pinfo.add_primref(ref); };
+          auto reduction2 = [] (PrimInfoMB& pinfo0,const PrimInfoMB& pinfo1) { pinfo0.merge(pinfo1); };
+          size_t center = parallel_partitioning(set.prims->data(),begin,end,EmptyTy(),left,right,isLeft,reduction,reduction2,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD);
+          new (&lset) SetMB(left,set.prims,range<size_t>(begin,center),set.time_range);
+          new (&rset) SetMB(right,set.prims,range<size_t>(center,end ),set.time_range);
+        }
+
+      private:
+        Scene* scene;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h
new file mode 100644
index 0000000000..21f18c0208
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_openmerge_array.h
@@ -0,0 +1,443 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+// TODO: 
+//       - adjust parallel build thresholds
+//       - openNodesBasedOnExtend should consider max extended size
+  
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+/* stop opening of all bref.geomIDs are the same */
+#define EQUAL_GEOMID_STOP_CRITERIA 1
+
+/* 10% spatial extend threshold */
+#define MAX_EXTEND_THRESHOLD   0.1f
+
+/* maximum is 8 children */
+#define MAX_OPENED_CHILD_NODES 8
+
+/* open until all build refs are below threshold size in one step */
+#define USE_LOOP_OPENING 0
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename NodeOpenerFunc, typename PrimRef, size_t OBJECT_BINS>
+      struct HeuristicArrayOpenMergeSAH
+      {
+        typedef BinSplit<OBJECT_BINS> Split;
+        typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> Binner;
+        
+        static const size_t PARALLEL_THRESHOLD = 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 512;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        static const size_t MOVE_STEP_SIZE = 64;
+        static const size_t CREATE_SPLITS_STEP_SIZE = 128;
+
+        __forceinline HeuristicArrayOpenMergeSAH ()
+          : prims0(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline HeuristicArrayOpenMergeSAH (const NodeOpenerFunc& nodeOpenerFunc, PrimRef* prims0, size_t max_open_size)
+          : prims0(prims0), nodeOpenerFunc(nodeOpenerFunc), max_open_size(max_open_size) 
+        {
+          assert(max_open_size <= MAX_OPENED_CHILD_NODES);
+        }
+
+        struct OpenHeuristic
+        {
+          __forceinline OpenHeuristic( const PrimInfoExtRange& pinfo )
+          {
+            const Vec3fa diag = pinfo.geomBounds.size();
+            dim = maxDim(diag);
+            assert(diag[dim] > 0.0f);
+            inv_max_extend = 1.0f / diag[dim];
+          }
+
+          __forceinline bool operator () ( PrimRef& prim ) const {
+            return !prim.node.isLeaf() && prim.bounds().size()[dim] * inv_max_extend > MAX_EXTEND_THRESHOLD;
+          }
+
+        private:
+          size_t dim;
+          float inv_max_extend;
+        };
+
+        /*! compute extended ranges */
+        __forceinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+        {
+          assert(set.ext_range_size() > 0);
+          const float left_factor           = (float)lweight / (lweight + rweight);
+          const size_t ext_range_size       = set.ext_range_size();
+          const size_t left_ext_range_size  = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+          const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+          lset.set_ext_range(lset.end() + left_ext_range_size);
+          rset.set_ext_range(rset.end() + right_ext_range_size);
+        }
+
+        /*! move ranges */
+        __forceinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t left_ext_range_size = lset.ext_range_size();
+          const size_t right_size = rset.size();
+
+          /* has the left child an extended range? */
+          if (left_ext_range_size > 0)
+          {
+            /* left extended range smaller than right range ? */
+            if (left_ext_range_size < right_size)
+            {
+              /* only move a small part of the beginning of the right range to the end */
+              parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {                  
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+right_size] = prims0[i];
+                });
+            }
+            else
+            {
+              /* no overlap, move entire right range to new location, can be made fully parallel */
+              parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE,  [&](const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+left_ext_range_size] = prims0[i];
+                });
+            }
+            /* update right range */
+            assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+            rset.move_right(left_ext_range_size);
+          }
+        }
+
+        /* estimates the extra space required when opening, and checks if all primitives are from same geometry */
+        __noinline std::pair<size_t,bool> getProperties(const PrimInfoExtRange& set)
+        {
+          const OpenHeuristic heuristic(set);
+          const unsigned int geomID = prims0[set.begin()].geomID();
+          
+          auto body = [&] (const range<size_t>& r) -> std::pair<size_t,bool> { 
+            bool commonGeomID = true;
+            size_t opens = 0;
+            for (size_t i=r.begin(); i<r.end(); i++) {
+              commonGeomID &= prims0[i].geomID() == geomID; 
+              if (heuristic(prims0[i]))
+                opens += prims0[i].node.getN()-1; // coarse approximation
+            }
+            return std::pair<size_t,bool>(opens,commonGeomID); 
+          };
+          auto reduction = [&] (const std::pair<size_t,bool>& b0, const std::pair<size_t,bool>& b1) -> std::pair<size_t,bool> { 
+            return std::pair<size_t,bool>(b0.first+b1.first,b0.second && b1.second); 
+          };
+          return parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,std::pair<size_t,bool>(0,true),body,reduction);
+        }
+
+        // FIXME: should consider maximum available extended size 
+        __noinline void openNodesBasedOnExtend(PrimInfoExtRange& set)
+        {
+          const OpenHeuristic heuristic(set);
+          const size_t ext_range_start = set.end();
+
+          if (false && set.size() < PARALLEL_THRESHOLD) 
+          {
+            size_t extra_elements = 0;
+            for (size_t i=set.begin(); i<set.end(); i++)
+            {
+              if (heuristic(prims0[i]))
+              {
+                PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                assert(extra_elements + n-1 <= set.ext_range_size());
+                for (size_t j=0; j<n; j++)
+                  set.extend_center2(tmp[j]);
+
+                prims0[i] = tmp[0];
+                for (size_t j=1; j<n; j++)
+                  prims0[ext_range_start+extra_elements+j-1] = tmp[j]; 
+                extra_elements += n-1;
+              }
+            }
+            set._end += extra_elements;
+          }
+          else 
+          {
+            std::atomic<size_t> ext_elements;
+            ext_elements.store(0);
+            PrimInfo info = parallel_reduce( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, PrimInfo(empty), [&](const range<size_t>& r) -> PrimInfo {
+                PrimInfo info(empty);
+                for (size_t i=r.begin(); i<r.end(); i++)
+                  if (heuristic(prims0[i]))
+                  {
+                    PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                    const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                    const size_t ID = ext_elements.fetch_add(n-1);
+                    assert(ID + n-1 <= set.ext_range_size());
+
+                    for (size_t j=0; j<n; j++)
+                      info.extend_center2(tmp[j]);
+
+                    prims0[i] = tmp[0];
+                    for (size_t j=1; j<n; j++)
+                      prims0[ext_range_start+ID+j-1] = tmp[j]; 
+                  }
+                return info;
+              }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+            set.centBounds.extend(info.centBounds);
+            assert(ext_elements.load() <= set.ext_range_size());
+            set._end += ext_elements.load();
+          }
+        } 
+
+        __noinline void openNodesBasedOnExtendLoop(PrimInfoExtRange& set, const size_t est_new_elements)
+        {
+          const OpenHeuristic heuristic(set);
+          size_t next_iteration_extra_elements = est_new_elements;          
+          
+          while (next_iteration_extra_elements <= set.ext_range_size()) 
+          {
+            next_iteration_extra_elements = 0;
+            size_t extra_elements = 0;
+            const size_t ext_range_start = set.end();
+
+            for (size_t i=set.begin(); i<set.end(); i++)
+            {
+              if (heuristic(prims0[i]))
+              {
+                PrimRef tmp[MAX_OPENED_CHILD_NODES];
+                const size_t n = nodeOpenerFunc(prims0[i],tmp);
+                assert(extra_elements + n-1 <= set.ext_range_size());
+                for (size_t j=0;j<n;j++)
+                  set.extend_center2(tmp[j]);
+                  
+                prims0[i] = tmp[0];
+                for (size_t j=1;j<n;j++)
+                  prims0[ext_range_start+extra_elements+j-1] = tmp[j]; 
+                extra_elements += n-1;
+
+                for (size_t j=0; j<n; j++)
+                  if (heuristic(tmp[j]))
+                    next_iteration_extra_elements += tmp[j].node.getN()-1; // coarse approximation
+
+              }
+            }
+            assert( extra_elements <= set.ext_range_size());
+            set._end += extra_elements;
+
+            for (size_t i=set.begin();i<set.end();i++)
+              assert(prims0[i].numPrimitives() > 0);
+
+            if (unlikely(next_iteration_extra_elements == 0)) break;
+          }
+        } 
+
+        __noinline const Split find(PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          /* single element */
+          if (set.size() <= 1)
+            return Split();
+
+          /* disable opening if there is no overlap */
+          const size_t D = 4;
+          if (unlikely(set.has_ext_range() && set.size() <= D))
+          {
+            bool disjoint = true;
+            for (size_t j=set.begin(); j<set.end()-1; j++) {
+              for (size_t i=set.begin()+1; i<set.end(); i++) {
+                if (conjoint(prims0[j].bounds(),prims0[i].bounds())) { 
+                  disjoint = false; break; 
+                }
+              }
+            }
+            if (disjoint) set.set_ext_range(set.end()); /* disables opening */
+          }
+
+          std::pair<size_t,bool> p(0,false);
+
+          /* disable opening when all primitives are from same geometry */
+          if (unlikely(set.has_ext_range()))
+          {
+            p =  getProperties(set);
+#if EQUAL_GEOMID_STOP_CRITERIA == 1
+            if (p.second) set.set_ext_range(set.end()); /* disable opening */
+#endif         
+          }
+
+          /* open nodes when we have sufficient space available */
+          if (unlikely(set.has_ext_range()))
+          {
+#if USE_LOOP_OPENING == 1
+            openNodesBasedOnExtendLoop(set,p.first);
+#else
+            if (p.first <= set.ext_range_size())
+              openNodesBasedOnExtend(set);
+#endif
+
+            /* disable opening when unsufficient space for opening a node available */
+            if (set.ext_range_size() < max_open_size-1) 
+              set.set_ext_range(set.end()); /* disable opening */
+          }
+                    
+          /* find best split */
+          return object_find(set,logBlockSize);
+        }
+
+
+        /*! finds the best object split */
+        __forceinline const Split object_find(const PrimInfoExtRange& set,const size_t logBlockSize)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize);
+          else                                 return parallel_object_find  (set,logBlockSize);
+        }
+
+        /*! finds the best object split */
+        __noinline const Split sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          Binner binner(empty); 
+          const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+          binner.bin(prims0,set.begin(),set.end(),mapping);
+          return binner.best(mapping,logBlockSize);
+        }
+
+        /*! finds the best split */
+        __noinline const Split parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          Binner binner(empty);
+          const BinMapping<OBJECT_BINS> mapping(set.centBounds);
+          const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          auto body = [&] (const range<size_t>& r) -> Binner { 
+            Binner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; 
+          };
+          auto reduction = [&] (const Binner& b0, const Binner& b1) -> Binner { 
+            Binner r = b0; r.merge(b1,_mapping.size()); return r; 
+          };
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,body,reduction);
+          return binner.best(mapping,logBlockSize);
+        }
+        
+        /*! array partitioning */
+        __noinline void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          PrimInfoExtRange set = set_i;
+
+          /* valid split */
+          if (unlikely(!split.valid())) {
+            deterministic_order(set);
+            splitFallback(set,lset,rset);
+            return;
+          }
+
+          std::pair<size_t,size_t> ext_weights(0,0);
+
+          /* object split */
+          if (likely(set.size() < PARALLEL_THRESHOLD)) 
+            ext_weights = sequential_object_split(split,set,lset,rset);
+          else
+            ext_weights = parallel_object_split(split,set,lset,rset);
+
+          /* if we have an extended range, set extended child ranges and move right split range */
+          if (unlikely(set.has_ext_range())) 
+          {
+            setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+
+        /*! array partitioning */
+        std::pair<size_t,size_t> sequential_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); });          
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(local_left.size(),local_right.size());
+        }
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_object_split(const Split& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+          auto isLeft = [&] (const PrimRef& ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref); },
+            [] (PrimInfo& pinfo0,const PrimInfo& pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+
+          return std::pair<size_t,size_t>(left.size(),right.size());
+        }
+
+        void deterministic_order(const extended_range<size_t>& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims0[set.begin()],&prims0[set.end()]);
+        }
+
+        __forceinline void splitFallback(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+
+          PrimInfo left(empty);
+          for (size_t i=begin; i<center; i++)
+            left.add_center2(prims0[i]);
+
+          const size_t lweight = left.end;
+          
+          PrimInfo right(empty);
+          for (size_t i=center; i<end; i++)
+            right.add_center2(prims0[i]);	
+
+          const size_t rweight = right.end;
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          /* if we have an extended range */
+          if (set.has_ext_range()) 
+          {
+            setExtentedRanges(set,lset,rset,lweight,rweight);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+        
+      private:
+        PrimRef* const prims0;
+        const NodeOpenerFunc& nodeOpenerFunc;
+        size_t max_open_size;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h
new file mode 100644
index 0000000000..d8ca6cb92c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial.h
@@ -0,0 +1,414 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "priminfo.h"
+
+namespace embree
+{
+  static const unsigned int RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS = 5;
+
+  namespace isa
+  {
+
+    /*! mapping into bins */
+    template<size_t BINS>
+      struct SpatialBinMapping
+      {
+      public:
+        __forceinline SpatialBinMapping() {}
+        
+        /*! calculates the mapping */
+        __forceinline SpatialBinMapping(const CentGeomBBox3fa& pinfo)
+        {
+          const vfloat4 lower = (vfloat4) pinfo.geomBounds.lower;
+          const vfloat4 upper = (vfloat4) pinfo.geomBounds.upper;
+          const vfloat4 eps = 128.0f*vfloat4(ulp)*max(abs(lower),abs(upper));
+          const vfloat4 diag = max(eps,(vfloat4) pinfo.geomBounds.size());
+          scale = select(upper-lower <= eps,vfloat4(0.0f),vfloat4(BINS)/diag);
+          ofs  = (vfloat4) pinfo.geomBounds.lower;
+          inv_scale = 1.0f / scale; 
+        }
+
+        /*! slower but safe binning */
+        __forceinline vint4 bin(const Vec3fa& p) const
+        {
+          const vint4 i = floori((vfloat4(p)-ofs)*scale);
+          return clamp(i,vint4(0),vint4(BINS-1));
+        }
+
+        __forceinline std::pair<vint4,vint4> bin(const BBox3fa& b) const
+        {
+#if defined(__AVX__)
+          const vfloat8 ofs8(ofs);
+          const vfloat8 scale8(scale);
+          const vint8 lu   = floori((vfloat8::loadu(&b)-ofs8)*scale8);
+          const vint8 c_lu = clamp(lu,vint8(zero),vint8(BINS-1));
+          return std::pair<vint4,vint4>(extract4<0>(c_lu),extract4<1>(c_lu));
+#else
+          const vint4 lower = floori((vfloat4(b.lower)-ofs)*scale);
+          const vint4 upper = floori((vfloat4(b.upper)-ofs)*scale);
+          const vint4 c_lower = clamp(lower,vint4(0),vint4(BINS-1));
+          const vint4 c_upper = clamp(upper,vint4(0),vint4(BINS-1));
+          return std::pair<vint4,vint4>(c_lower,c_upper);
+#endif
+        }
+
+        
+        /*! calculates left spatial position of bin */
+        __forceinline float pos(const size_t bin, const size_t dim) const {
+          return madd(float(bin),inv_scale[dim],ofs[dim]);
+        }
+
+        /*! calculates left spatial position of bin */
+        template<size_t N>
+        __forceinline vfloat<N> posN(const vfloat<N> bin, const size_t dim) const {
+          return madd(bin,vfloat<N>(inv_scale[dim]),vfloat<N>(ofs[dim]));
+        }
+        
+        /*! returns true if the mapping is invalid in some dimension */
+        __forceinline bool invalid(const size_t dim) const {
+          return scale[dim] == 0.0f;
+        }
+        
+      public:
+        vfloat4 ofs,scale,inv_scale;  //!< linear function that maps to bin ID
+      };
+
+    /*! stores all information required to perform some split */
+    template<size_t BINS>
+      struct SpatialBinSplit
+      {
+        /*! construct an invalid split by default */
+        __forceinline SpatialBinSplit() 
+          : sah(inf), dim(-1), pos(0), left(-1), right(-1), factor(1.0f) {}
+        
+        /*! constructs specified split */
+        __forceinline SpatialBinSplit(float sah, int dim, int pos, const SpatialBinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), left(-1), right(-1), factor(1.0f), mapping(mapping) {}
+
+        /*! constructs specified split */
+        __forceinline SpatialBinSplit(float sah, int dim, int pos, int left, int right, float factor, const SpatialBinMapping<BINS>& mapping)
+          : sah(sah), dim(dim), pos(pos), left(left), right(right), factor(factor), mapping(mapping) {}
+        
+        /*! tests if this split is valid */
+        __forceinline bool valid() const { return dim != -1; }
+        
+        /*! calculates surface area heuristic for performing the split */
+        __forceinline float splitSAH() const { return sah; }
+        
+        /*! stream output */
+        friend embree_ostream operator<<(embree_ostream cout, const SpatialBinSplit& split) {
+          return cout << "SpatialBinSplit { sah = " << split.sah << ", dim = " << split.dim << ", pos = " << split.pos << ", left = " << split.left << ", right = " << split.right << ", factor = " << split.factor << "}";
+        }
+        
+      public:
+        float sah;                 //!< SAH cost of the split
+        int   dim;                 //!< split dimension
+        int   pos;                 //!< split position
+        int   left;                //!< number of elements on the left side
+        int   right;               //!< number of elements on the right side
+        float factor;              //!< factor splitting the extended range
+        SpatialBinMapping<BINS> mapping; //!< mapping into bins
+      };    
+    
+    /*! stores all binning information */
+    template<size_t BINS, typename PrimRef>
+      struct __aligned(64) SpatialBinInfo
+    {
+      SpatialBinInfo() {
+      }
+
+      __forceinline SpatialBinInfo(EmptyTy) {
+	clear();
+      }
+
+      /*! clears the bin info */
+      __forceinline void clear() 
+      {
+        for (size_t i=0; i<BINS; i++) { 
+          bounds[i][0] = bounds[i][1] = bounds[i][2] = empty;
+          numBegin[i] = numEnd[i] = 0;
+        }
+      }
+      
+      /*! adds binning data */
+      __forceinline void add(const size_t dim,
+                             const size_t beginID, 
+                             const size_t endID, 
+                             const size_t binID, 
+                             const BBox3fa &b,
+                             const size_t n = 1) 
+      {
+        assert(beginID < BINS);
+        assert(endID < BINS);
+        assert(binID < BINS);
+
+        numBegin[beginID][dim]+=(unsigned int)n;
+        numEnd  [endID][dim]+=(unsigned int)n;
+        bounds  [binID][dim].extend(b);        
+      }
+
+      /*! extends binning bounds */
+      __forceinline void extend(const size_t dim,
+                                const size_t binID, 
+                                const BBox3fa &b) 
+      {
+        assert(binID < BINS);
+        bounds  [binID][dim].extend(b);        
+      }
+      
+      /*! bins an array of triangles */
+      template<typename SplitPrimitive>
+        __forceinline void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t N, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=0; i<N; i++)
+        {
+          const PrimRef prim = prims[i];
+          unsigned splits = prim.geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+          if (unlikely(splits == 1))
+          {
+            const vint4 bin = mapping.bin(center(prim.bounds()));
+            for (size_t dim=0; dim<3; dim++) 
+            {
+              assert(bin[dim] >= (int)0 && bin[dim] < (int)BINS);
+              numBegin[bin[dim]][dim]++;
+              numEnd  [bin[dim]][dim]++;
+              bounds  [bin[dim]][dim].extend(prim.bounds());
+            }
+          } 
+          else
+          {
+            const vint4 bin0 = mapping.bin(prim.bounds().lower);
+            const vint4 bin1 = mapping.bin(prim.bounds().upper);
+            
+            for (size_t dim=0; dim<3; dim++) 
+            {
+              size_t bin;
+              PrimRef rest = prim;
+              size_t l = bin0[dim];
+              size_t r = bin1[dim];
+
+              // same bin optimization
+              if (likely(l == r)) 
+              {
+                numBegin[l][dim]++;
+                numEnd  [l][dim]++;
+                bounds  [l][dim].extend(prim.bounds());
+                continue;
+              }
+
+              for (bin=(size_t)bin0[dim]; bin<(size_t)bin1[dim]; bin++) 
+              {
+                const float pos = mapping.pos(bin+1,dim);
+                
+                PrimRef left,right;
+                splitPrimitive(rest,(int)dim,pos,left,right);
+                if (unlikely(left.bounds().empty())) l++;                
+                bounds[bin][dim].extend(left.bounds());
+                rest = right;
+              }
+              if (unlikely(rest.bounds().empty())) r--;
+              numBegin[l][dim]++;
+              numEnd  [r][dim]++;
+              bounds  [bin][dim].extend(rest.bounds());
+            }
+          }
+        }
+      }
+      
+      /*! bins a range of primitives inside an array */
+      template<typename SplitPrimitive>
+        void bin(const SplitPrimitive& splitPrimitive, const PrimRef* prims, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping) {
+	bin(splitPrimitive,prims+begin,end-begin,mapping);
+      }
+
+      /*! bins an array of primitives */
+      template<typename PrimitiveSplitterFactory>
+        __forceinline void bin2(const PrimitiveSplitterFactory& splitterFactory, const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=begin; i<end; i++)
+        {
+          const PrimRef &prim = source[i];
+          const vint4 bin0 = mapping.bin(prim.bounds().lower);
+          const vint4 bin1 = mapping.bin(prim.bounds().upper);
+          
+          for (size_t dim=0; dim<3; dim++) 
+          {
+            if (unlikely(mapping.invalid(dim))) 
+              continue;
+            
+            size_t bin;
+            size_t l = bin0[dim];
+            size_t r = bin1[dim];
+            
+            // same bin optimization
+            if (likely(l == r)) 
+            {
+              add(dim,l,l,l,prim.bounds());
+              continue;
+            }
+            const size_t bin_start = bin0[dim];
+            const size_t bin_end   = bin1[dim];
+            BBox3fa rest = prim.bounds();
+            const auto splitter = splitterFactory(prim);
+            for (bin=bin_start; bin<bin_end; bin++) 
+            {
+              const float pos = mapping.pos(bin+1,dim);
+              BBox3fa left,right;
+              splitter(rest,dim,pos,left,right);
+              if (unlikely(left.empty())) l++;                
+              extend(dim,bin,left);
+              rest = right;
+            }
+            if (unlikely(rest.empty())) r--;
+            add(dim,l,r,bin,rest);
+          }
+        }              
+      }
+
+
+
+      /*! bins an array of primitives */
+      __forceinline void binSubTreeRefs(const PrimRef* source, size_t begin, size_t end, const SpatialBinMapping<BINS>& mapping)
+      {
+        for (size_t i=begin; i<end; i++)
+        {
+          const PrimRef &prim = source[i];
+          const vint4 bin0 = mapping.bin(prim.bounds().lower);
+          const vint4 bin1 = mapping.bin(prim.bounds().upper);
+          
+          for (size_t dim=0; dim<3; dim++) 
+          {
+            if (unlikely(mapping.invalid(dim))) 
+              continue;
+            
+            const size_t l = bin0[dim];
+            const size_t r = bin1[dim];
+
+            const unsigned int n  = prim.primID();
+            
+            // same bin optimization
+            if (likely(l == r)) 
+            {
+              add(dim,l,l,l,prim.bounds(),n);
+              continue;
+            }
+            const size_t bin_start = bin0[dim];
+            const size_t bin_end   = bin1[dim];
+            for (size_t bin=bin_start; bin<bin_end; bin++) 
+              add(dim,l,r,bin,prim.bounds(),n);
+          }
+        }              
+      }
+      
+      /*! merges in other binning information */
+      void merge (const SpatialBinInfo& other)
+      {
+        for (size_t i=0; i<BINS; i++) 
+        {
+          numBegin[i] += other.numBegin[i];
+          numEnd  [i] += other.numEnd  [i];
+          bounds[i][0].extend(other.bounds[i][0]);
+          bounds[i][1].extend(other.bounds[i][1]);
+          bounds[i][2].extend(other.bounds[i][2]);
+        }
+      }
+
+      /*! merges in other binning information */
+      static __forceinline const SpatialBinInfo reduce (const SpatialBinInfo& a, const SpatialBinInfo& b)
+      {
+        SpatialBinInfo c(empty);
+        for (size_t i=0; i<BINS; i++) 
+        {
+          c.numBegin[i] += a.numBegin[i]+b.numBegin[i];
+          c.numEnd  [i] += a.numEnd  [i]+b.numEnd  [i];
+          c.bounds[i][0] = embree::merge(a.bounds[i][0],b.bounds[i][0]);
+          c.bounds[i][1] = embree::merge(a.bounds[i][1],b.bounds[i][1]);
+          c.bounds[i][2] = embree::merge(a.bounds[i][2],b.bounds[i][2]);
+        }
+        return c;
+      }
+      
+      /*! finds the best split by scanning binning information */
+      SpatialBinSplit<BINS> best(const SpatialBinMapping<BINS>& mapping, const size_t blocks_shift) const 
+      {
+        /* sweep from right to left and compute parallel prefix of merged bounds */
+        vfloat4 rAreas[BINS];
+        vuint4 rCounts[BINS];
+        vuint4 count = 0; BBox3fa bx = empty; BBox3fa by = empty; BBox3fa bz = empty;
+        for (size_t i=BINS-1; i>0; i--)
+        {
+          count += numEnd[i];
+          rCounts[i] = count;
+          bx.extend(bounds[i][0]); rAreas[i][0] = halfArea(bx);
+          by.extend(bounds[i][1]); rAreas[i][1] = halfArea(by);
+          bz.extend(bounds[i][2]); rAreas[i][2] = halfArea(bz);
+          rAreas[i][3] = 0.0f;
+        }
+        
+        /* sweep from left to right and compute SAH */
+        vuint4 blocks_add = (1 << blocks_shift)-1;
+        vuint4 ii = 1; vfloat4 vbestSAH = pos_inf; vuint4 vbestPos = 0; vuint4 vbestlCount = 0; vuint4 vbestrCount = 0;
+        count = 0; bx = empty; by = empty; bz = empty;
+        for (size_t i=1; i<BINS; i++, ii+=1)
+        {
+          count += numBegin[i-1];
+          bx.extend(bounds[i-1][0]); float Ax = halfArea(bx);
+          by.extend(bounds[i-1][1]); float Ay = halfArea(by);
+          bz.extend(bounds[i-1][2]); float Az = halfArea(bz);
+          const vfloat4 lArea = vfloat4(Ax,Ay,Az,Az);
+          const vfloat4 rArea = rAreas[i];
+          const vuint4 lCount = (count     +blocks_add) >> (unsigned int)(blocks_shift);
+          const vuint4 rCount = (rCounts[i]+blocks_add) >> (unsigned int)(blocks_shift);
+          const vfloat4 sah = madd(lArea,vfloat4(lCount),rArea*vfloat4(rCount));
+          // const vfloat4 sah = madd(lArea,vfloat4(vint4(lCount)),rArea*vfloat4(vint4(rCount)));
+          const vbool4 mask = sah < vbestSAH;
+          vbestPos      = select(mask,ii ,vbestPos);
+          vbestSAH      = select(mask,sah,vbestSAH);
+          vbestlCount   = select(mask,count,vbestlCount);
+          vbestrCount   = select(mask,rCounts[i],vbestrCount);
+        }
+        
+        /* find best dimension */
+        float bestSAH = inf;
+        int   bestDim = -1;
+        int   bestPos = 0;
+        unsigned int   bestlCount = 0;
+        unsigned int   bestrCount = 0;
+        for (int dim=0; dim<3; dim++) 
+        {
+          /* ignore zero sized dimensions */
+          if (unlikely(mapping.invalid(dim)))
+            continue;
+          
+          /* test if this is a better dimension */
+          if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
+            bestDim = dim;
+            bestPos = vbestPos[dim];
+            bestSAH = vbestSAH[dim];
+            bestlCount = vbestlCount[dim];
+            bestrCount = vbestrCount[dim];
+          }
+        }
+        assert(bestSAH >= 0.0f);
+        
+        /* return invalid split if no split found */
+        if (bestDim == -1) 
+          return SpatialBinSplit<BINS>(inf,-1,0,mapping);
+        
+        /* return best found split */
+        return SpatialBinSplit<BINS>(bestSAH,bestDim,bestPos,bestlCount,bestrCount,1.0f,mapping);
+      }
+      
+    private:
+      BBox3fa bounds[BINS][3];  //!< geometry bounds for each bin in each dimension
+      vuint4    numBegin[BINS];   //!< number of primitives starting in bin
+      vuint4    numEnd[BINS];     //!< number of primitives ending in bin
+    };
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h
new file mode 100644
index 0000000000..911dcf950c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_spatial_array.h
@@ -0,0 +1,552 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "heuristic_binning.h"
+#include "heuristic_spatial.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+#if 0
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.2f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.95f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.0f
+#else
+#define SPATIAL_ASPLIT_OVERLAP_THRESHOLD 0.1f
+#define SPATIAL_ASPLIT_SAH_THRESHOLD 0.99f
+#define SPATIAL_ASPLIT_AREA_THRESHOLD 0.000005f
+#endif
+
+    struct PrimInfoExtRange : public CentGeomBBox3fa, public extended_range<size_t>
+    {
+      __forceinline PrimInfoExtRange() {
+      }
+
+      __forceinline PrimInfoExtRange(EmptyTy)
+        : CentGeomBBox3fa(EmptyTy()), extended_range<size_t>(0,0,0) {}
+
+      __forceinline PrimInfoExtRange(size_t begin, size_t end, size_t ext_end, const CentGeomBBox3fa& centGeomBounds) 
+        : CentGeomBBox3fa(centGeomBounds), extended_range<size_t>(begin,end,ext_end) {}
+      
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+    };
+
+    template<typename ObjectSplit, typename SpatialSplit>
+      struct Split2
+      {
+        __forceinline Split2 () {}
+        
+        __forceinline Split2 (const Split2& other) 
+        {
+          spatial = other.spatial;
+          sah = other.sah;
+          if (spatial) spatialSplit() = other.spatialSplit();
+          else         objectSplit()  = other.objectSplit();
+        }
+        
+        __forceinline Split2& operator= (const Split2& other) 
+        {
+          spatial = other.spatial;
+          sah = other.sah;
+          if (spatial) spatialSplit() = other.spatialSplit();
+          else         objectSplit()  = other.objectSplit();
+          return *this;
+        }
+          
+          __forceinline     ObjectSplit&  objectSplit()        { return *(      ObjectSplit*)data; }
+        __forceinline const ObjectSplit&  objectSplit() const  { return *(const ObjectSplit*)data; }
+        
+        __forceinline       SpatialSplit& spatialSplit()       { return *(      SpatialSplit*)data; }
+        __forceinline const SpatialSplit& spatialSplit() const { return *(const SpatialSplit*)data; }
+        
+        __forceinline Split2 (const ObjectSplit& objectSplit, float sah)
+          : spatial(false), sah(sah) 
+        {
+          new (data) ObjectSplit(objectSplit);
+        }
+        
+        __forceinline Split2 (const SpatialSplit& spatialSplit, float sah)
+          : spatial(true), sah(sah) 
+        {
+          new (data) SpatialSplit(spatialSplit);
+        }
+        
+        __forceinline float splitSAH() const { 
+          return sah; 
+        }
+        
+        __forceinline bool valid() const {
+          return sah < float(inf);
+        }
+        
+      public:
+        __aligned(64) char data[sizeof(ObjectSplit) > sizeof(SpatialSplit) ? sizeof(ObjectSplit) : sizeof(SpatialSplit)];
+        bool spatial;
+        float sah;
+      };
+    
+    /*! Performs standard object binning */
+    template<typename PrimitiveSplitterFactory, typename PrimRef, size_t OBJECT_BINS, size_t SPATIAL_BINS>
+      struct HeuristicArraySpatialSAH
+      {
+        typedef BinSplit<OBJECT_BINS> ObjectSplit;
+        typedef BinInfoT<OBJECT_BINS,PrimRef,BBox3fa> ObjectBinner;
+
+        typedef SpatialBinSplit<SPATIAL_BINS> SpatialSplit;
+        typedef SpatialBinInfo<SPATIAL_BINS,PrimRef> SpatialBinner;
+
+        //typedef extended_range<size_t> Set;
+        typedef Split2<ObjectSplit,SpatialSplit> Split;
+        
+#if defined(__AVX512ER__) // KNL
+        static const size_t PARALLEL_THRESHOLD = 3*1024; 
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 768;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#else
+        static const size_t PARALLEL_THRESHOLD = 3*1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+#endif
+
+        static const size_t MOVE_STEP_SIZE = 64;
+        static const size_t CREATE_SPLITS_STEP_SIZE = 64;
+
+        __forceinline HeuristicArraySpatialSAH ()
+          : prims0(nullptr) {}
+        
+        /*! remember prim array */
+        __forceinline HeuristicArraySpatialSAH (const PrimitiveSplitterFactory& splitterFactory, PrimRef* prims0, const CentGeomBBox3fa& root_info)
+          : prims0(prims0), splitterFactory(splitterFactory), root_info(root_info) {}
+
+
+        /*! compute extended ranges */
+        __noinline void setExtentedRanges(const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset, const size_t lweight, const size_t rweight)
+        {
+          assert(set.ext_range_size() > 0);
+          const float left_factor           = (float)lweight / (lweight + rweight);
+          const size_t ext_range_size       = set.ext_range_size();
+          const size_t left_ext_range_size  = min((size_t)(floorf(left_factor * ext_range_size)),ext_range_size);
+          const size_t right_ext_range_size = ext_range_size - left_ext_range_size;
+          lset.set_ext_range(lset.end() + left_ext_range_size);
+          rset.set_ext_range(rset.end() + right_ext_range_size);
+        }
+
+        /*! move ranges */
+        __noinline void moveExtentedRange(const PrimInfoExtRange& set, const PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t left_ext_range_size = lset.ext_range_size();
+          const size_t right_size = rset.size();
+
+          /* has the left child an extended range? */
+          if (left_ext_range_size > 0)
+          {
+            /* left extended range smaller than right range ? */
+            if (left_ext_range_size < right_size)
+            {
+              /* only move a small part of the beginning of the right range to the end */
+              parallel_for( rset.begin(), rset.begin()+left_ext_range_size, MOVE_STEP_SIZE, [&](const range<size_t>& r) {                  
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+right_size] = prims0[i];
+                });
+            }
+            else
+            {
+              /* no overlap, move entire right range to new location, can be made fully parallel */
+              parallel_for( rset.begin(), rset.end(), MOVE_STEP_SIZE,  [&](const range<size_t>& r) {
+                  for (size_t i=r.begin(); i<r.end(); i++)
+                    prims0[i+left_ext_range_size] = prims0[i];
+                });
+            }
+            /* update right range */
+            assert(rset.ext_end() + left_ext_range_size == set.ext_end());
+            rset.move_right(left_ext_range_size);
+          }
+        }
+
+        /*! finds the best split */
+        const Split find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SplitInfo oinfo;
+          const ObjectSplit object_split = object_find(set,logBlockSize,oinfo);
+          const float object_split_sah = object_split.splitSAH();
+
+          if (unlikely(set.has_ext_range()))
+          {
+            const BBox3fa overlap = intersect(oinfo.leftBounds, oinfo.rightBounds);
+            
+            /* do only spatial splits if the child bounds overlap */
+            if (safeArea(overlap) >= SPATIAL_ASPLIT_AREA_THRESHOLD*safeArea(root_info.geomBounds) &&
+                safeArea(overlap) >= SPATIAL_ASPLIT_OVERLAP_THRESHOLD*safeArea(set.geomBounds))
+            {              
+              const SpatialSplit spatial_split = spatial_find(set, logBlockSize);
+              const float spatial_split_sah = spatial_split.splitSAH();
+
+              /* valid spatial split, better SAH and number of splits do not exceed extended range */
+              if (spatial_split_sah < SPATIAL_ASPLIT_SAH_THRESHOLD*object_split_sah &&
+                  spatial_split.left + spatial_split.right - set.size() <= set.ext_range_size())
+              {          
+                return Split(spatial_split,spatial_split_sah);
+              }
+            }
+          }
+
+          return Split(object_split,object_split_sah);
+        }
+
+        /*! finds the best object split */
+        __forceinline const ObjectSplit object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_object_find(set,logBlockSize,info);
+          else                                 return parallel_object_find  (set,logBlockSize,info);
+        }
+
+        /*! finds the best object split */
+        __noinline const ObjectSplit sequential_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          ObjectBinner binner(empty); 
+          const BinMapping<OBJECT_BINS> mapping(set);
+          binner.bin(prims0,set.begin(),set.end(),mapping);
+          ObjectSplit s = binner.best(mapping,logBlockSize);
+          binner.getSplitInfo(mapping, s, info);
+          return s;
+        }
+
+        /*! finds the best split */
+        __noinline const ObjectSplit parallel_object_find(const PrimInfoExtRange& set, const size_t logBlockSize, SplitInfo &info)
+        {
+          ObjectBinner binner(empty);
+          const BinMapping<OBJECT_BINS> mapping(set);
+          const BinMapping<OBJECT_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+                                   [&] (const range<size_t>& r) -> ObjectBinner { ObjectBinner binner(empty); binner.bin(prims0+r.begin(),r.size(),_mapping); return binner; },
+                                   [&] (const ObjectBinner& b0, const ObjectBinner& b1) -> ObjectBinner { ObjectBinner r = b0; r.merge(b1,_mapping.size()); return r; });
+          ObjectSplit s = binner.best(mapping,logBlockSize);
+          binner.getSplitInfo(mapping, s, info);
+          return s;
+        }
+
+        /*! finds the best spatial split */
+        __forceinline const SpatialSplit spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          if (set.size() < PARALLEL_THRESHOLD) return sequential_spatial_find(set, logBlockSize);
+          else                                 return parallel_spatial_find  (set, logBlockSize);
+        }
+
+        /*! finds the best spatial split */
+        __noinline const SpatialSplit sequential_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SpatialBinner binner(empty); 
+          const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+          binner.bin2(splitterFactory,prims0,set.begin(),set.end(),mapping);
+          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          return binner.best(mapping,logBlockSize); //,set.ext_size());
+        }
+
+        __noinline const SpatialSplit parallel_spatial_find(const PrimInfoExtRange& set, const size_t logBlockSize)
+        {
+          SpatialBinner binner(empty);
+          const SpatialBinMapping<SPATIAL_BINS> mapping(set);
+          const SpatialBinMapping<SPATIAL_BINS>& _mapping = mapping; // CLANG 3.4 parser bug workaround
+          binner = parallel_reduce(set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,binner,
+                                   [&] (const range<size_t>& r) -> SpatialBinner { 
+                                     SpatialBinner binner(empty); 
+                                     binner.bin2(splitterFactory,prims0,r.begin(),r.end(),_mapping);
+                                     return binner; },
+                                   [&] (const SpatialBinner& b0, const SpatialBinner& b1) -> SpatialBinner { return SpatialBinner::reduce(b0,b1); });
+          /* todo: best spatial split not exeeding the extended range does not provide any benefit ?*/
+          return binner.best(mapping,logBlockSize); //,set.ext_size());
+        }
+
+
+        /*! subdivides primitives based on a spatial split */
+        __noinline void create_spatial_splits(PrimInfoExtRange& set, const SpatialSplit& split, const SpatialBinMapping<SPATIAL_BINS> &mapping)
+        {
+          assert(set.has_ext_range());
+          const size_t max_ext_range_size = set.ext_range_size();
+          const size_t ext_range_start = set.end();
+
+          /* atomic counter for number of primref splits */
+          std::atomic<size_t> ext_elements;
+          ext_elements.store(0);
+          
+          const float fpos = split.mapping.pos(split.pos,split.dim);
+        
+          const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+
+          parallel_for( set.begin(), set.end(), CREATE_SPLITS_STEP_SIZE, [&](const range<size_t>& r) {
+              for (size_t i=r.begin();i<r.end();i++)
+              {
+                const unsigned int splits = prims0[i].geomID() >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS);
+
+                if (likely(splits <= 1)) continue; /* todo: does this ever happen ? */
+
+                //int bin0 = split.mapping.bin(prims0[i].lower)[split.dim];
+                //int bin1 = split.mapping.bin(prims0[i].upper)[split.dim];
+                //if (unlikely(bin0 < split.pos && bin1 >= split.pos))
+                if (unlikely(prims0[i].lower[split.dim] < fpos && prims0[i].upper[split.dim] > fpos))
+                {
+                  assert(splits > 1);
+
+                  PrimRef left,right;
+                  const auto splitter = splitterFactory(prims0[i]);
+                  splitter(prims0[i],split.dim,fpos,left,right);
+                
+                  // no empty splits
+                  if (unlikely(left.bounds().empty() || right.bounds().empty())) continue;
+                
+                  left.lower.u  = (left.lower.u  & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+                  right.lower.u = (right.lower.u & mask) | ((splits-1) << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+
+                  const size_t ID = ext_elements.fetch_add(1);
+
+                  /* break if the number of subdivided elements are greater than the maximum allowed size */
+                  if (unlikely(ID >= max_ext_range_size)) 
+                    break;
+
+                  /* only write within the correct bounds */
+                  assert(ID < max_ext_range_size);
+                  prims0[i] = left;
+                  prims0[ext_range_start+ID] = right;     
+                }
+              }
+            });
+
+          const size_t numExtElements = min(max_ext_range_size,ext_elements.load());          
+          assert(set.end()+numExtElements<=set.ext_end());
+          set._end += numExtElements;
+        }
+        
+        /*! array partitioning */
+        void split(const Split& split, const PrimInfoExtRange& set_i, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          PrimInfoExtRange set = set_i;
+          
+          /* valid split */
+          if (unlikely(!split.valid())) {
+            deterministic_order(set);
+            return splitFallback(set,lset,rset);
+          }
+
+          std::pair<size_t,size_t> ext_weights(0,0);
+
+          if (unlikely(split.spatial))
+          {
+            create_spatial_splits(set,split.spatialSplit(), split.spatialSplit().mapping); 
+
+            /* spatial split */
+            if (likely(set.size() < PARALLEL_THRESHOLD)) 
+              ext_weights = sequential_spatial_split(split.spatialSplit(),set,lset,rset);
+            else
+              ext_weights = parallel_spatial_split(split.spatialSplit(),set,lset,rset);
+          }
+          else
+          {
+            /* object split */
+            if (likely(set.size() < PARALLEL_THRESHOLD)) 
+              ext_weights = sequential_object_split(split.objectSplit(),set,lset,rset);
+            else
+              ext_weights = parallel_object_split(split.objectSplit(),set,lset,rset);
+          }
+
+          /* if we have an extended range, set extended child ranges and move right split range */
+          if (unlikely(set.has_ext_range())) 
+          {
+            setExtentedRanges(set,lset,rset,ext_weights.first,ext_weights.second);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+
+        /*! array partitioning */
+        std::pair<size_t,size_t> sequential_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          const typename ObjectBinner::vint vSplitPos(splitPos);
+          const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) { 
+                                                return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask);
+                                              },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });          
+          const size_t left_weight  = local_left.end;
+          const size_t right_weight = local_right.end;
+
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> sequential_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset) 
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo local_left(empty);
+          PrimInfo local_right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim; 
+
+          /* init spatial mapping */
+          const SpatialBinMapping<SPATIAL_BINS> &mapping = split.mapping;
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          size_t center = serial_partitioning(prims0,
+                                              begin,end,local_left,local_right,
+                                              [&] (const PrimRef& ref) {
+                                                const Vec3fa c = ref.bounds().center();
+                                                return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); 
+                                              },
+                                              [] (PrimInfo& pinfo,const PrimRef& ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); });
+
+          const size_t left_weight  = local_left.end;
+          const size_t right_weight = local_right.end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,local_left);
+          new (&rset) PrimInfoExtRange(center,end,end,local_right);
+          assert(area(lset.geomBounds) >= 0.0f);
+          assert(area(rset.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+
+        
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_object_split(const ObjectSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          const typename ObjectBinner::vint vSplitPos(splitPos);
+          const typename ObjectBinner::vbool vSplitMask(splitDimMask);
+          auto isLeft = [&] (const PrimRef &ref) { return split.mapping.bin_unsafe(ref,vSplitPos,vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+            [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          const size_t left_weight  = left.end;
+          const size_t right_weight = right.end;
+          
+          left.begin  = begin;  left.end  = center; 
+          right.begin = center; right.end = end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          assert(area(left.geomBounds) >= 0.0f);
+          assert(area(right.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+        /*! array partitioning */
+        __noinline std::pair<size_t,size_t> parallel_spatial_split(const SpatialSplit& split, const PrimInfoExtRange& set, PrimInfoExtRange& lset, PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          PrimInfo left(empty);
+          PrimInfo right(empty);
+          const unsigned int splitPos = split.pos;
+          const unsigned int splitDim = split.dim;
+          const unsigned int splitDimMask = (unsigned int)1 << splitDim;
+
+          /* init spatial mapping */
+          const SpatialBinMapping<SPATIAL_BINS>& mapping = split.mapping;
+          const vint4 vSplitPos(splitPos);
+          const vbool4 vSplitMask( (int)splitDimMask );
+
+          auto isLeft = [&] (const PrimRef &ref) { 
+            const Vec3fa c = ref.bounds().center();
+            return any(((vint4)mapping.bin(c) < vSplitPos) & vSplitMask); };
+
+          const size_t center = parallel_partitioning(
+            prims0,begin,end,EmptyTy(),left,right,isLeft,
+            [] (PrimInfo &pinfo,const PrimRef &ref) { pinfo.add_center2(ref,ref.lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)); },
+            [] (PrimInfo &pinfo0,const PrimInfo &pinfo1) { pinfo0.merge(pinfo1); },
+            PARALLEL_PARTITION_BLOCK_SIZE);
+
+          const size_t left_weight  = left.end;
+          const size_t right_weight = right.end;
+          
+          left.begin  = begin;  left.end  = center; 
+          right.begin = center; right.end = end;
+          
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          assert(area(left.geomBounds) >= 0.0f);
+          assert(area(right.geomBounds) >= 0.0f);
+          return std::pair<size_t,size_t>(left_weight,right_weight);
+        }
+
+        void deterministic_order(const PrimInfoExtRange& set) 
+        {
+          /* required as parallel partition destroys original primitive order */
+          std::sort(&prims0[set.begin()],&prims0[set.end()]);
+        }
+
+        void splitFallback(const PrimInfoExtRange& set, 
+                           PrimInfoExtRange& lset, 
+                           PrimInfoExtRange& rset)
+        {
+          const size_t begin = set.begin();
+          const size_t end   = set.end();
+          const size_t center = (begin + end)/2;
+
+          PrimInfo left(empty);
+          for (size_t i=begin; i<center; i++) {
+            left.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));
+          }
+          const size_t lweight = left.end;
+          
+          PrimInfo right(empty);
+          for (size_t i=center; i<end; i++) {
+            right.add_center2(prims0[i],prims0[i].lower.u >> (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS));	
+          }
+          const size_t rweight = right.end;
+
+          new (&lset) PrimInfoExtRange(begin,center,center,left);
+          new (&rset) PrimInfoExtRange(center,end,end,right);
+
+          /* if we have an extended range */
+          if (set.has_ext_range()) {
+            setExtentedRanges(set,lset,rset,lweight,rweight);
+            moveExtentedRange(set,lset,rset);
+          }
+        }
+        
+      private:
+        PrimRef* const prims0;
+        const PrimitiveSplitterFactory& splitterFactory;
+        const CentGeomBBox3fa& root_info;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h
new file mode 100644
index 0000000000..ede0d04c78
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_strand_array.h
@@ -0,0 +1,188 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "priminfo.h"
+#include "../../common/algorithms/parallel_reduce.h"
+#include "../../common/algorithms/parallel_partition.h"
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    struct HeuristicStrandSplit
+    {
+      typedef range<size_t> Set;
+  
+      static const size_t PARALLEL_THRESHOLD = 10000;
+      static const size_t PARALLEL_FIND_BLOCK_SIZE = 4096;
+      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 64;
+
+      /*! stores all information to perform some split */
+      struct Split
+      {    
+	/*! construct an invalid split by default */
+	__forceinline Split()
+	  : sah(inf), axis0(zero), axis1(zero) {}
+	
+	/*! constructs specified split */
+	__forceinline Split(const float sah, const Vec3fa& axis0, const Vec3fa& axis1)
+	  : sah(sah), axis0(axis0), axis1(axis1) {}
+	
+	/*! calculates standard surface area heuristic for the split */
+	__forceinline float splitSAH() const { return sah; }
+
+        /*! test if this split is valid */
+        __forceinline bool valid() const { return sah != float(inf); }
+		
+      public:
+	float sah;             //!< SAH cost of the split
+	Vec3fa axis0, axis1;   //!< axis the two strands are aligned into
+      };
+
+      __forceinline HeuristicStrandSplit () // FIXME: required?
+        : scene(nullptr), prims(nullptr) {}
+      
+      /*! remember prim array */
+      __forceinline HeuristicStrandSplit (Scene* scene, PrimRef* prims)
+        : scene(scene), prims(prims) {}
+      
+      __forceinline const Vec3fa direction(const PrimRef& prim) {
+        return scene->get(prim.geomID())->computeDirection(prim.primID());
+      }
+      
+      __forceinline const BBox3fa bounds(const PrimRef& prim) {
+        return scene->get(prim.geomID())->vbounds(prim.primID());
+      }
+
+      __forceinline const BBox3fa bounds(const LinearSpace3fa& space, const PrimRef& prim) {
+        return scene->get(prim.geomID())->vbounds(space,prim.primID());
+      }
+
+      /*! finds the best split */
+      const Split find(const range<size_t>& set, size_t logBlockSize)
+      {
+        Vec3fa axis0(0,0,1);
+        uint64_t bestGeomPrimID = -1;
+
+        /* curve with minimum ID determines first axis */
+        for (size_t i=set.begin(); i<set.end(); i++)
+        {
+          const uint64_t geomprimID = prims[i].ID64();
+          if (geomprimID >= bestGeomPrimID) continue;
+          const Vec3fa axis = direction(prims[i]);
+          if (sqr_length(axis) > 1E-18f) {
+            axis0 = normalize(axis);
+            bestGeomPrimID = geomprimID;
+          }
+        }
+      
+        /* find 2nd axis that is most misaligned with first axis and has minimum ID */
+        float bestCos = 1.0f;
+        Vec3fa axis1 = axis0;
+        bestGeomPrimID = -1;
+        for (size_t i=set.begin(); i<set.end(); i++) 
+        {
+          const uint64_t geomprimID = prims[i].ID64();
+          Vec3fa axisi = direction(prims[i]);
+          float leni = length(axisi);
+          if (leni == 0.0f) continue;
+          axisi /= leni;
+          float cos = abs(dot(axisi,axis0));
+          if ((cos == bestCos && (geomprimID < bestGeomPrimID)) || cos < bestCos) {
+            bestCos = cos; axis1 = axisi;
+            bestGeomPrimID = geomprimID;
+          }
+        }
+      
+        /* partition the two strands */
+        size_t lnum = 0, rnum = 0;
+        BBox3fa lbounds = empty, rbounds = empty;
+        const LinearSpace3fa space0 = frame(axis0).transposed();
+        const LinearSpace3fa space1 = frame(axis1).transposed();
+        
+        for (size_t i=set.begin(); i<set.end(); i++)
+        {
+          PrimRef& prim = prims[i];
+          const Vec3fa axisi = normalize(direction(prim));
+          const float cos0 = abs(dot(axisi,axis0));
+          const float cos1 = abs(dot(axisi,axis1));
+          
+          if (cos0 > cos1) { lnum++; lbounds.extend(bounds(space0,prim)); }
+          else             { rnum++; rbounds.extend(bounds(space1,prim)); }
+        }
+      
+        /*! return an invalid split if we do not partition */
+        if (lnum == 0 || rnum == 0) 
+          return Split(inf,axis0,axis1);
+      
+        /*! calculate sah for the split */
+        const size_t lblocks = (lnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+        const size_t rblocks = (rnum+(1ull<<logBlockSize)-1ull) >> logBlockSize;
+        const float sah = madd(float(lblocks),halfArea(lbounds),float(rblocks)*halfArea(rbounds));
+        return Split(sah,axis0,axis1);
+      }
+
+      /*! array partitioning */
+      void split(const Split& split, const PrimInfoRange& set, PrimInfoRange& lset, PrimInfoRange& rset) 
+      {
+        if (!split.valid()) {
+          deterministic_order(set);
+          return splitFallback(set,lset,rset);
+        }
+        
+        const size_t begin = set.begin();
+        const size_t end   = set.end();
+        CentGeomBBox3fa local_left(empty);
+        CentGeomBBox3fa local_right(empty);
+
+        auto primOnLeftSide = [&] (const PrimRef& prim) -> bool { 
+          const Vec3fa axisi = normalize(direction(prim));
+          const float cos0 = abs(dot(axisi,split.axis0));
+          const float cos1 = abs(dot(axisi,split.axis1));
+          return cos0 > cos1;
+        };
+
+        auto mergePrimBounds = [this] (CentGeomBBox3fa& pinfo,const PrimRef& ref) { 
+          pinfo.extend(bounds(ref)); 
+        };
+        
+        size_t center = serial_partitioning(prims,begin,end,local_left,local_right,primOnLeftSide,mergePrimBounds);
+        
+        new (&lset) PrimInfoRange(begin,center,local_left);
+        new (&rset) PrimInfoRange(center,end,local_right);
+        assert(area(lset.geomBounds) >= 0.0f);
+        assert(area(rset.geomBounds) >= 0.0f);
+      }
+
+      void deterministic_order(const Set& set) 
+      {
+        /* required as parallel partition destroys original primitive order */
+        std::sort(&prims[set.begin()],&prims[set.end()]);
+      }
+      
+      void splitFallback(const Set& set, PrimInfoRange& lset, PrimInfoRange& rset)
+      {
+        const size_t begin = set.begin();
+        const size_t end   = set.end();
+        const size_t center = (begin + end)/2;
+        
+        CentGeomBBox3fa left(empty);
+        for (size_t i=begin; i<center; i++)
+          left.extend(bounds(prims[i]));
+        new (&lset) PrimInfoRange(begin,center,left);
+        
+        CentGeomBBox3fa right(empty);
+        for (size_t i=center; i<end; i++)
+          right.extend(bounds(prims[i]));	
+        new (&rset) PrimInfoRange(center,end,right);
+      }
+      
+    private:
+      Scene* const scene;
+      PrimRef* const prims;
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h b/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h
new file mode 100644
index 0000000000..c999941a11
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/heuristic_timesplit_array.h
@@ -0,0 +1,237 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/primref_mb.h"
+#include "../../common/algorithms/parallel_filter.h"
+
+#define MBLUR_TIME_SPLIT_THRESHOLD 1.25f
+
+namespace embree
+{
+  namespace isa
+  { 
+    /*! Performs standard object binning */
+    template<typename PrimRefMB, typename RecalculatePrimRef, size_t BINS>
+      struct HeuristicMBlurTemporalSplit
+      {
+        typedef BinSplit<MBLUR_NUM_OBJECT_BINS> Split;
+        typedef mvector<PrimRefMB>* PrimRefVector;
+        typedef typename PrimRefMB::BBox BBox; 
+
+        static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+        static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+        static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+        HeuristicMBlurTemporalSplit (MemoryMonitorInterface* device, const RecalculatePrimRef& recalculatePrimRef)
+          : device(device), recalculatePrimRef(recalculatePrimRef) {}
+
+        struct TemporalBinInfo
+        {
+          __forceinline TemporalBinInfo () {
+          }
+
+          __forceinline TemporalBinInfo (EmptyTy)
+          {
+            for (size_t i=0; i<BINS-1; i++)
+            {
+              count0[i] = count1[i] = 0;
+              bounds0[i] = bounds1[i] = empty;
+            }
+          }
+          
+          void bin(const PrimRefMB* prims, size_t begin, size_t end, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef)
+          {
+            for (int b=0; b<BINS-1; b++)
+            {
+              const float t = float(b+1)/float(BINS);
+              const float ct = lerp(time_range.lower,time_range.upper,t);
+              const float center_time = set.align_time(ct);
+              if (center_time <= time_range.lower) continue;
+              if (center_time >= time_range.upper) continue;
+              const BBox1f dt0(time_range.lower,center_time);
+              const BBox1f dt1(center_time,time_range.upper);
+              
+              /* find linear bounds for both time segments */
+              for (size_t i=begin; i<end; i++) 
+              {
+                if (prims[i].time_range_overlap(dt0))
+                {
+                  const LBBox3fa bn0 = recalculatePrimRef.linearBounds(prims[i],dt0);
+#if MBLUR_BIN_LBBOX
+                  bounds0[b].extend(bn0);
+#else
+                  bounds0[b].extend(bn0.interpolate(0.5f));
+#endif
+                  count0[b] += prims[i].timeSegmentRange(dt0).size();
+                }
+
+                if (prims[i].time_range_overlap(dt1))
+                {
+                  const LBBox3fa bn1 = recalculatePrimRef.linearBounds(prims[i],dt1);
+#if MBLUR_BIN_LBBOX
+                  bounds1[b].extend(bn1);
+#else
+                  bounds1[b].extend(bn1.interpolate(0.5f));
+#endif
+                  count1[b] += prims[i].timeSegmentRange(dt1).size();
+                }
+              }
+            }
+          }
+
+          __forceinline void bin_parallel(const PrimRefMB* prims, size_t begin, size_t end, size_t blockSize, size_t parallelThreshold, BBox1f time_range, const SetMB& set, const RecalculatePrimRef& recalculatePrimRef) 
+          {
+            if (likely(end-begin < parallelThreshold)) {
+              bin(prims,begin,end,time_range,set,recalculatePrimRef);
+            } 
+            else 
+            {
+              auto bin = [&](const range<size_t>& r) -> TemporalBinInfo { 
+                TemporalBinInfo binner(empty); binner.bin(prims, r.begin(), r.end(), time_range, set, recalculatePrimRef); return binner; 
+              };
+              *this = parallel_reduce(begin,end,blockSize,TemporalBinInfo(empty),bin,merge2);
+            }
+          }
+          
+          /*! merges in other binning information */
+          __forceinline void merge (const TemporalBinInfo& other)
+          {
+            for (size_t i=0; i<BINS-1; i++) 
+            {
+              count0[i] += other.count0[i];
+              count1[i] += other.count1[i];
+              bounds0[i].extend(other.bounds0[i]);
+              bounds1[i].extend(other.bounds1[i]);
+            }
+          }
+
+          static __forceinline const TemporalBinInfo merge2(const TemporalBinInfo& a, const TemporalBinInfo& b) {
+            TemporalBinInfo r = a; r.merge(b); return r;
+          }
+                    
+          Split best(int logBlockSize, BBox1f time_range, const SetMB& set)
+          {
+            float bestSAH = inf;
+            float bestPos = 0.0f;
+            for (int b=0; b<BINS-1; b++)
+            {
+              float t = float(b+1)/float(BINS);
+              float ct = lerp(time_range.lower,time_range.upper,t);
+              const float center_time = set.align_time(ct);
+              if (center_time <= time_range.lower) continue;
+              if (center_time >= time_range.upper) continue;
+              const BBox1f dt0(time_range.lower,center_time);
+              const BBox1f dt1(center_time,time_range.upper);
+              
+              /* calculate sah */
+              const size_t lCount = (count0[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+              const size_t rCount = (count1[b]+(size_t(1) << logBlockSize)-1) >> int(logBlockSize);
+              float sah0 = expectedApproxHalfArea(bounds0[b])*float(lCount)*dt0.size();
+              float sah1 = expectedApproxHalfArea(bounds1[b])*float(rCount)*dt1.size();
+              if (unlikely(lCount == 0)) sah0 = 0.0f; // happens for initial splits when objects not alive over entire shutter time
+              if (unlikely(rCount == 0)) sah1 = 0.0f;
+              const float sah = sah0+sah1;
+              if (sah < bestSAH) {
+                bestSAH = sah;
+                bestPos = center_time;
+              }
+            }
+            return Split(bestSAH*MBLUR_TIME_SPLIT_THRESHOLD,(unsigned)Split::SPLIT_TEMPORAL,0,bestPos);
+          }
+          
+        public:
+          size_t count0[BINS-1];
+          size_t count1[BINS-1];
+          BBox bounds0[BINS-1];
+          BBox bounds1[BINS-1];
+        };
+        
+        /*! finds the best split */
+        const Split find(const SetMB& set, const size_t logBlockSize)
+        {
+          assert(set.size() > 0);
+          TemporalBinInfo binner(empty);
+          binner.bin_parallel(set.prims->data(),set.begin(),set.end(),PARALLEL_FIND_BLOCK_SIZE,PARALLEL_THRESHOLD,set.time_range,set,recalculatePrimRef);
+          Split tsplit = binner.best((int)logBlockSize,set.time_range,set);
+          if (!tsplit.valid()) tsplit.data = Split::SPLIT_FALLBACK; // use fallback split
+          return tsplit;
+        }
+
+        __forceinline std::unique_ptr<mvector<PrimRefMB>> split(const Split& tsplit, const SetMB& set, SetMB& lset, SetMB& rset)
+        {
+          assert(tsplit.sah != float(inf));
+          assert(tsplit.fpos > set.time_range.lower);
+          assert(tsplit.fpos < set.time_range.upper);
+
+          float center_time = tsplit.fpos;
+          const BBox1f time_range0(set.time_range.lower,center_time);
+          const BBox1f time_range1(center_time,set.time_range.upper);
+          mvector<PrimRefMB>& prims = *set.prims;
+          
+          /* calculate primrefs for first time range */
+          std::unique_ptr<mvector<PrimRefMB>> new_vector(new mvector<PrimRefMB>(device, set.size()));
+          PrimRefVector lprims = new_vector.get();
+          
+          auto reduction_func0 = [&] (const range<size_t>& r) {
+            PrimInfoMB pinfo = empty;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+            {
+              if (likely(prims[i].time_range_overlap(time_range0)))
+              {
+                const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range0);
+                (*lprims)[i-set.begin()] = prim;
+                pinfo.add_primref(prim);
+              }
+              else
+              {
+                (*lprims)[i-set.begin()] = prims[i];
+              }
+            }
+            return pinfo;
+          };        
+          PrimInfoMB linfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func0,PrimInfoMB::merge2);
+
+          /* primrefs for first time range are in lprims[0 .. set.size()) */
+          /* some primitives may need to be filtered out */
+          if (linfo.size() != set.size())
+            linfo.object_range._end = parallel_filter(lprims->data(), size_t(0), set.size(), size_t(1024),
+                                                      [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range0); });
+                      
+          lset = SetMB(linfo,lprims,time_range0);
+
+          /* calculate primrefs for second time range */
+          auto reduction_func1 = [&] (const range<size_t>& r) {
+            PrimInfoMB pinfo = empty;
+            for (size_t i=r.begin(); i<r.end(); i++) 
+            {
+              if (likely(prims[i].time_range_overlap(time_range1)))
+              {
+                const PrimRefMB& prim = recalculatePrimRef(prims[i],time_range1);
+                prims[i] = prim;
+                pinfo.add_primref(prim);
+              }
+            }
+            return pinfo;
+          };        
+          PrimInfoMB rinfo = parallel_reduce(set.object_range,PARALLEL_PARTITION_BLOCK_SIZE,PARALLEL_THRESHOLD,PrimInfoMB(empty),reduction_func1,PrimInfoMB::merge2);
+          rinfo.object_range = range<size_t>(set.begin(), set.begin() + rinfo.size());
+
+          /* primrefs for second time range are in prims[set.begin() .. set.end()) */
+          /* some primitives may need to be filtered out */
+          if (rinfo.size() != set.size())
+            rinfo.object_range._end = parallel_filter(prims.data(), set.begin(), set.end(), size_t(1024),
+                                                      [&](const PrimRefMB& prim) { return prim.time_range_overlap(time_range1); });
+        
+          rset = SetMB(rinfo,&prims,time_range1);
+
+          return new_vector;
+        }
+
+      private:
+        MemoryMonitorInterface* device;              // device to report memory usage to
+        const RecalculatePrimRef recalculatePrimRef;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/priminfo.h b/thirdparty/embree-aarch64/kernels/builders/priminfo.h
new file mode 100644
index 0000000000..06c1388742
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/priminfo.h
@@ -0,0 +1,362 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+
+namespace embree
+{
+  // FIXME: maybe there's a better place for this util fct
+  __forceinline float areaProjectedTriangle(const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2)
+  {
+    const Vec3fa e0 = v1-v0;
+    const Vec3fa e1 = v2-v0;
+    const Vec3fa d = cross(e0,e1);
+    return fabs(d.x) + fabs(d.y) + fabs(d.z);
+  }
+
+  //namespace isa
+  //{
+    template<typename BBox>
+      class CentGeom
+    {
+    public:
+      __forceinline CentGeom () {}
+
+      __forceinline CentGeom (EmptyTy) 
+	: geomBounds(empty), centBounds(empty) {}
+      
+      __forceinline CentGeom (const BBox& geomBounds, const BBox3fa& centBounds) 
+	: geomBounds(geomBounds), centBounds(centBounds) {}
+      
+      template<typename PrimRef> 
+        __forceinline void extend_primref(const PrimRef& prim) 
+      {
+        BBox bounds; Vec3fa center;
+        prim.binBoundsAndCenter(bounds,center);
+        geomBounds.extend(bounds);
+        centBounds.extend(center);
+      }
+
+       template<typename PrimRef> 
+         __forceinline void extend_center2(const PrimRef& prim) 
+       {
+         BBox3fa bounds = prim.bounds();
+         geomBounds.extend(bounds);
+         centBounds.extend(bounds.center2());
+       }
+       
+      __forceinline void extend(const BBox& geomBounds_) {
+	geomBounds.extend(geomBounds_);
+	centBounds.extend(center2(geomBounds_));
+      }
+
+      __forceinline void merge(const CentGeom& other) 
+      {
+	geomBounds.extend(other.geomBounds);
+	centBounds.extend(other.centBounds);
+      }
+
+      static __forceinline const CentGeom merge2(const CentGeom& a, const CentGeom& b) {
+        CentGeom r = a; r.merge(b); return r;
+      }
+
+    public:
+      BBox geomBounds;   //!< geometry bounds of primitives
+      BBox3fa centBounds;   //!< centroid bounds of primitives
+    };
+
+    typedef CentGeom<BBox3fa> CentGeomBBox3fa;
+
+    /*! stores bounding information for a set of primitives */
+    template<typename BBox>
+      class PrimInfoT : public CentGeom<BBox>
+    {
+    public:
+      using CentGeom<BBox>::geomBounds;
+      using CentGeom<BBox>::centBounds;
+
+      __forceinline PrimInfoT () {}
+
+      __forceinline PrimInfoT (EmptyTy) 
+	: CentGeom<BBox>(empty), begin(0), end(0) {}
+
+      __forceinline PrimInfoT (size_t begin, size_t end, const CentGeomBBox3fa& centGeomBounds) 
+        : CentGeom<BBox>(centGeomBounds), begin(begin), end(end) {}
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeom<BBox>::extend_primref(prim);
+        end++;
+      }
+
+       template<typename PrimRef> 
+         __forceinline void add_center2(const PrimRef& prim) {
+         CentGeom<BBox>::extend_center2(prim);
+         end++;
+       }
+
+        template<typename PrimRef> 
+          __forceinline void add_center2(const PrimRef& prim, const size_t i) {
+          CentGeom<BBox>::extend_center2(prim);
+          end+=i;
+        }
+
+      /*__forceinline void add(const BBox& geomBounds_) {
+	CentGeom<BBox>::extend(geomBounds_);
+	end++;
+      }
+
+      __forceinline void add(const BBox& geomBounds_, const size_t i) {
+	CentGeom<BBox>::extend(geomBounds_);
+	end+=i;
+        }*/
+
+      __forceinline void merge(const PrimInfoT& other) 
+      {
+	CentGeom<BBox>::merge(other);
+        begin += other.begin;
+	end += other.end;
+      }
+
+      static __forceinline const PrimInfoT merge(const PrimInfoT& a, const PrimInfoT& b) {
+        PrimInfoT r = a; r.merge(b); return r;
+      }
+      
+      /*! returns the number of primitives */
+      __forceinline size_t size() const { 
+	return end-begin; 
+      }
+
+      __forceinline float halfArea() {
+        return expectedApproxHalfArea(geomBounds);
+      }
+
+      __forceinline float leafSAH() const { 
+	return expectedApproxHalfArea(geomBounds)*float(size()); 
+	//return halfArea(geomBounds)*blocks(num); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return expectedApproxHalfArea(geomBounds)*float((size()+(size_t(1)<<block_shift)-1) >> block_shift);
+	//return halfArea(geomBounds)*float((num+3) >> 2);
+	//return halfArea(geomBounds)*blocks(num); 
+      }
+      
+      /*! stream output */
+      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoT& pinfo) {
+	return cout << "PrimInfo { begin = " << pinfo.begin << ", end = " << pinfo.end << ", geomBounds = " << pinfo.geomBounds << ", centBounds = " << pinfo.centBounds << "}";
+      }
+      
+    public:
+      size_t begin,end;          //!< number of primitives
+    };
+
+    typedef PrimInfoT<BBox3fa> PrimInfo;
+    //typedef PrimInfoT<LBBox3fa> PrimInfoMB;
+
+    /*! stores bounding information for a set of primitives */
+    template<typename BBox>
+      class PrimInfoMBT : public CentGeom<BBox>
+    {
+    public:
+      using CentGeom<BBox>::geomBounds;
+      using CentGeom<BBox>::centBounds;
+
+      __forceinline PrimInfoMBT () {
+      } 
+
+      __forceinline PrimInfoMBT (EmptyTy)
+        : CentGeom<BBox>(empty), object_range(0,0), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      __forceinline PrimInfoMBT (size_t begin, size_t end)
+        : CentGeom<BBox>(empty), object_range(begin,end), num_time_segments(0), max_num_time_segments(0), max_time_range(0.0f,1.0f), time_range(1.0f,0.0f) {}
+
+      template<typename PrimRef> 
+        __forceinline void add_primref(const PrimRef& prim) 
+      {
+        CentGeom<BBox>::extend_primref(prim);
+        time_range.extend(prim.time_range);
+        object_range._end++;
+        num_time_segments += prim.size();
+        if (max_num_time_segments < prim.totalTimeSegments()) {
+          max_num_time_segments = prim.totalTimeSegments();
+          max_time_range = prim.time_range;
+        }
+      }
+
+      __forceinline void merge(const PrimInfoMBT& other)
+      {
+        CentGeom<BBox>::merge(other);
+        time_range.extend(other.time_range);
+        object_range._begin += other.object_range.begin();
+        object_range._end += other.object_range.end();
+        num_time_segments += other.num_time_segments;
+        if (max_num_time_segments < other.max_num_time_segments) {
+          max_num_time_segments = other.max_num_time_segments;
+          max_time_range = other.max_time_range;
+        }
+      }
+
+      static __forceinline const PrimInfoMBT merge2(const PrimInfoMBT& a, const PrimInfoMBT& b) {
+        PrimInfoMBT r = a; r.merge(b); return r;
+      }
+
+      __forceinline size_t begin() const {
+        return object_range.begin();
+      }
+
+      __forceinline size_t end() const {
+        return object_range.end();
+      }
+      
+      /*! returns the number of primitives */
+      __forceinline size_t size() const { 
+	return object_range.size(); 
+      }
+
+      __forceinline float halfArea() const {
+        return time_range.size()*expectedApproxHalfArea(geomBounds);
+      }
+
+      __forceinline float leafSAH() const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float(num_time_segments); 
+      }
+      
+      __forceinline float leafSAH(size_t block_shift) const { 
+	return time_range.size()*expectedApproxHalfArea(geomBounds)*float((num_time_segments+(size_t(1)<<block_shift)-1) >> block_shift);
+      }
+
+      __forceinline float align_time(float ct) const
+      {
+        //return roundf(ct * float(numTimeSegments)) / float(numTimeSegments);
+        float t0 = (ct-max_time_range.lower)/max_time_range.size();
+        float t1 = roundf(t0 * float(max_num_time_segments)) / float(max_num_time_segments);
+        return t1*max_time_range.size()+max_time_range.lower;
+      }
+      
+      /*! stream output */
+      friend embree_ostream operator<<(embree_ostream cout, const PrimInfoMBT& pinfo) 
+      {
+	return cout << "PrimInfo { " << 
+          "object_range = " << pinfo.object_range << 
+          ", time_range = " << pinfo.time_range << 
+          ", time_segments = " << pinfo.num_time_segments << 
+          ", geomBounds = " << pinfo.geomBounds << 
+          ", centBounds = " << pinfo.centBounds << 
+          "}";
+      }
+      
+    public:
+      range<size_t> object_range; //!< primitive range
+      size_t num_time_segments;  //!< total number of time segments of all added primrefs
+      size_t max_num_time_segments; //!< maximum number of time segments of a primitive
+      BBox1f max_time_range; //!< time range of primitive with max_num_time_segments
+      BBox1f time_range; //!< merged time range of primitives when merging prims, or additionally clipped with build time range when used in SetMB
+    };
+
+    typedef PrimInfoMBT<typename PrimRefMB::BBox> PrimInfoMB;
+
+    struct SetMB : public PrimInfoMB
+    {
+      static const size_t PARALLEL_THRESHOLD = 3 * 1024;
+      static const size_t PARALLEL_FIND_BLOCK_SIZE = 1024;
+      static const size_t PARALLEL_PARTITION_BLOCK_SIZE = 128;
+
+      typedef mvector<PrimRefMB>* PrimRefVector;
+
+      __forceinline SetMB() {}
+
+       __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims)
+         : PrimInfoMB(pinfo_i), prims(prims) {}
+
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, range<size_t> object_range_in, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        object_range = object_range_in;
+        time_range = intersect(time_range,time_range_in);
+      }
+      
+      __forceinline SetMB(const PrimInfoMB& pinfo_i, PrimRefVector prims, BBox1f time_range_in)
+        : PrimInfoMB(pinfo_i), prims(prims)
+      {
+        time_range = intersect(time_range,time_range_in);
+      }
+
+      void deterministic_order() const 
+      {
+        /* required as parallel partition destroys original primitive order */
+        PrimRefMB* prim = prims->data();
+        std::sort(&prim[object_range.begin()],&prim[object_range.end()]);
+      }
+
+      template<typename RecalculatePrimRef>
+      __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        __forceinline LBBox3fa linearBounds(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto reduce = [&](const range<size_t>& r) -> LBBox3fa
+        {
+          LBBox3fa cbounds(empty);
+          for (size_t j = r.begin(); j < r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            const LBBox3fa bn = recalculatePrimRef.linearBounds(ref, time_range, space);
+            cbounds.extend(bn);
+          };
+          return cbounds;
+        };
+        
+        return parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, LBBox3fa(empty),
+                               reduce,
+                               [&](const LBBox3fa& b0, const LBBox3fa& b1) -> LBBox3fa { return embree::merge(b0, b1); });
+      }
+
+      template<typename RecalculatePrimRef>
+        const SetMB primInfo(const RecalculatePrimRef& recalculatePrimRef, const LinearSpace3fa& space) const
+      {
+        auto computePrimInfo = [&](const range<size_t>& r) -> PrimInfoMB
+        {
+          PrimInfoMB pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            PrimRefMB& ref = (*prims)[j];
+            PrimRefMB ref1 = recalculatePrimRef(ref,time_range,space);
+            pinfo.add_primref(ref1);
+          };
+          return pinfo;
+        };
+        
+        const PrimInfoMB pinfo = parallel_reduce(object_range.begin(), object_range.end(), PARALLEL_FIND_BLOCK_SIZE, PARALLEL_THRESHOLD, 
+                                                 PrimInfoMB(empty), computePrimInfo, PrimInfoMB::merge2);
+
+        return SetMB(pinfo,prims,object_range,time_range);
+      }
+      
+    public:
+      PrimRefVector prims;
+    };
+//}
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp b/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp
new file mode 100644
index 0000000000..e23de3df28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen.cpp
@@ -0,0 +1,244 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primrefgen.h"
+#include "primrefgen_presplit.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelPrefixSumState<PrimInfo> pstate;
+      
+      /* first try */
+      progressMonitor(0);
+      PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+          return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+          return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
+    {
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,true);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+          return mesh->createPrimRefArrayMB(prims,itime,r,k,(unsigned)geomID);
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            return mesh->createPrimRefArrayMB(prims,itime,r,base.size(),(unsigned)geomID);
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1)
+    {
+      ParallelForForPrefixSumState<PrimInfoMB> pstate;
+      Scene::Iterator2 iter(scene,types,true);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfoMB pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfoMB {
+          return mesh->createPrimRefMBArray(prims,t0t1,r,k,(unsigned)geomID);
+      }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+            return mesh->createPrimRefMBArray(prims,t0t1,r,base.size(),(unsigned)geomID);
+        }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+      }
+
+      /* the BVH starts with that time range, even though primitives might have smaller/larger time range */
+      pinfo.time_range = t0t1;
+      return pinfo;
+    }
+
+    template<typename Mesh>
+    size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor)
+    {
+      size_t numPrimitives = morton.size();
+
+      /* compute scene bounds */
+      std::pair<size_t,BBox3fa> cb_empty(0,empty);
+      auto cb = parallel_reduce 
+        ( size_t(0), numPrimitives, size_t(1024), cb_empty, [&](const range<size_t>& r) -> std::pair<size_t,BBox3fa>
+          {
+            size_t num = 0;
+            BBox3fa bounds = empty;
+            
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa prim_bounds = empty;
+              if (unlikely(!mesh->buildBounds(j,&prim_bounds))) continue;
+              bounds.extend(center2(prim_bounds));
+              num++;
+            }
+            return std::make_pair(num,bounds);
+          }, [] (const std::pair<size_t,BBox3fa>& a, const std::pair<size_t,BBox3fa>& b) {
+          return std::make_pair(a.first + b.first,merge(a.second,b.second)); 
+        });
+      
+      
+      size_t numPrimitivesGen = cb.first;
+      const BBox3fa centBounds = cb.second;
+      
+      /* compute morton codes */
+      if (likely(numPrimitivesGen == numPrimitives))
+      {
+        /* fast path if all primitives were valid */
+        BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+        parallel_for( size_t(0), numPrimitives, size_t(1024), [&](const range<size_t>& r) -> void {
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+              generator(mesh->bounds(j),unsigned(j));
+          });
+      }
+      else
+      {
+        /* slow path, fallback in case some primitives were invalid */
+        ParallelPrefixSumState<size_t> pstate;
+        BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+        parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+            size_t num = 0;
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[r.begin()]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa bounds = empty;
+              if (unlikely(!mesh->buildBounds(j,&bounds))) continue;
+              generator(bounds,unsigned(j));
+              num++;
+            }
+            return num;
+          }, std::plus<size_t>());
+        
+        parallel_prefix_sum( pstate, size_t(0), numPrimitives, size_t(1024), size_t(0), [&](const range<size_t>& r, const size_t base) -> size_t {
+            size_t num = 0;
+            BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton.data()[base]);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              BBox3fa bounds = empty;
+              if (!mesh->buildBounds(j,&bounds)) continue;
+              generator(bounds,unsigned(j));
+              num++;
+            }
+            return num;
+          }, std::plus<size_t>());          
+      }
+      return numPrimitivesGen;
+    }
+
+    // ====================================================================================================
+    // ====================================================================================================
+    // ====================================================================================================
+
+    // template for grid meshes
+
+#if 0
+    template<>
+    PrimInfo createPrimRefArray<GridMesh,false>(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      PING;
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator<GridMesh,false> iter(scene);
+      
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k) -> PrimInfo
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!mesh->buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,mesh->geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != prims.size())
+      {
+        progressMonitor(0);
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, const PrimInfo& base) -> PrimInfo
+        {
+          k = base.size();
+          PrimInfo pinfo(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            BBox3fa bounds = empty;
+            if (!mesh->buildBounds(j,&bounds)) continue;
+            const PrimRef prim(bounds,mesh->geomID,unsigned(j));
+            pinfo.add_center2(prim);
+            prims[k++] = prim;
+          }
+          return pinfo;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      }
+      return pinfo;
+    }
+#endif
+
+    // ====================================================================================================
+    // ====================================================================================================
+    // ====================================================================================================
+
+    IF_ENABLED_TRIS (template size_t createMortonCodeArray<TriangleMesh>(TriangleMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_QUADS(template size_t createMortonCodeArray<QuadMesh>(QuadMesh* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_USER (template size_t createMortonCodeArray<UserGeometry>(UserGeometry* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+    IF_ENABLED_INSTANCE (template size_t createMortonCodeArray<Instance>(Instance* mesh COMMA mvector<BVHBuilderMorton::BuildPrim>& morton COMMA BuildProgressMonitor& progressMonitor));
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h
new file mode 100644
index 0000000000..9919c945c3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen.h
@@ -0,0 +1,28 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+#include "priminfo.h"
+#include "bvh_builder_morton.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    PrimInfo createPrimRefArray(Geometry* geometry, unsigned int geomID, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+   
+    PrimInfo createPrimRefArray(Scene* scene, Geometry::GTypeMask types, bool mblur, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor);
+   
+    PrimInfo createPrimRefArrayMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime = 0);
+
+    PrimInfoMB createPrimRefArrayMSMBlur(Scene* scene, Geometry::GTypeMask types, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f));
+
+    template<typename Mesh>
+      size_t createMortonCodeArray(Mesh* mesh, mvector<BVHBuilderMorton::BuildPrim>& morton, BuildProgressMonitor& progressMonitor);
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h
new file mode 100644
index 0000000000..8bdb38b955
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/primrefgen_presplit.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../builders/primrefgen.h"
+#include "../builders/heuristic_spatial.h"
+#include "../builders/splitter.h"
+
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+#define DBG_PRESPLIT(x)   
+#define CHECK_PRESPLIT(x) 
+
+#define GRID_SIZE 1024
+#define MAX_PRESPLITS_PER_PRIMITIVE_LOG 5
+#define MAX_PRESPLITS_PER_PRIMITIVE (1<<MAX_PRESPLITS_PER_PRIMITIVE_LOG)
+#define PRIORITY_CUTOFF_THRESHOLD 1.0f
+#define PRIORITY_SPLIT_POS_WEIGHT 1.5f
+
+namespace embree
+{  
+  namespace isa
+  {
+
+    struct PresplitItem
+    {
+      union {
+        float priority;    
+        unsigned int data;
+      };
+      unsigned int index;
+      
+      __forceinline operator unsigned() const
+      {
+	return reinterpret_cast<const unsigned&>(priority);
+      }
+      __forceinline bool operator < (const PresplitItem& item) const
+      {
+	return (priority < item.priority);
+      }
+
+      template<typename Mesh>
+      __forceinline static float compute_priority(const PrimRef &ref, Scene *scene, const Vec2i &mc)
+      {
+	const unsigned int geomID = ref.geomID();
+	const unsigned int primID = ref.primID();
+	const float area_aabb  = area(ref.bounds());
+	const float area_prim  = ((Mesh*)scene->get(geomID))->projectedPrimitiveArea(primID);
+        const unsigned int diff = 31 - lzcnt(mc.x^mc.y);
+        assert(area_prim <= area_aabb);
+        //const float priority = powf((area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff),1.0f/4.0f);   
+        const float priority = sqrtf(sqrtf( (area_aabb - area_prim) * powf(PRIORITY_SPLIT_POS_WEIGHT,(float)diff) ));
+        assert(priority >= 0.0f && priority < FLT_LARGE);
+	return priority;      
+      }
+
+    
+    };
+
+    inline std::ostream &operator<<(std::ostream &cout, const PresplitItem& item) {
+      return cout << "index " << item.index << " priority " << item.priority;    
+    };
+
+    template<typename SplitterFactory>    
+      void splitPrimitive(SplitterFactory &Splitter,
+                          const PrimRef &prim,
+                          const unsigned int geomID,
+                          const unsigned int primID,
+                          const unsigned int split_level,
+                          const Vec3fa &grid_base, 
+                          const float grid_scale,
+                          const float grid_extend,
+                          PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE],
+                          unsigned int& numSubPrims)
+    {
+      assert(split_level <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+      if (split_level == 0)
+      {
+        assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+        subPrims[numSubPrims++] = prim;
+      }
+      else
+      {
+        const Vec3fa lower = prim.lower;
+        const Vec3fa upper = prim.upper;
+        const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+        const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+        Vec3ia ilower(floor(glower));
+        Vec3ia iupper(floor(gupper));
+
+        /* this ignores dimensions that are empty */
+        iupper = (Vec3ia)(select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper)));
+
+        /* compute a morton code for the lower and upper grid coordinates. */
+        const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+        const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+			
+        /* if all bits are equal then we cannot split */
+        if(unlikely(lower_code == upper_code))
+        {
+          assert(numSubPrims < MAX_PRESPLITS_PER_PRIMITIVE);
+          subPrims[numSubPrims++] = prim;
+          return;
+        }
+		    
+        /* compute octree level and dimension to perform the split in */
+        const unsigned int diff = 31 - lzcnt(lower_code^upper_code);
+        const unsigned int level = diff / 3;
+        const unsigned int dim   = diff % 3;
+      
+        /* now we compute the grid position of the split */
+        const unsigned int isplit = iupper[dim] & ~((1<<level)-1);
+			    
+        /* compute world space position of split */
+        const float inv_grid_size = 1.0f / GRID_SIZE;
+        const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend;
+
+        assert(prim.lower[dim] <= fsplit &&
+               prim.upper[dim] >= fsplit);
+		
+        /* split primitive */
+        const auto splitter = Splitter(prim);
+        BBox3fa left,right;
+        splitter(prim.bounds(),dim,fsplit,left,right);
+        assert(!left.empty());
+        assert(!right.empty());
+
+			    
+        splitPrimitive(Splitter,PrimRef(left ,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+        splitPrimitive(Splitter,PrimRef(right,geomID,primID),geomID,primID,split_level-1,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+      }
+    }
+    
+    
+    template<typename Mesh, typename SplitterFactory>    
+      PrimInfo createPrimRefArray_presplit(Geometry* geometry, unsigned int geomID, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {
+      ParallelPrefixSumState<PrimInfo> pstate;
+      
+      /* first try */
+      progressMonitor(0);
+      PrimInfo pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+	  return geometry->createPrimRefArray(prims,r,r.begin(),geomID);
+	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+	{
+	  progressMonitor(0);
+	  pinfo = parallel_prefix_sum( pstate, size_t(0), geometry->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo {
+	      return geometry->createPrimRefArray(prims,r,base.size(),geomID);
+	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+	}
+      return pinfo;	
+    }
+    
+    __forceinline Vec2i computeMC(const Vec3fa &grid_base, const float grid_scale, const PrimRef &ref)
+    {
+      const Vec3fa lower = ref.lower;
+      const Vec3fa upper = ref.upper;
+      const Vec3fa glower = (lower-grid_base)*Vec3fa(grid_scale)+Vec3fa(0.2f);
+      const Vec3fa gupper = (upper-grid_base)*Vec3fa(grid_scale)-Vec3fa(0.2f);
+      Vec3ia ilower(floor(glower));
+      Vec3ia iupper(floor(gupper));
+      
+      /* this ignores dimensions that are empty */
+      iupper = (Vec3ia)select(vint4(glower) >= vint4(gupper),vint4(ilower),vint4(iupper));
+
+      /* compute a morton code for the lower and upper grid coordinates. */
+      const unsigned int lower_code = bitInterleave(ilower.x,ilower.y,ilower.z);
+      const unsigned int upper_code = bitInterleave(iupper.x,iupper.y,iupper.z);
+      return Vec2i(lower_code,upper_code);
+    }
+
+    template<typename Mesh, typename SplitterFactory>    
+      PrimInfo createPrimRefArray_presplit(Scene* scene, Geometry::GTypeMask types, bool mblur, size_t numPrimRefs, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor)
+    {	
+      static const size_t MIN_STEP_SIZE = 128;
+
+      ParallelForForPrefixSumState<PrimInfo> pstate;
+      Scene::Iterator2 iter(scene,types,mblur);
+
+      /* first try */
+      progressMonitor(0);
+      pstate.init(iter,size_t(1024));
+      PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+	  return mesh->createPrimRefArray(prims,r,k,(unsigned)geomID);
+	}, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+      
+      /* if we need to filter out geometry, run again */
+      if (pinfo.size() != numPrimRefs)
+	{
+	  progressMonitor(0);
+	  pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](Geometry* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+	      return mesh->createPrimRefArray(prims,r,base.size(),(unsigned)geomID);
+	    }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+	}
+
+      /* use correct number of primitives */
+      size_t numPrimitives = pinfo.size();
+      const size_t alloc_numPrimitives = prims.size(); 
+      const size_t numSplitPrimitivesBudget = alloc_numPrimitives - numPrimitives;
+
+      /* set up primitive splitter */
+      SplitterFactory Splitter(scene);
+
+
+      DBG_PRESPLIT(
+        const size_t org_numPrimitives = pinfo.size();
+        PRINT(numPrimitives);		
+        PRINT(alloc_numPrimitives);		
+        PRINT(numSplitPrimitivesBudget);
+        );
+
+      /* allocate double buffer presplit items */
+      const size_t presplit_allocation_size = sizeof(PresplitItem)*alloc_numPrimitives;
+      PresplitItem *presplitItem     = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+      PresplitItem *tmp_presplitItem = (PresplitItem*)alignedMalloc(presplit_allocation_size,64);
+
+      /* compute grid */
+      const Vec3fa grid_base    = pinfo.geomBounds.lower;
+      const Vec3fa grid_diag    = pinfo.geomBounds.size();
+      const float grid_extend   = max(grid_diag.x,max(grid_diag.y,grid_diag.z));		
+      const float grid_scale    = grid_extend == 0.0f ? 0.0f : GRID_SIZE / grid_extend;
+
+      /* init presplit items and get total sum */
+      const float psum = parallel_reduce( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), 0.0f, [&](const range<size_t>& r) -> float {
+          float sum = 0.0f;
+          for (size_t i=r.begin(); i<r.end(); i++)
+          {		
+            presplitItem[i].index = (unsigned int)i;
+            const Vec2i mc = computeMC(grid_base,grid_scale,prims[i]);
+            /* if all bits are equal then we cannot split */
+            presplitItem[i].priority = (mc.x != mc.y) ? PresplitItem::compute_priority<Mesh>(prims[i],scene,mc) : 0.0f;    
+            /* FIXME: sum undeterministic */
+            sum += presplitItem[i].priority;
+          }
+          return sum;
+        },[](const float& a, const float& b) -> float { return a+b; });
+
+      /* compute number of splits per primitive */
+      const float inv_psum = 1.0f / psum;
+      parallel_for( size_t(0), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+          for (size_t i=r.begin(); i<r.end(); i++)
+          {
+            if (presplitItem[i].priority > 0.0f)
+            {
+              const float rel_p = (float)numSplitPrimitivesBudget * presplitItem[i].priority * inv_psum;
+              if (rel_p >= PRIORITY_CUTOFF_THRESHOLD) // need at least a split budget that generates two sub-prims
+              {
+                presplitItem[i].priority = max(min(ceilf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG),1.0f);
+                //presplitItem[i].priority = min(floorf(logf(rel_p)/logf(2.0f)),(float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+                assert(presplitItem[i].priority >= 0.0f && presplitItem[i].priority <= (float)MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+              }
+              else
+                presplitItem[i].priority = 0.0f;
+            }
+          }
+        });
+
+      auto isLeft = [&] (const PresplitItem &ref) { return ref.priority < PRIORITY_CUTOFF_THRESHOLD; };        
+      size_t center = parallel_partitioning(presplitItem,0,numPrimitives,isLeft,1024);
+
+      /* anything to split ? */
+      if (center < numPrimitives)
+      {
+        const size_t numPrimitivesToSplit = numPrimitives - center;
+        assert(presplitItem[center].priority >= 1.0f);
+
+        /* sort presplit items in ascending order */
+        radix_sort_u32(presplitItem + center,tmp_presplitItem + center,numPrimitivesToSplit,1024);
+
+        CHECK_PRESPLIT(
+          parallel_for( size_t(center+1), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& r) -> void {
+              for (size_t i=r.begin(); i<r.end(); i++)
+                assert(presplitItem[i-1].priority <= presplitItem[i].priority);
+            });
+          );
+
+        unsigned int *const primOffset0 = (unsigned int*)tmp_presplitItem;
+        unsigned int *const primOffset1 = (unsigned int*)tmp_presplitItem + numPrimitivesToSplit;
+
+        /* compute actual number of sub-primitives generated within the [center;numPrimitives-1] range */
+        const size_t totalNumSubPrims = parallel_reduce( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), size_t(0), [&](const range<size_t>& t) -> size_t {
+            size_t sum = 0;
+            for (size_t i=t.begin(); i<t.end(); i++)
+            {	
+              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];	
+              assert(presplitItem[i].priority >= 1.0f);
+              const unsigned int  primrefID = presplitItem[i].index;	
+              const float prio              = presplitItem[i].priority;
+              const unsigned int   geomID   = prims[primrefID].geomID();
+              const unsigned int   primID   = prims[primrefID].primID();
+              const unsigned int split_levels = (unsigned int)prio;
+              unsigned int numSubPrims = 0;
+              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+              assert(numSubPrims);
+              numSubPrims--; // can reuse slot 
+              sum+=numSubPrims;
+              presplitItem[i].data = (numSubPrims << MAX_PRESPLITS_PER_PRIMITIVE_LOG) | split_levels;
+              primOffset0[i-center] = numSubPrims;
+            }
+            return sum;
+          },[](const size_t& a, const size_t& b) -> size_t { return a+b; });
+        
+        /* if we are over budget, need to shrink the range */
+        if (totalNumSubPrims > numSplitPrimitivesBudget) 
+        {
+          size_t new_center = numPrimitives-1;
+          size_t sum = 0;
+          for (;new_center>=center;new_center--)
+          {
+            const unsigned int numSubPrims = presplitItem[new_center].data >> MAX_PRESPLITS_PER_PRIMITIVE_LOG;
+            if (unlikely(sum + numSubPrims >= numSplitPrimitivesBudget)) break;
+            sum += numSubPrims;
+          }
+          new_center++;
+          center = new_center;
+        }
+
+        /* parallel prefix sum to compute offsets for storing sub-primitives */
+        const unsigned int offset = parallel_prefix_sum(primOffset0,primOffset1,numPrimitivesToSplit,(unsigned int)0,std::plus<unsigned int>());
+
+        /* iterate over range, and split primitives into sub primitives and append them to prims array */		    
+        parallel_for( size_t(center), numPrimitives, size_t(MIN_STEP_SIZE), [&](const range<size_t>& rn) -> void {
+            for (size_t j=rn.begin(); j<rn.end(); j++)		    
+            {
+              PrimRef subPrims[MAX_PRESPLITS_PER_PRIMITIVE];
+              const unsigned int  primrefID = presplitItem[j].index;	
+              const unsigned int   geomID   = prims[primrefID].geomID();
+              const unsigned int   primID   = prims[primrefID].primID();
+              const unsigned int split_levels = presplitItem[j].data & ((unsigned int)(1 << MAX_PRESPLITS_PER_PRIMITIVE_LOG)-1);
+
+              assert(split_levels);
+              assert(split_levels <= MAX_PRESPLITS_PER_PRIMITIVE_LOG);
+              unsigned int numSubPrims = 0;
+              splitPrimitive(Splitter,prims[primrefID],geomID,primID,split_levels,grid_base,grid_scale,grid_extend,subPrims,numSubPrims);
+              const size_t newID = numPrimitives + primOffset1[j-center];              
+              assert(newID+numSubPrims <= alloc_numPrimitives);
+              prims[primrefID] = subPrims[0];
+              for (size_t i=1;i<numSubPrims;i++)
+                prims[newID+i-1] = subPrims[i];
+            }
+          });
+
+        numPrimitives += offset;
+        DBG_PRESPLIT(
+          PRINT(pinfo.size());
+          PRINT(numPrimitives);
+          PRINT((float)numPrimitives/org_numPrimitives));                
+      }
+                
+      /* recompute centroid bounding boxes */
+      pinfo = parallel_reduce(size_t(0),numPrimitives,size_t(MIN_STEP_SIZE),PrimInfo(empty),[&] (const range<size_t>& r) -> PrimInfo {
+          PrimInfo p(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+            p.add_center2(prims[j]);
+          return p;
+        }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+  
+      assert(pinfo.size() == numPrimitives);
+      
+      /* free double buffer presplit items */
+      alignedFree(tmp_presplitItem);		
+      alignedFree(presplitItem);
+      return pinfo;	
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/builders/splitter.h b/thirdparty/embree-aarch64/kernels/builders/splitter.h
new file mode 100644
index 0000000000..dbd6cf07c7
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/builders/splitter.h
@@ -0,0 +1,169 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/primref.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<size_t N>
+    __forceinline void splitPolygon(const BBox3fa& bounds, 
+                                    const size_t dim, 
+                                    const float pos, 
+                                    const Vec3fa (&v)[N+1],
+                                    const Vec3fa (&inv_length)[N],
+                                    BBox3fa& left_o, 
+                                    BBox3fa& right_o)
+    {
+      BBox3fa left = empty, right = empty;
+      /* clip triangle to left and right box by processing all edges */
+      for (size_t i=0; i<N; i++)
+      {
+        const Vec3fa &v0 = v[i]; 
+        const Vec3fa &v1 = v[i+1]; 
+        const float v0d = v0[dim];
+        const float v1d = v1[dim];
+        
+        if (v0d <= pos) left. extend(v0); // this point is on left side
+        if (v0d >= pos) right.extend(v0); // this point is on right side
+        
+        if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+        {
+          assert((v1d-v0d) != 0.0f);
+          const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length[i][dim]),v1-v0,v0);
+          left.extend(c);
+          right.extend(c);
+        }
+      }
+      
+      /* clip against current bounds */
+      left_o  = intersect(left,bounds);
+      right_o = intersect(right,bounds);
+    }
+    
+    template<size_t N>
+      __forceinline void splitPolygon(const PrimRef& prim, 
+                                      const size_t dim, 
+                                      const float pos, 
+                                      const Vec3fa (&v)[N+1],
+                                      PrimRef& left_o, 
+                                      PrimRef& right_o)
+    {
+      BBox3fa left = empty, right = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const Vec3fa &v0 = v[i]; 
+        const Vec3fa &v1 = v[i+1]; 
+        const float v0d = v0[dim];
+        const float v1d = v1[dim];
+        
+        if (v0d <= pos) left. extend(v0); // this point is on left side
+        if (v0d >= pos) right.extend(v0); // this point is on right side
+        
+        if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d)) // the edge crosses the splitting location
+        {
+          assert((v1d-v0d) != 0.0f);
+          const float inv_length = 1.0f/(v1d-v0d);
+          const Vec3fa c = madd(Vec3fa((pos-v0d)*inv_length),v1-v0,v0);
+          left.extend(c);
+          right.extend(c);
+        }
+      }
+      
+      /* clip against current bounds */
+      new (&left_o ) PrimRef(intersect(left ,prim.bounds()),prim.geomID(), prim.primID());
+      new (&right_o) PrimRef(intersect(right,prim.bounds()),prim.geomID(), prim.primID());
+    }
+    
+    struct TriangleSplitter
+    {
+      __forceinline TriangleSplitter(const Scene* scene, const PrimRef& prim)
+      {
+        const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+        const TriangleMesh* mesh = (const TriangleMesh*) scene->get(prim.geomID() & mask );  
+        TriangleMesh::Triangle tri = mesh->triangle(prim.primID());
+        v[0] = mesh->vertex(tri.v[0]);
+        v[1] = mesh->vertex(tri.v[1]);
+        v[2] = mesh->vertex(tri.v[2]);
+        v[3] = mesh->vertex(tri.v[0]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[0]-v[2]);
+      }
+      
+      __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+        splitPolygon<3>(prim,dim,pos,v,left_o,right_o);
+      }
+      
+      __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+        splitPolygon<3>(prim,dim,pos,v,inv_length,left_o,right_o);
+      }
+      
+    private:
+      Vec3fa v[4];
+      Vec3fa inv_length[3];
+    };
+    
+    struct TriangleSplitterFactory
+    {
+      __forceinline TriangleSplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline TriangleSplitter operator() (const PrimRef& prim) const {
+        return TriangleSplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+    
+    struct QuadSplitter
+    {
+      __forceinline QuadSplitter(const Scene* scene, const PrimRef& prim)
+      {
+        const unsigned int mask = 0xFFFFFFFF >> RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS;
+        const QuadMesh* mesh = (const QuadMesh*) scene->get(prim.geomID() & mask );  
+        QuadMesh::Quad quad = mesh->quad(prim.primID());
+        v[0] = mesh->vertex(quad.v[0]);
+        v[1] = mesh->vertex(quad.v[1]);
+        v[2] = mesh->vertex(quad.v[2]);
+        v[3] = mesh->vertex(quad.v[3]);
+        v[4] = mesh->vertex(quad.v[0]);
+        inv_length[0] = Vec3fa(1.0f) / (v[1]-v[0]);
+        inv_length[1] = Vec3fa(1.0f) / (v[2]-v[1]);
+        inv_length[2] = Vec3fa(1.0f) / (v[3]-v[2]);
+        inv_length[3] = Vec3fa(1.0f) / (v[0]-v[3]);
+      }
+      
+      __forceinline void operator() (const PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const {
+        splitPolygon<4>(prim,dim,pos,v,left_o,right_o);
+      }
+      
+      __forceinline void operator() (const BBox3fa& prim, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const {
+        splitPolygon<4>(prim,dim,pos,v,inv_length,left_o,right_o);
+      }
+      
+    private:
+      Vec3fa v[5];
+      Vec3fa inv_length[4];
+    };
+    
+    struct QuadSplitterFactory
+    {
+      __forceinline QuadSplitterFactory(const Scene* scene)
+        : scene(scene) {}
+      
+      __forceinline QuadSplitter operator() (const PrimRef& prim) const {
+        return QuadSplitter(scene,prim);
+      }
+      
+    private:
+      const Scene* scene;
+    };
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp
new file mode 100644
index 0000000000..bd102bd6ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh.cpp
@@ -0,0 +1,190 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_statistics.h"
+
+namespace embree
+{
+  template<int N>
+  BVHN<N>::BVHN (const PrimitiveType& primTy, Scene* scene)
+    : AccelData((N==4) ? AccelData::TY_BVH4 : (N==8) ? AccelData::TY_BVH8 : AccelData::TY_UNKNOWN),
+      primTy(&primTy), device(scene->device), scene(scene),
+      root(emptyNode), alloc(scene->device,scene->isStaticAccel()), numPrimitives(0), numVertices(0)
+  {
+  }
+
+  template<int N>
+  BVHN<N>::~BVHN ()
+  {
+    for (size_t i=0; i<objects.size(); i++) 
+      delete objects[i];
+  }
+
+  template<int N>
+  void BVHN<N>::clear()
+  {
+    set(BVHN::emptyNode,empty,0);
+    alloc.clear();
+  }
+
+  template<int N>
+  void BVHN<N>::set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives)
+  {
+    this->root = root;
+    this->bounds = bounds;
+    this->numPrimitives = numPrimitives;
+  }	
+
+  template<int N>
+  void BVHN<N>::clearBarrier(NodeRef& node)
+  {
+    if (node.isBarrier())
+      node.clearBarrier();
+    else if (!node.isLeaf()) {
+      BaseNode* n = node.baseNode(); // FIXME: flags should be stored in BVH
+      for (size_t c=0; c<N; c++)
+        clearBarrier(n->child(c));
+    }
+  }
+
+  template<int N>
+  void BVHN<N>::layoutLargeNodes(size_t num)
+  {
+#if defined(__X86_64__) || defined(__aarch64__) // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
+    struct NodeArea 
+    {
+      __forceinline NodeArea() {}
+
+      __forceinline NodeArea(NodeRef& node, const BBox3fa& bounds)
+        : node(&node), A(node.isLeaf() ? float(neg_inf) : area(bounds)) {}
+
+      __forceinline bool operator< (const NodeArea& other) const {
+        return this->A < other.A;
+      }
+
+      NodeRef* node;
+      float A;
+    };
+    std::vector<NodeArea> lst;
+    lst.reserve(num);
+    lst.push_back(NodeArea(root,empty));
+
+    while (lst.size() < num)
+    {
+      std::pop_heap(lst.begin(), lst.end());
+      NodeArea n = lst.back(); lst.pop_back();
+      if (!n.node->isAABBNode()) break;
+      AABBNode* node = n.node->getAABBNode();
+      for (size_t i=0; i<N; i++) {
+        if (node->child(i) == BVHN::emptyNode) continue;
+        lst.push_back(NodeArea(node->child(i),node->bounds(i)));
+        std::push_heap(lst.begin(), lst.end());
+      }
+    }
+
+    for (size_t i=0; i<lst.size(); i++)
+      lst[i].node->setBarrier();
+      
+    root = layoutLargeNodesRecursion(root,alloc.getCachedAllocator());
+#endif
+  }
+  
+  template<int N>
+  typename BVHN<N>::NodeRef BVHN<N>::layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator)
+  {
+    if (node.isBarrier()) {
+      node.clearBarrier();
+      return node;
+    }
+    else if (node.isAABBNode()) 
+    {
+      AABBNode* oldnode = node.getAABBNode();
+      AABBNode* newnode = (BVHN::AABBNode*) allocator.malloc0(sizeof(BVHN::AABBNode),byteNodeAlignment);
+      *newnode = *oldnode;
+      for (size_t c=0; c<N; c++)
+        newnode->child(c) = layoutLargeNodesRecursion(oldnode->child(c),allocator);
+      return encodeNode(newnode);
+    }
+    else return node;
+  }
+
+  template<int N>
+  double BVHN<N>::preBuild(const std::string& builderName)
+  {
+    if (builderName == "") 
+      return inf;
+
+    if (device->verbosity(2))
+    {
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "building BVH" << N << (builderName.find("MBlur") != std::string::npos ? "MB" : "") << "<" << primTy->name() << "> using " << builderName << " ..." << std::endl << std::flush;
+    }
+
+    double t0 = 0.0;
+    if (device->benchmark || device->verbosity(2)) t0 = getSeconds();
+    return t0;
+  }
+
+  template<int N>
+  void BVHN<N>::postBuild(double t0)
+  {
+    if (t0 == double(inf))
+      return;
+    
+    double dt = 0.0;
+    if (device->benchmark || device->verbosity(2)) 
+      dt = getSeconds()-t0;
+
+    std::unique_ptr<BVHNStatistics<N>> stat;
+
+    /* print statistics */
+    if (device->verbosity(2))
+    {
+      if (!stat) stat.reset(new BVHNStatistics<N>(this));
+      const size_t usedBytes = alloc.getUsedBytes();
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "finished BVH" << N << "<" << primTy->name() << "> : " << 1000.0f*dt << "ms, " << 1E-6*double(numPrimitives)/dt << " Mprim/s, " << 1E-9*double(usedBytes)/dt << " GB/s" << std::endl;
+    
+      if (device->verbosity(2))
+        std::cout << stat->str();
+
+      if (device->verbosity(2))
+      {
+        FastAllocator::AllStatistics stat(&alloc);
+        for (size_t i=0; i<objects.size(); i++)
+          if (objects[i])
+            stat = stat + FastAllocator::AllStatistics(&objects[i]->alloc);
+
+        stat.print(numPrimitives);
+      }
+
+      if (device->verbosity(3))
+      {
+        alloc.print_blocks();
+        for (size_t i=0; i<objects.size(); i++)
+          if (objects[i]) 
+            objects[i]->alloc.print_blocks();
+      }
+
+      std::cout << std::flush;
+    }
+
+    /* benchmark mode */
+    if (device->benchmark)
+    {
+      if (!stat) stat.reset(new BVHNStatistics<N>(this));
+      Lock<MutexSys> lock(g_printMutex);
+      std::cout << "BENCHMARK_BUILD " << dt << " " << double(numPrimitives)/dt << " " << stat->sah() << " " << stat->bytesUsed() << " BVH" << N << "<" << primTy->name() << ">" << std::endl << std::flush;
+    }
+  }
+
+#if defined(__AVX__)
+  template class BVHN<8>;
+#endif
+
+#if !defined(__AVX__) || !defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42) || defined(__aarch64__)
+  template class BVHN<4>;
+#endif
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh.h b/thirdparty/embree-aarch64/kernels/bvh/bvh.h
new file mode 100644
index 0000000000..8fdf912e52
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh.h
@@ -0,0 +1,235 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+/* include all node types */
+#include "bvh_node_aabb.h"
+#include "bvh_node_aabb_mb.h"
+#include "bvh_node_aabb_mb4d.h"
+#include "bvh_node_obb.h"
+#include "bvh_node_obb_mb.h"
+#include "bvh_node_qaabb.h"
+
+namespace embree
+{
+  /*! flags used to enable specific node types in intersectors */
+  enum BVHNodeFlags
+  {
+    BVH_FLAG_ALIGNED_NODE = 0x00001,
+    BVH_FLAG_ALIGNED_NODE_MB = 0x00010,
+    BVH_FLAG_UNALIGNED_NODE = 0x00100,
+    BVH_FLAG_UNALIGNED_NODE_MB = 0x01000,
+    BVH_FLAG_QUANTIZED_NODE = 0x100000,
+    BVH_FLAG_ALIGNED_NODE_MB4D = 0x1000000,
+    
+    /* short versions */
+    BVH_AN1 = BVH_FLAG_ALIGNED_NODE,
+    BVH_AN2 = BVH_FLAG_ALIGNED_NODE_MB,
+    BVH_AN2_AN4D = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D,
+    BVH_UN1 = BVH_FLAG_UNALIGNED_NODE,
+    BVH_UN2 = BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_MB = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D,
+    BVH_AN1_UN1 = BVH_FLAG_ALIGNED_NODE | BVH_FLAG_UNALIGNED_NODE,
+    BVH_AN2_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_AN2_AN4D_UN2 = BVH_FLAG_ALIGNED_NODE_MB | BVH_FLAG_ALIGNED_NODE_MB4D | BVH_FLAG_UNALIGNED_NODE_MB,
+    BVH_QN1 = BVH_FLAG_QUANTIZED_NODE
+  };
+  
+  /*! Multi BVH with N children. Each node stores the bounding box of
+   * it's N children as well as N child references. */
+  template<int N>
+    class BVHN : public AccelData
+  {
+    ALIGNED_CLASS_(16);
+  public:
+    
+    /*! forward declaration of node ref type */
+    typedef NodeRefPtr<N> NodeRef;
+    typedef BaseNode_t<NodeRef,N> BaseNode;
+    typedef AABBNode_t<NodeRef,N> AABBNode;
+    typedef AABBNodeMB_t<NodeRef,N> AABBNodeMB;
+    typedef AABBNodeMB4D_t<NodeRef,N> AABBNodeMB4D;
+    typedef OBBNode_t<NodeRef,N> OBBNode;
+    typedef OBBNodeMB_t<NodeRef,N> OBBNodeMB;
+    typedef QuantizedBaseNode_t<N> QuantizedBaseNode;
+    typedef QuantizedBaseNodeMB_t<N> QuantizedBaseNodeMB;
+    typedef QuantizedNode_t<NodeRef,N> QuantizedNode;
+    
+    /*! Number of bytes the nodes and primitives are minimally aligned to.*/
+    static const size_t byteAlignment = 16;
+    static const size_t byteNodeAlignment = 4*N;
+    
+    /*! Empty node */
+    static const size_t emptyNode = NodeRef::emptyNode;
+    
+    /*! Invalid node, used as marker in traversal */
+    static const size_t invalidNode = NodeRef::invalidNode;
+    static const size_t popRay      = NodeRef::popRay;
+    
+    /*! Maximum depth of the BVH. */
+    static const size_t maxBuildDepth = 32;
+    static const size_t maxBuildDepthLeaf = maxBuildDepth+8;
+    static const size_t maxDepth = 2*maxBuildDepthLeaf; // 2x because of two level builder
+    
+    /*! Maximum number of primitive blocks in a leaf. */
+    static const size_t maxLeafBlocks = NodeRef::maxLeafBlocks;
+    
+  public:
+    
+    /*! Builder interface to create allocator */
+    struct CreateAlloc : public FastAllocator::Create {
+      __forceinline CreateAlloc (BVHN* bvh) : FastAllocator::Create(&bvh->alloc) {}
+    };
+
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+  public:
+    
+    /*! BVHN default constructor. */
+    BVHN (const PrimitiveType& primTy, Scene* scene);
+    
+    /*! BVHN destruction */
+    ~BVHN ();
+    
+    /*! clears the acceleration structure */
+    void clear();
+    
+    /*! sets BVH members after build */
+    void set (NodeRef root, const LBBox3fa& bounds, size_t numPrimitives);
+    
+    /*! Clears the barrier bits of a subtree. */
+    void clearBarrier(NodeRef& node);
+    
+    /*! lays out num large nodes of the BVH */
+    void layoutLargeNodes(size_t num);
+    NodeRef layoutLargeNodesRecursion(NodeRef& node, const FastAllocator::CachedAllocator& allocator);
+    
+    /*! called by all builders before build starts */
+    double preBuild(const std::string& builderName);
+    
+    /*! called by all builders after build ended */
+    void postBuild(double t0);
+    
+    /*! allocator class */
+    struct Allocator {
+      BVHN* bvh;
+      Allocator (BVHN* bvh) : bvh(bvh) {}
+      __forceinline void* operator() (size_t bytes) const { 
+        return bvh->alloc._threadLocal()->malloc(&bvh->alloc,bytes); 
+      }
+    };
+    
+    /*! post build cleanup */
+    void cleanup() {
+      alloc.cleanup();
+    }
+    
+  public:
+    
+    /*! Encodes a node */
+    static __forceinline NodeRef encodeNode(AABBNode* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(AABBNodeMB* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(AABBNodeMB4D* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(OBBNode* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeNode(OBBNodeMB* node) { return NodeRef::encodeNode(node); }
+    static __forceinline NodeRef encodeLeaf(void* tri, size_t num) { return NodeRef::encodeLeaf(tri,num); }
+    static __forceinline NodeRef encodeTypedLeaf(void* ptr, size_t ty) { return NodeRef::encodeTypedLeaf(ptr,ty); }
+    
+  public:
+    
+    /*! Prefetches the node this reference points to */
+    __forceinline static void prefetch(const NodeRef ref, int types=0)
+    {
+#if defined(__AVX512PF__) // MIC
+      if (types != BVH_FLAG_QUANTIZED_NODE) {
+        prefetchL2(((char*)ref.ptr)+0*64);
+        prefetchL2(((char*)ref.ptr)+1*64);
+        if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+          prefetchL2(((char*)ref.ptr)+2*64);
+          prefetchL2(((char*)ref.ptr)+3*64);
+        }
+        if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+          /* KNL still needs L2 prefetches for large nodes */
+          prefetchL2(((char*)ref.ptr)+4*64);
+          prefetchL2(((char*)ref.ptr)+5*64);
+          prefetchL2(((char*)ref.ptr)+6*64);
+          prefetchL2(((char*)ref.ptr)+7*64);
+        }
+      }
+      else
+      {
+        /* todo: reduce if 32bit offsets are enabled */
+        prefetchL2(((char*)ref.ptr)+0*64);
+        prefetchL2(((char*)ref.ptr)+1*64);
+        prefetchL2(((char*)ref.ptr)+2*64);
+      }
+#else
+      if (types != BVH_FLAG_QUANTIZED_NODE) {
+        prefetchL1(((char*)ref.ptr)+0*64);
+        prefetchL1(((char*)ref.ptr)+1*64);
+        if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+          prefetchL1(((char*)ref.ptr)+2*64);
+          prefetchL1(((char*)ref.ptr)+3*64);
+        }
+        if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+          /* deactivate for large nodes on Xeon, as it introduces regressions */
+          //prefetchL1(((char*)ref.ptr)+4*64);
+          //prefetchL1(((char*)ref.ptr)+5*64);
+          //prefetchL1(((char*)ref.ptr)+6*64);
+          //prefetchL1(((char*)ref.ptr)+7*64);
+        }
+      }
+      else
+      {
+        /* todo: reduce if 32bit offsets are enabled */
+        prefetchL1(((char*)ref.ptr)+0*64);
+        prefetchL1(((char*)ref.ptr)+1*64);
+        prefetchL1(((char*)ref.ptr)+2*64);
+      }
+#endif
+    }
+    
+    __forceinline static void prefetchW(const NodeRef ref, int types=0)
+    {
+      embree::prefetchEX(((char*)ref.ptr)+0*64);
+      embree::prefetchEX(((char*)ref.ptr)+1*64);
+      if ((N >= 8) || (types > BVH_FLAG_ALIGNED_NODE)) {
+        embree::prefetchEX(((char*)ref.ptr)+2*64);
+        embree::prefetchEX(((char*)ref.ptr)+3*64);
+      }
+      if ((N >= 8) && (types > BVH_FLAG_ALIGNED_NODE)) {
+        embree::prefetchEX(((char*)ref.ptr)+4*64);
+        embree::prefetchEX(((char*)ref.ptr)+5*64);
+        embree::prefetchEX(((char*)ref.ptr)+6*64);
+        embree::prefetchEX(((char*)ref.ptr)+7*64);
+      }
+    }
+    
+    /*! bvh type information */
+  public:
+    const PrimitiveType* primTy;       //!< primitive type stored in the BVH
+    
+    /*! bvh data */
+  public:
+    Device* device;                    //!< device pointer
+    Scene* scene;                      //!< scene pointer
+    NodeRef root;                      //!< root node
+    FastAllocator alloc;               //!< allocator used to allocate nodes
+    
+    /*! statistics data */
+  public:
+    size_t numPrimitives;              //!< number of primitives the BVH is build over
+    size_t numVertices;                //!< number of vertices the BVH references
+    
+    /*! data arrays for special builders */
+  public:
+    std::vector<BVHN*> objects;
+    vector_t<char,aligned_allocator<char,32>> subdiv_patches;
+  };
+  
+  typedef BVHN<4> BVH4;
+  typedef BVHN<8> BVH8;
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp
new file mode 100644
index 0000000000..23f4f63d45
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.cpp
@@ -0,0 +1,1325 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh4_factory.h"
+#include "../bvh/bvh.h"
+
+#include "../geometry/curveNv.h"
+#include "../geometry/curveNi.h"
+#include "../geometry/curveNi_mb.h"
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/subdivpatch1.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+#include "../common/accelinstance.h"
+
+namespace embree
+{
+  DECLARE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom);
+
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4i,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8i,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector4iMB,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void);
+    
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1);
+  
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4);
+  
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8);
+  
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16);
+  
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4IntersectorStreamPacketFallback);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4IntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Triangle4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4Quad4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  
+  DECLARE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+
+  BVH4Factory::BVH4Factory(int bfeatures, int ifeatures)
+  {
+    SELECT_SYMBOL_DEFAULT_AVX_AVX2(ifeatures,BVH4ColliderUserGeom);
+
+    selectBuilders(bfeatures);
+    selectIntersectors(ifeatures);
+  }
+
+  void BVH4Factory::selectBuilders(int features)
+  {
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4MeshSAH));
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4iMeshSAH));
+    IF_ENABLED_TRIS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelTriangle4vMeshSAH));
+    IF_ENABLED_QUADS (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelQuadMeshSAH));
+    IF_ENABLED_USER (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelVirtualSAH));
+    IF_ENABLED_INSTANCE (SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4BuilderTwoLevelInstanceSAH));
+
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4vBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Curve4iBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4OBBCurve4iMBBuilder_OBB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4Curve8iBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH4OBBCurve8iMBBuilder_OBB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4SceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4vSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Triangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedTriangle4iSceneBuilderSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4vSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4Quad4iSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4iMBSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4QuantizedQuad4iSceneBuilderSAH));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4SceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4vSceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Triangle4iSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4Quad4vSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4VirtualSceneBuilderSAH));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4VirtualMBSceneBuilderSAH));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4InstanceSceneBuilderSAH));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4InstanceMBSceneBuilderSAH));
+    
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridSceneBuilderSAH));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_AVX(features,BVH4GridMBSceneBuilderSAH));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1BuilderSAH));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,BVH4SubdivPatch1MBBuilderSAH));
+  }
+
+  void BVH4Factory::selectIntersectors(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4i));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8i));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector4iMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB));
+    
+    /* select intersectors1 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector1MB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust1MB));
+    
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4vIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,BVH4Triangle4iIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector1Moeller));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Triangle4iIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,QBVH4Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector1));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector1));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector1));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector1));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector1));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector1));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Moeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector1Moeller))
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector1Pluecker));
+
+#if defined (EMBREE_RAY_PACKETS)
+
+    /* select intersectors4 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector4HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust4HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector4HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector4HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector4));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector4));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector4Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector4Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector4Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector4Chunk));
+    
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector4HybridMoeller));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector4HybridPluecker));
+
+    /* select intersectors8 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersector8HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust8HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4Intersector8HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iIntersector8HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4vMBIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Triangle4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4vIntersector8HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4Quad4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1Intersector8));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4SubdivPatch1MBIntersector8));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualIntersector8Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4VirtualMBIntersector8Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceIntersector8Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4InstanceMBIntersector8Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridMBIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH4GridIntersector8HybridPluecker));
+
+    /* select intersectors16 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersector16HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4OBBVirtualCurveIntersectorRobust16HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4Intersector16HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersector16HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4vMBIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Triangle4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersector16HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4Quad4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1Intersector16));
+    IF_ENABLED_SUBDIV(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4SubdivPatch1MBIntersector16));
+    
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersector16Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4VirtualMBIntersector16Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersector16Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4InstanceMBIntersector16Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridMBIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH4GridIntersector16HybridPluecker));
+
+    /* select stream intersectors */
+    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4IntersectorStreamPacketFallback);
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4IntersectorStreamMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4vIntersectorStreamPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Triangle4iIntersectorStreamPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4vIntersectorStreamPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4Quad4iIntersectorStreamPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4VirtualIntersectorStream));
+    
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH4InstanceIntersectorStream));
+
+#endif
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersector4Hybrid();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersector8Hybrid();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16Hybrid();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersectorRobust1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersectorRobust4Hybrid();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersectorRobust8Hybrid();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16Hybrid();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersector1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersector4HybridMB();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersector8HybridMB();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersector16HybridMB();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH4OBBVirtualCurveIntersectorRobust1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4OBBVirtualCurveIntersectorRobust4HybridMB();
+      intersectors.intersector8  = BVH4OBBVirtualCurveIntersectorRobust8HybridMB();
+      intersectors.intersector16 = BVH4OBBVirtualCurveIntersectorRobust16HybridMB();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::FAST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1           = BVH4Triangle4Intersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4_filter    = BVH4Triangle4Intersector4HybridMoeller();
+    intersectors.intersector4_nofilter  = BVH4Triangle4Intersector4HybridMoellerNoFilter();
+    intersectors.intersector8_filter    = BVH4Triangle4Intersector8HybridMoeller();
+    intersectors.intersector8_nofilter  = BVH4Triangle4Intersector8HybridMoellerNoFilter();
+    intersectors.intersector16_filter   = BVH4Triangle4Intersector16HybridMoeller();
+    intersectors.intersector16_nofilter = BVH4Triangle4Intersector16HybridMoellerNoFilter();
+    intersectors.intersectorN_filter    = BVH4Triangle4IntersectorStreamMoeller();
+    intersectors.intersectorN_nofilter  = BVH4Triangle4IntersectorStreamMoellerNoFilter();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::ROBUST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4Triangle4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4Triangle4vIntersector4HybridPluecker();
+    intersectors.intersector8  = BVH4Triangle4vIntersector8HybridPluecker();
+    intersectors.intersector16 = BVH4Triangle4vIntersector16HybridPluecker();
+    intersectors.intersectorN  = BVH4Triangle4vIntersectorStreamPluecker();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4Triangle4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4Triangle4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4vMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4vMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4vMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4vMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4vMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4vMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4vMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4Triangle4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Triangle4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Triangle4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Triangle4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Triangle4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1           = BVH4Quad4vIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4_filter    = BVH4Quad4vIntersector4HybridMoeller();
+      intersectors.intersector4_nofilter  = BVH4Quad4vIntersector4HybridMoellerNoFilter();
+      intersectors.intersector8_filter    = BVH4Quad4vIntersector8HybridMoeller();
+      intersectors.intersector8_nofilter  = BVH4Quad4vIntersector8HybridMoellerNoFilter();
+      intersectors.intersector16_filter   = BVH4Quad4vIntersector16HybridMoeller();
+      intersectors.intersector16_nofilter = BVH4Quad4vIntersector16HybridMoellerNoFilter();
+      intersectors.intersectorN_filter    = BVH4Quad4vIntersectorStreamMoeller();
+      intersectors.intersectorN_nofilter  = BVH4Quad4vIntersectorStreamMoellerNoFilter();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH4Quad4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4Quad4vIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4Quad4vIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4Quad4vIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4Quad4vIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iIntersector4HybridMoeller();
+      intersectors.intersector8 = BVH4Quad4iIntersector8HybridMoeller();
+      intersectors.intersector16= BVH4Quad4iIntersector16HybridMoeller();
+      intersectors.intersectorN = BVH4Quad4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iIntersector4HybridPluecker();
+      intersectors.intersector8 = BVH4Quad4iIntersector8HybridPluecker();
+      intersectors.intersector16= BVH4Quad4iIntersector16HybridPluecker();
+      intersectors.intersectorN = BVH4Quad4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridMoeller();
+      intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridMoeller();
+      intersectors.intersector16= BVH4Quad4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1 = BVH4Quad4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4 = BVH4Quad4iMBIntersector4HybridPluecker();
+      intersectors.intersector8 = BVH4Quad4iMBIntersector8HybridPluecker();
+      intersectors.intersector16= BVH4Quad4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH4Factory::QBVH4Triangle4iIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH4Triangle4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::QBVH4Quad4iIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH4Quad4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4UserGeometryIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4VirtualIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4VirtualIntersector4Chunk();
+    intersectors.intersector8  = BVH4VirtualIntersector8Chunk();
+    intersectors.intersector16 = BVH4VirtualIntersector16Chunk();
+    intersectors.intersectorN  = BVH4VirtualIntersectorStream();
+#endif
+    intersectors.collider      = BVH4ColliderUserGeom();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4UserGeometryMBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4VirtualMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4VirtualMBIntersector4Chunk();
+    intersectors.intersector8  = BVH4VirtualMBIntersector8Chunk();
+    intersectors.intersector16 = BVH4VirtualMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4InstanceIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4InstanceIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4InstanceIntersector4Chunk();
+    intersectors.intersector8  = BVH4InstanceIntersector8Chunk();
+    intersectors.intersector16 = BVH4InstanceIntersector16Chunk();
+    intersectors.intersectorN  = BVH4InstanceIntersectorStream();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4InstanceMBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4InstanceMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4InstanceMBIntersector4Chunk();
+    intersectors.intersector8  = BVH4InstanceMBIntersector8Chunk();
+    intersectors.intersector16 = BVH4InstanceMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+  
+  Accel::Intersectors BVH4Factory::BVH4SubdivPatch1Intersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4SubdivPatch1Intersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4SubdivPatch1Intersector4();
+    intersectors.intersector8  = BVH4SubdivPatch1Intersector8();
+    intersectors.intersector16 = BVH4SubdivPatch1Intersector16();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4SubdivPatch1MBIntersectors(BVH4* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4SubdivPatch1MBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4SubdivPatch1MBIntersector4();
+    intersectors.intersector8  = BVH4SubdivPatch1MBIntersector8();
+    intersectors.intersector16 = BVH4SubdivPatch1MBIntersector16();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4i::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4i(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve4iBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+#if defined(EMBREE_TARGET_SIMD8)
+  Accel* BVH4Factory::BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve8i::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8i(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve8iBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+#endif
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4v::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector4v(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4Curve4vBuilder_OBB_New(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve4iMB::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector4iMB(),ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4OBBCurve4iMBBuilder_OBB(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+#if defined(EMBREE_TARGET_SIMD8)
+  Accel* BVH4Factory::BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Curve8iMB::type,scene);
+    Accel::Intersectors intersectors = BVH4OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(), ivariant);
+
+    Builder* builder = nullptr;
+    if      (scene->device->hair_builder == "default"     ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    else if (scene->device->hair_builder == "sah"         ) builder = BVH4OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->hair_builder+" for BVH4OBB<VirtualCurve8iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+#endif
+  
+  Accel* BVH4Factory::BVH4Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4Intersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4Intersectors(accel,IntersectVariant::FAST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4SceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4MeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4v::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4vIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4vIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4vSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4vMeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser == "default") intersectors = BVH4Triangle4iIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser == "fast"   ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser == "robust" ) intersectors = BVH4Triangle4iIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser+" for BVH4<Triangle4i>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default"     ) {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         ) builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial" ) builder = BVH4Triangle4iSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit") builder = BVH4Triangle4iSceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"      ) builder = BVH4BuilderTwoLevelTriangle4iMeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH4<Triangle4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4iMBIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser_mb == "fast"   ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4iMBIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4iMB>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else  if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Triangle4vMB::type,scene);
+
+    Accel::Intersectors intersectors;
+    if      (scene->device->tri_traverser_mb == "default") intersectors = BVH4Triangle4vMBIntersectors(accel,ivariant);
+    else if (scene->device->tri_traverser_mb == "fast"   ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::FAST);
+    else if (scene->device->tri_traverser_mb == "robust" ) intersectors = BVH4Triangle4vMBIntersectors(accel,IntersectVariant::ROBUST);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown traverser "+scene->device->tri_traverser_mb+" for BVH4<Triangle4vMB>");
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else  if (scene->device->tri_builder_mb == "internal_time_splits") builder = BVH4Triangle4vMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH4<Triangle4vMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4v::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4vIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->quad_builder == "sah"              ) builder = BVH4Quad4vSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH4Quad4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->quad_builder == "dynamic"          ) builder = BVH4BuilderTwoLevelQuadMeshSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else if (scene->device->quad_builder == "sah") builder = BVH4Quad4iSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH4<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH4Quad4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->quad_builder_mb == "sah") builder = BVH4Quad4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH4<Quad4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4QuantizedQuad4i(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Quad4i::type,scene);
+    Builder* builder = BVH4QuantizedQuad4iSceneBuilderSAH(accel,scene,0);
+    Accel::Intersectors intersectors = QBVH4Quad4iIntersectors(accel);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4QuantizedTriangle4i(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Triangle4i::type,scene);
+    Builder* builder = BVH4QuantizedTriangle4iSceneBuilderSAH(accel,scene,0);
+    Accel::Intersectors intersectors = QBVH4Triangle4iIntersectors(accel);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4SubdivPatch1(Scene* scene)
+  {
+    BVH4* accel = new BVH4(SubdivPatch1::type,scene);
+    Accel::Intersectors intersectors = BVH4SubdivPatch1Intersectors(accel);
+    Builder* builder = BVH4SubdivPatch1BuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4SubdivPatch1MB(Scene* scene)
+  {
+    BVH4* accel = new BVH4(SubdivPatch1::type,scene);
+    Accel::Intersectors intersectors = BVH4SubdivPatch1MBIntersectors(accel);
+    Builder* builder = BVH4SubdivPatch1MBBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4UserGeometry(Scene* scene, BuildVariant bvariant)
+  {
+    BVH4* accel = new BVH4(Object::type,scene);
+    Accel::Intersectors intersectors = BVH4UserGeometryIntersectors(accel);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4VirtualSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH4VirtualSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelVirtualSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4UserGeometryMB(Scene* scene)
+  {
+    BVH4* accel = new BVH4(Object::type,scene);
+    Accel::Intersectors intersectors = BVH4UserGeometryMBIntersectors(accel);
+    Builder* builder = BVH4VirtualMBSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant)
+  {
+    BVH4* accel = new BVH4(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH4InstanceIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP;
+    // Builder* builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype); break;
+      case BuildVariant::DYNAMIC     : builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH4InstanceSceneBuilderSAH(accel,scene,gtype);
+    else if (scene->device->object_builder == "dynamic") builder = BVH4BuilderTwoLevelInstanceSAH(accel,scene,gtype,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH4Factory::BVH4InstanceMB(Scene* scene, bool isExpensive)
+  {
+    BVH4* accel = new BVH4(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH4InstanceMBIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE_CHEAP;
+    Builder* builder = BVH4InstanceMBSceneBuilderSAH(accel,scene,gtype);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    if (ivariant == IntersectVariant::FAST)
+    {
+      intersectors.intersector1  = BVH4GridIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4GridIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH4GridIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH4GridIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    }
+    else /* if (ivariant == IntersectVariant::ROBUST) */
+    {
+      intersectors.intersector1  = BVH4GridIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH4GridIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH4GridIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH4GridIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif      
+    }
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH4Factory::BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH4GridMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH4GridMBIntersector4HybridMoeller();
+    intersectors.intersector8  = BVH4GridMBIntersector8HybridMoeller();
+    intersectors.intersector16 = BVH4GridMBIntersector16HybridMoeller();
+    intersectors.intersectorN  = BVH4IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH4Factory::BVH4Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(SubGridQBVH4::type,scene);
+    Accel::Intersectors intersectors = BVH4GridIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      builder = BVH4GridSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4<GridMesh>");
+    
+    return new AccelInstance(accel,builder,intersectors);    
+  }
+
+  Accel* BVH4Factory::BVH4GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH4* accel = new BVH4(SubGridQBVH4::type,scene);
+    Accel::Intersectors intersectors = BVH4GridMBIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      builder = BVH4GridMBSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->grid_builder+" for BVH4MB<GridMesh>");
+    return new AccelInstance(accel,builder,intersectors);        
+  }
+
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h
new file mode 100644
index 0000000000..a68227b41f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh4_factory.h
@@ -0,0 +1,316 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_factory.h"
+
+namespace embree
+{
+  /*! BVH4 instantiations */
+  class BVH4Factory : public BVHFactory
+  {
+  public:
+    BVH4Factory(int bfeatures, int ifeatures);
+
+  public:
+    Accel* BVH4OBBVirtualCurve4i(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve4v(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve8i(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve4iMB(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH4OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4i);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8i);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector4iMB);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB);
+        
+    Accel* BVH4Triangle4   (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::ROBUST);
+    Accel* BVH4Triangle4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH4Quad4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Quad4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH4QuantizedTriangle4i(Scene* scene);
+    Accel* BVH4QuantizedQuad4i(Scene* scene);
+ 
+    Accel* BVH4SubdivPatch1(Scene* scene);
+    Accel* BVH4SubdivPatch1MB(Scene* scene);
+
+    Accel* BVH4UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH4UserGeometryMB(Scene* scene);
+
+    Accel* BVH4Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH4InstanceMB(Scene* scene, bool isExpensive);
+
+    Accel* BVH4Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH4GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+  private:
+    void selectBuilders(int features);
+    void selectIntersectors(int features);
+    
+  private:
+    Accel::Intersectors BVH4OBBVirtualCurveIntersectors(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    Accel::Intersectors BVH4OBBVirtualCurveIntersectorsMB(BVH4* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    
+    Accel::Intersectors BVH4Triangle4Intersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4vIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4iIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Triangle4vMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors BVH4Quad4vIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Quad4iIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4Quad4iMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors QBVH4Quad4iIntersectors(BVH4* bvh);
+    Accel::Intersectors QBVH4Triangle4iIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4UserGeometryIntersectors(BVH4* bvh);
+    Accel::Intersectors BVH4UserGeometryMBIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4InstanceIntersectors(BVH4* bvh);
+    Accel::Intersectors BVH4InstanceMBIntersectors(BVH4* bvh);
+    
+    Accel::Intersectors BVH4SubdivPatch1Intersectors(BVH4* bvh);
+    Accel::Intersectors BVH4SubdivPatch1MBIntersectors(BVH4* bvh);
+
+    Accel::Intersectors BVH4GridIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH4GridMBIntersectors(BVH4* bvh, IntersectVariant ivariant);
+    
+  private:
+
+    DEFINE_SYMBOL2(Accel::Collider,BVH4ColliderUserGeom);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersector1MB);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4OBBVirtualCurveIntersectorRobust1MB);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4vMBIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Triangle4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4Quad4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Triangle4iIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH4Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1Intersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4SubdivPatch1MBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4VirtualMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4InstanceMBIntersector1);
+        
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH4GridIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersector4HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4OBBVirtualCurveIntersectorRobust4HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4Intersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4vMBIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Triangle4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4Quad4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1Intersector4);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4SubdivPatch1MBIntersector4);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4VirtualMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4InstanceMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH4GridIntersector4HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersector8HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4OBBVirtualCurveIntersectorRobust8HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4Intersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4vMBIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Triangle4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4Quad4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1Intersector8);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4SubdivPatch1MBIntersector8);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4VirtualMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4InstanceMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH4GridIntersector8HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersector16HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4OBBVirtualCurveIntersectorRobust16HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4Intersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4vMBIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Triangle4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4Quad4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1Intersector16);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4SubdivPatch1MBIntersector16);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4VirtualMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4InstanceMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH4GridIntersector16HybridPluecker);
+
+    // ==============
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4IntersectorStreamPacketFallback);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4IntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Triangle4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN, BVH4Quad4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH4VirtualIntersectorStream);
+    
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH4InstanceIntersectorStream);
+       
+    // SAH scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve4iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve4iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Curve8iBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1BuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4SubdivPatch1MBBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH4GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    // spatial scene builder
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Triangle4iSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    
+    // twolevel scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH4BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp
new file mode 100644
index 0000000000..9fe057c392
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.cpp
@@ -0,0 +1,1165 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "../common/isa.h" // to define EMBREE_TARGET_SIMD8
+
+#if defined (EMBREE_TARGET_SIMD8)
+
+#include "bvh8_factory.h"
+#include "../bvh/bvh.h"
+
+#include "../geometry/curveNv.h"
+#include "../geometry/curveNi.h"
+#include "../geometry/curveNi_mb.h"
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/subdivpatch1.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+#include "../common/accelinstance.h"
+
+namespace embree
+{
+  DECLARE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom);
+  
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8v,void);
+  DECLARE_ISA_FUNCTION(VirtualCurveIntersector*,VirtualCurveIntersector8iMB,void);
+  
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
+
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
+  DECLARE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
+
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
+  DECLARE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
+
+  DECLARE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+
+  BVH8Factory::BVH8Factory(int bfeatures, int ifeatures)
+  {
+    SELECT_SYMBOL_INIT_AVX(ifeatures,BVH8ColliderUserGeom);
+    
+    selectBuilders(bfeatures);
+    selectIntersectors(ifeatures);
+  }
+
+  void BVH8Factory::selectBuilders(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8Curve8vBuilder_OBB_New));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX(features,BVH8OBBCurve8iMBBuilder_OBB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4iMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vMBSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4iSceneBuilderSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedTriangle4SceneBuilderSAH));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4iMBSceneBuilderSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX(features,BVH8QuantizedQuad4iSceneBuilderSAH));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualSceneBuilderSAH));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX(features,BVH8VirtualMBSceneBuilderSAH));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceSceneBuilderSAH));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX(features,BVH8InstanceMBSceneBuilderSAH));
+    
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridSceneBuilderSAH));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX(features,BVH8GridMBSceneBuilderSAH));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4SceneBuilderFastSpatialSAH));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Triangle4vSceneBuilderFastSpatialSAH));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8Quad4vSceneBuilderFastSpatialSAH));
+
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4MeshSAH));
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4vMeshSAH));
+    IF_ENABLED_TRIS  (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelTriangle4iMeshSAH));
+    IF_ENABLED_QUADS (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelQuadMeshSAH));
+    IF_ENABLED_USER  (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelVirtualSAH));
+    IF_ENABLED_INSTANCE (SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,BVH8BuilderTwoLevelInstanceSAH));
+  }
+
+  void BVH8Factory::selectIntersectors(int features)
+  {
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8v));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,VirtualCurveIntersector8iMB));
+    
+    /* select intersectors1 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector1MB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust1MB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector1Woop));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Moeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector1Pluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector1Pluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4iIntersector1Pluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Triangle4Intersector1Moeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,QBVH8Quad4iIntersector1Pluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector1));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector1));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector1));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector1));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Moeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridMBIntersector1Moeller))
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector1Pluecker));
+
+#if defined (EMBREE_RAY_PACKETS)
+
+    /* select intersectors4 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector4HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust4HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector4HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector4HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector4HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector4HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector4HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector4HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector4Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector4Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector4Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector4Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector4HybridPluecker));
+
+    /* select intersectors8 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersector8HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust8HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4Intersector8HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iIntersector8HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4vMBIntersector8HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Triangle4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4vIntersector8HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8Quad4iIntersector8HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2(features,BVH8Quad4iMBIntersector8HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualIntersector8Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8VirtualMBIntersector8Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceIntersector8Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8InstanceMBIntersector8Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,BVH8GridIntersector8HybridPluecker));
+
+    /* select intersectors16 */
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersector16HybridMB));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16Hybrid));
+    IF_ENABLED_CURVES_OR_POINTS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8OBBVirtualCurveIntersectorRobust16HybridMB));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4Intersector16HybridMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersector16HybridPluecker));
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4vMBIntersector16HybridPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Triangle4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersector16HybridPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersector16HybridPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8Quad4iMBIntersector16HybridPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersector16Chunk));
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8VirtualMBIntersector16Chunk));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersector16Chunk));
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8InstanceMBIntersector16Chunk));
+
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridMoeller));
+    IF_ENABLED_GRIDS(SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,BVH8GridIntersector16HybridPluecker));
+
+    /* select stream intersectors */
+
+    SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8IntersectorStreamPacketFallback);
+
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4IntersectorStreamMoellerNoFilter));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamMoeller));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4vIntersectorStreamPluecker));
+    IF_ENABLED_TRIS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Triangle4iIntersectorStreamPluecker));
+
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamMoellerNoFilter));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamMoeller));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4vIntersectorStreamPluecker));
+    IF_ENABLED_QUADS(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8Quad4iIntersectorStreamPluecker));
+
+    IF_ENABLED_USER(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8VirtualIntersectorStream));
+
+    IF_ENABLED_INSTANCE(SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,BVH8InstanceIntersectorStream));
+
+#endif
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersector4Hybrid();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersector8Hybrid();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16Hybrid();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersectorRobust1();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersectorRobust4Hybrid();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersectorRobust8Hybrid();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16Hybrid();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersector1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersector4HybridMB();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersector8HybridMB();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersector16HybridMB();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.leafIntersector = leafIntersector;
+      intersectors.intersector1  = BVH8OBBVirtualCurveIntersectorRobust1MB();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8OBBVirtualCurveIntersectorRobust4HybridMB();
+      intersectors.intersector8  = BVH8OBBVirtualCurveIntersectorRobust8HybridMB();
+      intersectors.intersector16 = BVH8OBBVirtualCurveIntersectorRobust16HybridMB();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    default: assert(false);
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    assert(ivariant == IntersectVariant::FAST);
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1           = BVH8Triangle4Intersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4_filter    = BVH8Triangle4Intersector4HybridMoeller();
+    intersectors.intersector4_nofilter  = BVH8Triangle4Intersector4HybridMoellerNoFilter();
+    intersectors.intersector8_filter    = BVH8Triangle4Intersector8HybridMoeller();
+    intersectors.intersector8_nofilter  = BVH8Triangle4Intersector8HybridMoellerNoFilter();
+    intersectors.intersector16_filter   = BVH8Triangle4Intersector16HybridMoeller();
+    intersectors.intersector16_nofilter = BVH8Triangle4Intersector16HybridMoellerNoFilter();
+    intersectors.intersectorN_filter    = BVH8Triangle4IntersectorStreamMoeller();
+    intersectors.intersectorN_nofilter  = BVH8Triangle4IntersectorStreamMoellerNoFilter();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+#define ENABLE_WOOP_TEST 0
+#if ENABLE_WOOP_TEST == 0
+    //assert(ivariant == IntersectVariant::ROBUST);
+    intersectors.intersector1    = BVH8Triangle4vIntersector1Pluecker();
+#else
+    intersectors.intersector1    = BVH8Triangle4vIntersector1Woop();
+#endif
+
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4    = BVH8Triangle4vIntersector4HybridPluecker();
+    intersectors.intersector8    = BVH8Triangle4vIntersector8HybridPluecker();
+    intersectors.intersector16   = BVH8Triangle4vIntersector16HybridPluecker();
+    intersectors.intersectorN    = BVH8Triangle4vIntersectorStreamPluecker();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8Triangle4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Triangle4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4vMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4vMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4vMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4vMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4vMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4vMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4vMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Triangle4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Triangle4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Triangle4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Triangle4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Triangle4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1           = BVH8Quad4vIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4_filter    = BVH8Quad4vIntersector4HybridMoeller();
+      intersectors.intersector4_nofilter  = BVH8Quad4vIntersector4HybridMoellerNoFilter();
+      intersectors.intersector8_filter    = BVH8Quad4vIntersector8HybridMoeller();
+      intersectors.intersector8_nofilter  = BVH8Quad4vIntersector8HybridMoellerNoFilter();
+      intersectors.intersector16_filter   = BVH8Quad4vIntersector16HybridMoeller();
+      intersectors.intersector16_nofilter = BVH8Quad4vIntersector16HybridMoellerNoFilter();
+      intersectors.intersectorN_filter    = BVH8Quad4vIntersectorStreamMoeller();
+      intersectors.intersectorN_nofilter  = BVH8Quad4vIntersectorStreamMoellerNoFilter();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4vIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4vIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4vIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4vIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Quad4vIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Quad4iIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Quad4iIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8Quad4iIntersectorStreamMoeller();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4iIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4iIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8Quad4iIntersectorStreamPluecker();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    switch (ivariant) {
+    case IntersectVariant::FAST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iMBIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8Quad4iMBIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    case IntersectVariant::ROBUST:
+    {
+      Accel::Intersectors intersectors;
+      intersectors.ptr = bvh;
+      intersectors.intersector1  = BVH8Quad4iMBIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8Quad4iMBIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8Quad4iMBIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8Quad4iMBIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+      return intersectors;
+    }
+    }
+    return Accel::Intersectors();
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Triangle4iIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Triangle4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Triangle4Intersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Triangle4Intersector1Moeller();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::QBVH8Quad4iIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1 = QBVH8Quad4iIntersector1Pluecker();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8UserGeometryIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8VirtualIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8VirtualIntersector4Chunk();
+    intersectors.intersector8  = BVH8VirtualIntersector8Chunk();
+    intersectors.intersector16 = BVH8VirtualIntersector16Chunk();
+    intersectors.intersectorN  = BVH8VirtualIntersectorStream();
+#endif
+    intersectors.collider      = BVH8ColliderUserGeom();
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8UserGeometryMBIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8VirtualMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8VirtualMBIntersector4Chunk();
+    intersectors.intersector8  = BVH8VirtualMBIntersector8Chunk();
+    intersectors.intersector16 = BVH8VirtualMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8InstanceIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8InstanceIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8InstanceIntersector4Chunk();
+    intersectors.intersector8  = BVH8InstanceIntersector8Chunk();
+    intersectors.intersector16 = BVH8InstanceIntersector16Chunk();
+    intersectors.intersectorN  = BVH8InstanceIntersectorStream();
+#endif
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8InstanceMBIntersectors(BVH8* bvh)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8InstanceMBIntersector1();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = BVH8InstanceMBIntersector4Chunk();
+    intersectors.intersector8  = BVH8InstanceMBIntersector8Chunk();
+    intersectors.intersector16 = BVH8InstanceMBIntersector16Chunk();
+    intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH8Factory::BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Curve8v::type,scene);
+    Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectors(accel,VirtualCurveIntersector8v(),ivariant);
+    Builder* builder = BVH8Curve8vBuilder_OBB_New(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Curve8iMB::type,scene);
+    Accel::Intersectors intersectors = BVH8OBBVirtualCurveIntersectorsMB(accel,VirtualCurveIntersector8iMB(),ivariant);
+    Builder* builder = BVH8OBBCurve8iMBBuilder_OBB(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4Intersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default")  {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah"         )  builder = BVH8Triangle4SceneBuilderSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_fast_spatial")  builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else if (scene->device->tri_builder == "sah_presplit")     builder = BVH8Triangle4SceneBuilderSAH(accel,scene,MODE_HIGH_QUALITY);
+    else if (scene->device->tri_builder == "dynamic"     ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,false);
+    else if (scene->device->tri_builder == "morton"     ) builder = BVH8BuilderTwoLevelTriangle4MeshSAH(accel,scene,true);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4v::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4vIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default")  {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4vMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Triangle4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->tri_builder == "sah_fast_spatial")  builder = BVH8Triangle4SceneBuilderFastSpatialSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4v>");
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Triangle4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelTriangle4iMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder+" for BVH8<Triangle4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Triangle4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") { // FIXME: implement
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->tri_builder_mb == "internal_time_splits")  builder = BVH8Triangle4iMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4iMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Triangle4vMB::type,scene);
+    Accel::Intersectors intersectors= BVH8Triangle4vMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->tri_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->tri_builder_mb == "internal_time_splits")  builder = BVH8Triangle4vMBSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->tri_builder_mb+" for BVH8<Triangle4vMB>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedTriangle4i(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Triangle4i::type,scene);
+    Accel::Intersectors intersectors = QBVH8Triangle4iIntersectors(accel);
+    Builder* builder = BVH8QuantizedTriangle4iSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedTriangle4(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Triangle4::type,scene);
+    Accel::Intersectors intersectors = QBVH8Triangle4Intersectors(accel);
+    Builder* builder = BVH8QuantizedTriangle4SceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4v(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4v::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4vIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4vSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0); break;
+      }
+    }
+    else if (scene->device->quad_builder == "dynamic"      ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,false);
+    else if (scene->device->quad_builder == "morton"       ) builder = BVH8BuilderTwoLevelQuadMeshSAH(accel,scene,true);
+    else if (scene->device->quad_builder == "sah_fast_spatial" ) builder = BVH8Quad4vSceneBuilderFastSpatialSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4v>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4i(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4iIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4iSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break; // FIXME: implement
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for BVH8<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Quad4iMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = BVH8Quad4iMBIntersectors(accel,ivariant);
+
+    Builder* builder = nullptr;
+    if (scene->device->quad_builder_mb == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8Quad4iMBSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : assert(false); break; // FIXME: implement
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder_mb+" for BVH8<Quad4i>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8QuantizedQuad4i(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Quad4i::type,scene);
+    Accel::Intersectors intersectors = QBVH8Quad4iIntersectors(accel);
+    Builder* builder = nullptr;
+    if      (scene->device->quad_builder == "default"     ) builder = BVH8QuantizedQuad4iSceneBuilderSAH(accel,scene,0);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->quad_builder+" for QBVH8<Quad4i>");
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8UserGeometry(Scene* scene, BuildVariant bvariant)
+  {
+    BVH8* accel = new BVH8(Object::type,scene);
+    Accel::Intersectors intersectors = BVH8UserGeometryIntersectors(accel);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8VirtualSceneBuilderSAH(accel,scene,0); break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH8VirtualSceneBuilderSAH(accel,scene,0);
+    else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelVirtualSAH(accel,scene,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8UserGeometryMB(Scene* scene)
+  {
+    BVH8* accel = new BVH8(Object::type,scene);
+    Accel::Intersectors intersectors = BVH8UserGeometryMBIntersectors(accel);
+    Builder* builder = BVH8VirtualMBSceneBuilderSAH(accel,scene,0);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant)
+  {
+    BVH8* accel = new BVH8(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH8InstanceIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; 
+    // Builder* builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);
+
+    Builder* builder = nullptr;
+    if (scene->device->object_builder == "default") {
+      switch (bvariant) {
+      case BuildVariant::STATIC      : builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);; break;
+      case BuildVariant::DYNAMIC     : builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false); break;
+      case BuildVariant::HIGH_QUALITY: assert(false); break;
+      }
+    }
+    else if (scene->device->object_builder == "sah") builder = BVH8InstanceSceneBuilderSAH(accel,scene,gtype);
+    else if (scene->device->object_builder == "dynamic") builder = BVH8BuilderTwoLevelInstanceSAH(accel,scene,gtype,false);
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8<Object>");
+
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel* BVH8Factory::BVH8InstanceMB(Scene* scene, bool isExpensive)
+  {
+    BVH8* accel = new BVH8(InstancePrimitive::type,scene);
+    Accel::Intersectors intersectors = BVH8InstanceMBIntersectors(accel);
+    auto gtype = isExpensive ? Geometry::MTY_INSTANCE_EXPENSIVE : Geometry::MTY_INSTANCE; 
+    Builder* builder = BVH8InstanceMBSceneBuilderSAH(accel,scene,gtype);
+    return new AccelInstance(accel,builder,intersectors);
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    if (ivariant == IntersectVariant::FAST)
+    {
+      intersectors.intersector1  = BVH8GridIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8GridIntersector4HybridMoeller();
+      intersectors.intersector8  = BVH8GridIntersector8HybridMoeller();
+      intersectors.intersector16 = BVH8GridIntersector16HybridMoeller();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif
+    }
+    else /* if (ivariant == IntersectVariant::ROBUST) */
+    {
+      intersectors.intersector1  = BVH8GridIntersector1Pluecker();
+#if defined (EMBREE_RAY_PACKETS)
+      intersectors.intersector4  = BVH8GridIntersector4HybridPluecker();
+      intersectors.intersector8  = BVH8GridIntersector8HybridPluecker();
+      intersectors.intersector16 = BVH8GridIntersector16HybridPluecker();
+      intersectors.intersectorN  = BVH8IntersectorStreamPacketFallback();
+#endif            
+    }
+    return intersectors;
+  }
+
+  Accel::Intersectors BVH8Factory::BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant)
+  {
+    Accel::Intersectors intersectors;
+    intersectors.ptr = bvh;
+    intersectors.intersector1  = BVH8GridMBIntersector1Moeller();
+#if defined (EMBREE_RAY_PACKETS)
+    intersectors.intersector4  = nullptr;
+    intersectors.intersector8  = nullptr;
+    intersectors.intersector16 = nullptr;
+    intersectors.intersectorN  = nullptr;
+#endif
+    return intersectors;
+  }
+
+  Accel* BVH8Factory::BVH8Grid(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(SubGridQBVH8::type,scene);
+    Accel::Intersectors intersectors = BVH8GridIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->grid_builder == "default") {
+      builder = BVH8GridSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH4<GridMesh>");
+
+    return new AccelInstance(accel,builder,intersectors);    
+  }
+
+  Accel* BVH8Factory::BVH8GridMB(Scene* scene, BuildVariant bvariant, IntersectVariant ivariant)
+  {
+    BVH8* accel = new BVH8(SubGridQBVH8::type,scene);
+    Accel::Intersectors intersectors = BVH8GridMBIntersectors(accel,ivariant);
+    Builder* builder = nullptr;
+    if (scene->device->grid_builder_mb == "default") {
+      builder = BVH8GridMBSceneBuilderSAH(accel,scene,0);
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown builder "+scene->device->object_builder+" for BVH8MB<GridMesh>");
+    return new AccelInstance(accel,builder,intersectors);        
+  }
+}
+
+#endif
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h
new file mode 100644
index 0000000000..b92188e7d3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh8_factory.h
@@ -0,0 +1,280 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_factory.h"
+
+namespace embree
+{
+  /*! BVH8 instantiations */
+  class BVH8Factory : public BVHFactory
+  {
+  public:
+    BVH8Factory(int bfeatures, int ifeatures);
+
+  public:
+    Accel* BVH8OBBVirtualCurve8v(Scene* scene, IntersectVariant ivariant);
+    Accel* BVH8OBBVirtualCurve8iMB(Scene* scene, IntersectVariant ivariant);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8v);
+    DEFINE_SYMBOL2(VirtualCurveIntersector*,VirtualCurveIntersector8iMB);
+    
+    Accel* BVH8Triangle4   (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4vMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Triangle4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH8Quad4v  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Quad4i  (Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8Quad4iMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+    Accel* BVH8QuantizedTriangle4i(Scene* scene);
+    Accel* BVH8QuantizedTriangle4(Scene* scene);
+    Accel* BVH8QuantizedQuad4i(Scene* scene);
+
+    Accel* BVH8UserGeometry(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH8UserGeometryMB(Scene* scene);
+
+    Accel* BVH8Instance(Scene* scene, bool isExpensive, BuildVariant bvariant = BuildVariant::STATIC);
+    Accel* BVH8InstanceMB(Scene* scene, bool isExpensive);
+
+    Accel* BVH8Grid(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+    Accel* BVH8GridMB(Scene* scene, BuildVariant bvariant = BuildVariant::STATIC, IntersectVariant ivariant = IntersectVariant::FAST);
+
+  private:
+    void selectBuilders(int features);
+    void selectIntersectors(int features);
+
+  private:
+    Accel::Intersectors BVH8OBBVirtualCurveIntersectors(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    Accel::Intersectors BVH8OBBVirtualCurveIntersectorsMB(BVH8* bvh, VirtualCurveIntersector* leafIntersector, IntersectVariant ivariant);
+    
+    Accel::Intersectors BVH8Triangle4Intersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4vIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4iIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Triangle4vMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors BVH8Quad4vIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Quad4iIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8Quad4iMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+    Accel::Intersectors QBVH8Triangle4iIntersectors(BVH8* bvh);
+    Accel::Intersectors QBVH8Triangle4Intersectors(BVH8* bvh);
+    Accel::Intersectors QBVH8Quad4iIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8UserGeometryIntersectors(BVH8* bvh);
+    Accel::Intersectors BVH8UserGeometryMBIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8InstanceIntersectors(BVH8* bvh);
+    Accel::Intersectors BVH8InstanceMBIntersectors(BVH8* bvh);
+
+    Accel::Intersectors BVH8GridIntersectors(BVH8* bvh, IntersectVariant ivariant);
+    Accel::Intersectors BVH8GridMBIntersectors(BVH8* bvh, IntersectVariant ivariant);
+
+  private:
+    DEFINE_SYMBOL2(Accel::Collider,BVH8ColliderUserGeom);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersector1MB);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8OBBVirtualCurveIntersectorRobust1MB);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vMBIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Triangle4vIntersector1Woop);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4vIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8Quad4iMBIntersector1Pluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4iIntersector1Pluecker);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Triangle4Intersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,QBVH8Quad4iIntersector1Pluecker);
+    
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8VirtualMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceIntersector1);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8InstanceMBIntersector1);
+
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridMBIntersector1Moeller);
+    DEFINE_SYMBOL2(Accel::Intersector1,BVH8GridIntersector1Pluecker);
+    
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersector4HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8OBBVirtualCurveIntersectorRobust4HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4Intersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4vMBIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Triangle4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4vIntersector4HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8Quad4iMBIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8VirtualMBIntersector4Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceIntersector4Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8InstanceMBIntersector4Chunk);
+    
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector4,BVH8GridIntersector4HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersector8HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8OBBVirtualCurveIntersectorRobust8HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4Intersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4vMBIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Triangle4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4vIntersector8HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8Quad4iMBIntersector8HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8VirtualMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceIntersector8Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8InstanceMBIntersector8Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector8,BVH8GridIntersector8HybridPluecker);
+   
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersector16HybridMB);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16Hybrid);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8OBBVirtualCurveIntersectorRobust16HybridMB);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4Intersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4vMBIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Triangle4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4vIntersector16HybridPluecker);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8Quad4iMBIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8VirtualMBIntersector16Chunk);
+
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceIntersector16Chunk);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8InstanceMBIntersector16Chunk);
+   
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridMoeller);
+    DEFINE_SYMBOL2(Accel::Intersector16,BVH8GridIntersector16HybridPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8IntersectorStreamPacketFallback);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4IntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Triangle4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamMoellerNoFilter);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamMoeller);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4vIntersectorStreamPluecker);
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8Quad4iIntersectorStreamPluecker);
+
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8VirtualIntersectorStream);
+    
+    DEFINE_SYMBOL2(Accel::IntersectorN,BVH8InstanceIntersectorStream);
+
+    // SAH scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Curve8vBuilder_OBB_New,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8OBBCurve8iMBBuilder_OBB,void* COMMA Scene* COMMA size_t);
+ 
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedTriangle4SceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+ 
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4iMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8QuantizedQuad4iSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    
+    DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8VirtualMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8InstanceMBSceneBuilderSAH,void* COMMA Scene* COMMA Geometry::GTypeMask);
+
+    DEFINE_ISA_FUNCTION(Builder*,BVH8GridSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8GridMBSceneBuilderSAH,void* COMMA Scene* COMMA size_t);
+
+    // SAH spatial scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4SceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Triangle4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8Quad4vSceneBuilderFastSpatialSAH,void* COMMA Scene* COMMA size_t);
+
+    // twolevel scene builders
+  private:
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4MeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4vMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelTriangle4iMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelQuadMeshSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelVirtualSAH,void* COMMA Scene* COMMA bool);
+    DEFINE_ISA_FUNCTION(Builder*,BVH8BuilderTwoLevelInstanceSAH,void* COMMA Scene* COMMA Geometry::GTypeMask COMMA bool);
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp
new file mode 100644
index 0000000000..e832537ec5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.cpp
@@ -0,0 +1,60 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_builder.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+    typename BVHN<N>::NodeRef BVHNBuilderVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef {
+        return createLeaf(prims,set,alloc);
+      };
+      
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRef>
+        (FastAllocator::Create(allocator),typename BVH::AABBNode::Create2(),typename BVH::AABBNode::Set3(allocator,prims),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+
+    template<int N>
+    typename BVHN<N>::NodeRef BVHNBuilderQuantizedVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRef {
+        return createLeaf(prims,set,alloc);
+      };
+            
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRef>
+        (FastAllocator::Create(allocator),typename BVH::QuantizedNode::Create2(),typename BVH::QuantizedNode::Set2(),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+    template<int N>
+    typename BVHN<N>::NodeRecordMB BVHNBuilderMblurVirtual<N>::BVHNBuilderV::build(FastAllocator* allocator, BuildProgressMonitor& progressFunc, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange)
+    {
+      auto createLeafFunc = [&] (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) -> NodeRecordMB {
+        return createLeaf(prims,set,alloc);
+      };
+
+      settings.branchingFactor = N;
+      settings.maxDepth = BVH::maxBuildDepthLeaf;
+      return BVHBuilderBinnedSAH::build<NodeRecordMB>
+        (FastAllocator::Create(allocator),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::SetTimeRange(timeRange),createLeafFunc,progressFunc,prims,pinfo,settings);
+    }
+
+    template struct BVHNBuilderVirtual<4>;
+    template struct BVHNBuilderQuantizedVirtual<4>;
+    template struct BVHNBuilderMblurVirtual<4>;    
+
+#if defined(__AVX__)
+    template struct BVHNBuilderVirtual<8>;
+    template struct BVHNBuilderQuantizedVirtual<8>;
+    template struct BVHNBuilderMblurVirtual<8>;
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h
new file mode 100644
index 0000000000..1b86bb45ad
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder.h
@@ -0,0 +1,114 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "../builders/bvh_builder_sah.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N>
+      struct BVHNBuilderVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings);
+          virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings);
+        }
+      };
+
+    template<int N>
+      struct BVHNBuilderQuantizedVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRef build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings);
+          virtual NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRef createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRef build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings);
+        }
+      };
+
+    template<int N>
+      struct BVHNBuilderMblurVirtual
+      {
+        typedef BVHN<N> BVH;
+        typedef typename BVH::AABBNodeMB AABBNodeMB;
+        typedef typename BVH::NodeRef NodeRef;
+        typedef typename BVH::NodeRecordMB NodeRecordMB;
+        typedef FastAllocator::CachedAllocator Allocator;
+      
+        struct BVHNBuilderV {
+          NodeRecordMB build(FastAllocator* allocator, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange);
+          virtual NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) = 0;
+        };
+
+        template<typename CreateLeafFunc>
+        struct BVHNBuilderT : public BVHNBuilderV
+        {
+          BVHNBuilderT (CreateLeafFunc createLeafFunc)
+            : createLeafFunc(createLeafFunc) {}
+
+          NodeRecordMB createLeaf (const PrimRef* prims, const range<size_t>& set, const Allocator& alloc) {
+            return createLeafFunc(prims,set,alloc);
+          }
+
+        private:
+          CreateLeafFunc createLeafFunc;
+        };
+
+        template<typename CreateLeafFunc>
+        static NodeRecordMB build(FastAllocator* allocator, CreateLeafFunc createLeaf, BuildProgressMonitor& progress, PrimRef* prims, const PrimInfo& pinfo, GeneralBVHBuilder::Settings settings, const BBox1f& timeRange) {
+          return BVHNBuilderT<CreateLeafFunc>(createLeaf).build(allocator,progress,prims,pinfo,settings,timeRange);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp
new file mode 100644
index 0000000000..64759c1294
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_morton.cpp
@@ -0,0 +1,531 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_statistics.h"
+#include "bvh_rotate.h"
+#include "../common/profile.h"
+#include "../../common/algorithms/parallel_prefix_sum.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/bvh_builder_morton.h"
+
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+#if defined(__X86_64__) || defined(__aarch64__)
+#  define ROTATE_TREE 1 // specifies number of tree rotation rounds to perform
+#else
+#  define ROTATE_TREE 0 // do not use tree rotations on 32 bit platforms, barrier bit in NodeRef will cause issues
+#endif
+
+namespace embree 
+{
+  namespace isa
+  {
+    template<int N>
+    struct SetBVHNBounds
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+      typedef typename BVH::AABBNode AABBNode;
+
+      BVH* bvh;
+      __forceinline SetBVHNBounds (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRecord operator() (NodeRef ref, const NodeRecord* children, size_t num)
+      {
+        AABBNode* node = ref.getAABBNode();
+
+        BBox3fa res = empty;
+        for (size_t i=0; i<num; i++) {
+          const BBox3fa b = children[i].bounds;
+          res.extend(b);
+          node->setRef(i,children[i].ref);
+          node->setBounds(i,b);
+        }
+
+        BBox3fx result = (BBox3fx&)res;
+#if ROTATE_TREE
+        if (N == 4)
+        {
+          size_t n = 0;
+          for (size_t i=0; i<num; i++)
+            n += children[i].bounds.lower.a;
+
+          if (n >= 4096) {
+            for (size_t i=0; i<num; i++) {
+              if (children[i].bounds.lower.a < 4096) {
+                for (int j=0; j<ROTATE_TREE; j++)
+                  BVHNRotate<N>::rotate(node->child(i));
+                node->child(i).setBarrier();
+              }
+            }
+          }
+          result.lower.a = unsigned(n);
+        }
+#endif
+
+        return NodeRecord(ref,result);
+      }
+    };
+
+    template<int N, typename Primitive>
+    struct CreateMortonLeaf;
+
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4* accel = (Triangle4*) alloc.malloc1(sizeof(Triangle4),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero;
+        const TriangleMesh* __restrict__ const mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        }
+
+        Triangle4::store_nt(accel,Triangle4(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = unsigned(current.size());
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+    
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4v>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4v* accel = (Triangle4v*) alloc.malloc1(sizeof(Triangle4v),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);       
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero;
+        const TriangleMesh* __restrict__ mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        }
+        Triangle4v::store_nt(accel,Triangle4v(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Triangle4i>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (TriangleMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Triangle4i* accel = (Triangle4i*) alloc.malloc1(sizeof(Triangle4i),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        
+        vuint4 v0 = zero, v1 = zero, v2 = zero;
+        vuint4 vgeomID = -1, vprimID = -1;
+        const TriangleMesh* __restrict__ const mesh = this->mesh;
+        
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2);
+          vgeomID[i] = geomID_;
+          vprimID[i] = primID;
+          unsigned int int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = tri.v[0] * int_stride; 
+          v1[i] = tri.v[1] * int_stride;
+          v2[i] = tri.v[2] * int_stride;
+        }
+        
+        for (size_t i=items; i<4; i++)
+        {
+          vgeomID[i] = vgeomID[0];
+          vprimID[i] = -1;
+          v0[i] = 0;
+          v1[i] = 0; 
+          v2[i] = 0;
+        }
+        Triangle4i::store_nt(accel,Triangle4i(v0,v1,v2,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      TriangleMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Quad4v>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (QuadMesh* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items<=4);
+        
+        /* allocate leaf node */
+        Quad4v* accel = (Quad4v*) alloc.malloc1(sizeof(Quad4v),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,1);
+        
+        vuint4 vgeomID = -1, vprimID = -1;
+        Vec3vf4 v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+        const QuadMesh* __restrict__ mesh = this->mesh;
+
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index;
+          const QuadMesh::Quad& tri = mesh->quad(primID);
+          const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+          const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+          const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+          const Vec3fa& p3 = mesh->vertex(tri.v[3]);
+          lower = min(lower,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3);
+          upper = max(upper,(vfloat4)p0,(vfloat4)p1,(vfloat4)p2,(vfloat4)p3);
+          vgeomID [i] = geomID_;
+          vprimID [i] = primID;
+          v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+          v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+          v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+          v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+        }
+        Quad4v::store_nt(accel,Quad4v(v0,v1,v2,v3,vgeomID,vprimID));
+        BBox3fx box_o = BBox3fx((Vec3fx)lower,(Vec3fx)upper);
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      QuadMesh* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,Object>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (UserGeometry* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        
+        /* allocate leaf node */
+        Object* accel = (Object*) alloc.malloc1(items*sizeof(Object),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,items);
+        const UserGeometry* mesh = this->mesh;
+        
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int index = morton[start+i].index;
+          const unsigned int primID = index; 
+          bounds.extend(mesh->bounds(primID));
+          new (&accel[i]) Object(geomID_,primID);
+        }
+
+        BBox3fx box_o = (BBox3fx&)bounds;
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      UserGeometry* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<int N>
+    struct CreateMortonLeaf<N,InstancePrimitive>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+      __forceinline CreateMortonLeaf (Instance* mesh, unsigned int geomID, BVHBuilderMorton::BuildPrim* morton)
+        : mesh(mesh), morton(morton), geomID_(geomID) {}
+      
+      __noinline NodeRecord operator() (const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc)
+      {
+        vfloat4 lower(pos_inf);
+        vfloat4 upper(neg_inf);
+        size_t items = current.size();
+        size_t start = current.begin();
+        assert(items <= 1);
+        
+        /* allocate leaf node */
+        InstancePrimitive* accel = (InstancePrimitive*) alloc.malloc1(items*sizeof(InstancePrimitive),BVH::byteAlignment);
+        NodeRef ref = BVH::encodeLeaf((char*)accel,items);
+        const Instance* instance = this->mesh;
+        
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<items; i++)
+        {
+          const unsigned int primID = morton[start+i].index; 
+          bounds.extend(instance->bounds(primID));
+          new (&accel[i]) InstancePrimitive(instance, geomID_);
+        }
+
+        BBox3fx box_o = (BBox3fx&)bounds;
+#if ROTATE_TREE
+        if (N == 4)
+          box_o.lower.a = current.size();
+#endif
+        return NodeRecord(ref,box_o);
+      }
+    private:
+      Instance* mesh;
+      BVHBuilderMorton::BuildPrim* morton;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+    };
+
+    template<typename Mesh>
+    struct CalculateMeshBounds
+    {
+      __forceinline CalculateMeshBounds (Mesh* mesh)
+        : mesh(mesh) {}
+      
+      __forceinline const BBox3fa operator() (const BVHBuilderMorton::BuildPrim& morton) {
+        return mesh->bounds(morton.index);
+      }
+      
+    private:
+      Mesh* mesh;
+    };        
+
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNMeshBuilderMorton : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecord NodeRecord;
+
+    public:
+      
+      BVHNMeshBuilderMorton (BVH* bvh, Mesh* mesh, unsigned int geomID, const size_t minLeafSize, const size_t maxLeafSize, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD)
+        : bvh(bvh), mesh(mesh), morton(bvh->device,0), settings(N,BVH::maxBuildDepth,minLeafSize,min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks),singleThreadThreshold), geomID_(geomID) {}
+      
+      /* build function */
+      void build() 
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+          morton.clear();
+        }
+        size_t numPrimitives = mesh->size();
+        numPreviousPrimitives = numPrimitives;
+        
+        /* skip build for empty scene */
+        if (numPrimitives == 0) {
+          bvh->set(BVH::emptyNode,empty,0);
+          return;
+        }
+        
+        /* preallocate arrays */
+        morton.resize(numPrimitives);
+        size_t bytesEstimated = numPrimitives*sizeof(AABBNode)/(4*N) + size_t(1.2f*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+        size_t bytesMortonCodes = numPrimitives*sizeof(BVHBuilderMorton::BuildPrim);
+        bytesEstimated = max(bytesEstimated,bytesMortonCodes); // the first allocation block is reused to sort the morton codes
+        bvh->alloc.init(bytesMortonCodes,bytesMortonCodes,bytesEstimated);
+
+        /* create morton code array */
+        BVHBuilderMorton::BuildPrim* dest = (BVHBuilderMorton::BuildPrim*) bvh->alloc.specialAlloc(bytesMortonCodes);
+        size_t numPrimitivesGen = createMortonCodeArray<Mesh>(mesh,morton,bvh->scene->progressInterface);
+
+        /* create BVH */
+        SetBVHNBounds<N> setBounds(bvh);
+        CreateMortonLeaf<N,Primitive> createLeaf(mesh,geomID_,morton.data());
+        CalculateMeshBounds<Mesh> calculateBounds(mesh);
+        auto root = BVHBuilderMorton::build<NodeRecord>(
+          typename BVH::CreateAlloc(bvh), 
+          typename BVH::AABBNode::Create(),
+          setBounds,createLeaf,calculateBounds,bvh->scene->progressInterface,
+          morton.data(),dest,numPrimitivesGen,settings);
+        
+        bvh->set(root.ref,LBBox3fa(root.bounds),numPrimitives);
+        
+#if ROTATE_TREE
+        if (N == 4)
+        {
+          for (int i=0; i<ROTATE_TREE; i++)
+            BVHNRotate<N>::rotate(bvh->root);
+          bvh->clearBarrier(bvh->root);
+        }
+#endif
+
+        /* clear temporary data for static geometry */
+        if (bvh->scene->isStaticAccel()) {
+          morton.clear();
+        }
+        bvh->cleanup();
+      }
+      
+      void clear() {
+        morton.clear();
+      }
+      
+    private:
+      BVH* bvh;
+      Mesh* mesh;
+      mvector<BVHBuilderMorton::BuildPrim> morton;
+      BVHBuilderMorton::Settings settings;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+    };
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderMortonGeneral  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4> ((BVH4*)bvh,mesh,geomID,4,4); }
+    Builder* BVH4Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4v>((BVH4*)bvh,mesh,geomID,4,4); }
+    Builder* BVH4Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,TriangleMesh,Triangle4i>((BVH4*)bvh,mesh,geomID,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderMortonGeneral  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4> ((BVH8*)bvh,mesh,geomID,4,4); }
+    Builder* BVH8Triangle4vMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4v>((BVH8*)bvh,mesh,geomID,4,4); }
+    Builder* BVH8Triangle4iMeshBuilderMortonGeneral (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,TriangleMesh,Triangle4i>((BVH8*)bvh,mesh,geomID,4,4); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,QuadMesh,Quad4v>((BVH4*)bvh,mesh,geomID,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8Quad4vMeshBuilderMortonGeneral (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,QuadMesh,Quad4v>((BVH8*)bvh,mesh,geomID,4,4); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,UserGeometry,Object>((BVH4*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); }
+#if defined(__AVX__)
+    Builder* BVH8VirtualMeshBuilderMortonGeneral (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,UserGeometry,Object>((BVH8*)bvh,mesh,geomID,1,BVH4::maxLeafBlocks); }    
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<4,Instance,InstancePrimitive>((BVH4*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }
+#if defined(__AVX__)
+    Builder* BVH8InstanceMeshBuilderMortonGeneral (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new class BVHNMeshBuilderMorton<8,Instance,InstancePrimitive>((BVH8*)bvh,mesh,gtype,geomID,1,BVH4::maxLeafBlocks); }    
+#endif
+#endif
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp
new file mode 100644
index 0000000000..cf5b2eb47f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah.cpp
@@ -0,0 +1,640 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+#include "../builders/primrefgen.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+#define PROFILE 0
+#define PROFILE_RUNS 20
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Primitive>
+    struct CreateLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeaf (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+
+    template<int N, typename Primitive>
+    struct CreateLeafQuantized
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafQuantized (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N, typename Primitive>
+    struct BVHNBuilderSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+      BVH* bvh;
+      Scene* scene;
+      Geometry* mesh;
+      mvector<PrimRef> prims;
+      GeneralBVHBuilder::Settings settings;
+      Geometry::GTypeMask gtype_;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max ();
+      bool primrefarrayalloc;
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize,
+                      const Geometry::GTypeMask gtype, bool primrefarrayalloc = false)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0),
+          settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), primrefarrayalloc(primrefarrayalloc) {}
+
+      BVHNBuilderSAH (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID), primrefarrayalloc(false) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+        /* if we use the primrefarray for allocations we have to take it back from the BVH */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.unshare(prims);
+
+	/* skip build for empty scene */
+        const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false);
+        numPreviousPrimitives = numPrimitives;
+        if (numPrimitives == 0) {
+          bvh->clear();
+          prims.clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+
+            /* create primref array */
+            if (primrefarrayalloc) {
+              settings.primrefarrayalloc = numPrimitives/1000;
+              if (settings.primrefarrayalloc < 1000)
+                settings.primrefarrayalloc = inf;
+            }
+
+            /* enable os_malloc for two level build */
+            if (mesh)
+              bvh->alloc.setOSallocation(true);
+
+            /* initialize allocator */
+            const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N);
+            const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+            bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+            settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+            prims.resize(numPrimitives); 
+
+            PrimInfo pinfo = mesh ?
+              createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) :
+              createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface);
+
+            /* pinfo might has zero size due to invalid geometry */
+            if (unlikely(pinfo.size() == 0))
+            {
+              bvh->clear();
+              prims.clear();
+              return;
+            }
+
+            /* call BVH builder */
+            NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeaf<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+            bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+#if PROFILE
+          });
+#endif
+
+        /* if we allocated using the primrefarray we have to keep it alive */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.share(prims);
+
+        /* for static geometries we can do some cleanups */
+        else if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    template<int N, typename Primitive>
+    struct BVHNBuilderSAHQuantized : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+      BVH* bvh;
+      Scene* scene;
+      Geometry* mesh;
+      mvector<PrimRef> prims;
+      GeneralBVHBuilder::Settings settings;
+      Geometry::GTypeMask gtype_;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAHQuantized (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype) {}
+
+      BVHNBuilderSAHQuantized (BVH* bvh, Geometry* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), gtype_(gtype), geomID_(geomID) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+	/* skip build for empty scene */
+        const size_t numPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(gtype_,false);
+        numPreviousPrimitives = numPrimitives;
+        if (numPrimitives == 0) {
+          prims.clear();
+          bvh->clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::QBVH" + toString(N) + "BuilderSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+            /* create primref array */
+            prims.resize(numPrimitives);
+            PrimInfo pinfo = mesh ?
+              createPrimRefArray(mesh,geomID_,prims,bvh->scene->progressInterface) :
+              createPrimRefArray(scene,gtype_,false,prims,bvh->scene->progressInterface);
+
+            /* enable os_malloc for two level build */
+            if (mesh)
+              bvh->alloc.setOSallocation(true);
+
+            /* call BVH builder */
+            const size_t node_bytes = numPrimitives*sizeof(typename BVH::QuantizedNode)/(4*N);
+            const size_t leaf_bytes = size_t(1.2*Primitive::blocks(numPrimitives)*sizeof(Primitive));
+            bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+            settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+            NodeRef root = BVHNBuilderQuantizedVirtual<N>::build(&bvh->alloc,CreateLeafQuantized<N,Primitive>(bvh),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+            //bvh->layoutLargeNodes(pinfo.size()*0.005f); // FIXME: COPY LAYOUT FOR LARGE NODES !!!
+#if PROFILE
+          });
+#endif
+
+	/* clear temporary data for static geometry */
+	if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+
+    template<int N, typename Primitive>
+    struct CreateLeafGrid
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafGrid (BVH* bvh, const SubGridBuildData * const sgrids) : bvh(bvh),sgrids(sgrids) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = set.size(); //Primitive::blocks(n);
+        const size_t start = set.begin();
+
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridQBVHN<N>* accel = (SubGridQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,num_geomIDs);
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData& sgrid_bd = sgrids[prims[start+i].primID()];
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            bounds[pos] = prims[start+i].bounds();
+            pos++;
+          }
+          assert(pos <= N);
+          new (&accel[g]) SubGridQBVHN<N>(x,y,primID,bounds,geomIDs[g],pos);
+        }
+
+        return node;
+      }
+
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+
+
+    template<int N>
+    struct BVHNBuilderSAHGrid : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      
+      BVH* bvh;
+      Scene* scene;
+      GridMesh* mesh;
+      mvector<PrimRef> prims;
+      mvector<SubGridBuildData> sgrids;
+      GeneralBVHBuilder::Settings settings;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims(scene->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD) {}
+
+      BVHNBuilderSAHGrid (BVH* bvh, GridMesh* mesh, unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims(bvh->device,0), sgrids(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD), geomID_(geomID) {}
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+        
+        /* if we use the primrefarray for allocations we have to take it back from the BVH */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.unshare(prims);
+
+        const size_t numGridPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(GridMesh::geom_type,false);
+        numPreviousPrimitives = numGridPrimitives;
+               
+        PrimInfo pinfo(empty);
+        size_t numPrimitives = 0;
+
+        if (!mesh)
+        {
+          /* first run to get #primitives */
+
+          ParallelForForPrefixSumState<PrimInfo> pstate;
+          Scene::Iterator<GridMesh,false> iter(scene);
+
+          pstate.init(iter,size_t(1024));
+
+          /* iterate over all meshes in the scene */
+          pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+              PrimInfo pinfo(empty);
+              for (size_t j=r.begin(); j<r.end(); j++)
+              {
+                if (!mesh->valid(j)) continue;
+                BBox3fa bounds = empty;
+                const PrimRef prim(bounds,(unsigned)geomID,(unsigned)j);
+                if (!mesh->valid(j)) continue;
+                pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+              }
+              return pinfo;
+            }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+          numPrimitives = pinfo.size();
+          
+          /* resize arrays */
+          sgrids.resize(numPrimitives); 
+          prims.resize(numPrimitives); 
+
+          /* second run to fill primrefs and SubGridBuildData arrays */
+          pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+              k = base.size();
+              size_t p_index = k;
+              PrimInfo pinfo(empty);
+              for (size_t j=r.begin(); j<r.end(); j++)
+              {
+                if (!mesh->valid(j)) continue;
+                const GridMesh::Grid &g = mesh->grid(j);
+                for (unsigned int y=0; y<g.resY-1u; y+=2)
+                  for (unsigned int x=0; x<g.resX-1u; x+=2)
+                  {
+                    BBox3fa bounds = empty;
+                    if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
+                    const PrimRef prim(bounds,(unsigned)geomID,(unsigned)p_index);
+                    pinfo.add_center2(prim);
+                    sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                    prims[p_index++] = prim;                
+                  }
+              }
+              return pinfo;
+            }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+          assert(pinfo.size() == numPrimitives);
+        }
+        else
+        {
+          ParallelPrefixSumState<PrimInfo> pstate;
+          /* iterate over all grids in a single mesh */
+          pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo
+                                       {
+                                         PrimInfo pinfo(empty);
+                                         for (size_t j=r.begin(); j<r.end(); j++)
+                                         {
+                                           if (!mesh->valid(j)) continue;
+                                           BBox3fa bounds = empty;
+                                           const PrimRef prim(bounds,geomID_,unsigned(j));
+                                           pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+                                         }
+                                         return pinfo;
+                                       }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+          numPrimitives = pinfo.size();
+          /* resize arrays */
+          sgrids.resize(numPrimitives); 
+          prims.resize(numPrimitives); 
+
+          /* second run to fill primrefs and SubGridBuildData arrays */
+          pinfo = parallel_prefix_sum( pstate, size_t(0), mesh->size(), size_t(1024), PrimInfo(empty), [&](const range<size_t>& r, const PrimInfo& base) -> PrimInfo
+                                       {
+
+                                         size_t p_index = base.size();
+                                         PrimInfo pinfo(empty);
+                                         for (size_t j=r.begin(); j<r.end(); j++)
+                                         {
+                                           if (!mesh->valid(j)) continue;
+                                           const GridMesh::Grid &g = mesh->grid(j);
+                                           for (unsigned int y=0; y<g.resY-1u; y+=2)
+                                             for (unsigned int x=0; x<g.resX-1u; x+=2)
+                                             {
+                                               BBox3fa bounds = empty;
+                                               if (!mesh->buildBounds(g,x,y,bounds)) continue; // get bounds of subgrid
+                                               const PrimRef prim(bounds,geomID_,unsigned(p_index));
+                                               pinfo.add_center2(prim);
+                                               sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                                               prims[p_index++] = prim;                
+                                             }
+                                         }
+                                         return pinfo;
+                                       }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+
+        }
+
+        /* no primitives */
+        if (numPrimitives == 0) {
+          bvh->clear();
+          prims.clear();
+          sgrids.clear();
+          return;
+        }
+
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + "BuilderSAH");
+
+        /* create primref array */
+        settings.primrefarrayalloc = numPrimitives/1000;
+        if (settings.primrefarrayalloc < 1000)
+          settings.primrefarrayalloc = inf;
+
+        /* enable os_malloc for two level build */
+        if (mesh)
+          bvh->alloc.setOSallocation(true);
+
+        /* initialize allocator */
+        const size_t node_bytes = numPrimitives*sizeof(typename BVH::AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,numPrimitives,node_bytes+leaf_bytes);
+
+        /* pinfo might has zero size due to invalid geometry */
+        if (unlikely(pinfo.size() == 0))
+        {
+          bvh->clear();
+          sgrids.clear();
+          prims.clear();
+          return;
+        }
+
+        /* call BVH builder */
+        NodeRef root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafGrid<N,SubGridQBVHN<N>>(bvh,sgrids.data()),bvh->scene->progressInterface,prims.data(),pinfo,settings);
+        bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+        bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+        /* clear temporary array */
+        sgrids.clear();
+
+        /* if we allocated using the primrefarray we have to keep it alive */
+        if (settings.primrefarrayalloc != size_t(inf))
+          bvh->alloc.share(prims);
+
+        /* for static geometries we can do some cleanups */
+        else if (scene && scene->isStaticAccel()) {
+          prims.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+    Builder* BVH4Triangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4vSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4v>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH4Triangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); }
+
+
+    Builder* BVH4QuantizedTriangle4iSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+    Builder* BVH8Triangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4vSceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4v>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8Triangle4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type,true); }
+    Builder* BVH8QuantizedTriangle4iSceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+    Builder* BVH8QuantizedTriangle4SceneBuilderSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Triangle4>((BVH8*)bvh,scene,4,1.0f,4,inf,TriangleMesh::geom_type); }
+
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4iMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4Quad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); }
+    Builder* BVH4QuantizedQuad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4v>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH4QuantizedQuad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<4,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+
+#if defined(__AVX__)
+    Builder* BVH8Quad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8Quad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAH<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type,true); }
+    Builder* BVH8QuantizedQuad4vSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4v>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8QuantizedQuad4iSceneBuilderSAH     (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderSAHQuantized<8,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,QuadMesh::geom_type); }
+    Builder* BVH8Quad4vMeshBuilderSAH     (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode)     { return new BVHNBuilderSAH<8,Quad4v>((BVH8*)bvh,mesh,geomID,4,1.0f,4,inf,QuadMesh::geom_type); }
+
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+
+    Builder* BVH4VirtualSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_max_leaf_size;
+      return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type);
+    }
+
+    Builder* BVH4VirtualMeshBuilderSAH    (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<4,Object>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,UserGeometry::geom_type);
+    }
+#if defined(__AVX__)
+
+    Builder* BVH8VirtualSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_max_leaf_size;
+      return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,UserGeometry::geom_type);
+    }
+
+    Builder* BVH8VirtualMeshBuilderSAH    (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<8,Object>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,UserGeometry::geom_type);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+    Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<4,InstancePrimitive>((BVH4*)bvh,mesh,geomID,4,1.0f,1,inf,gtype);
+    }
+#if defined(__AVX__)
+    Builder* BVH8InstanceSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+    Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) {
+      return new BVHNBuilderSAH<8,InstancePrimitive>((BVH8*)bvh,mesh,geomID,8,1.0f,1,inf,gtype);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+    Builder* BVH4GridMeshBuilderSAH  (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,mesh,geomID,4,1.0f,4,4,mode); }
+    Builder* BVH4GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode)   { return new BVHNBuilderSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4,mode); } // FIXME: check whether cost factors are correct
+
+#if defined(__AVX__)
+    Builder* BVH8GridMeshBuilderSAH  (void* bvh, GridMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,mesh,geomID,8,1.0f,8,8,mode); }
+    Builder* BVH8GridSceneBuilderSAH (void* bvh, Scene* scene, size_t mode)   { return new BVHNBuilderSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8,mode); } // FIXME: check whether cost factors are correct
+#endif
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp
new file mode 100644
index 0000000000..9c01553ec6
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_mb.cpp
@@ -0,0 +1,705 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+#include "../builders/bvh_builder_msmblur.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+
+// FIXME: remove after removing BVHNBuilderMBlurRootTimeSplitsSAH
+#include "../../common/algorithms/parallel_for_for.h"
+#include "../../common/algorithms/parallel_for_for_prefix_sum.h"
+
+
+namespace embree
+{
+  namespace isa
+  {
+
+#if 0
+    template<int N, typename Primitive>
+    struct CreateMBlurLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB NodeRecordMB;
+
+      __forceinline CreateMBlurLeaf (BVH* bvh, PrimRef* prims, size_t time) : bvh(bvh), prims(prims), time(time) {}
+
+      __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t items = Primitive::blocks(set.size());
+        size_t start = set.begin();
+        for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        NodeRef node = bvh->encodeLeaf((char*)accel,items);
+
+        LBBox3fa allBounds = empty;
+        for (size_t i=0; i<items; i++)
+          allBounds.extend(accel[i].fillMB(prims, start, set.end(), bvh->scene, time));
+
+        return NodeRecordMB(node,allBounds);
+      }
+
+      BVH* bvh;
+      PrimRef* prims;
+      size_t time;
+    };
+#endif
+
+    template<int N, typename Mesh, typename Primitive>
+    struct CreateMSMBlurLeaf
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB4D NodeRecordMB4D;
+
+      __forceinline CreateMSMBlurLeaf (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t items = Primitive::blocks(current.prims.size());
+        size_t start = current.prims.begin();
+        size_t end   = current.prims.end();
+        for (size_t i=start; i<end; i++) assert((*current.prims.prims)[start].geomID() == (*current.prims.prims)[i].geomID()); // assert that all geomIDs are identical
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteNodeAlignment);
+        NodeRef node = bvh->encodeLeaf((char*)accel,items);
+        LBBox3fa allBounds = empty;
+        for (size_t i=0; i<items; i++)
+          allBounds.extend(accel[i].fillMB(current.prims.prims->data(), start, current.prims.end(), bvh->scene, current.prims.time_range));
+        return NodeRecordMB4D(node,allBounds,current.prims.time_range);
+      }
+
+      BVH* bvh;
+    };
+
+    /* Motion blur BVH with 4D nodes and internal time splits */
+    template<int N, typename Mesh, typename Primitive>
+    struct BVHNBuilderMBlurSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      typedef typename BVHN<N>::NodeRecordMB NodeRecordMB;
+      typedef typename BVHN<N>::AABBNodeMB AABBNodeMB;
+
+      BVH* bvh;
+      Scene* scene;
+      const size_t sahBlockSize;
+      const float intCost;
+      const size_t minLeafSize;
+      const size_t maxLeafSize;
+      const Geometry::GTypeMask gtype_;
+
+      BVHNBuilderMBlurSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const Geometry::GTypeMask gtype)
+        : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks)), gtype_(gtype) {}
+
+      void build()
+      {
+	/* skip build for empty scene */
+        const size_t numPrimitives = scene->getNumPrimitives(gtype_,true);
+        if (numPrimitives == 0) { bvh->clear(); return; }
+
+        double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAH");
+
+#if PROFILE
+        profile(2,PROFILE_RUNS,numPrimitives,[&] (ProfileTimer& timer) {
+#endif
+
+            //const size_t numTimeSteps = scene->getNumTimeSteps<typename Mesh::type_t,true>();
+            //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1);
+
+            /*if (numTimeSegments == 1)
+              buildSingleSegment(numPrimitives);
+              else*/
+              buildMultiSegment(numPrimitives);
+
+#if PROFILE
+          });
+#endif
+
+	/* clear temporary data for static geometry */
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+#if 0 // No longer compatible when time_ranges are present for geometries. Would have to create temporal nodes sometimes, and put only a single geometry into leaf.
+      void buildSingleSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRef> prims(scene->device,numPrimitives);
+        const PrimInfo pinfo = createPrimRefArrayMBlur(scene,gtype_,prims,bvh->scene->progressInterface,0);
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        GeneralBVHBuilder::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxBuildDepthLeaf;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+        /* build hierarchy */
+        auto root = BVHBuilderBinnedSAH::build<NodeRecordMB>
+          (typename BVH::CreateAlloc(bvh),typename BVH::AABBNodeMB::Create(),typename BVH::AABBNodeMB::Set(),
+           CreateMBlurLeaf<N,Primitive>(bvh,prims.data(),0),bvh->scene->progressInterface,
+           prims.data(),pinfo,settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.size());
+      }
+#endif
+
+      void buildMultiSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRefMB> prims(scene->device,numPrimitives);
+        PrimInfoMB pinfo = createPrimRefArrayMSMBlur(scene,gtype_,prims,bvh->scene->progressInterface);
+
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N);
+        const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(Primitive));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        BVHBuilderMSMBlur::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxDepth;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleLeafTimeSegment = Primitive::singleTimeSegment;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+        
+        /* build hierarchy */
+        auto root =
+          BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device,
+                                            RecalculatePrimRef<Mesh>(scene),
+                                            typename BVH::CreateAlloc(bvh),
+                                            typename BVH::AABBNodeMB4D::Create(),
+                                            typename BVH::AABBNodeMB4D::Set(),
+                                            CreateMSMBlurLeaf<N,Mesh,Primitive>(bvh),
+                                            bvh->scene->progressInterface,
+                                            settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.num_time_segments);
+      }
+
+      void clear() {
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+    struct GridRecalculatePrimRef
+    {
+      Scene* scene;
+      const SubGridBuildData * const sgrids;
+
+      __forceinline GridRecalculatePrimRef (Scene* scene, const SubGridBuildData * const sgrids)
+        : scene(scene), sgrids(sgrids) {}
+
+        __forceinline PrimRefMB operator() (const PrimRefMB& prim, const BBox1f time_range) const
+        {
+          const unsigned int geomID  = prim.geomID();
+          const GridMesh* mesh = scene->get<GridMesh>(geomID);
+          const unsigned int buildID = prim.primID();
+          const SubGridBuildData &subgrid = sgrids[buildID];                      
+          const unsigned int primID = subgrid.primID;
+          const size_t x = subgrid.x();
+          const size_t y = subgrid.y();
+          const LBBox3fa lbounds = mesh->linearBounds(mesh->grid(primID),x,y,time_range);
+          const unsigned num_time_segments = mesh->numTimeSegments();
+          const range<int> tbounds = mesh->timeSegmentRange(time_range);
+          return PrimRefMB (lbounds, tbounds.size(), mesh->time_range, num_time_segments, geomID, buildID);
+        }
+
+        __forceinline LBBox3fa linearBounds(const PrimRefMB& prim, const BBox1f time_range) const {
+          const unsigned int geomID  = prim.geomID();
+          const GridMesh* mesh = scene->get<GridMesh>(geomID);
+          const unsigned int buildID = prim.primID();
+          const SubGridBuildData &subgrid = sgrids[buildID];                      
+          const unsigned int primID = subgrid.primID;
+          const size_t x = subgrid.x();
+          const size_t y = subgrid.y();
+          return mesh->linearBounds(mesh->grid(primID),x,y,time_range);
+        }
+
+    };
+
+    template<int N>
+    struct CreateMSMBlurLeafGrid
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB4D NodeRecordMB4D;
+
+      __forceinline CreateMSMBlurLeafGrid (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) : scene(scene), bvh(bvh), sgrids(sgrids) {}
+
+      __forceinline const NodeRecordMB4D operator() (const BVHBuilderMSMBlur::BuildRecord& current, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = current.prims.size(); 
+        const size_t start = current.prims.begin();
+
+        const PrimRefMB* prims = current.prims.prims->data();
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs);
+
+        LBBox3fa allBounds = empty;
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]);
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds0[N];
+          BBox3fa bounds1[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData  &sgrid_bd = sgrids[prims[start+i].primID()];                      
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            const size_t x = sgrid_bd.x();
+            const size_t y = sgrid_bd.y();
+            LBBox3fa newBounds = mesh->linearBounds(mesh->grid(sgrid_bd.primID),x,y,current.prims.time_range);
+            allBounds.extend(newBounds);
+            bounds0[pos] = newBounds.bounds0;
+            bounds1[pos] = newBounds.bounds1;
+            pos++;
+          }
+          assert(pos <= N);
+          new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],current.prims.time_range.lower,1.0f/current.prims.time_range.size(),pos);
+        }
+        return NodeRecordMB4D(node,allBounds,current.prims.time_range);       
+      }
+
+      Scene *scene;
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+
+#if 0
+    template<int N>
+    struct CreateLeafGridMB
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::NodeRecordMB NodeRecordMB;
+
+      __forceinline CreateLeafGridMB (Scene* scene, BVH* bvh, const SubGridBuildData * const sgrids) 
+		  : scene(scene), bvh(bvh), sgrids(sgrids) {}
+
+      __forceinline NodeRecordMB operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        const size_t items = set.size(); 
+        const size_t start = set.begin();
+
+        /* collect all subsets with unique geomIDs */
+        assert(items <= N);
+        unsigned int geomIDs[N];
+        unsigned int num_geomIDs = 1;
+        geomIDs[0] = prims[start].geomID();
+
+        for (size_t i=1;i<items;i++)
+        {
+          bool found = false;
+          const unsigned int new_geomID = prims[start+i].geomID();
+          for (size_t j=0;j<num_geomIDs;j++)
+            if (new_geomID == geomIDs[j])
+            { found = true; break; }
+          if (!found) 
+            geomIDs[num_geomIDs++] = new_geomID;
+        }
+
+        /* allocate all leaf memory in one single block */
+        SubGridMBQBVHN<N>* accel = (SubGridMBQBVHN<N>*) alloc.malloc1(num_geomIDs*sizeof(SubGridMBQBVHN<N>),BVH::byteAlignment);
+        typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,num_geomIDs);
+
+        LBBox3fa allBounds = empty;
+
+        for (size_t g=0;g<num_geomIDs;g++)
+        {
+          const GridMesh* __restrict__ const mesh = scene->get<GridMesh>(geomIDs[g]);
+
+          unsigned int x[N];
+          unsigned int y[N];
+          unsigned int primID[N];
+          BBox3fa bounds0[N];
+          BBox3fa bounds1[N];
+          unsigned int pos = 0;
+          for (size_t i=0;i<items;i++)
+          {
+            if (unlikely(prims[start+i].geomID() != geomIDs[g])) continue;
+
+            const SubGridBuildData  &sgrid_bd = sgrids[prims[start+i].primID()];                      
+            x[pos] = sgrid_bd.sx;
+            y[pos] = sgrid_bd.sy;
+            primID[pos] = sgrid_bd.primID;
+            const size_t x = sgrid_bd.x();
+            const size_t y = sgrid_bd.y();
+            bool MAYBE_UNUSED valid0 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,0,bounds0[pos]);
+            bool MAYBE_UNUSED valid1 = mesh->buildBounds(mesh->grid(sgrid_bd.primID),x,y,1,bounds1[pos]);
+            assert(valid0);
+            assert(valid1);
+            allBounds.extend(LBBox3fa(bounds0[pos],bounds1[pos]));
+            pos++;
+          }
+          new (&accel[g]) SubGridMBQBVHN<N>(x,y,primID,bounds0,bounds1,geomIDs[g],0.0f,1.0f,pos);
+        }
+        return NodeRecordMB(node,allBounds);
+      }
+
+      Scene *scene;
+      BVH* bvh;
+      const SubGridBuildData * const sgrids;
+    };
+#endif
+
+
+    /* Motion blur BVH with 4D nodes and internal time splits */
+    template<int N>
+    struct BVHNBuilderMBlurSAHGrid : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVHN<N>::NodeRef NodeRef;
+      typedef typename BVHN<N>::NodeRecordMB NodeRecordMB;
+      typedef typename BVHN<N>::AABBNodeMB AABBNodeMB;
+
+      BVH* bvh;
+      Scene* scene;
+      const size_t sahBlockSize;
+      const float intCost;
+      const size_t minLeafSize;
+      const size_t maxLeafSize;
+      mvector<SubGridBuildData> sgrids;
+
+
+      BVHNBuilderMBlurSAHGrid (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize)
+        : bvh(bvh), scene(scene), sahBlockSize(sahBlockSize), intCost(intCost), minLeafSize(minLeafSize), maxLeafSize(min(maxLeafSize,BVH::maxLeafBlocks)), sgrids(scene->device,0) {}
+
+
+      PrimInfo createPrimRefArrayMBlurGrid(Scene* scene, mvector<PrimRef>& prims, BuildProgressMonitor& progressMonitor, size_t itime)
+      {
+        /* first run to get #primitives */
+        ParallelForForPrefixSumState<PrimInfo> pstate;
+        Scene::Iterator<GridMesh,true> iter(scene);
+
+        pstate.init(iter,size_t(1024));
+
+        /* iterate over all meshes in the scene */
+        PrimInfo pinfo = parallel_for_for_prefix_sum0( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID) -> PrimInfo {
+            
+            PrimInfo pinfo(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j,range<size_t>(0,1))) continue;
+              BBox3fa bounds = empty;
+              const PrimRef prim(bounds,unsigned(geomID),unsigned(j));
+              pinfo.add_center2(prim,mesh->getNumSubGrids(j));
+            }
+            return pinfo;
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+        
+        size_t numPrimitives = pinfo.size();
+        if (numPrimitives == 0) return pinfo;
+
+        /* resize arrays */
+        sgrids.resize(numPrimitives); 
+        prims.resize(numPrimitives); 
+
+        /* second run to fill primrefs and SubGridBuildData arrays */
+        pinfo = parallel_for_for_prefix_sum1( pstate, iter, PrimInfo(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfo& base) -> PrimInfo {
+            
+            k = base.size();
+            size_t p_index = k;
+            PrimInfo pinfo(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              const GridMesh::Grid &g = mesh->grid(j);
+              if (!mesh->valid(j,range<size_t>(0,1))) continue;
+              
+              for (unsigned int y=0; y<g.resY-1u; y+=2)
+                for (unsigned int x=0; x<g.resX-1u; x+=2)
+                {
+                  BBox3fa bounds = empty;
+                  if (!mesh->buildBounds(g,x,y,itime,bounds)) continue; // get bounds of subgrid
+                  const PrimRef prim(bounds,unsigned(geomID),unsigned(p_index));
+                  pinfo.add_center2(prim);
+                  sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                                                      prims[p_index++] = prim;                
+                }
+            }
+            return pinfo;
+          }, [](const PrimInfo& a, const PrimInfo& b) -> PrimInfo { return PrimInfo::merge(a,b); });
+        
+        assert(pinfo.size() == numPrimitives);
+        return pinfo;
+      }
+
+      PrimInfoMB createPrimRefArrayMSMBlurGrid(Scene* scene, mvector<PrimRefMB>& prims, BuildProgressMonitor& progressMonitor, BBox1f t0t1 = BBox1f(0.0f,1.0f))
+      {
+        /* first run to get #primitives */
+        ParallelForForPrefixSumState<PrimInfoMB> pstate;
+        Scene::Iterator<GridMesh,true> iter(scene);
+
+        pstate.init(iter,size_t(1024));
+        /* iterate over all meshes in the scene */
+        PrimInfoMB pinfoMB = parallel_for_for_prefix_sum0( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t /*geomID*/) -> PrimInfoMB {
+            
+            PrimInfoMB pinfoMB(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue;
+              LBBox3fa bounds(empty);
+              PrimInfoMB gridMB(0,mesh->getNumSubGrids(j));
+              pinfoMB.merge(gridMB);
+            }
+            return pinfoMB;
+          }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+        
+        size_t numPrimitives = pinfoMB.size();
+        if (numPrimitives == 0) return pinfoMB;
+
+        /* resize arrays */
+        sgrids.resize(numPrimitives); 
+        prims.resize(numPrimitives); 
+        /* second run to fill primrefs and SubGridBuildData arrays */
+        pinfoMB = parallel_for_for_prefix_sum1( pstate, iter, PrimInfoMB(empty), [&](GridMesh* mesh, const range<size_t>& r, size_t k, size_t geomID, const PrimInfoMB& base) -> PrimInfoMB {
+            
+            k = base.size();
+            size_t p_index = k;
+            PrimInfoMB pinfoMB(empty);
+            for (size_t j=r.begin(); j<r.end(); j++)
+            {
+              if (!mesh->valid(j, mesh->timeSegmentRange(t0t1))) continue;
+              const GridMesh::Grid &g = mesh->grid(j);
+              
+              for (unsigned int y=0; y<g.resY-1u; y+=2)
+                for (unsigned int x=0; x<g.resX-1u; x+=2)
+                {
+                  const PrimRefMB prim(mesh->linearBounds(g,x,y,t0t1),mesh->numTimeSegments(),mesh->time_range,mesh->numTimeSegments(),unsigned(geomID),unsigned(p_index));
+                  pinfoMB.add_primref(prim);
+                  sgrids[p_index] = SubGridBuildData(x | g.get3x3FlagsX(x), y | g.get3x3FlagsY(y), unsigned(j));
+                  prims[p_index++] = prim;                
+                }
+            }
+            return pinfoMB;
+          }, [](const PrimInfoMB& a, const PrimInfoMB& b) -> PrimInfoMB { return PrimInfoMB::merge2(a,b); });
+        
+        assert(pinfoMB.size() == numPrimitives);
+        pinfoMB.time_range = t0t1;
+        return pinfoMB;
+      }
+
+      void build()
+      {
+	/* skip build for empty scene */
+        const size_t numPrimitives = scene->getNumPrimitives(GridMesh::geom_type,true);
+        if (numPrimitives == 0) { bvh->clear(); return; }
+
+        double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderMBlurSAHGrid");
+
+        //const size_t numTimeSteps = scene->getNumTimeSteps<GridMesh,true>();
+        //const size_t numTimeSegments = numTimeSteps-1; assert(numTimeSteps > 1);
+        //if (numTimeSegments == 1)
+        //  buildSingleSegment(numPrimitives);
+        //else
+        buildMultiSegment(numPrimitives);
+
+	/* clear temporary data for static geometry */
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+#if 0
+      void buildSingleSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRef> prims(scene->device,numPrimitives);
+        const PrimInfo pinfo = createPrimRefArrayMBlurGrid(scene,prims,bvh->scene->progressInterface,0);
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.size()*sizeof(AABBNodeMB)/(4*N);
+        //TODO: check leaf_bytes
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        GeneralBVHBuilder::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxBuildDepthLeaf;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+        /* build hierarchy */
+        auto root = BVHBuilderBinnedSAH::build<NodeRecordMB>
+          (typename BVH::CreateAlloc(bvh),
+           typename BVH::AABBNodeMB::Create(),
+           typename BVH::AABBNodeMB::Set(),
+           CreateLeafGridMB<N>(scene,bvh,sgrids.data()),
+           bvh->scene->progressInterface,
+           prims.data(),pinfo,settings);
+
+        bvh->set(root.ref,root.lbounds,pinfo.size());
+      }
+#endif
+      
+      void buildMultiSegment(size_t numPrimitives)
+      {
+        /* create primref array */
+        mvector<PrimRefMB> prims(scene->device,numPrimitives);
+        PrimInfoMB pinfo = createPrimRefArrayMSMBlurGrid(scene,prims,bvh->scene->progressInterface);
+
+        /* early out if no valid primitives */
+        if (pinfo.size() == 0) { bvh->clear(); return; }
+
+
+
+        GridRecalculatePrimRef recalculatePrimRef(scene,sgrids.data());
+
+        /* estimate acceleration structure size */
+        const size_t node_bytes = pinfo.num_time_segments*sizeof(AABBNodeMB)/(4*N);
+        //FIXME: check leaf_bytes
+        //const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.num_time_segments)*sizeof(SubGridQBVHN<N>));
+        const size_t leaf_bytes = size_t(1.2*(float)numPrimitives/N * sizeof(SubGridQBVHN<N>));
+
+        bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+
+        /* settings for BVH build */
+        BVHBuilderMSMBlur::Settings settings;
+        settings.branchingFactor = N;
+        settings.maxDepth = BVH::maxDepth;
+        settings.logBlockSize = bsr(sahBlockSize);
+        settings.minLeafSize = min(minLeafSize,maxLeafSize);
+        settings.maxLeafSize = maxLeafSize;
+        settings.travCost = travCost;
+        settings.intCost = intCost;
+        settings.singleLeafTimeSegment = false; 
+        settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+        
+        /* build hierarchy */
+        auto root =
+          BVHBuilderMSMBlur::build<NodeRef>(prims,pinfo,scene->device,
+                                            recalculatePrimRef,
+                                            typename BVH::CreateAlloc(bvh),
+                                            typename BVH::AABBNodeMB4D::Create(),
+                                            typename BVH::AABBNodeMB4D::Set(),
+                                            CreateMSMBlurLeafGrid<N>(scene,bvh,sgrids.data()),
+                                            bvh->scene->progressInterface,
+                                            settings);
+        bvh->set(root.ref,root.lbounds,pinfo.num_time_segments);
+      }
+
+      void clear() {
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+    Builder* BVH4Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,TriangleMesh,Triangle4vMB>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+#if defined(__AVX__)
+    Builder* BVH8Triangle4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+    Builder* BVH8Triangle4vMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,TriangleMesh,Triangle4vMB>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_TRIANGLE_MESH); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<4,QuadMesh,Quad4i>((BVH4*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); }
+#if defined(__AVX__)
+    Builder* BVH8Quad4iMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAH<8,QuadMesh,Quad4i>((BVH8*)bvh,scene,4,1.0f,4,inf,Geometry::MTY_QUAD_MESH); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMBSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_mb_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_mb_max_leaf_size;
+      return new BVHNBuilderMBlurSAH<4,UserGeometry,Object>((BVH4*)bvh,scene,4,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY);
+    }
+#if defined(__AVX__)
+    Builder* BVH8VirtualMBSceneBuilderSAH    (void* bvh, Scene* scene, size_t mode) {
+      int minLeafSize = scene->device->object_accel_mb_min_leaf_size;
+      int maxLeafSize = scene->device->object_accel_mb_max_leaf_size;
+      return new BVHNBuilderMBlurSAH<8,UserGeometry,Object>((BVH8*)bvh,scene,8,1.0f,minLeafSize,maxLeafSize,Geometry::MTY_USER_GEOMETRY);
+    }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,4,1.0f,1,1,gtype); }
+#if defined(__AVX__)
+    Builder* BVH8InstanceMBSceneBuilderSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype) { return new BVHNBuilderMBlurSAH<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,8,1.0f,1,1,gtype); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+    Builder* BVH4GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<4>((BVH4*)bvh,scene,4,1.0f,4,4); }
+#if defined(__AVX__)
+    Builder* BVH8GridMBSceneBuilderSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderMBlurSAHGrid<8>((BVH8*)bvh,scene,8,1.0f,8,8); }
+#endif
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp
new file mode 100644
index 0000000000..285b38c39d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_sah_spatial.cpp
@@ -0,0 +1,201 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh.h"
+#include "bvh_builder.h"
+
+#include "../builders/primrefgen.h"
+#include "../builders/primrefgen_presplit.h"
+#include "../builders/splitter.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglev_mb.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+#include "../geometry/subgrid.h"
+
+#include "../common/state.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Primitive>
+    struct CreateLeafSpatial
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline CreateLeafSpatial (BVH* bvh) : bvh(bvh) {}
+
+      __forceinline NodeRef operator() (const PrimRef* prims, const range<size_t>& set, const FastAllocator::CachedAllocator& alloc) const
+      {
+        size_t n = set.size();
+        size_t items = Primitive::blocks(n);
+        size_t start = set.begin();
+        Primitive* accel = (Primitive*) alloc.malloc1(items*sizeof(Primitive),BVH::byteAlignment);
+        typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,items);
+        for (size_t i=0; i<items; i++) {
+          accel[i].fill(prims,start,set.end(),bvh->scene);
+        }
+        return node;
+      }
+
+      BVH* bvh;
+    };
+
+    template<int N, typename Mesh, typename Primitive, typename Splitter>
+    struct BVHNBuilderFastSpatialSAH : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      BVH* bvh;
+      Scene* scene;
+      Mesh* mesh;
+      mvector<PrimRef> prims0;
+      GeneralBVHBuilder::Settings settings;
+      const float splitFactor;
+      unsigned int geomID_ = std::numeric_limits<unsigned int>::max();
+      unsigned int numPreviousPrimitives = 0;
+
+      BVHNBuilderFastSpatialSAH (BVH* bvh, Scene* scene, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(scene), mesh(nullptr), prims0(scene->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD),
+          splitFactor(scene->device->max_spatial_split_replications) {}
+
+      BVHNBuilderFastSpatialSAH (BVH* bvh, Mesh* mesh, const unsigned int geomID, const size_t sahBlockSize, const float intCost, const size_t minLeafSize, const size_t maxLeafSize, const size_t mode)
+        : bvh(bvh), scene(nullptr), mesh(mesh), prims0(bvh->device,0), settings(sahBlockSize, minLeafSize, min(maxLeafSize,Primitive::max_size()*BVH::maxLeafBlocks), travCost, intCost, DEFAULT_SINGLE_THREAD_THRESHOLD),
+          splitFactor(scene->device->max_spatial_split_replications), geomID_(geomID) {}
+
+      // FIXME: shrink bvh->alloc in destructor here and in other builders too
+
+      void build()
+      {
+        /* we reset the allocator when the mesh size changed */
+        if (mesh && mesh->numPrimitives != numPreviousPrimitives) {
+          bvh->alloc.clear();
+        }
+
+	/* skip build for empty scene */
+        const size_t numOriginalPrimitives = mesh ? mesh->size() : scene->getNumPrimitives(Mesh::geom_type,false);
+        numPreviousPrimitives = numOriginalPrimitives;
+        if (numOriginalPrimitives == 0) {
+          prims0.clear();
+          bvh->clear();
+          return;
+        }
+
+        const unsigned int maxGeomID = mesh ? geomID_ : scene->getMaxGeomID<Mesh,false>();
+        const bool usePreSplits = scene->device->useSpatialPreSplits || (maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS)));
+        double t0 = bvh->preBuild(mesh ? "" : TOSTRING(isa) "::BVH" + toString(N) + (usePreSplits ? "BuilderFastSpatialPresplitSAH" : "BuilderFastSpatialSAH"));
+
+        /* create primref array */
+        const size_t numSplitPrimitives = max(numOriginalPrimitives,size_t(splitFactor*numOriginalPrimitives));
+        prims0.resize(numSplitPrimitives);
+
+        /* enable os_malloc for two level build */
+        if (mesh)
+          bvh->alloc.setOSallocation(true);
+	
+	NodeRef root(0);
+	PrimInfo pinfo;
+	
+
+        if (likely(usePreSplits))
+	  {		     
+            /* spatial presplit SAH BVH builder */
+	    pinfo = mesh ?
+	      createPrimRefArray_presplit<Mesh,Splitter>(mesh,maxGeomID,numOriginalPrimitives,prims0,bvh->scene->progressInterface) :
+	      createPrimRefArray_presplit<Mesh,Splitter>(scene,Mesh::geom_type,false,numOriginalPrimitives,prims0,bvh->scene->progressInterface);
+
+	    const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N);
+	    const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+	    bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+	    settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+	    settings.branchingFactor = N;
+	    settings.maxDepth = BVH::maxBuildDepthLeaf;
+
+	    /* call BVH builder */
+	    root = BVHNBuilderVirtual<N>::build(&bvh->alloc,CreateLeafSpatial<N,Primitive>(bvh),bvh->scene->progressInterface,prims0.data(),pinfo,settings);
+	  }
+	else
+	  {
+            /* standard spatial split SAH BVH builder */
+	    pinfo = mesh ?
+	      createPrimRefArray(mesh,geomID_,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface) :
+	      createPrimRefArray(scene,Mesh::geom_type,false,/*numSplitPrimitives,*/prims0,bvh->scene->progressInterface);
+	
+	    Splitter splitter(scene);
+
+	    const size_t node_bytes = pinfo.size()*sizeof(typename BVH::AABBNode)/(4*N);
+	    const size_t leaf_bytes = size_t(1.2*Primitive::blocks(pinfo.size())*sizeof(Primitive));
+	    bvh->alloc.init_estimate(node_bytes+leaf_bytes);
+	    settings.singleThreadThreshold = bvh->alloc.fixSingleThreadThreshold(N,DEFAULT_SINGLE_THREAD_THRESHOLD,pinfo.size(),node_bytes+leaf_bytes);
+
+	    settings.branchingFactor = N;
+	    settings.maxDepth = BVH::maxBuildDepthLeaf;
+
+	    /* call BVH builder */
+	    root = BVHBuilderBinnedFastSpatialSAH::build<NodeRef>(
+								  typename BVH::CreateAlloc(bvh),
+								  typename BVH::AABBNode::Create2(),
+								  typename BVH::AABBNode::Set2(),
+								  CreateLeafSpatial<N,Primitive>(bvh),
+								  splitter,
+								  bvh->scene->progressInterface,
+								  prims0.data(),
+								  numSplitPrimitives,
+								  pinfo,settings);
+
+	    /* ==================== */
+	  }
+
+        bvh->set(root,LBBox3fa(pinfo.geomBounds),pinfo.size());
+        bvh->layoutLargeNodes(size_t(pinfo.size()*0.005f));
+
+	/* clear temporary data for static geometry */
+	if (scene && scene->isStaticAccel()) {
+          prims0.clear();
+        }
+	bvh->cleanup();
+        bvh->postBuild(t0);
+      }
+
+      void clear() {
+        prims0.clear();
+      }
+    };
+
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+    /************************************************************************************/
+
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+
+    Builder* BVH4Triangle4SceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH4Triangle4vSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH4Triangle4iSceneBuilderFastSpatialSAH (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,TriangleMesh,Triangle4i,TriangleSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+
+#if defined(__AVX__)
+    Builder* BVH8Triangle4SceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+    Builder* BVH8Triangle4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,TriangleMesh,Triangle4v,TriangleSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<4,QuadMesh,Quad4v,QuadSplitterFactory>((BVH4*)bvh,scene,4,1.0f,4,inf,mode); }
+
+#if defined(__AVX__)
+    Builder* BVH8Quad4vSceneBuilderFastSpatialSAH  (void* bvh, Scene* scene, size_t mode) { return new BVHNBuilderFastSpatialSAH<8,QuadMesh,Quad4v,QuadSplitterFactory>((BVH8*)bvh,scene,4,1.0f,4,inf,mode); }
+#endif
+
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp
new file mode 100644
index 0000000000..1a78f347ac
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.cpp
@@ -0,0 +1,377 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_builder_twolevel.h"
+#include "bvh_statistics.h"
+#include "../builders/bvh_builder_sah.h"
+#include "../common/scene_line_segments.h"
+#include "../common/scene_triangle_mesh.h"
+#include "../common/scene_quad_mesh.h"
+
+#define PROFILE 0
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Mesh, typename Primitive>
+    BVHNBuilderTwoLevel<N,Mesh,Primitive>::BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder, const size_t singleThreadThreshold)
+      : bvh(bvh), scene(scene), refs(scene->device,0), prims(scene->device,0), singleThreadThreshold(singleThreadThreshold), gtype(gtype), useMortonBuilder_(useMortonBuilder) {}
+    
+    template<int N, typename Mesh, typename Primitive>
+    BVHNBuilderTwoLevel<N,Mesh,Primitive>::~BVHNBuilderTwoLevel () {
+    }
+
+    // ===========================================================================
+    // ===========================================================================
+    // ===========================================================================
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::build()
+    {
+      /* delete some objects */
+      size_t num = scene->size();
+      if (num < bvh->objects.size()) {
+        parallel_for(num, bvh->objects.size(), [&] (const range<size_t>& r) {
+            for (size_t i=r.begin(); i<r.end(); i++) {
+              builders[i].reset();
+              delete bvh->objects[i]; bvh->objects[i] = nullptr;
+            }
+          });
+      }
+      
+#if PROFILE
+      while(1) 
+#endif
+      {
+      /* reset memory allocator */
+      bvh->alloc.reset();
+      
+      /* skip build for empty scene */
+      const size_t numPrimitives = scene->getNumPrimitives(gtype,false);
+
+      if (numPrimitives == 0) {
+        prims.resize(0);
+        bvh->set(BVH::emptyNode,empty,0);
+        return;
+      }
+
+      /* calculate the size of the entire BVH */
+      const size_t numLeafBlocks = Primitive::blocks(numPrimitives);
+      const size_t node_bytes = 2*numLeafBlocks*sizeof(typename BVH::AABBNode)/N;
+      const size_t leaf_bytes = size_t(1.2*numLeafBlocks*sizeof(Primitive));
+      bvh->alloc.init_estimate(node_bytes+leaf_bytes); 
+
+      double t0 = bvh->preBuild(TOSTRING(isa) "::BVH" + toString(N) + "BuilderTwoLevel");
+
+      /* resize object array if scene got larger */
+      if (bvh->objects.size()  < num) bvh->objects.resize(num);
+      if (builders.size() < num) builders.resize(num);
+      resizeRefsList ();
+      nextRef.store(0);
+      
+      /* create acceleration structures */
+      parallel_for(size_t(0), num, [&] (const range<size_t>& r)
+      {
+        for (size_t objectID=r.begin(); objectID<r.end(); objectID++)
+        {
+          Mesh* mesh = scene->getSafe<Mesh>(objectID);
+      
+          /* ignore meshes we do not support */
+          if (mesh == nullptr || mesh->numTimeSteps != 1)
+            continue;
+          
+          if (isSmallGeometry(mesh)) {
+             setupSmallBuildRefBuilder (objectID, mesh);
+          } else {
+            setupLargeBuildRefBuilder (objectID, mesh);
+          }
+        }
+      });
+
+      /* parallel build of acceleration structures */
+      parallel_for(size_t(0), num, [&] (const range<size_t>& r)
+      {
+        for (size_t objectID=r.begin(); objectID<r.end(); objectID++)
+        {
+          /* ignore if no triangle mesh or not enabled */
+          Mesh* mesh = scene->getSafe<Mesh>(objectID);
+          if (mesh == nullptr || !mesh->isEnabled() || mesh->numTimeSteps != 1) 
+            continue;
+
+          builders[objectID]->attachBuildRefs (this);
+        }
+      });
+
+
+#if PROFILE
+      double d0 = getSeconds();
+#endif
+      /* fast path for single geometry scenes */
+      if (nextRef == 1) { 
+        bvh->set(refs[0].node,LBBox3fa(refs[0].bounds()),numPrimitives);
+      }
+
+      else
+      {     
+        /* open all large nodes */
+        refs.resize(nextRef);
+
+        /* this probably needs some more tuning */
+        const size_t extSize = max(max((size_t)SPLIT_MIN_EXT_SPACE,refs.size()*SPLIT_MEMORY_RESERVE_SCALE),size_t((float)numPrimitives / SPLIT_MEMORY_RESERVE_FACTOR));
+ 
+#if !ENABLE_DIRECT_SAH_MERGE_BUILDER
+
+#if ENABLE_OPEN_SEQUENTIAL
+        open_sequential(extSize); 
+#endif
+        /* compute PrimRefs */
+        prims.resize(refs.size());
+#endif
+        
+#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL
+        tbb::task_arena limited(min(32,(int)TaskScheduler::threadCount()));
+        limited.execute([&]
+#endif
+        {
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+
+          const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(),  PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo {
+
+              PrimInfo pinfo(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                pinfo.add_center2(refs[i]);
+              }
+              return pinfo;
+            }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+          
+#else
+          const PrimInfo pinfo = parallel_reduce(size_t(0), refs.size(),  PrimInfo(empty), [&] (const range<size_t>& r) -> PrimInfo {
+
+              PrimInfo pinfo(empty);
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                pinfo.add_center2(refs[i]);
+                prims[i] = PrimRef(refs[i].bounds(),(size_t)refs[i].node);
+              }
+              return pinfo;
+            }, [] (const PrimInfo& a, const PrimInfo& b) { return PrimInfo::merge(a,b); });
+#endif   
+       
+          /* skip if all objects where empty */
+          if (pinfo.size() == 0)
+            bvh->set(BVH::emptyNode,empty,0);
+        
+          /* otherwise build toplevel hierarchy */
+          else
+          {
+            /* settings for BVH build */
+            GeneralBVHBuilder::Settings settings;
+            settings.branchingFactor = N;
+            settings.maxDepth = BVH::maxBuildDepthLeaf;
+            settings.logBlockSize = bsr(N);
+            settings.minLeafSize = 1;
+            settings.maxLeafSize = 1;
+            settings.travCost = 1.0f;
+            settings.intCost = 1.0f;
+            settings.singleThreadThreshold = singleThreadThreshold;
+      
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            
+            refs.resize(extSize); 
+         
+            NodeRef root = BVHBuilderBinnedOpenMergeSAH::build<NodeRef,BuildRef>(
+              typename BVH::CreateAlloc(bvh),
+              typename BVH::AABBNode::Create2(),
+              typename BVH::AABBNode::Set2(),
+              
+              [&] (const BuildRef* refs, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef  {
+                assert(range.size() == 1);
+                return (NodeRef) refs[range.begin()].node;
+              },
+              [&] (BuildRef &bref, BuildRef *refs) -> size_t { 
+                return openBuildRef(bref,refs);
+              },              
+              [&] (size_t dn) { bvh->scene->progressMonitor(0); },
+              refs.data(),extSize,pinfo,settings);
+#else
+            NodeRef root = BVHBuilderBinnedSAH::build<NodeRef>(
+              typename BVH::CreateAlloc(bvh),
+              typename BVH::AABBNode::Create2(),
+              typename BVH::AABBNode::Set2(),
+              
+              [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> NodeRef {
+                assert(range.size() == 1);
+                return (NodeRef) prims[range.begin()].ID();
+              },
+              [&] (size_t dn) { bvh->scene->progressMonitor(0); },
+              prims.data(),pinfo,settings);
+#endif
+
+            
+            bvh->set(root,LBBox3fa(pinfo.geomBounds),numPrimitives);
+          }
+        }
+#if defined(TASKING_TBB) && defined(__AVX512ER__) && USE_TASK_ARENA // KNL
+          );
+#endif
+
+      }  
+        
+      bvh->alloc.cleanup();
+      bvh->postBuild(t0);
+#if PROFILE
+      double d1 = getSeconds();
+      std::cout << "TOP_LEVEL OPENING/REBUILD TIME " << 1000.0*(d1-d0) << " ms" << std::endl;
+#endif
+      }
+
+    }
+    
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::deleteGeometry(size_t geomID)
+    {
+      if (geomID >= bvh->objects.size()) return;
+      if (builders[geomID]) builders[geomID].reset();
+      delete bvh->objects [geomID]; bvh->objects [geomID] = nullptr;
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::clear()
+    {
+      for (size_t i=0; i<bvh->objects.size(); i++) 
+        if (bvh->objects[i]) bvh->objects[i]->clear();
+
+      for (size_t i=0; i<builders.size(); i++) 
+        if (builders[i]) builders[i].reset();
+
+      refs.clear();
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::open_sequential(const size_t extSize)
+    {
+      if (refs.size() == 0)
+	return;
+
+      refs.reserve(extSize);
+
+#if 1
+      for (size_t i=0;i<refs.size();i++)
+      {
+        NodeRef ref = refs[i].node;
+        if (ref.isAABBNode())
+          BVH::prefetch(ref);
+      }
+#endif
+
+      std::make_heap(refs.begin(),refs.end());
+      while (refs.size()+N-1 <= extSize)
+      {
+        std::pop_heap (refs.begin(),refs.end()); 
+        NodeRef ref = refs.back().node;
+        if (ref.isLeaf()) break;
+        refs.pop_back();    
+        
+        AABBNode* node = ref.getAABBNode();
+        for (size_t i=0; i<N; i++) {
+          if (node->child(i) == BVH::emptyNode) continue;
+          refs.push_back(BuildRef(node->bounds(i),node->child(i)));
+         
+#if 1
+          NodeRef ref_pre = node->child(i);
+          if (ref_pre.isAABBNode())
+            ref_pre.prefetch();
+#endif
+          std::push_heap (refs.begin(),refs.end()); 
+        }
+      }
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupSmallBuildRefBuilder (size_t objectID, Mesh const * const /*mesh*/)
+    {
+      if (builders[objectID] == nullptr ||                                         // new mesh
+          dynamic_cast<RefBuilderSmall*>(builders[objectID].get()) == nullptr)     // size change resulted in large->small change
+      {
+        builders[objectID].reset (new RefBuilderSmall(objectID));
+      }
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNBuilderTwoLevel<N,Mesh,Primitive>::setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh)
+    {
+      if (bvh->objects[objectID] == nullptr ||                                  // new mesh
+          builders[objectID]->meshQualityChanged (mesh->quality) ||             // changed build quality
+          dynamic_cast<RefBuilderLarge*>(builders[objectID].get()) == nullptr)  // size change resulted in small->large change
+      {
+        Builder* builder = nullptr;
+        delete bvh->objects[objectID]; 
+        createMeshAccel(objectID, builder);
+        builders[objectID].reset (new RefBuilderLarge(objectID, builder, mesh->quality));
+      }
+    }
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH4BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4v>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH4BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,TriangleMesh,Triangle4i>((BVH4*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+    return new BVHNBuilderTwoLevel<4,QuadMesh,Quad4v>((BVH4*)bvh,scene,QuadMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+    return new BVHNBuilderTwoLevel<4,UserGeometry,Object>((BVH4*)bvh,scene,UserGeometry::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<4,Instance,InstancePrimitive>((BVH4*)bvh,scene,gtype,useMortonBuilder);
+    }
+#endif
+
+#if defined(__AVX__)
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH8BuilderTwoLevelTriangle4MeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH8BuilderTwoLevelTriangle4vMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4v>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+    Builder* BVH8BuilderTwoLevelTriangle4iMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,TriangleMesh,Triangle4i>((BVH8*)bvh,scene,TriangleMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH8BuilderTwoLevelQuadMeshSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,QuadMesh,Quad4v>((BVH8*)bvh,scene,QuadMesh::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH8BuilderTwoLevelVirtualSAH (void* bvh, Scene* scene, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,UserGeometry,Object>((BVH8*)bvh,scene,UserGeometry::geom_type,useMortonBuilder);
+    }
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH8BuilderTwoLevelInstanceSAH (void* bvh, Scene* scene, Geometry::GTypeMask gtype, bool useMortonBuilder) {
+      return new BVHNBuilderTwoLevel<8,Instance,InstancePrimitive>((BVH8*)bvh,scene,gtype,useMortonBuilder);
+    }
+#endif
+
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h
new file mode 100644
index 0000000000..8f57c3b406
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel.h
@@ -0,0 +1,263 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <type_traits>
+
+#include "bvh_builder_twolevel_internal.h"
+#include "bvh.h"
+#include "../common/primref.h"
+#include "../builders/priminfo.h"
+#include "../builders/primrefgen.h"
+
+/* new open/merge builder */
+#define ENABLE_DIRECT_SAH_MERGE_BUILDER 1
+#define ENABLE_OPEN_SEQUENTIAL 0
+#define SPLIT_MEMORY_RESERVE_FACTOR 1000
+#define SPLIT_MEMORY_RESERVE_SCALE 2
+#define SPLIT_MIN_EXT_SPACE 1000
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNBuilderTwoLevel : public Builder
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+
+      __forceinline static bool isSmallGeometry(Mesh* mesh) {
+        return mesh->size() <= 4;
+      }
+
+    public:
+
+      typedef void (*createMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+
+      struct BuildRef : public PrimRef
+      {
+      public:
+        __forceinline BuildRef () {}
+
+        __forceinline BuildRef (const BBox3fa& bounds, NodeRef node)
+          : PrimRef(bounds,(size_t)node), node(node)
+        {
+          if (node.isLeaf())
+            bounds_area = 0.0f;
+          else
+            bounds_area = area(this->bounds());
+        }
+
+        /* used by the open/merge bvh builder */
+        __forceinline BuildRef (const BBox3fa& bounds, NodeRef node, const unsigned int geomID, const unsigned int numPrimitives)
+          : PrimRef(bounds,geomID,numPrimitives), node(node)
+        {
+          /* important for relative buildref ordering */
+          if (node.isLeaf())
+            bounds_area = 0.0f;
+          else
+            bounds_area = area(this->bounds());
+        }
+
+        __forceinline size_t size() const {
+          return primID();
+        }
+
+        friend bool operator< (const BuildRef& a, const BuildRef& b) {
+          return a.bounds_area < b.bounds_area;
+        }
+
+        friend __forceinline embree_ostream operator<<(embree_ostream cout, const BuildRef& ref) {
+          return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", center2 = " << ref.center2() << ", geomID = " << ref.geomID() << ", numPrimitives = " << ref.numPrimitives() << ", bounds_area = " << ref.bounds_area << " }";
+        }
+
+        __forceinline unsigned int numPrimitives() const { return primID(); }
+
+      public:
+        NodeRef node;
+        float bounds_area;
+      };
+
+
+      __forceinline size_t openBuildRef(BuildRef &bref, BuildRef *const refs) {
+        if (bref.node.isLeaf())
+        {
+          refs[0] = bref;
+          return 1;
+        }
+        NodeRef ref = bref.node;
+        unsigned int geomID   = bref.geomID();
+        unsigned int numPrims = max((unsigned int)bref.numPrimitives() / N,(unsigned int)1);
+        AABBNode* node = ref.getAABBNode();
+        size_t n = 0;
+        for (size_t i=0; i<N; i++) {
+          if (node->child(i) == BVH::emptyNode) continue;
+          refs[i] = BuildRef(node->bounds(i),node->child(i),geomID,numPrims);
+          n++;
+        }
+        assert(n > 1);
+        return n;        
+      }
+      
+      /*! Constructor. */
+      BVHNBuilderTwoLevel (BVH* bvh, Scene* scene, Geometry::GTypeMask gtype = Mesh::geom_type, bool useMortonBuilder = false, const size_t singleThreadThreshold = DEFAULT_SINGLE_THREAD_THRESHOLD);
+      
+      /*! Destructor */
+      ~BVHNBuilderTwoLevel ();
+      
+      /*! builder entry point */
+      void build();
+      void deleteGeometry(size_t geomID);
+      void clear();
+
+      void open_sequential(const size_t extSize);
+      
+    private:
+
+      class RefBuilderBase {
+      public:
+        virtual ~RefBuilderBase () {}
+        virtual void attachBuildRefs (BVHNBuilderTwoLevel* builder) = 0;
+        virtual bool meshQualityChanged (RTCBuildQuality currQuality) = 0;
+      };
+
+      class RefBuilderSmall : public RefBuilderBase {
+      public:
+
+        RefBuilderSmall (size_t objectID)
+          : objectID_ (objectID) {}
+
+        void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder) {
+
+          Mesh* mesh = topBuilder->scene->template getSafe<Mesh>(objectID_);
+          size_t meshSize = mesh->size();
+          assert(isSmallGeometry(mesh));
+          
+          mvector<PrimRef> prefs(topBuilder->scene->device, meshSize);
+          auto pinfo = createPrimRefArray(mesh,objectID_,prefs,topBuilder->bvh->scene->progressInterface);
+
+          size_t begin=0;
+          while (begin < pinfo.size())
+          {
+            Primitive* accel = (Primitive*) topBuilder->bvh->alloc.getCachedAllocator().malloc1(sizeof(Primitive),BVH::byteAlignment);
+            typename BVH::NodeRef node = BVH::encodeLeaf((char*)accel,1);
+            accel->fill(prefs.data(),begin,pinfo.size(),topBuilder->bvh->scene);
+            
+            /* create build primitive */
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node,(unsigned int)objectID_,1);
+#else
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(pinfo.geomBounds,node);
+#endif
+          }
+          assert(begin == pinfo.size());
+        }
+
+        bool meshQualityChanged (RTCBuildQuality /*currQuality*/) {
+          return false;
+        }
+        
+        size_t  objectID_;
+      };
+
+      class RefBuilderLarge : public RefBuilderBase {
+      public:
+        
+        RefBuilderLarge (size_t objectID, const Ref<Builder>& builder, RTCBuildQuality quality)
+        : objectID_ (objectID), builder_ (builder), quality_ (quality) {}
+
+        void attachBuildRefs (BVHNBuilderTwoLevel* topBuilder)
+        {
+          BVH* object  = topBuilder->getBVH(objectID_); assert(object);
+          
+          /* build object if it got modified */
+          if (topBuilder->isGeometryModified(objectID_))
+            builder_->build();
+
+          /* create build primitive */
+          if (!object->getBounds().empty())
+          {
+#if ENABLE_DIRECT_SAH_MERGE_BUILDER
+            Mesh* mesh = topBuilder->getMesh(objectID_);
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root,(unsigned int)objectID_,(unsigned int)mesh->size());
+#else
+            topBuilder->refs[topBuilder->nextRef++] = BVHNBuilderTwoLevel::BuildRef(object->getBounds(),object->root);
+#endif
+          }
+        }
+
+        bool meshQualityChanged (RTCBuildQuality currQuality) {
+          return currQuality != quality_;
+        }
+
+      private:
+        size_t          objectID_;
+        Ref<Builder>    builder_;
+        RTCBuildQuality quality_;
+      };
+
+      void setupLargeBuildRefBuilder (size_t objectID, Mesh const * const mesh);
+      void setupSmallBuildRefBuilder (size_t objectID, Mesh const * const mesh);
+
+      BVH*  getBVH (size_t objectID) {
+        return this->bvh->objects[objectID];
+      }
+      Mesh* getMesh (size_t objectID) {
+        return this->scene->template getSafe<Mesh>(objectID);
+      }
+      bool  isGeometryModified (size_t objectID) {
+        return this->scene->isGeometryModified(objectID);
+      }
+
+      void resizeRefsList ()
+      {
+        size_t num = parallel_reduce (size_t(0), scene->size(), size_t(0), 
+          [this](const range<size_t>& r)->size_t {
+            size_t c = 0;
+            for (auto i=r.begin(); i<r.end(); ++i) {
+              Mesh* mesh = scene->getSafe<Mesh>(i);
+              if (mesh == nullptr || mesh->numTimeSteps != 1)
+                continue;
+              size_t meshSize = mesh->size();
+              c += isSmallGeometry(mesh) ? Primitive::blocks(meshSize) : 1;
+            }
+            return c;
+          },
+          std::plus<size_t>()
+        );
+
+        if (refs.size() < num) {
+          refs.resize(num);
+        }
+      }
+
+      void createMeshAccel (size_t geomID, Builder*& builder)
+      {
+        bvh->objects[geomID] = new BVH(Primitive::type,scene);
+        BVH* accel = bvh->objects[geomID];
+        auto mesh = scene->getSafe<Mesh>(geomID);
+        if (nullptr == mesh) {
+          throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"geomID does not return correct type");
+          return;
+        }
+
+        __internal_two_level_builder__::MeshBuilder<N,Mesh,Primitive>()(accel, mesh, geomID, this->gtype, this->useMortonBuilder_, builder);
+      }      
+
+      using BuilderList = std::vector<std::unique_ptr<RefBuilderBase>>;
+
+      BuilderList         builders;
+      BVH*                bvh;
+      Scene*              scene;      
+      mvector<BuildRef>   refs;
+      mvector<PrimRef>    prims;
+      std::atomic<int>    nextRef;
+      const size_t        singleThreadThreshold;
+      Geometry::GTypeMask gtype;
+      bool                useMortonBuilder_ = false;
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h
new file mode 100644
index 0000000000..1c1ae8d6a7
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_builder_twolevel_internal.h
@@ -0,0 +1,267 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/quadi.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+namespace embree
+{
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH4InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t)
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4MeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4vMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderMortonGeneral,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshBuilderSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Triangle4iMeshRefitSAH,void* COMMA TriangleMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderMortonGeneral,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshBuilderSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8Quad4vMeshRefitSAH,void* COMMA QuadMesh* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderMortonGeneral,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshBuilderSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8VirtualMeshRefitSAH,void* COMMA UserGeometry* COMMA unsigned int COMMA size_t); 
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderMortonGeneral,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshBuilderSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t);
+  DECLARE_ISA_FUNCTION(Builder*,BVH8InstanceMeshRefitSAH,void* COMMA Instance* COMMA Geometry::GTypeMask COMMA unsigned int COMMA size_t) 
+  
+  namespace isa
+  {
+
+    namespace __internal_two_level_builder__ {
+
+      template<int N, typename Mesh, typename Primitive>
+      struct MortonBuilder {};
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,TriangleMesh,Triangle4i> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,QuadMesh,Quad4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,UserGeometry,Object> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<4,Instance,InstancePrimitive> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,TriangleMesh,Triangle4i> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,QuadMesh,Quad4v> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,UserGeometry,Object> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderMortonGeneral(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct MortonBuilder<8,Instance,InstancePrimitive> {
+        MortonBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderMortonGeneral(bvh,mesh,gtype,geomID,0);}
+      };
+
+      template<int N, typename Mesh, typename Primitive>
+      struct SAHBuilder {};
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,TriangleMesh,Triangle4i> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,QuadMesh,Quad4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,UserGeometry,Object> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<4,Instance,InstancePrimitive> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,TriangleMesh,Triangle4i> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,QuadMesh,Quad4v> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,UserGeometry,Object> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshBuilderSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct SAHBuilder<8,Instance,InstancePrimitive> {
+        SAHBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshBuilderSAH(bvh,mesh,gtype,geomID,0);}
+      };
+
+      template<int N, typename Mesh, typename Primitive>
+      struct RefitBuilder {};
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4MeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,TriangleMesh,Triangle4i> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,QuadMesh,Quad4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4Quad4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,UserGeometry,Object> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH4VirtualMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<4,Instance,InstancePrimitive> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH4InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4MeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,TriangleMesh,Triangle4i> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, TriangleMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Triangle4iMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,QuadMesh,Quad4v> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, QuadMesh* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8Quad4vMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,UserGeometry,Object> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, UserGeometry* mesh, size_t geomID, Geometry::GTypeMask /*gtype*/) { return BVH8VirtualMeshRefitSAH(bvh,mesh,geomID,0);}
+      };
+      template<>
+      struct RefitBuilder<8,Instance,InstancePrimitive> {
+        RefitBuilder () {}
+        Builder* operator () (void* bvh, Instance* mesh, size_t geomID, Geometry::GTypeMask gtype) { return BVH8InstanceMeshRefitSAH(bvh,mesh,gtype,geomID,0);}
+      };
+      
+      template<int N, typename Mesh, typename Primitive>
+      struct MeshBuilder {
+        MeshBuilder () {}
+        void operator () (void* bvh, Mesh* mesh, size_t geomID, Geometry::GTypeMask gtype, bool useMortonBuilder, Builder*& builder) {
+          if(useMortonBuilder) {
+            builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype);
+            return;
+          }
+          switch (mesh->quality) {
+            case RTC_BUILD_QUALITY_LOW:    builder = MortonBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            case RTC_BUILD_QUALITY_MEDIUM:
+            case RTC_BUILD_QUALITY_HIGH:   builder = SAHBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            case RTC_BUILD_QUALITY_REFIT:  builder = RefitBuilder<N,Mesh,Primitive>()(bvh,mesh,geomID,gtype); break;
+            default: throw_RTCError(RTC_ERROR_UNKNOWN,"invalid build quality");
+          }
+        }
+      };
+    }
+  }
+}
+\ No newline at end of file
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp
new file mode 100644
index 0000000000..a27be8bae8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.cpp
@@ -0,0 +1,375 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_collider.h"
+#include "../geometry/triangle_triangle_intersector.h"
+
+namespace embree
+{ 
+  namespace isa
+  {
+#define CSTAT(x)
+
+    size_t parallel_depth_threshold = 3;
+    CSTAT(std::atomic<size_t> bvh_collide_traversal_steps(0));
+    CSTAT(std::atomic<size_t> bvh_collide_leaf_pairs(0));
+    CSTAT(std::atomic<size_t> bvh_collide_leaf_iterations(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections1(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections2(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections3(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections4(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections5(0));
+    CSTAT(std::atomic<size_t> bvh_collide_prim_intersections(0));
+
+    struct Collision
+    {
+      __forceinline Collision() {}
+
+      __forceinline Collision (unsigned geomID0, unsigned primID0, unsigned geomID1, unsigned primID1)
+        : geomID0(geomID0), primID0(primID0), geomID1(geomID1), primID1(primID1) {}
+
+      unsigned geomID0;
+      unsigned primID0;
+      unsigned geomID1;
+      unsigned primID1;
+    };
+    
+    template<int N>
+    __forceinline size_t overlap(const BBox3fa& box0, const typename BVHN<N>::AABBNode& node1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),node1.lower_x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),node1.lower_y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),node1.lower_z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),node1.upper_x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),node1.upper_y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),node1.upper_z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    template<int N>
+    __forceinline size_t overlap(const BBox3fa& box0, const BBox<Vec3<vfloat<N>>>& box1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x),box1.lower.x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y),box1.lower.y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z),box1.lower.z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x),box1.upper.x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y),box1.upper.y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z),box1.upper.z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    template<int N>
+    __forceinline size_t overlap(const BBox<Vec3<vfloat<N>>>& box0, size_t i, const BBox<Vec3<vfloat<N>>>& box1)
+    {
+      const vfloat<N> lower_x = max(vfloat<N>(box0.lower.x[i]),box1.lower.x);
+      const vfloat<N> lower_y = max(vfloat<N>(box0.lower.y[i]),box1.lower.y);
+      const vfloat<N> lower_z = max(vfloat<N>(box0.lower.z[i]),box1.lower.z);
+      const vfloat<N> upper_x = min(vfloat<N>(box0.upper.x[i]),box1.upper.x);
+      const vfloat<N> upper_y = min(vfloat<N>(box0.upper.y[i]),box1.upper.y);
+      const vfloat<N> upper_z = min(vfloat<N>(box0.upper.z[i]),box1.upper.z);
+      return movemask((lower_x <= upper_x) & (lower_y <= upper_y) & (lower_z <= upper_z));
+    }
+
+    bool intersect_triangle_triangle (Scene* scene0, unsigned geomID0, unsigned primID0, Scene* scene1, unsigned geomID1, unsigned primID1)
+    {
+      CSTAT(bvh_collide_prim_intersections1++);
+      const TriangleMesh* mesh0 = scene0->get<TriangleMesh>(geomID0);
+      const TriangleMesh* mesh1 = scene1->get<TriangleMesh>(geomID1);
+      const TriangleMesh::Triangle& tri0 = mesh0->triangle(primID0);
+      const TriangleMesh::Triangle& tri1 = mesh1->triangle(primID1);
+      
+      /* special culling for scene intersection with itself */
+      if (scene0 == scene1 && geomID0 == geomID1)
+      {
+        /* ignore self intersections */
+        if (primID0 == primID1)
+          return false;
+      }
+      CSTAT(bvh_collide_prim_intersections2++);
+      
+      if (scene0 == scene1 && geomID0 == geomID1)
+      {
+        /* ignore intersection with topological neighbors */
+        const vint4 t0(tri0.v[0],tri0.v[1],tri0.v[2],tri0.v[2]);
+        if (any(vint4(tri1.v[0]) == t0)) return false;
+        if (any(vint4(tri1.v[1]) == t0)) return false;
+        if (any(vint4(tri1.v[2]) == t0)) return false;
+      }
+      CSTAT(bvh_collide_prim_intersections3++);
+      
+      const Vec3fa a0 = mesh0->vertex(tri0.v[0]);
+      const Vec3fa a1 = mesh0->vertex(tri0.v[1]);
+      const Vec3fa a2 = mesh0->vertex(tri0.v[2]);
+      const Vec3fa b0 = mesh1->vertex(tri1.v[0]);
+      const Vec3fa b1 = mesh1->vertex(tri1.v[1]);
+      const Vec3fa b2 = mesh1->vertex(tri1.v[2]);
+      
+      return TriangleTriangleIntersector::intersect_triangle_triangle(a0,a1,a2,b0,b1,b2);
+    }
+    
+    template<int N>
+    __forceinline void BVHNColliderUserGeom<N>::processLeaf(NodeRef node0, NodeRef node1)
+    {
+      Collision collisions[16];
+      size_t num_collisions = 0;
+
+      size_t N0; Object* leaf0 = (Object*) node0.leaf(N0);
+      size_t N1; Object* leaf1 = (Object*) node1.leaf(N1);
+      for (size_t i=0; i<N0; i++) {
+        for (size_t j=0; j<N1; j++) {
+          const unsigned geomID0 = leaf0[i].geomID();
+          const unsigned primID0 = leaf0[i].primID();
+          const unsigned geomID1 = leaf1[j].geomID();
+          const unsigned primID1 = leaf1[j].primID();
+          if (this->scene0 == this->scene1 && geomID0 == geomID1 && primID0 == primID1) continue;
+          collisions[num_collisions++] = Collision(geomID0,primID0,geomID1,primID1);
+          if (num_collisions == 16) {
+            this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions);
+            num_collisions = 0;
+          }
+        }
+      }
+      if (num_collisions)
+        this->callback(this->userPtr,(RTCCollision*)&collisions,num_collisions);
+    }
+
+    template<int N>
+    void BVHNCollider<N>::collide_recurse(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1, size_t depth0, size_t depth1)
+    {
+      CSTAT(bvh_collide_traversal_steps++);
+      if (unlikely(ref0.isLeaf())) {
+        if (unlikely(ref1.isLeaf())) {
+          CSTAT(bvh_collide_leaf_pairs++);
+          processLeaf(ref0,ref1);
+          return;
+        } else goto recurse_node1;
+        
+      } else {
+        if (unlikely(ref1.isLeaf())) {
+          goto recurse_node0;
+        } else {
+          if (area(bounds0) > area(bounds1)) {
+            goto recurse_node0;
+          }
+          else {
+            goto recurse_node1;
+          }
+        }
+      }
+
+      {
+      recurse_node0:
+        AABBNode* node0 = ref0.getAABBNode();
+        size_t mask = overlap<N>(bounds1,*node0);
+        //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+        //for (size_t i=0; i<N; i++) {
+#if 0
+        if (depth0 < parallel_depth_threshold) 
+        {
+          parallel_for(size_t(N), [&] ( size_t i ) {
+              if (mask & ( 1 << i)) {
+                BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE);
+                collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1);
+              }
+            });
+        } 
+        else
+#endif
+        {
+          for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+            BVHN<N>::prefetch(node0->child(i),BVH_FLAG_ALIGNED_NODE);
+            collide_recurse(node0->child(i),node0->bounds(i),ref1,bounds1,depth0+1,depth1);
+          }
+        }
+        return;
+      }
+      
+      {
+      recurse_node1:
+        AABBNode* node1 = ref1.getAABBNode();
+        size_t mask = overlap<N>(bounds0,*node1);
+        //for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+        //for (size_t i=0; i<N; i++) {
+#if 0
+        if (depth1 < parallel_depth_threshold) 
+        {
+          parallel_for(size_t(N), [&] ( size_t i ) {
+              if (mask & ( 1 << i)) {
+                BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE);
+                collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1);
+              }
+            });
+        }
+        else
+#endif
+        {
+          for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+            BVHN<N>::prefetch(node1->child(i),BVH_FLAG_ALIGNED_NODE);
+            collide_recurse(ref0,bounds0,node1->child(i),node1->bounds(i),depth0,depth1+1);
+          }
+        }
+        return;
+      }
+    }
+
+    template<int N>
+    void BVHNCollider<N>::split(const CollideJob& job, jobvector& jobs)
+    {
+      if (unlikely(job.ref0.isLeaf())) {
+        if (unlikely(job.ref1.isLeaf())) {
+          jobs.push_back(job);
+          return;
+        } else goto recurse_node1;
+      } else {
+        if (unlikely(job.ref1.isLeaf())) {
+          goto recurse_node0;
+        } else {
+          if (area(job.bounds0) > area(job.bounds1)) {
+            goto recurse_node0;
+          }
+          else {
+            goto recurse_node1;
+          }
+        }
+      }
+      
+      {
+      recurse_node0:
+        const AABBNode* node0 = job.ref0.getAABBNode();
+        size_t mask = overlap<N>(job.bounds1,*node0);
+        for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+          jobs.push_back(CollideJob(node0->child(i),node0->bounds(i),job.depth0+1,job.ref1,job.bounds1,job.depth1));
+        }
+        return;
+      }
+      
+      {
+      recurse_node1:
+        const AABBNode* node1 = job.ref1.getAABBNode();
+        size_t mask = overlap<N>(job.bounds0,*node1);
+        for (size_t m=mask, i=bsf(m); m!=0; m=btc(m,i), i=bsf(m)) {
+          jobs.push_back(CollideJob(job.ref0,job.bounds0,job.depth0,node1->child(i),node1->bounds(i),job.depth1+1));
+        }
+        return;
+      }
+    }
+    
+    template<int N>
+    void BVHNCollider<N>::collide_recurse_entry(NodeRef ref0, const BBox3fa& bounds0, NodeRef ref1, const BBox3fa& bounds1)
+    {
+      CSTAT(bvh_collide_traversal_steps = 0);
+      CSTAT(bvh_collide_leaf_pairs = 0);
+      CSTAT(bvh_collide_leaf_iterations = 0);
+      CSTAT(bvh_collide_prim_intersections1 = 0);
+      CSTAT(bvh_collide_prim_intersections2 = 0);
+      CSTAT(bvh_collide_prim_intersections3 = 0);
+      CSTAT(bvh_collide_prim_intersections4 = 0);
+      CSTAT(bvh_collide_prim_intersections5 = 0);
+      CSTAT(bvh_collide_prim_intersections = 0);
+#if 0
+      collide_recurse(ref0,bounds0,ref1,bounds1,0,0);
+#else
+      const int M = 2048;
+      jobvector jobs[2];
+      jobs[0].reserve(M);
+      jobs[1].reserve(M);
+      jobs[0].push_back(CollideJob(ref0,bounds0,0,ref1,bounds1,0));
+      int source = 0;
+      int target = 1;
+
+      /* try to split job until job list is full */
+      while (jobs[source].size()+8 <= M)
+      {
+        for (size_t i=0; i<jobs[source].size(); i++)
+        {
+          const CollideJob& job = jobs[source][i];
+          size_t remaining = jobs[source].size()-i;
+          if (jobs[target].size()+remaining+8 > M) {
+            jobs[target].push_back(job);
+          } else {
+            split(job,jobs[target]);
+          }
+        }
+
+        /* stop splitting jobs if we reached only leaves and cannot make progress anymore */
+        if (jobs[target].size() == jobs[source].size())
+          break;
+
+        jobs[source].resize(0);
+        std::swap(source,target);
+      }
+
+      /* parallel processing of all jobs */
+      parallel_for(size_t(jobs[source].size()), [&] ( size_t i ) {
+          CollideJob& j = jobs[source][i];
+          collide_recurse(j.ref0,j.bounds0,j.ref1,j.bounds1,j.depth0,j.depth1);
+        });
+      
+      
+#endif
+      CSTAT(PRINT(bvh_collide_traversal_steps));
+      CSTAT(PRINT(bvh_collide_leaf_pairs));
+      CSTAT(PRINT(bvh_collide_leaf_iterations));
+      CSTAT(PRINT(bvh_collide_prim_intersections1));
+      CSTAT(PRINT(bvh_collide_prim_intersections2));
+      CSTAT(PRINT(bvh_collide_prim_intersections3));
+      CSTAT(PRINT(bvh_collide_prim_intersections4));
+      CSTAT(PRINT(bvh_collide_prim_intersections5));
+      CSTAT(PRINT(bvh_collide_prim_intersections));
+    }
+   
+    template<int N>
+    void BVHNColliderUserGeom<N>::collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr)
+    { 
+      BVHNColliderUserGeom<N>(bvh0->scene,bvh1->scene,callback,userPtr).
+        collide_recurse_entry(bvh0->root,bvh0->bounds.bounds(),bvh1->root,bvh1->bounds.bounds());
+    }
+
+#if defined (EMBREE_LOWEST_ISA)
+    struct collision_regression_test : public RegressionTest
+    {
+      collision_regression_test(const char* name) : RegressionTest(name) {
+        registerRegressionTest(this);
+      }
+    
+      bool run ()
+      {
+        bool passed = true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(-0.008815f, 0.041848f, -2.49875e-06f), Vec3fa(-0.008276f, 0.053318f, -2.49875e-06f), Vec3fa(0.003023f, 0.048969f, -2.49875e-06f),
+                                                                            Vec3fa(0.00245f, 0.037612f, -2.49875e-06f), Vec3fa(0.01434f, 0.042634f, -2.49875e-06f), Vec3fa(0.013499f, 0.031309f, -2.49875e-06f)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,1),Vec3fa(0,1,1)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,1),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,-0.1f),Vec3fa(1,0,1),Vec3fa(0,1,1)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0,0,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0,0),Vec3fa(0,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(0.1f,-0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), Vec3fa(-0.1f,0.1f,0),Vec3fa(0.5f,0.1f,0),Vec3fa(0.1f,0.5f,0)) == true;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa(-1,1,0) + Vec3fa(0,0,0),Vec3fa(-1,1,0) + Vec3fa(0.1f,0,0),Vec3fa(-1,1,0) + Vec3fa(0,0.1f,0)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa( 2,0.5f,0) + Vec3fa(0,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0.1f,0,0),Vec3fa( 2,0.5f,0) + Vec3fa(0,0.1f,0)) == false;
+        passed &= TriangleTriangleIntersector::intersect_triangle_triangle (Vec3fa(0,0,0),Vec3fa(1,0,0),Vec3fa(0,1,0), 
+                                               Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0.1f,0,0),Vec3fa(0.5f,-2.0f,0) + Vec3fa(0,0.1f,0)) == false;
+        return passed;
+      }
+    };
+
+    collision_regression_test collision_regression("collision_regression_test");
+#endif
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Collider Definitions
+    ////////////////////////////////////////////////////////////////////////////////
+
+    DEFINE_COLLIDER(BVH4ColliderUserGeom,BVHNColliderUserGeom<4>);
+
+#if defined(__AVX__)
+    DEFINE_COLLIDER(BVH8ColliderUserGeom,BVHNColliderUserGeom<8>);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h
new file mode 100644
index 0000000000..ac4f99c96a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_collider.h
@@ -0,0 +1,72 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/object.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+      class BVHNCollider
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+
+      struct CollideJob
+      {
+        CollideJob () {}
+        
+        CollideJob (NodeRef ref0, const BBox3fa& bounds0, size_t depth0,
+                    NodeRef ref1, const BBox3fa& bounds1, size_t depth1)
+        : ref0(ref0), bounds0(bounds0), depth0(depth0), ref1(ref1), bounds1(bounds1), depth1(depth1) {}
+        
+        NodeRef ref0;
+        BBox3fa bounds0;
+        size_t depth0;
+        NodeRef ref1;
+        BBox3fa bounds1;
+        size_t depth1;
+      };
+
+      typedef vector_t<CollideJob, aligned_allocator<CollideJob,16>> jobvector;
+
+      void split(const CollideJob& job, jobvector& jobs);
+      
+    public:
+      __forceinline BVHNCollider (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr)
+        : scene0(scene0), scene1(scene1), callback(callback), userPtr(userPtr) {}
+
+    public:
+      virtual void processLeaf(NodeRef leaf0, NodeRef leaf1) = 0;
+      void collide_recurse(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1, size_t depth0, size_t depth1);
+      void collide_recurse_entry(NodeRef node0, const BBox3fa& bounds0, NodeRef node1, const BBox3fa& bounds1);
+    
+    protected:
+      Scene* scene0;
+      Scene* scene1;
+      RTCCollideFunc callback;
+      void* userPtr;
+    };
+
+    template<int N>
+      class BVHNColliderUserGeom : public BVHNCollider<N>
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+
+      __forceinline BVHNColliderUserGeom (Scene* scene0, Scene* scene1, RTCCollideFunc callback, void* userPtr)
+        : BVHNCollider<N>(scene0,scene1,callback,userPtr) {}
+
+      virtual void processLeaf(NodeRef leaf0, NodeRef leaf1);
+    public:
+      static void collide(BVH* __restrict__ bvh0, BVH* __restrict__ bvh1, RTCCollideFunc callback, void* userPtr);
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h
new file mode 100644
index 0000000000..54021ca6eb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_factory.h
@@ -0,0 +1,21 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+#include "../common/isa.h"
+#include "../common/accel.h"
+#include "../common/scene.h"
+#include "../geometry/curve_intersector_virtual.h"
+
+namespace embree
+{
+  /*! BVH instantiations */
+  class BVHFactory
+  {
+  public:
+    enum class BuildVariant     { STATIC, DYNAMIC, HIGH_QUALITY };
+    enum class IntersectVariant { FAST, ROBUST };
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp
new file mode 100644
index 0000000000..ea6adc2717
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.cpp
@@ -0,0 +1,330 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector1.h"
+#include "node_intersector1.h"
+#include "bvh_traverser1.h"
+
+#include "../geometry/intersector_iterators.h"
+#include "../geometry/triangle_intersector.h"
+#include "../geometry/trianglev_intersector.h"
+#include "../geometry/trianglev_mb_intersector.h"
+#include "../geometry/trianglei_intersector.h"
+#include "../geometry/quadv_intersector.h"
+#include "../geometry/quadi_intersector.h"
+#include "../geometry/curveNv_intersector.h"
+#include "../geometry/curveNi_intersector.h"
+#include "../geometry/curveNi_mb_intersector.h"
+#include "../geometry/linei_intersector.h"
+#include "../geometry/subdivpatch1_intersector.h"
+#include "../geometry/object_intersector.h"
+#include "../geometry/instance_intersector.h"
+#include "../geometry/subgrid_intersector.h"
+#include "../geometry/subgrid_mb_intersector.h"
+#include "../geometry/curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::intersect(const Accel::Intersectors* __restrict__ This,
+                                                                              RayHit& __restrict__ ray,
+                                                                              IntersectContext* __restrict__ context)
+    {
+      const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+      
+      /* we may traverse an empty BVH in case all geometry was invalid */
+      if (bvh->root == BVH::emptyNode)
+        return;
+      
+      /* perform per ray precalculations required by the primitive intersector */
+      Precalculations pre(ray, bvh);
+
+      /* stack state */
+      StackItemT<NodeRef> stack[stackSize];    // stack of nodes
+      StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer
+      StackItemT<NodeRef>* stackEnd = stack+stackSize;
+      stack[0].ptr  = bvh->root;
+      stack[0].dist = neg_inf;
+      
+      if (bvh->root == BVH::emptyNode)
+        return;
+      
+      /* filter out invalid rays */
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+      if (!ray.valid()) return;
+#endif
+      /* verify correct input */
+      assert(ray.valid());
+      assert(ray.tnear() >= 0.0f);
+      assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f));
+
+      /* load the ray into SIMD registers */
+      TravRay<N,Nx,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f));
+
+      /* initialize the node traverser */
+      BVHNNodeTraverser1Hit<N, Nx, types> nodeTraverser;
+
+      /* pop loop */
+      while (true) pop:
+      {
+        /* pop next node */
+        if (unlikely(stackPtr == stack)) break;
+        stackPtr--;
+        NodeRef cur = NodeRef(stackPtr->ptr);
+
+        /* if popped node is too far, pop next one */
+#if defined(__AVX512ER__)
+        /* much faster on KNL */
+        if (unlikely(any(vfloat<Nx>(*(float*)&stackPtr->dist) > tray.tfar)))
+          continue;
+#else
+        if (unlikely(*(float*)&stackPtr->dist > ray.tfar))
+          continue;
+#endif
+
+        /* downtraversal loop */
+        while (true)
+        {
+          /* intersect node */
+          size_t mask; vfloat<Nx> tNear;
+          STAT3(normal.trav_nodes,1,1,1);
+          bool nodeIntersected = BVHNNodeIntersector1<N, Nx, types, robust>::intersect(cur, tray, ray.time(), tNear, mask);
+          if (unlikely(!nodeIntersected)) { STAT3(normal.trav_nodes,-1,-1,-1); break; }
+
+          /* if no child is hit, pop next node */
+          if (unlikely(mask == 0))
+            goto pop;
+
+          /* select next child and push other children */
+          nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd);
+        }
+
+        /* this is a leaf node */
+        assert(cur != BVH::emptyNode);
+        STAT3(normal.trav_leaves,1,1,1);
+        size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+        size_t lazy_node = 0;
+        PrimitiveIntersector1::intersect(This, pre, ray, context, prim, num, tray, lazy_node);
+        tray.tfar = ray.tfar;
+
+        /* push lazy node onto stack */
+        if (unlikely(lazy_node)) {
+          stackPtr->ptr = lazy_node;
+          stackPtr->dist = neg_inf;
+          stackPtr++;
+        }
+      }
+    }
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    void BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::occluded(const Accel::Intersectors* __restrict__ This,
+                                                                             Ray& __restrict__ ray,
+                                                                             IntersectContext* __restrict__ context)
+    {
+      const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+      
+      /* we may traverse an empty BVH in case all geometry was invalid */
+      if (bvh->root == BVH::emptyNode)
+        return;
+       
+      /* early out for already occluded rays */
+      if (unlikely(ray.tfar < 0.0f))
+        return;
+
+      /* perform per ray precalculations required by the primitive intersector */
+      Precalculations pre(ray, bvh);
+
+      /* stack state */
+      NodeRef stack[stackSize];    // stack of nodes that still need to get traversed
+      NodeRef* stackPtr = stack+1; // current stack pointer
+      NodeRef* stackEnd = stack+stackSize;
+      stack[0] = bvh->root;
+
+      /* filter out invalid rays */
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+      if (!ray.valid()) return;
+#endif
+
+      /* verify correct input */
+      assert(ray.valid());
+      assert(ray.tnear() >= 0.0f);
+      assert(!(types & BVH_MB) || (ray.time() >= 0.0f && ray.time() <= 1.0f));
+
+      /* load the ray into SIMD registers */
+      TravRay<N,Nx,robust> tray(ray.org, ray.dir, max(ray.tnear(), 0.0f), max(ray.tfar, 0.0f));
+
+      /* initialize the node traverser */
+      BVHNNodeTraverser1Hit<N, Nx, types> nodeTraverser;
+
+      /* pop loop */
+      while (true) pop:
+      {
+        /* pop next node */
+        if (unlikely(stackPtr == stack)) break;
+        stackPtr--;
+        NodeRef cur = (NodeRef)*stackPtr;
+
+        /* downtraversal loop */
+        while (true)
+        {
+          /* intersect node */
+          size_t mask; vfloat<Nx> tNear;
+          STAT3(shadow.trav_nodes,1,1,1);
+          bool nodeIntersected = BVHNNodeIntersector1<N, Nx, types, robust>::intersect(cur, tray, ray.time(), tNear, mask);
+          if (unlikely(!nodeIntersected)) { STAT3(shadow.trav_nodes,-1,-1,-1); break; }
+
+          /* if no child is hit, pop next node */
+          if (unlikely(mask == 0))
+            goto pop;
+
+          /* select next child and push other children */
+          nodeTraverser.traverseAnyHit(cur, mask, tNear, stackPtr, stackEnd);
+        }
+
+        /* this is a leaf node */
+        assert(cur != BVH::emptyNode);
+        STAT3(shadow.trav_leaves,1,1,1);
+        size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+        size_t lazy_node = 0;
+        if (PrimitiveIntersector1::occluded(This, pre, ray, context, prim, num, tray, lazy_node)) {
+          ray.tfar = neg_inf;
+          break;
+        }
+
+        /* push lazy node onto stack */
+        if (unlikely(lazy_node)) {
+          *stackPtr = (NodeRef)lazy_node;
+          stackPtr++;
+        }
+      }
+    }
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    struct PointQueryDispatch
+    {
+      typedef typename PrimitiveIntersector1::Precalculations Precalculations;
+      typedef typename PrimitiveIntersector1::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+
+      static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+
+      /* right now AVX512KNL SIMD extension only for standard node types */
+      static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend<N>::size : N;
+
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context)
+      {
+        const BVH* __restrict__ bvh = (const BVH*)This->ptr;
+        
+        /* we may traverse an empty BVH in case all geometry was invalid */
+        if (bvh->root == BVH::emptyNode)
+          return false;
+        
+        /* stack state */
+        StackItemT<NodeRef> stack[stackSize];    // stack of nodes
+        StackItemT<NodeRef>* stackPtr = stack+1; // current stack pointer
+        StackItemT<NodeRef>* stackEnd = stack+stackSize;
+        stack[0].ptr  = bvh->root;
+        stack[0].dist = neg_inf;
+        
+        /* verify correct input */
+        assert(!(types & BVH_MB) || (query->time >= 0.0f && query->time <= 1.0f));
+
+        /* load the point query into SIMD registers */
+        TravPointQuery<N> tquery(query->p, context->query_radius);
+
+        /* initialize the node traverser */
+        BVHNNodeTraverser1Hit<N, N, types> nodeTraverser;
+
+        bool changed = false;
+        float cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE
+                          ? query->radius * query->radius
+                          : dot(context->query_radius, context->query_radius);
+
+        /* pop loop */
+        while (true) pop:
+        {
+          /* pop next node */
+          if (unlikely(stackPtr == stack)) break;
+          stackPtr--;
+          NodeRef cur = NodeRef(stackPtr->ptr);
+
+          /* if popped node is too far, pop next one */
+          if (unlikely(*(float*)&stackPtr->dist > cull_radius))
+            continue;
+
+          /* downtraversal loop */
+          while (true)
+          {
+            /* intersect node */
+            size_t mask; vfloat<N> tNear;
+            STAT3(point_query.trav_nodes,1,1,1);
+            bool nodeIntersected;
+            if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+              nodeIntersected = BVHNNodePointQuerySphere1<N, types>::pointQuery(cur, tquery, query->time, tNear, mask);
+            } else {
+              nodeIntersected = BVHNNodePointQueryAABB1  <N, types>::pointQuery(cur, tquery, query->time, tNear, mask);
+            }
+            if (unlikely(!nodeIntersected)) { STAT3(point_query.trav_nodes,-1,-1,-1); break; }
+
+            /* if no child is hit, pop next node */
+            if (unlikely(mask == 0))
+              goto pop;
+
+            /* select next child and push other children */
+            nodeTraverser.traverseClosestHit(cur, mask, tNear, stackPtr, stackEnd);
+          }
+
+          /* this is a leaf node */
+          assert(cur != BVH::emptyNode);
+          STAT3(point_query.trav_leaves,1,1,1);
+          size_t num; Primitive* prim = (Primitive*)cur.leaf(num);
+          size_t lazy_node = 0;
+          if (PrimitiveIntersector1::pointQuery(This, query, context, prim, num, tquery, lazy_node))
+          {
+            changed = true;
+            tquery.rad = context->query_radius;
+            cull_radius = context->query_type == POINT_QUERY_TYPE_SPHERE
+                        ? query->radius * query->radius
+                        : dot(context->query_radius, context->query_radius);
+          }
+
+          /* push lazy node onto stack */
+          if (unlikely(lazy_node)) {
+            stackPtr->ptr = lazy_node;
+            stackPtr->dist = neg_inf;
+            stackPtr++;
+          }
+        }
+        return changed;
+      }
+    };
+
+    /* disable point queries for not yet supported geometry types */
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, VirtualCurveIntersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+    
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, SubdivPatch1Intersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+    
+    template<int N, int types, bool robust>
+    struct PointQueryDispatch<N, types, robust, SubdivPatch1MBIntersector1> {
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context) { return false; }
+    };
+
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    bool BVHNIntersector1<N, types, robust, PrimitiveIntersector1>::pointQuery(
+      const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context)
+    {
+      return PointQueryDispatch<N, types, robust, PrimitiveIntersector1>::pointQuery(This, query, context);
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h
new file mode 100644
index 0000000000..1a269c319a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! BVH single ray intersector. */
+    template<int N, int types, bool robust, typename PrimitiveIntersector1>
+    class BVHNIntersector1
+    {
+      /* shortcuts for frequently used types */
+      typedef typename PrimitiveIntersector1::Precalculations Precalculations;
+      typedef typename PrimitiveIntersector1::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+
+      static const size_t stackSize = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+
+      /* right now AVX512KNL SIMD extension only for standard node types */
+      static const size_t Nx = (types == BVH_AN1 || types == BVH_QN1) ? vextend<N>::size : N;
+
+    public:
+      static void intersect (const Accel::Intersectors* This, RayHit& ray, IntersectContext* context);
+      static void occluded  (const Accel::Intersectors* This, Ray& ray, IntersectContext* context);
+      static bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp
new file mode 100644
index 0000000000..989f7354fd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector1_bvh4.cpp
@@ -0,0 +1,61 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_intersector1.cpp"
+
+namespace embree
+{
+  namespace isa
+  {
+    int getISA() {
+      return VerifyMultiTargetLinking::getISA();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// BVH4Intersector1 Definitions
+    ////////////////////////////////////////////////////////////////////////////////
+
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA false COMMA VirtualCurveIntersector1 >));
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersector1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA false COMMA VirtualCurveIntersector1 >));
+
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1,BVHNIntersector1<4 COMMA BVH_AN1_UN1 COMMA true COMMA VirtualCurveIntersector1 >));
+    IF_ENABLED_CURVES_OR_POINTS(DEFINE_INTERSECTOR1(BVH4OBBVirtualCurveIntersectorRobust1MB,BVHNIntersector1<4 COMMA BVH_AN2_AN4D_UN2 COMMA true COMMA VirtualCurveIntersector1 >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4Intersector1Moeller,  BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMIntersector1Moeller  <SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Moeller <SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<TriangleMvIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMvMBIntersector1Moeller <SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<TriangleMiMBIntersector1Moeller <SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4vMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<TriangleMvMBIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(BVH4Triangle4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<TriangleMiMBIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMvIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4vIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<QuadMvIntersector1Pluecker<4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true  COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Moeller, BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<QuadMiMBIntersector1Moeller <4 COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(BVH4Quad4iMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true  COMMA ArrayIntersector1<QuadMiMBIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1Intersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubdivPatch1Intersector1>));
+    IF_ENABLED_SUBDIV(DEFINE_INTERSECTOR1(BVH4SubdivPatch1MBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubdivPatch1MBIntersector1>));
+    
+    IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<ObjectIntersector1<false>> >));
+    IF_ENABLED_USER(DEFINE_INTERSECTOR1(BVH4VirtualMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<ObjectIntersector1<true>> >));
+
+    IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceIntersector1,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA ArrayIntersector1<InstanceIntersector1> >));
+    IF_ENABLED_INSTANCE(DEFINE_INTERSECTOR1(BVH4InstanceMBIntersector1,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA ArrayIntersector1<InstanceIntersector1MB> >));
+
+    IF_ENABLED_TRIS(DEFINE_INTERSECTOR1(QBVH4Triangle4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<TriangleMiIntersector1Pluecker<SIMD_MODE(4) COMMA true> > >));
+    IF_ENABLED_QUADS(DEFINE_INTERSECTOR1(QBVH4Quad4iIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_QN1 COMMA false COMMA ArrayIntersector1<QuadMiIntersector1Pluecker<4 COMMA true> > >));
+
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN1 COMMA false COMMA SubGridIntersector1Moeller<4 COMMA true> >));
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Moeller,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA true COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >));
+
+    IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN1 COMMA true COMMA SubGridIntersector1Pluecker<4 COMMA true> >));
+    //IF_ENABLED_GRIDS(DEFINE_INTERSECTOR1(BVH4GridMBIntersector1Pluecker,BVHNIntersector1<4 COMMA BVH_AN2_AN4D COMMA false COMMA SubGridMBIntersector1Pluecker<4 COMMA true> >));
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h
new file mode 100644
index 0000000000..d764cc928d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_hybrid.h
@@ -0,0 +1,61 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/stack_item.h"
+#include "node_intersector_frustum.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    template<int K, bool robust>
+    struct TravRayK;
+
+    /*! BVH hybrid packet intersector. Switches between packet and single ray traversal (optional). */
+    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK, bool single = true>
+    class BVHNIntersectorKHybrid
+    {
+      /* right now AVX512KNL SIMD extension only for standard node types */
+      static const size_t Nx = types == BVH_AN1 ? vextend<N>::size : N;
+
+      /* shortcuts for frequently used types */
+      typedef typename PrimitiveIntersectorK::Precalculations Precalculations;
+      typedef typename PrimitiveIntersectorK::Primitive Primitive;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+      typedef typename BVH::AABBNode AABBNode;
+      
+      static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth+3; // +3 due to 16-wide store
+      static const size_t stackSizeChunk = 1+(N-1)*BVH::maxDepth;
+
+      static const size_t switchThresholdIncoherent = \
+      (K==4)  ? 3 :
+      (K==8)  ? ((N==4) ? 5 : 7) :
+      (K==16) ? 14 : // 14 seems to work best for KNL due to better ordered chunk traversal
+      0;
+
+    private:
+      static void intersect1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
+                             RayHitK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+      static bool occluded1(Accel::Intersectors* This, const BVH* bvh, NodeRef root, size_t k, Precalculations& pre,
+                            RayK<K>& ray, const TravRayK<K, robust>& tray, IntersectContext* context);
+
+    public:
+      static void intersect(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
+      static void occluded (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+
+      static void intersectCoherent(vint<K>* valid, Accel::Intersectors* This, RayHitK<K>& ray, IntersectContext* context);
+      static void occludedCoherent (vint<K>* valid, Accel::Intersectors* This, RayK<K>& ray, IntersectContext* context);
+
+    };
+
+    /*! BVH packet intersector. */
+    template<int N, int K, int types, bool robust, typename PrimitiveIntersectorK>
+    class BVHNIntersectorKChunk : public BVHNIntersectorKHybrid<N, K, types, robust, PrimitiveIntersectorK, false> {};
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h
new file mode 100644
index 0000000000..83d1fb4d3d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream.h
@@ -0,0 +1,295 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector_packet_stream.h"
+#include "node_intersector_frustum.h"
+#include "bvh_traverser_stream.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    /*! BVH ray stream intersector. */
+    template<int N, int Nx, int types, bool robust, typename PrimitiveIntersector>
+    class BVHNIntersectorStream
+    {
+      static const int Nxd = (Nx == N) ? N : Nx/2;
+
+      /* shortcuts for frequently used types */
+      template<int K> using PrimitiveIntersectorK = typename PrimitiveIntersector::template Type<K>;
+      template<int K> using PrimitiveK = typename PrimitiveIntersectorK<K>::PrimitiveK;
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::AABBNodeMB AABBNodeMB;
+
+      template<int K>
+      __forceinline static size_t initPacketsAndFrustum(RayK<K>** inputPackets, size_t numOctantRays,
+                                                        TravRayKStream<K, robust>* packets, Frustum<robust>& frustum, bool& commonOctant)
+      {
+        const size_t numPackets = (numOctantRays+K-1)/K;
+
+        Vec3vf<K> tmp_min_rdir(pos_inf);
+        Vec3vf<K> tmp_max_rdir(neg_inf);
+        Vec3vf<K> tmp_min_org(pos_inf);
+        Vec3vf<K> tmp_max_org(neg_inf);
+        vfloat<K> tmp_min_dist(pos_inf);
+        vfloat<K> tmp_max_dist(neg_inf);
+
+        size_t m_active = 0;
+        for (size_t i = 0; i < numPackets; i++)
+        {
+          const vfloat<K> tnear = inputPackets[i]->tnear();
+          const vfloat<K> tfar  = inputPackets[i]->tfar;
+          vbool<K> m_valid = (tnear <= tfar) & (tnear >= 0.0f);
+
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+          m_valid &= inputPackets[i]->valid();
+#endif
+
+          m_active |= (size_t)movemask(m_valid) << (i*K);
+
+          vfloat<K> packet_min_dist = max(tnear, 0.0f);
+          vfloat<K> packet_max_dist = select(m_valid, tfar, neg_inf);
+          tmp_min_dist = min(tmp_min_dist, packet_min_dist);
+          tmp_max_dist = max(tmp_max_dist, packet_max_dist);
+
+          const Vec3vf<K>& org = inputPackets[i]->org;
+          const Vec3vf<K>& dir = inputPackets[i]->dir;
+
+          new (&packets[i]) TravRayKStream<K, robust>(org, dir, packet_min_dist, packet_max_dist);
+
+          tmp_min_rdir = min(tmp_min_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(pos_inf)));
+          tmp_max_rdir = max(tmp_max_rdir, select(m_valid, packets[i].rdir, Vec3vf<K>(neg_inf)));
+          tmp_min_org  = min(tmp_min_org , select(m_valid,org , Vec3vf<K>(pos_inf)));
+          tmp_max_org  = max(tmp_max_org , select(m_valid,org , Vec3vf<K>(neg_inf)));
+        }
+
+        m_active &= (numOctantRays == (8 * sizeof(size_t))) ? (size_t)-1 : (((size_t)1 << numOctantRays)-1);
+
+        
+        const Vec3fa reduced_min_rdir(reduce_min(tmp_min_rdir.x),
+                                      reduce_min(tmp_min_rdir.y),
+                                      reduce_min(tmp_min_rdir.z));
+
+        const Vec3fa reduced_max_rdir(reduce_max(tmp_max_rdir.x),
+                                      reduce_max(tmp_max_rdir.y),
+                                      reduce_max(tmp_max_rdir.z));
+
+        const Vec3fa reduced_min_origin(reduce_min(tmp_min_org.x),
+                                        reduce_min(tmp_min_org.y),
+                                        reduce_min(tmp_min_org.z));
+
+        const Vec3fa reduced_max_origin(reduce_max(tmp_max_org.x),
+                                        reduce_max(tmp_max_org.y),
+                                        reduce_max(tmp_max_org.z));
+
+        commonOctant =
+          (reduced_max_rdir.x < 0.0f || reduced_min_rdir.x >= 0.0f) &&
+          (reduced_max_rdir.y < 0.0f || reduced_min_rdir.y >= 0.0f) &&
+          (reduced_max_rdir.z < 0.0f || reduced_min_rdir.z >= 0.0f);
+        
+        const float frustum_min_dist = reduce_min(tmp_min_dist);
+        const float frustum_max_dist = reduce_max(tmp_max_dist);
+
+        frustum.init(reduced_min_origin, reduced_max_origin,
+                     reduced_min_rdir, reduced_max_rdir,
+                     frustum_min_dist, frustum_max_dist,
+                     N);
+        
+        return m_active;
+      }
+
+      template<int K>
+      __forceinline static size_t intersectAABBNodePacket(size_t m_active,
+                                                             const TravRayKStream<K,robust>* packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             size_t boxID,
+                                                             const NearFarPrecalculations& nf)
+      {
+        assert(m_active);
+        const size_t startPacketID = bsf(m_active) / K;
+        const size_t endPacketID   = bsr(m_active) / K;
+        size_t m_trav_active = 0;
+        for (size_t i = startPacketID; i <= endPacketID; i++)
+        {
+          const size_t m_hit = intersectNodeK<N>(node, boxID, packets[i], nf);
+          m_trav_active |= m_hit << (i*K);
+        } 
+        return m_trav_active;
+      }
+      
+      template<int K>
+      __forceinline static size_t traverseCoherentStream(size_t m_active,
+                                                         TravRayKStream<K, robust>* packets,
+                                                         const AABBNode* __restrict__ node,
+                                                         const Frustum<robust>& frustum,
+                                                         size_t* maskK,
+                                                         vfloat<Nx>& dist)
+      {
+        size_t m_node_hit = intersectNodeFrustum<N,Nx>(node, frustum, dist);
+        const size_t first_index    = bsf(m_active);
+        const size_t first_packetID = first_index / K;
+        const size_t first_rayID    = first_index % K;
+        size_t m_first_hit = intersectNode1<N,Nx>(node, packets[first_packetID], first_rayID, frustum.nf);
+
+        /* this make traversal independent of the ordering of rays */
+        size_t m_node = m_node_hit ^ m_first_hit;
+        while (unlikely(m_node))
+        {
+          const size_t boxID = bscf(m_node);
+          const size_t m_current = m_active & intersectAABBNodePacket(m_active, packets, node, boxID, frustum.nf);
+          m_node_hit ^= m_current ? (size_t)0 : ((size_t)1 << boxID);
+          maskK[boxID] = m_current;
+        }
+        return m_node_hit;
+      }
+      
+      // TODO: explicit 16-wide path for KNL
+      template<int K>
+      __forceinline static vint<Nx> traverseIncoherentStream(size_t m_active,
+                                                             TravRayKStreamFast<K>* __restrict__ packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             const NearFarPrecalculations& nf,
+                                                             const int shiftTable[32])
+      {
+        const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+        const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+        const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+        const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+        const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+        const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+        assert(m_active);
+        vint<Nx> vmask(zero);
+        do
+        {   
+          STAT3(shadow.trav_nodes,1,1,1);
+          const size_t rayID = bscf(m_active);
+          assert(rayID < MAX_INTERNAL_STREAM_SIZE);
+          TravRayKStream<K,robust> &p = packets[rayID / K];
+          const size_t i = rayID % K;
+          const vint<Nx> bitmask(shiftTable[rayID]);
+
+#if defined (__aarch64__)
+          const vfloat<Nx> tNearX = madd(bminX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+          const vfloat<Nx> tNearY = madd(bminY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+          const vfloat<Nx> tNearZ = madd(bminZ, p.rdir.z[i], p.neg_org_rdir.z[i]);
+          const vfloat<Nx> tFarX  = madd(bmaxX, p.rdir.x[i], p.neg_org_rdir.x[i]);
+          const vfloat<Nx> tFarY  = madd(bmaxY, p.rdir.y[i], p.neg_org_rdir.y[i]);
+          const vfloat<Nx> tFarZ  = madd(bmaxZ, p.rdir.z[i], p.neg_org_rdir.z[i]); 
+#else
+          const vfloat<Nx> tNearX = msub(bminX, p.rdir.x[i], p.org_rdir.x[i]);
+          const vfloat<Nx> tNearY = msub(bminY, p.rdir.y[i], p.org_rdir.y[i]);
+          const vfloat<Nx> tNearZ = msub(bminZ, p.rdir.z[i], p.org_rdir.z[i]);
+          const vfloat<Nx> tFarX  = msub(bmaxX, p.rdir.x[i], p.org_rdir.x[i]);
+          const vfloat<Nx> tFarY  = msub(bmaxY, p.rdir.y[i], p.org_rdir.y[i]);
+          const vfloat<Nx> tFarZ  = msub(bmaxZ, p.rdir.z[i], p.org_rdir.z[i]); 
+#endif
+
+          const vfloat<Nx> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i]));
+          const vfloat<Nx> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<Nx>(p.tfar[i]));      
+
+#if defined(__AVX512ER__)
+          const vboolx m_node((1 << N)-1);
+          const vbool<Nx> hit_mask = le(m_node, tNear, tFar);
+          vmask = mask_or(hit_mask, vmask, vmask, bitmask);
+#else
+          const vbool<Nx> hit_mask = tNear <= tFar;
+#if defined(__AVX2__)
+          vmask = vmask | (bitmask & vint<Nx>(hit_mask));
+#else
+          vmask = select(hit_mask, vmask | bitmask, vmask);
+#endif
+#endif
+        } while(m_active);
+        return vmask;        
+      }
+
+      template<int K>
+      __forceinline static vint<Nx> traverseIncoherentStream(size_t m_active,
+                                                             TravRayKStreamRobust<K>* __restrict__ packets,
+                                                             const AABBNode* __restrict__ node,
+                                                             const NearFarPrecalculations& nf,
+                                                             const int shiftTable[32])
+      {
+        const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+        const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+        const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+        const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+        const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+        const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+        assert(m_active);
+        vint<Nx> vmask(zero);
+        do
+        {   
+          STAT3(shadow.trav_nodes,1,1,1);
+          const size_t rayID = bscf(m_active);
+          assert(rayID < MAX_INTERNAL_STREAM_SIZE);
+          TravRayKStream<K,robust> &p = packets[rayID / K];
+          const size_t i = rayID % K;
+          const vint<Nx> bitmask(shiftTable[rayID]);
+          const vfloat<Nx> tNearX = (bminX - p.org.x[i]) * p.rdir.x[i];
+          const vfloat<Nx> tNearY = (bminY - p.org.y[i]) * p.rdir.y[i];
+          const vfloat<Nx> tNearZ = (bminZ - p.org.z[i]) * p.rdir.z[i];
+          const vfloat<Nx> tFarX  = (bmaxX - p.org.x[i]) * p.rdir.x[i];
+          const vfloat<Nx> tFarY  = (bmaxY - p.org.y[i]) * p.rdir.y[i];
+          const vfloat<Nx> tFarZ  = (bmaxZ - p.org.z[i]) * p.rdir.z[i];
+          const vfloat<Nx> tNear  = maxi(tNearX, tNearY, tNearZ, vfloat<Nx>(p.tnear[i]));
+          const vfloat<Nx> tFar   = mini(tFarX , tFarY , tFarZ,  vfloat<Nx>(p.tfar[i]));
+          const float round_down  = 1.0f-2.0f*float(ulp);
+          const float round_up    = 1.0f+2.0f*float(ulp);
+#if defined(__AVX512ER__)
+          const vboolx m_node((1 << N)-1);
+          const vbool<Nx> hit_mask = le(m_node, round_down*tNear, round_up*tFar);
+          vmask = mask_or(hit_mask, vmask, vmask, bitmask);
+#else
+          const vbool<Nx> hit_mask = round_down*tNear <= round_up*tFar;
+#if defined(__AVX2__)
+          vmask = vmask | (bitmask & vint<Nx>(hit_mask));
+#else
+          vmask = select(hit_mask, vmask | bitmask, vmask);
+#endif
+#endif
+        } while(m_active);
+        return vmask;
+      }
+                                                         
+
+      static const size_t stackSizeSingle = 1+(N-1)*BVH::maxDepth;
+
+    public:
+      static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
+      static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
+
+    private:
+      template<int K>
+      static void intersectCoherent(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedCoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedIncoherent(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+    };
+
+
+    /*! BVH ray stream intersector with direct fallback to packets. */
+    template<int N, int Nx>
+    class BVHNIntersectorStreamPacketFallback
+    {
+    public:
+      static void intersect(Accel::Intersectors* This, RayHitN** inputRays, size_t numRays, IntersectContext* context);
+      static void occluded (Accel::Intersectors* This, RayN** inputRays, size_t numRays, IntersectContext* context);
+
+    private:
+      template<int K>
+      static void intersectK(Accel::Intersectors* This, RayHitK<K>** inputRays, size_t numRays, IntersectContext* context);
+
+      template<int K>
+      static void occludedK(Accel::Intersectors* This, RayK<K>** inputRays, size_t numRays, IntersectContext* context);
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h
new file mode 100644
index 0000000000..cdeb923637
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_intersector_stream_filters.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/ray.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class RayStreamFilter
+    {
+    public:
+      static void intersectAOS(Scene* scene, RTCRayHit* rays, size_t N, size_t stride, IntersectContext* context);
+      static void intersectAOP(Scene* scene, RTCRayHit** rays, size_t N, IntersectContext* context);
+      static void intersectSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+      static void intersectSOP(Scene* scene, const RTCRayHitNp* rays, size_t N, IntersectContext* context);
+
+      static void occludedAOS(Scene* scene, RTCRay* rays, size_t N, size_t stride, IntersectContext* context);
+      static void occludedAOP(Scene* scene, RTCRay** rays, size_t N, IntersectContext* context);
+      static void occludedSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+      static void occludedSOP(Scene* scene, const RTCRayNp* rays, size_t N, IntersectContext* context);
+
+    private:
+      template<int K, bool intersect>
+      static void filterAOS(Scene* scene, void* rays, size_t N, size_t stride, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterAOP(Scene* scene, void** rays, size_t N, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterSOA(Scene* scene, char* rays, size_t N, size_t numPackets, size_t stride, IntersectContext* context);
+
+      template<int K, bool intersect>
+      static void filterSOP(Scene* scene, const void* rays, size_t N, IntersectContext* context);
+    };
+  }
+};
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h
new file mode 100644
index 0000000000..baa4a8d805
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb.h
@@ -0,0 +1,213 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! BVHN AABBNode */
+  template<typename NodeRef, int N>
+    struct AABBNode_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc, size_t numChildren = 0) const
+      {
+        AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const BBox3fa& bounds) const {
+        node.getAABBNode()->setRef(i,child);
+        node.getAABBNode()->setBounds(i,bounds);
+      }
+    };
+    
+    struct Create2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const
+      {
+        AABBNode_t* node = (AABBNode_t*) alloc.malloc0(sizeof(AABBNode_t), NodeRef::byteNodeAlignment); node->clear();
+        for (size_t i=0; i<num; i++) node->setBounds(i,children[i].bounds());
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        AABBNode_t* node = ref.getAABBNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        return ref;
+      }
+    };
+    
+    struct Set3
+    {
+      Set3 (FastAllocator* allocator, PrimRef* prims)
+      : allocator(allocator), prims(prims) {}
+      
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        AABBNode_t* node = ref.getAABBNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        
+        if (unlikely(precord.alloc_barrier))
+        {
+          PrimRef* begin = &prims[precord.prims.begin()];
+          PrimRef* end   = &prims[precord.prims.end()]; // FIXME: extended end for spatial split builder!!!!!
+          size_t bytes = (size_t)end - (size_t)begin;
+          allocator->addBlock(begin,bytes);
+        }
+        
+        return ref;
+      }
+      
+      FastAllocator* const allocator;
+      PrimRef* const prims;
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      lower_x = lower_y = lower_z = pos_inf;
+      upper_x = upper_y = upper_z = neg_inf;
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const BBox3fa& bounds)
+    {
+      assert(i < N);
+      lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z;
+      upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRef& ref, const BBox3fa& bounds) {
+      setBounds(i,bounds);
+      children[i] = ref;
+    }
+    
+    /*! Returns bounds of node. */
+    __forceinline BBox3fa bounds() const {
+      const Vec3fa lower(reduce_min(lower_x),reduce_min(lower_y),reduce_min(lower_z));
+      const Vec3fa upper(reduce_max(upper_x),reduce_max(upper_y),reduce_max(upper_z));
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      const Vec3fa lower(lower_x[i],lower_y[i],lower_z[i]);
+      const Vec3fa upper(upper_x[i],upper_y[i],upper_z[i]);
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extend(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    /*! Returns bounds of all children (implemented later as specializations) */
+    __forceinline void bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const;
+    
+    /*! swap two children of the node */
+    __forceinline void swap(size_t i, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(children[i],children[j]);
+      std::swap(lower_x[i],lower_x[j]);
+      std::swap(lower_y[i],lower_y[j]);
+      std::swap(lower_z[i],lower_z[j]);
+      std::swap(upper_x[i],upper_x[j]);
+      std::swap(upper_y[i],upper_y[j]);
+      std::swap(upper_z[i],upper_z[j]);
+    }
+
+    /*! swap the children of two nodes */
+    __forceinline static void swap(AABBNode_t* a, size_t i, AABBNode_t* b, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(a->children[i],b->children[j]);
+      std::swap(a->lower_x[i],b->lower_x[j]);
+      std::swap(a->lower_y[i],b->lower_y[j]);
+      std::swap(a->lower_z[i],b->lower_z[j]);
+      std::swap(a->upper_x[i],b->upper_x[j]);
+      std::swap(a->upper_y[i],b->upper_y[j]);
+      std::swap(a->upper_z[i],b->upper_z[j]);
+    }
+
+    /*! compacts a node (moves empty children to the end) */
+    __forceinline static void compact(AABBNode_t* a)
+    {
+      /* find right most filled node */
+      ssize_t j=N;
+      for (j=j-1; j>=0; j--)
+        if (a->child(j) != NodeRef::emptyNode)
+          break;
+
+      /* replace empty nodes with filled nodes */
+      for (ssize_t i=0; i<j; i++) {
+        if (a->child(i) == NodeRef::emptyNode) {
+          a->swap(i,j);
+          for (j=j-1; j>i; j--)
+            if (a->child(j) != NodeRef::emptyNode)
+              break;
+        }
+      }
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! output operator */
+    friend embree_ostream operator<<(embree_ostream o, const AABBNode_t& n)
+    {
+      o << "AABBNode { " << embree_endl;
+      o << "  lower_x " << n.lower_x << embree_endl;
+      o << "  upper_x " << n.upper_x << embree_endl;
+      o << "  lower_y " << n.lower_y << embree_endl;
+      o << "  upper_y " << n.upper_y << embree_endl;
+      o << "  lower_z " << n.lower_z << embree_endl;
+      o << "  upper_z " << n.upper_z << embree_endl;
+      o << "  children = ";
+      for (size_t i=0; i<N; i++) o << n.children[i] << " ";
+      o << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    
+  public:
+    vfloat<N> lower_x;           //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_x;           //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_y;           //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_y;           //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_z;           //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_z;           //!< Z dimension of upper bounds of all N children.
+  };
+
+  template<>
+    __forceinline void AABBNode_t<NodeRefPtr<4>,4>::bounds(BBox<vfloat4>& bounds0, BBox<vfloat4>& bounds1, BBox<vfloat4>& bounds2, BBox<vfloat4>& bounds3) const {
+    transpose(lower_x,lower_y,lower_z,vfloat4(zero),bounds0.lower,bounds1.lower,bounds2.lower,bounds3.lower);
+    transpose(upper_x,upper_y,upper_z,vfloat4(zero),bounds0.upper,bounds1.upper,bounds2.upper,bounds3.upper);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h
new file mode 100644
index 0000000000..501f4bce5b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb.h
@@ -0,0 +1,247 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! Motion Blur AABBNode */
+  template<typename NodeRef, int N>
+    struct AABBNodeMB_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+    struct Create
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t num, const FastAllocator::CachedAllocator& alloc) const
+      {
+        AABBNodeMB_t* node = (AABBNodeMB_t*) alloc.malloc0(sizeof(AABBNodeMB_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    { 
+      template<typename BuildRecord>
+      __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
+      {
+        AABBNodeMB_t* node = ref.getAABBNodeMB();
+        
+        LBBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++) {
+          node->setRef(i,children[i].ref);
+          node->setBounds(i,children[i].lbounds);
+          bounds.extend(children[i].lbounds);
+        }
+        return NodeRecordMB(ref,bounds);
+      }
+    };
+    
+    struct SetTimeRange
+    {
+      __forceinline SetTimeRange(BBox1f tbounds) : tbounds(tbounds) {}
+      
+      template<typename BuildRecord>
+      __forceinline NodeRecordMB operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRecordMB* children, const size_t num) const
+      {
+        AABBNodeMB_t* node = ref.getAABBNodeMB();
+        
+        LBBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++) {
+          node->setRef(i, children[i].ref);
+          node->setBounds(i, children[i].lbounds, tbounds);
+          bounds.extend(children[i].lbounds);
+        }
+        return NodeRecordMB(ref,bounds);
+      }
+      
+      BBox1f tbounds;
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()  {
+      lower_x = lower_y = lower_z = vfloat<N>(pos_inf);
+      upper_x = upper_y = upper_z = vfloat<N>(neg_inf);
+      lower_dx = lower_dy = lower_dz = vfloat<N>(0.0f);
+      upper_dx = upper_dy = upper_dz = vfloat<N>(0.0f);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, NodeRef ref) {
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const BBox3fa& bounds0_i, const BBox3fa& bounds1_i)
+    {
+      /*! for empty bounds we have to avoid inf-inf=nan */
+      BBox3fa bounds0(min(bounds0_i.lower,Vec3fa(+FLT_MAX)),max(bounds0_i.upper,Vec3fa(-FLT_MAX)));
+      BBox3fa bounds1(min(bounds1_i.lower,Vec3fa(+FLT_MAX)),max(bounds1_i.upper,Vec3fa(-FLT_MAX)));
+      bounds0 = bounds0.enlarge_by(4.0f*float(ulp));
+      bounds1 = bounds1.enlarge_by(4.0f*float(ulp));
+      Vec3fa dlower = bounds1.lower-bounds0.lower;
+      Vec3fa dupper = bounds1.upper-bounds0.upper;
+      
+      lower_x[i] = bounds0.lower.x; lower_y[i] = bounds0.lower.y; lower_z[i] = bounds0.lower.z;
+      upper_x[i] = bounds0.upper.x; upper_y[i] = bounds0.upper.y; upper_z[i] = bounds0.upper.z;
+      
+      lower_dx[i] = dlower.x; lower_dy[i] = dlower.y; lower_dz[i] = dlower.z;
+      upper_dx[i] = dupper.x; upper_dy[i] = dupper.y; upper_dz[i] = dupper.z;
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds) {
+      setBounds(i, bounds.bounds0, bounds.bounds1);
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds) {
+      setBounds(i, bounds.global(tbounds));
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, NodeRef ref, const BBox3fa& bounds) {
+      lower_x[i] = bounds.lower.x; lower_y[i] = bounds.lower.y; lower_z[i] = bounds.lower.z;
+      upper_x[i] = bounds.upper.x; upper_y[i] = bounds.upper.y; upper_z[i] = bounds.upper.z;
+      children[i] = ref;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRecordMB4D& child)
+    {
+      setRef(i, child.ref);
+      setBounds(i, child.lbounds, child.dt);
+    }
+    
+    /*! Return bounding box for time 0 */
+    __forceinline BBox3fa bounds0(size_t i) const {
+      return BBox3fa(Vec3fa(lower_x[i],lower_y[i],lower_z[i]),
+                     Vec3fa(upper_x[i],upper_y[i],upper_z[i]));
+    }
+    
+    /*! Return bounding box for time 1 */
+    __forceinline BBox3fa bounds1(size_t i) const {
+      return BBox3fa(Vec3fa(lower_x[i]+lower_dx[i],lower_y[i]+lower_dy[i],lower_z[i]+lower_dz[i]),
+                     Vec3fa(upper_x[i]+upper_dx[i],upper_y[i]+upper_dy[i],upper_z[i]+upper_dz[i]));
+    }
+    
+    /*! Returns bounds of node. */
+    __forceinline BBox3fa bounds() const {
+      return BBox3fa(Vec3fa(reduce_min(min(lower_x,lower_x+lower_dx)),
+                            reduce_min(min(lower_y,lower_y+lower_dy)),
+                            reduce_min(min(lower_z,lower_z+lower_dz))),
+                     Vec3fa(reduce_max(max(upper_x,upper_x+upper_dx)),
+                            reduce_max(max(upper_y,upper_y+upper_dy)),
+                            reduce_max(max(upper_z,upper_z+upper_dz))));
+    }
+    
+    /*! Return bounding box of child i */
+    __forceinline BBox3fa bounds(size_t i) const {
+      return merge(bounds0(i),bounds1(i));
+    }
+    
+    /*! Return linear bounding box of child i */
+    __forceinline LBBox3fa lbounds(size_t i) const {
+      return LBBox3fa(bounds0(i),bounds1(i));
+    }
+    
+    /*! Return bounding box of child i at specified time */
+    __forceinline BBox3fa bounds(size_t i, float time) const {
+      return lerp(bounds0(i),bounds1(i),time);
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i) const {
+      return lbounds(i).expectedHalfArea();
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i, const BBox1f& t0t1) const {
+      return lbounds(i).expectedHalfArea(t0t1); 
+    }
+    
+    /*! swap two children of the node */
+    __forceinline void swap(size_t i, size_t j)
+    {
+      assert(i<N && j<N);
+      std::swap(children[i],children[j]);
+      
+      std::swap(lower_x[i],lower_x[j]);
+      std::swap(upper_x[i],upper_x[j]);
+      std::swap(lower_y[i],lower_y[j]);
+      std::swap(upper_y[i],upper_y[j]);
+      std::swap(lower_z[i],lower_z[j]);
+      std::swap(upper_z[i],upper_z[j]);
+      
+      std::swap(lower_dx[i],lower_dx[j]);
+      std::swap(upper_dx[i],upper_dx[j]);
+      std::swap(lower_dy[i],lower_dy[j]);
+      std::swap(upper_dy[i],upper_dy[j]);
+      std::swap(lower_dz[i],lower_dz[j]);
+      std::swap(upper_dz[i],upper_dz[j]);
+    }
+
+    /*! compacts a node (moves empty children to the end) */
+    __forceinline static void compact(AABBNodeMB_t* a)
+    {
+      /* find right most filled node */
+      ssize_t j=N;
+      for (j=j-1; j>=0; j--)
+        if (a->child(j) != NodeRef::emptyNode)
+          break;
+
+      /* replace empty nodes with filled nodes */
+      for (ssize_t i=0; i<j; i++) {
+        if (a->child(i) == NodeRef::emptyNode) {
+          a->swap(i,j);
+          for (j=j-1; j>i; j--)
+            if (a->child(j) != NodeRef::emptyNode)
+              break;
+        }
+      }
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! stream output operator */
+    friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB_t& n) 
+    {
+      cout << "AABBNodeMB {" << embree_endl;
+      for (size_t i=0; i<N; i++) 
+      {
+        const BBox3fa b0 = n.bounds0(i);
+        const BBox3fa b1 = n.bounds1(i);
+        cout << "  child" << i << " { " << embree_endl;
+        cout << "    bounds0 = " << b0 << ", " << embree_endl;
+        cout << "    bounds1 = " << b1 << ", " << embree_endl;
+        cout << "  }";
+      }
+      cout << "}";
+      return cout;
+    }
+    
+  public:
+    vfloat<N> lower_x;        //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_x;        //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_y;        //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_y;        //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_z;        //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_z;        //!< Z dimension of upper bounds of all N children.
+    
+    vfloat<N> lower_dx;        //!< X dimension of lower bounds of all N children.
+    vfloat<N> upper_dx;        //!< X dimension of upper bounds of all N children.
+    vfloat<N> lower_dy;        //!< Y dimension of lower bounds of all N children.
+    vfloat<N> upper_dy;        //!< Y dimension of upper bounds of all N children.
+    vfloat<N> lower_dz;        //!< Z dimension of lower bounds of all N children.
+    vfloat<N> upper_dz;        //!< Z dimension of upper bounds of all N children.
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h
new file mode 100644
index 0000000000..e968bbbc39
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_aabb_mb4d.h
@@ -0,0 +1,107 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_aabb_mb.h"
+
+namespace embree
+{
+  /*! Aligned 4D Motion Blur Node */
+  template<typename NodeRef, int N>
+    struct AABBNodeMB4D_t : public AABBNodeMB_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    using AABBNodeMB_t<NodeRef,N>::set;
+
+    typedef BVHNodeRecord<NodeRef>     NodeRecord;
+    typedef BVHNodeRecordMB<NodeRef>   NodeRecordMB;
+    typedef BVHNodeRecordMB4D<NodeRef> NodeRecordMB4D;
+    
+    struct Create
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord*, const size_t, const FastAllocator::CachedAllocator& alloc, bool hasTimeSplits = true) const
+      {
+        if (hasTimeSplits)
+        {
+          AABBNodeMB4D_t* node = (AABBNodeMB4D_t*) alloc.malloc0(sizeof(AABBNodeMB4D_t),NodeRef::byteNodeAlignment); node->clear();
+          return NodeRef::encodeNode(node);
+        }
+        else
+        {
+          AABBNodeMB_t<NodeRef,N>* node = (AABBNodeMB_t<NodeRef,N>*) alloc.malloc0(sizeof(AABBNodeMB_t<NodeRef,N>),NodeRef::byteNodeAlignment); node->clear();
+          return NodeRef::encodeNode(node);
+        }
+      }
+    };
+
+    struct Set
+    {
+      template<typename BuildRecord>
+      __forceinline void operator() (const BuildRecord&, const BuildRecord*, NodeRef ref, NodeRecordMB4D* children, const size_t num) const
+      {
+        if (likely(ref.isAABBNodeMB())) {
+          for (size_t i=0; i<num; i++)
+            ref.getAABBNodeMB()->set(i, children[i]);
+        } else {
+          for (size_t i=0; i<num; i++)
+            ref.getAABBNodeMB4D()->set(i, children[i]);
+        }
+      }
+    };
+
+    /*! Clears the node. */
+    __forceinline void clear()  {
+      lower_t = vfloat<N>(pos_inf);
+      upper_t = vfloat<N>(neg_inf);
+      AABBNodeMB_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box of child. */
+    __forceinline void setBounds(size_t i, const LBBox3fa& bounds, const BBox1f& tbounds)
+    {
+      AABBNodeMB_t<NodeRef,N>::setBounds(i, bounds.global(tbounds));
+      lower_t[i] = tbounds.lower;
+      upper_t[i] = tbounds.upper == 1.0f ? 1.0f+float(ulp) : tbounds.upper;
+    }
+    
+    /*! Sets bounding box and ID of child. */
+    __forceinline void set(size_t i, const NodeRecordMB4D& child) {
+      AABBNodeMB_t<NodeRef,N>::setRef(i,child.ref);
+      setBounds(i, child.lbounds, child.dt);
+    }
+    
+    /*! Returns the expected surface area when randomly sampling the time. */
+    __forceinline float expectedHalfArea(size_t i) const {
+      return AABBNodeMB_t<NodeRef,N>::lbounds(i).expectedHalfArea(timeRange(i));
+    }
+    
+    /*! returns time range for specified child */
+    __forceinline BBox1f timeRange(size_t i) const {
+      return BBox1f(lower_t[i],upper_t[i]);
+    }
+    
+    /*! stream output operator */
+    friend embree_ostream operator<<(embree_ostream cout, const AABBNodeMB4D_t& n) 
+    {
+      cout << "AABBNodeMB4D {" << embree_endl;
+      for (size_t i=0; i<N; i++) 
+      {
+        const BBox3fa b0 = n.bounds0(i);
+        const BBox3fa b1 = n.bounds1(i);
+        cout << "  child" << i << " { " << embree_endl;
+        cout << "    bounds0 = " << lerp(b0,b1,n.lower_t[i]) << ", " << embree_endl;
+        cout << "    bounds1 = " << lerp(b0,b1,n.upper_t[i]) << ", " << embree_endl;
+        cout << "    time_bounds = " << n.lower_t[i] << ", " << n.upper_t[i] << embree_endl;
+        cout << "  }";
+      }
+      cout << "}";
+      return cout;
+    }
+    
+  public:
+    vfloat<N> lower_t;        //!< time dimension of lower bounds of all N children
+    vfloat<N> upper_t;        //!< time dimension of upper bounds of all N children
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h
new file mode 100644
index 0000000000..8268f3b932
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_base.h
@@ -0,0 +1,43 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_ref.h"
+
+namespace embree
+{
+  
+  /*! BVHN Base Node */
+  template<typename NodeRef, int N>
+    struct BaseNode_t
+  {
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      for (size_t i=0; i<N; i++)
+        children[i] = NodeRef::emptyNode;
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! verifies the node */
+    __forceinline bool verify() const
+    {
+      for (size_t i=0; i<N; i++) {
+        if (child(i) == NodeRef::emptyNode) {
+          for (; i<N; i++) {
+            if (child(i) != NodeRef::emptyNode)
+              return false;
+          }
+          break;
+        }
+      }
+      return true;
+    }
+    
+    NodeRef children[N];    //!< Pointer to the N children (can be a node or leaf)
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h
new file mode 100644
index 0000000000..fa7cc08211
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! Node with unaligned bounds */
+  template<typename NodeRef, int N>
+    struct OBBNode_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const
+      {
+        OBBNode_t* node = (OBBNode_t*) alloc.malloc0(sizeof(OBBNode_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const OBBox3fa& bounds) const {
+        node.ungetAABBNode()->setRef(i,child);
+        node.ungetAABBNode()->setBounds(i,bounds);
+      }
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      naabb.l.vx = Vec3fa(nan);
+      naabb.l.vy = Vec3fa(nan);
+      naabb.l.vz = Vec3fa(nan);
+      naabb.p    = Vec3fa(nan);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets bounding box. */
+    __forceinline void setBounds(size_t i, const OBBox3fa& b)
+    {
+      assert(i < N);
+      
+      AffineSpace3fa space = b.space;
+      space.p -= b.bounds.lower;
+      space = AffineSpace3fa::scale(1.0f/max(Vec3fa(1E-19f),b.bounds.upper-b.bounds.lower))*space;
+      
+      naabb.l.vx.x[i] = space.l.vx.x;
+      naabb.l.vx.y[i] = space.l.vx.y;
+      naabb.l.vx.z[i] = space.l.vx.z;
+      
+      naabb.l.vy.x[i] = space.l.vy.x;
+      naabb.l.vy.y[i] = space.l.vy.y;
+      naabb.l.vy.z[i] = space.l.vy.z;
+      
+      naabb.l.vz.x[i] = space.l.vz.x;
+      naabb.l.vz.y[i] = space.l.vz.y;
+      naabb.l.vz.z[i] = space.l.vz.z;
+      
+      naabb.p.x[i] = space.p.x;
+      naabb.p.y[i] = space.p.y;
+      naabb.p.z[i] = space.p.z;
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Returns the extent of the bounds of the ith child */
+    __forceinline Vec3fa extent(size_t i) const {
+      assert(i<N);
+      const Vec3fa vx(naabb.l.vx.x[i],naabb.l.vx.y[i],naabb.l.vx.z[i]);
+      const Vec3fa vy(naabb.l.vy.x[i],naabb.l.vy.y[i],naabb.l.vy.z[i]);
+      const Vec3fa vz(naabb.l.vz.x[i],naabb.l.vz.y[i],naabb.l.vz.z[i]);
+      return rsqrt(vx*vx + vy*vy + vz*vz);
+    }
+    
+    /*! Returns reference to specified child */
+    __forceinline       NodeRef& child(size_t i)       { assert(i<N); return children[i]; }
+    __forceinline const NodeRef& child(size_t i) const { assert(i<N); return children[i]; }
+    
+    /*! output operator */
+    friend embree_ostream operator<<(embree_ostream o, const OBBNode_t& n)
+    {
+      o << "UnAABBNode { " << n.naabb << " } " << embree_endl;
+      return o;
+    }
+    
+  public:
+    AffineSpace3vf<N> naabb;   //!< non-axis aligned bounding boxes (bounds are [0,1] in specified space)
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h
new file mode 100644
index 0000000000..834cf5ec28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_obb_mb.h
@@ -0,0 +1,90 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  template<typename NodeRef, int N>
+    struct OBBNodeMB_t : public BaseNode_t<NodeRef, N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    
+    struct Create
+    {
+      __forceinline NodeRef operator() (const FastAllocator::CachedAllocator& alloc) const
+      {
+        OBBNodeMB_t* node = (OBBNodeMB_t*) alloc.malloc0(sizeof(OBBNodeMB_t),NodeRef::byteNodeAlignment); node->clear();
+        return NodeRef::encodeNode(node);
+      }
+    };
+    
+    struct Set
+    {
+      __forceinline void operator() (NodeRef node, size_t i, NodeRef child, const LinearSpace3fa& space, const LBBox3fa& lbounds, const BBox1f dt) const {
+        node.ungetAABBNodeMB()->setRef(i,child);
+        node.ungetAABBNodeMB()->setBounds(i,space,lbounds.global(dt));
+      }
+    };
+    
+    /*! Clears the node. */
+    __forceinline void clear()
+    {
+      space0 = one;
+      //b0.lower = b0.upper = Vec3fa(nan);
+      b1.lower = b1.upper = Vec3fa(nan);
+      BaseNode_t<NodeRef,N>::clear();
+    }
+    
+    /*! Sets space and bounding boxes. */
+    __forceinline void setBounds(size_t i, const AffineSpace3fa& space, const LBBox3fa& lbounds) {
+      setBounds(i,space,lbounds.bounds0,lbounds.bounds1);
+    }
+    
+    /*! Sets space and bounding boxes. */
+    __forceinline void setBounds(size_t i, const AffineSpace3fa& s0, const BBox3fa& a, const BBox3fa& c)
+    {
+      assert(i < N);
+      
+      AffineSpace3fa space = s0;
+      space.p -= a.lower;
+      Vec3fa scale = 1.0f/max(Vec3fa(1E-19f),a.upper-a.lower);
+      space = AffineSpace3fa::scale(scale)*space;
+      BBox3fa a1((a.lower-a.lower)*scale,(a.upper-a.lower)*scale);
+      BBox3fa c1((c.lower-a.lower)*scale,(c.upper-a.lower)*scale);
+      
+      space0.l.vx.x[i] = space.l.vx.x; space0.l.vx.y[i] = space.l.vx.y; space0.l.vx.z[i] = space.l.vx.z;
+      space0.l.vy.x[i] = space.l.vy.x; space0.l.vy.y[i] = space.l.vy.y; space0.l.vy.z[i] = space.l.vy.z;
+      space0.l.vz.x[i] = space.l.vz.x; space0.l.vz.y[i] = space.l.vz.y; space0.l.vz.z[i] = space.l.vz.z;
+      space0.p   .x[i] = space.p   .x; space0.p   .y[i] = space.p   .y; space0.p   .z[i] = space.p   .z;
+      
+      /*b0.lower.x[i] = a1.lower.x; b0.lower.y[i] = a1.lower.y; b0.lower.z[i] = a1.lower.z;
+        b0.upper.x[i] = a1.upper.x; b0.upper.y[i] = a1.upper.y; b0.upper.z[i] = a1.upper.z;*/
+      
+      b1.lower.x[i] = c1.lower.x; b1.lower.y[i] = c1.lower.y; b1.lower.z[i] = c1.lower.z;
+      b1.upper.x[i] = c1.upper.x; b1.upper.y[i] = c1.upper.y; b1.upper.z[i] = c1.upper.z;
+    }
+    
+    /*! Sets ID of child. */
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    /*! Returns the extent of the bounds of the ith child */
+    __forceinline Vec3fa extent0(size_t i) const {
+      assert(i < N);
+      const Vec3fa vx(space0.l.vx.x[i],space0.l.vx.y[i],space0.l.vx.z[i]);
+      const Vec3fa vy(space0.l.vy.x[i],space0.l.vy.y[i],space0.l.vy.z[i]);
+      const Vec3fa vz(space0.l.vz.x[i],space0.l.vz.y[i],space0.l.vz.z[i]);
+      return rsqrt(vx*vx + vy*vy + vz*vz);
+    }
+    
+  public:
+    AffineSpace3vf<N> space0;
+    //BBox3vf<N> b0; // these are the unit bounds
+    BBox3vf<N> b1;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h
new file mode 100644
index 0000000000..5212821f3f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_qaabb.h
@@ -0,0 +1,265 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh_node_base.h"
+
+namespace embree
+{
+  /*! BVHN Quantized Node */
+  template<int N>
+    struct __aligned(8) QuantizedBaseNode_t
+  {
+    typedef unsigned char T;
+    static const T MIN_QUAN = 0;
+    static const T MAX_QUAN = 255;
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      for (size_t i=0; i<N; i++) lower_x[i] = lower_y[i] = lower_z[i] = MAX_QUAN;
+      for (size_t i=0; i<N; i++) upper_x[i] = upper_y[i] = upper_z[i] = MIN_QUAN;
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      const Vec3fa lower(madd(scale.x,(float)lower_x[i],start.x),
+                         madd(scale.y,(float)lower_y[i],start.y),
+                         madd(scale.z,(float)lower_z[i],start.z));
+      const Vec3fa upper(madd(scale.x,(float)upper_x[i],start.x),
+                         madd(scale.y,(float)upper_y[i],start.y),
+                         madd(scale.z,(float)upper_z[i],start.z));
+      return BBox3fa(lower,upper);
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extent(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    static __forceinline void init_dim(const vfloat<N> &lower,
+                                       const vfloat<N> &upper,
+                                       T lower_quant[N],
+                                       T upper_quant[N],
+                                       float &start,
+                                       float &scale)
+    {
+      /* quantize bounds */
+      const vbool<N> m_valid = lower != vfloat<N>(pos_inf);
+      const float minF = reduce_min(lower);
+      const float maxF = reduce_max(upper);
+      float diff = (1.0f+2.0f*float(ulp))*(maxF - minF);
+      float decode_scale = diff / float(MAX_QUAN);
+      if (decode_scale == 0.0f) decode_scale = 2.0f*FLT_MIN; // result may have been flushed to zero
+      assert(madd(decode_scale,float(MAX_QUAN),minF) >= maxF);
+      const float encode_scale = diff > 0 ? (float(MAX_QUAN) / diff) : 0.0f;
+      vint<N> ilower = max(vint<N>(floor((lower - vfloat<N>(minF))*vfloat<N>(encode_scale))),MIN_QUAN);
+      vint<N> iupper = min(vint<N>(ceil ((upper - vfloat<N>(minF))*vfloat<N>(encode_scale))),MAX_QUAN);
+      
+      /* lower/upper correction */
+      vbool<N> m_lower_correction = (madd(vfloat<N>(ilower),decode_scale,minF)) > lower;
+      vbool<N> m_upper_correction = (madd(vfloat<N>(iupper),decode_scale,minF)) < upper;
+      ilower = max(select(m_lower_correction,ilower-1,ilower),MIN_QUAN);
+      iupper = min(select(m_upper_correction,iupper+1,iupper),MAX_QUAN);
+      
+      /* disable invalid lanes */
+      ilower = select(m_valid,ilower,MAX_QUAN);
+      iupper = select(m_valid,iupper,MIN_QUAN);
+      
+      /* store as uchar to memory */
+      vint<N>::store(lower_quant,ilower);
+      vint<N>::store(upper_quant,iupper);
+      start = minF;
+      scale = decode_scale;
+      
+#if defined(DEBUG)
+      vfloat<N> extract_lower( vint<N>::loadu(lower_quant) );
+      vfloat<N> extract_upper( vint<N>::loadu(upper_quant) );
+      vfloat<N> final_extract_lower = madd(extract_lower,decode_scale,minF);
+      vfloat<N> final_extract_upper = madd(extract_upper,decode_scale,minF);
+      assert( (movemask(final_extract_lower <= lower ) & movemask(m_valid)) == movemask(m_valid));
+      assert( (movemask(final_extract_upper >= upper ) & movemask(m_valid)) == movemask(m_valid));
+#endif
+    }
+    
+    __forceinline void init_dim(AABBNode_t<NodeRefPtr<N>,N>& node)
+    {
+      init_dim(node.lower_x,node.upper_x,lower_x,upper_x,start.x,scale.x);
+      init_dim(node.lower_y,node.upper_y,lower_y,upper_y,start.y,scale.y);
+      init_dim(node.lower_z,node.upper_z,lower_z,upper_z,start.z,scale.z);
+    }
+    
+    __forceinline vbool<N> validMask() const { return vint<N>::loadu(lower_x) <= vint<N>::loadu(upper_x); }
+    
+#if defined(__AVX512F__) // KNL
+    __forceinline vbool16 validMask16() const { return le(0xff,vint<16>::loadu(lower_x),vint<16>::loadu(upper_x)); }
+#endif
+    __forceinline vfloat<N> dequantizeLowerX() const { return madd(vfloat<N>(vint<N>::loadu(lower_x)),scale.x,vfloat<N>(start.x)); }
+    
+    __forceinline vfloat<N> dequantizeUpperX() const { return madd(vfloat<N>(vint<N>::loadu(upper_x)),scale.x,vfloat<N>(start.x)); }
+    
+    __forceinline vfloat<N> dequantizeLowerY() const { return madd(vfloat<N>(vint<N>::loadu(lower_y)),scale.y,vfloat<N>(start.y)); }
+    
+    __forceinline vfloat<N> dequantizeUpperY() const { return madd(vfloat<N>(vint<N>::loadu(upper_y)),scale.y,vfloat<N>(start.y)); }
+    
+    __forceinline vfloat<N> dequantizeLowerZ() const { return madd(vfloat<N>(vint<N>::loadu(lower_z)),scale.z,vfloat<N>(start.z)); }
+    
+    __forceinline vfloat<N> dequantizeUpperZ() const { return madd(vfloat<N>(vint<N>::loadu(upper_z)),scale.z,vfloat<N>(start.z)); }
+    
+    template <int M>
+      __forceinline vfloat<M> dequantize(const size_t offset) const { return vfloat<M>(vint<M>::loadu(all_planes+offset)); }
+    
+#if defined(__AVX512F__)
+    __forceinline vfloat16 dequantizeLowerUpperX(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_x),p)),scale.x,vfloat16(start.x)); }
+    __forceinline vfloat16 dequantizeLowerUpperY(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_y),p)),scale.y,vfloat16(start.y)); }
+    __forceinline vfloat16 dequantizeLowerUpperZ(const vint16 &p) const { return madd(vfloat16(permute(vint<16>::loadu(lower_z),p)),scale.z,vfloat16(start.z)); }      
+#endif
+    
+    union {
+      struct {
+        T lower_x[N]; //!< 8bit discretized X dimension of lower bounds of all N children
+        T upper_x[N]; //!< 8bit discretized X dimension of upper bounds of all N children
+        T lower_y[N]; //!< 8bit discretized Y dimension of lower bounds of all N children
+        T upper_y[N]; //!< 8bit discretized Y dimension of upper bounds of all N children
+        T lower_z[N]; //!< 8bit discretized Z dimension of lower bounds of all N children
+        T upper_z[N]; //!< 8bit discretized Z dimension of upper bounds of all N children
+      };
+      T all_planes[6*N];
+    };
+    
+    Vec3f start;
+    Vec3f scale;
+    
+    friend embree_ostream operator<<(embree_ostream o, const QuantizedBaseNode_t& n)
+    {
+      o << "QuantizedBaseNode { " << embree_endl;
+      o << "  start   " << n.start << embree_endl;
+      o << "  scale   " << n.scale << embree_endl;
+      o << "  lower_x " << vuint<N>::loadu(n.lower_x) << embree_endl;
+      o << "  upper_x " << vuint<N>::loadu(n.upper_x) << embree_endl;
+      o << "  lower_y " << vuint<N>::loadu(n.lower_y) << embree_endl;
+      o << "  upper_y " << vuint<N>::loadu(n.upper_y) << embree_endl;
+      o << "  lower_z " << vuint<N>::loadu(n.lower_z) << embree_endl;
+      o << "  upper_z " << vuint<N>::loadu(n.upper_z) << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    
+  };
+
+  template<typename NodeRef, int N>
+    struct __aligned(8) QuantizedNode_t : public BaseNode_t<NodeRef, N>, QuantizedBaseNode_t<N>
+  {
+    using BaseNode_t<NodeRef,N>::children;
+    using QuantizedBaseNode_t<N>::lower_x;
+    using QuantizedBaseNode_t<N>::upper_x;
+    using QuantizedBaseNode_t<N>::lower_y;
+    using QuantizedBaseNode_t<N>::upper_y;
+    using QuantizedBaseNode_t<N>::lower_z;
+    using QuantizedBaseNode_t<N>::upper_z;
+    using QuantizedBaseNode_t<N>::start;
+    using QuantizedBaseNode_t<N>::scale;
+    using QuantizedBaseNode_t<N>::init_dim;
+    
+    __forceinline void setRef(size_t i, const NodeRef& ref) {
+      assert(i < N);
+      children[i] = ref;
+    }
+    
+    struct Create2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (BuildRecord* children, const size_t n, const FastAllocator::CachedAllocator& alloc) const
+      {
+        __aligned(64) AABBNode_t<NodeRef,N> node;
+        node.clear();
+        for (size_t i=0; i<n; i++) {
+          node.setBounds(i,children[i].bounds());
+        }
+        QuantizedNode_t *qnode = (QuantizedNode_t*) alloc.malloc0(sizeof(QuantizedNode_t), NodeRef::byteAlignment);
+        qnode->init(node);
+        
+        return (size_t)qnode | NodeRef::tyQuantizedNode;
+      }
+    };
+    
+    struct Set2
+    {
+      template<typename BuildRecord>
+      __forceinline NodeRef operator() (const BuildRecord& precord, const BuildRecord* crecords, NodeRef ref, NodeRef* children, const size_t num) const
+      {
+        QuantizedNode_t* node = ref.quantizedNode();
+        for (size_t i=0; i<num; i++) node->setRef(i,children[i]);
+        return ref;
+      }
+    };
+    
+    __forceinline void init(AABBNode_t<NodeRef,N>& node)
+    {
+      for (size_t i=0;i<N;i++) children[i] = NodeRef::emptyNode;
+      init_dim(node);
+    }
+    
+  }; 
+  
+  /*! BVHN Quantized Node */
+  template<int N>
+    struct __aligned(8) QuantizedBaseNodeMB_t
+  {
+    QuantizedBaseNode_t<N> node0;
+    QuantizedBaseNode_t<N> node1;
+    
+    /*! Clears the node. */
+    __forceinline void clear() {
+      node0.clear();
+      node1.clear();
+    }
+    
+    /*! Returns bounds of specified child. */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      assert(i < N);
+      BBox3fa bounds0 = node0.bounds(i);
+      BBox3fa bounds1 = node1.bounds(i);
+      bounds0.extend(bounds1);
+      return bounds0;
+    }
+    
+    /*! Returns extent of bounds of specified child. */
+    __forceinline Vec3fa extent(size_t i) const {
+      return bounds(i).size();
+    }
+    
+    __forceinline vbool<N> validMask() const { return node0.validMask(); }
+    
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerX(const T t) const { return lerp(node0.dequantizeLowerX(),node1.dequantizeLowerX(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperX(const T t) const { return lerp(node0.dequantizeUpperX(),node1.dequantizeUpperX(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerY(const T t) const { return lerp(node0.dequantizeLowerY(),node1.dequantizeLowerY(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperY(const T t) const { return lerp(node0.dequantizeUpperY(),node1.dequantizeUpperY(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeLowerZ(const T t) const { return lerp(node0.dequantizeLowerZ(),node1.dequantizeLowerZ(),t); }
+    template<typename T>
+      __forceinline vfloat<N> dequantizeUpperZ(const T t) const { return lerp(node0.dequantizeUpperZ(),node1.dequantizeUpperZ(),t); }
+    
+    
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerX()[i]),vfloat<M>(node1.dequantizeLowerX()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperX(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperX()[i]),vfloat<M>(node1.dequantizeUpperX()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerY()[i]),vfloat<M>(node1.dequantizeLowerY()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperY(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperY()[i]),vfloat<M>(node1.dequantizeUpperY()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeLowerZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeLowerZ()[i]),vfloat<M>(node1.dequantizeLowerZ()[i]),t); }
+    template<int M>
+      __forceinline vfloat<M> dequantizeUpperZ(const size_t i, const vfloat<M> &t) const { return lerp(vfloat<M>(node0.dequantizeUpperZ()[i]),vfloat<M>(node1.dequantizeUpperZ()[i]),t); }
+    
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h
new file mode 100644
index 0000000000..0f6d4dac7e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_node_ref.h
@@ -0,0 +1,242 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/alloc.h"
+#include "../common/accel.h"
+#include "../common/device.h"
+#include "../common/scene.h"
+#include "../geometry/primitive.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  /* BVH node reference with bounds */
+  template<typename NodeRef>
+  struct BVHNodeRecord
+  {
+    __forceinline BVHNodeRecord() {}
+    __forceinline BVHNodeRecord(NodeRef ref, const BBox3fa& bounds) : ref(ref), bounds((BBox3fx)bounds) {}
+    __forceinline BVHNodeRecord(NodeRef ref, const BBox3fx& bounds) : ref(ref), bounds(bounds) {}
+
+    NodeRef ref;
+    BBox3fx bounds;
+  };
+
+  template<typename NodeRef>
+  struct BVHNodeRecordMB
+  {
+    __forceinline BVHNodeRecordMB() {}
+    __forceinline BVHNodeRecordMB(NodeRef ref, const LBBox3fa& lbounds) : ref(ref), lbounds(lbounds) {}
+
+    NodeRef ref;
+    LBBox3fa lbounds;
+  };
+
+  template<typename NodeRef>
+  struct BVHNodeRecordMB4D
+  {
+    __forceinline BVHNodeRecordMB4D() {}
+    __forceinline BVHNodeRecordMB4D(NodeRef ref, const LBBox3fa& lbounds, const BBox1f& dt) : ref(ref), lbounds(lbounds), dt(dt) {}
+
+    NodeRef ref;
+    LBBox3fa lbounds;
+    BBox1f dt;
+  };
+
+  template<typename NodeRef, int N> struct BaseNode_t;
+  template<typename NodeRef, int N> struct AABBNode_t;
+  template<typename NodeRef, int N> struct AABBNodeMB_t;
+  template<typename NodeRef, int N> struct AABBNodeMB4D_t;
+  template<typename NodeRef, int N> struct OBBNode_t;
+  template<typename NodeRef, int N> struct OBBNodeMB_t;
+  template<typename NodeRef, int N> struct QuantizedNode_t;
+  template<typename NodeRef, int N> struct QuantizedNodeMB_t;
+  
+  /*! Pointer that points to a node or a list of primitives */
+  template<int N>
+    struct NodeRefPtr
+  {
+    //template<int NN> friend class BVHN;
+
+    /*! Number of bytes the nodes and primitives are minimally aligned to.*/
+    static const size_t byteAlignment = 16;
+    static const size_t byteNodeAlignment = 4*N;
+
+    /*! highest address bit is used as barrier for some algorithms */
+    static const size_t barrier_mask = (1LL << (8*sizeof(size_t)-1));
+
+    /*! Masks the bits that store the number of items per leaf. */
+    static const size_t align_mask = byteAlignment-1;
+    static const size_t items_mask = byteAlignment-1;
+
+    /*! different supported node types */
+    static const size_t tyAABBNode = 0;
+    static const size_t tyAABBNodeMB = 1;
+    static const size_t tyAABBNodeMB4D = 6;
+    static const size_t tyOBBNode = 2;
+    static const size_t tyOBBNodeMB = 3;
+    static const size_t tyQuantizedNode = 5;
+    static const size_t tyLeaf = 8;
+
+    /*! Empty node */
+    static const size_t emptyNode = tyLeaf;
+
+    /*! Invalid node, used as marker in traversal */
+    static const size_t invalidNode = (((size_t)-1) & (~items_mask)) | (tyLeaf+0);
+    static const size_t popRay      = (((size_t)-1) & (~items_mask)) | (tyLeaf+1);
+
+    /*! Maximum number of primitive blocks in a leaf. */
+    static const size_t maxLeafBlocks = items_mask-tyLeaf;
+        
+    /*! Default constructor */
+    __forceinline NodeRefPtr () {}
+    
+    /*! Construction from integer */
+    __forceinline NodeRefPtr (size_t ptr) : ptr(ptr) {}
+    
+    /*! Cast to size_t */
+    __forceinline operator size_t() const { return ptr; }
+    
+    /*! Sets the barrier bit. */
+    __forceinline void setBarrier() {
+#if defined(__X86_64__) || defined(__aarch64__)
+      assert(!isBarrier());
+      ptr |= barrier_mask;
+#else
+      assert(false);
+#endif
+    }
+    
+    /*! Clears the barrier bit. */
+    __forceinline void clearBarrier() {
+#if defined(__X86_64__) || defined(__aarch64__)
+      ptr &= ~barrier_mask;
+#else
+      assert(false);
+#endif
+    }
+    
+    /*! Checks if this is an barrier. A barrier tells the top level tree rotations how deep to enter the tree. */
+    __forceinline bool isBarrier() const { return (ptr & barrier_mask) != 0; }
+    
+    /*! checks if this is a leaf */
+    __forceinline size_t isLeaf() const { return ptr & tyLeaf; }
+    
+    /*! returns node type */
+    __forceinline int type() const { return ptr & (size_t)align_mask; }
+    
+    /*! checks if this is a node */
+    __forceinline int isAABBNode() const { return (ptr & (size_t)align_mask) == tyAABBNode; }
+    
+    /*! checks if this is a motion blur node */
+    __forceinline int isAABBNodeMB() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB; }
+    
+    /*! checks if this is a 4D motion blur node */
+    __forceinline int isAABBNodeMB4D() const { return (ptr & (size_t)align_mask) == tyAABBNodeMB4D; }
+    
+    /*! checks if this is a node with unaligned bounding boxes */
+    __forceinline int isOBBNode() const { return (ptr & (size_t)align_mask) == tyOBBNode; }
+    
+    /*! checks if this is a motion blur node with unaligned bounding boxes */
+    __forceinline int isOBBNodeMB() const { return (ptr & (size_t)align_mask) == tyOBBNodeMB; }
+    
+    /*! checks if this is a quantized node */
+    __forceinline int isQuantizedNode() const { return (ptr & (size_t)align_mask) == tyQuantizedNode; }
+
+    /*! Encodes a node */
+    static __forceinline NodeRefPtr encodeNode(AABBNode_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node);
+    }
+
+    static __forceinline NodeRefPtr encodeNode(AABBNodeMB_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node | tyAABBNodeMB);
+    }
+
+    static __forceinline NodeRefPtr encodeNode(AABBNodeMB4D_t<NodeRefPtr,N>* node) {
+      assert(!((size_t)node & align_mask));
+      return NodeRefPtr((size_t) node | tyAABBNodeMB4D);
+    }
+
+    /*! Encodes an unaligned node */
+    static __forceinline NodeRefPtr encodeNode(OBBNode_t<NodeRefPtr,N>* node) {
+      return NodeRefPtr((size_t) node | tyOBBNode);
+    }
+
+    /*! Encodes an unaligned motion blur node */
+    static __forceinline NodeRefPtr encodeNode(OBBNodeMB_t<NodeRefPtr,N>* node) {
+      return NodeRefPtr((size_t) node | tyOBBNodeMB);
+    }
+
+    /*! Encodes a leaf */
+    static __forceinline NodeRefPtr encodeLeaf(void* tri, size_t num) {
+      assert(!((size_t)tri & align_mask));
+      assert(num <= maxLeafBlocks);
+      return NodeRefPtr((size_t)tri | (tyLeaf+min(num,(size_t)maxLeafBlocks)));
+    }
+
+    /*! Encodes a leaf */
+    static __forceinline NodeRefPtr encodeTypedLeaf(void* ptr, size_t ty) {
+      assert(!((size_t)ptr & align_mask));
+      return NodeRefPtr((size_t)ptr | (tyLeaf+ty));
+    }
+    
+    /*! returns base node pointer */
+    __forceinline BaseNode_t<NodeRefPtr,N>* baseNode()
+    {
+      assert(!isLeaf());
+      return (BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask);
+    }
+    __forceinline const BaseNode_t<NodeRefPtr,N>* baseNode() const
+    {
+      assert(!isLeaf());
+      return (const BaseNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask);
+    }
+    
+    /*! returns node pointer */
+    __forceinline       AABBNode_t<NodeRefPtr,N>* getAABBNode()       { assert(isAABBNode()); return (      AABBNode_t<NodeRefPtr,N>*)ptr; }
+    __forceinline const AABBNode_t<NodeRefPtr,N>* getAABBNode() const { assert(isAABBNode()); return (const AABBNode_t<NodeRefPtr,N>*)ptr; }
+    
+    /*! returns motion blur node pointer */
+    __forceinline       AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB()       { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (      AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const AABBNodeMB_t<NodeRefPtr,N>* getAABBNodeMB() const { assert(isAABBNodeMB() || isAABBNodeMB4D()); return (const AABBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns 4D motion blur node pointer */
+    __forceinline       AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D()       { assert(isAABBNodeMB4D()); return (      AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const AABBNodeMB4D_t<NodeRefPtr,N>* getAABBNodeMB4D() const { assert(isAABBNodeMB4D()); return (const AABBNodeMB4D_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns unaligned node pointer */
+    __forceinline       OBBNode_t<NodeRefPtr,N>* ungetAABBNode()       { assert(isOBBNode()); return (      OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const OBBNode_t<NodeRefPtr,N>* ungetAABBNode() const { assert(isOBBNode()); return (const OBBNode_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns unaligned motion blur node pointer */
+    __forceinline       OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB()       { assert(isOBBNodeMB()); return (      OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    __forceinline const OBBNodeMB_t<NodeRefPtr,N>* ungetAABBNodeMB() const { assert(isOBBNodeMB()); return (const OBBNodeMB_t<NodeRefPtr,N>*)(ptr & ~(size_t)align_mask); }
+    
+    /*! returns quantized node pointer */
+    __forceinline       QuantizedNode_t<NodeRefPtr,N>* quantizedNode()       { assert(isQuantizedNode()); return (      QuantizedNode_t<NodeRefPtr,N>*)(ptr  & ~(size_t)align_mask ); }
+    __forceinline const QuantizedNode_t<NodeRefPtr,N>* quantizedNode() const { assert(isQuantizedNode()); return (const QuantizedNode_t<NodeRefPtr,N>*)(ptr  & ~(size_t)align_mask ); }
+    
+    /*! returns leaf pointer */
+    __forceinline char* leaf(size_t& num) const {
+      assert(isLeaf());
+      num = (ptr & (size_t)items_mask)-tyLeaf;
+      return (char*)(ptr & ~(size_t)align_mask);
+    }
+    
+    /*! clear all bit flags */
+    __forceinline void clearFlags() {
+      ptr &= ~(size_t)align_mask;
+    }
+    
+     /*! returns the wideness */
+    __forceinline size_t getN() const { return N; }
+    
+  public:
+    size_t ptr;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp
new file mode 100644
index 0000000000..a273c21e8b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.cpp
@@ -0,0 +1,247 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_refit.h"
+#include "bvh_statistics.h"
+
+#include "../geometry/linei.h"
+#include "../geometry/triangle.h"
+#include "../geometry/trianglev.h"
+#include "../geometry/trianglei.h"
+#include "../geometry/quadv.h"
+#include "../geometry/object.h"
+#include "../geometry/instance.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    static const size_t SINGLE_THREAD_THRESHOLD = 4*1024;
+    
+    template<int N>
+    __forceinline bool compare(const typename BVHN<N>::NodeRef* a, const typename BVHN<N>::NodeRef* b)
+    {
+      size_t sa = *(size_t*)&a->node()->lower_x;
+      size_t sb = *(size_t*)&b->node()->lower_x;
+      return sa < sb;
+    }
+
+    template<int N>
+    BVHNRefitter<N>::BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds)
+      : bvh(bvh), leafBounds(leafBounds), numSubTrees(0)
+    {
+    }
+
+    template<int N>
+    void BVHNRefitter<N>::refit()
+    {
+      if (bvh->numPrimitives <= SINGLE_THREAD_THRESHOLD) {
+        bvh->bounds = LBBox3fa(recurse_bottom(bvh->root));
+      }
+      else
+      {
+        BBox3fa subTreeBounds[MAX_NUM_SUB_TREES];
+        numSubTrees = 0;
+        gather_subtree_refs(bvh->root,numSubTrees,0);
+        if (numSubTrees)
+          parallel_for(size_t(0), numSubTrees, size_t(1), [&](const range<size_t>& r) {
+              for (size_t i=r.begin(); i<r.end(); i++) {
+                NodeRef& ref = subTrees[i];
+                subTreeBounds[i] = recurse_bottom(ref);
+              }
+            });
+
+        numSubTrees = 0;        
+        bvh->bounds = LBBox3fa(refit_toplevel(bvh->root,numSubTrees,subTreeBounds,0));
+      }    
+  }
+
+    template<int N>
+    void BVHNRefitter<N>::gather_subtree_refs(NodeRef& ref,
+                                              size_t &subtrees,
+                                              const size_t depth)
+    {
+      if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) 
+      {
+        assert(subtrees < MAX_NUM_SUB_TREES);
+        subTrees[subtrees++] = ref;
+        return;
+      }
+
+      if (ref.isAABBNode())
+      {
+        AABBNode* node = ref.getAABBNode();
+        for (size_t i=0; i<N; i++) {
+          NodeRef& child = node->child(i);
+          if (unlikely(child == BVH::emptyNode)) continue;
+          gather_subtree_refs(child,subtrees,depth+1); 
+        }
+      }
+    }
+
+    template<int N>
+    BBox3fa BVHNRefitter<N>::refit_toplevel(NodeRef& ref,
+                                            size_t &subtrees,
+											const BBox3fa *const subTreeBounds,
+                                            const size_t depth)
+    {
+      if (depth >= MAX_SUB_TREE_EXTRACTION_DEPTH) 
+      {
+        assert(subtrees < MAX_NUM_SUB_TREES);
+        assert(subTrees[subtrees] == ref);
+        return subTreeBounds[subtrees++];
+      }
+
+      if (ref.isAABBNode())
+      {
+        AABBNode* node = ref.getAABBNode();
+        BBox3fa bounds[N];
+
+        for (size_t i=0; i<N; i++)
+        {
+          NodeRef& child = node->child(i);
+
+          if (unlikely(child == BVH::emptyNode)) 
+            bounds[i] = BBox3fa(empty);
+          else
+            bounds[i] = refit_toplevel(child,subtrees,subTreeBounds,depth+1); 
+        }
+        
+        BBox3vf<N> boundsT = transpose<N>(bounds);
+      
+        /* set new bounds */
+        node->lower_x = boundsT.lower.x;
+        node->lower_y = boundsT.lower.y;
+        node->lower_z = boundsT.lower.z;
+        node->upper_x = boundsT.upper.x;
+        node->upper_y = boundsT.upper.y;
+        node->upper_z = boundsT.upper.z;
+        
+        return merge<N>(bounds);
+      }
+      else
+        return leafBounds.leafBounds(ref);
+    }
+
+    // =========================================================
+    // =========================================================
+    // =========================================================
+
+    
+    template<int N>
+    BBox3fa BVHNRefitter<N>::recurse_bottom(NodeRef& ref)
+    {
+      /* this is a leaf node */
+      if (unlikely(ref.isLeaf()))
+        return leafBounds.leafBounds(ref);
+      
+      /* recurse if this is an internal node */
+      AABBNode* node = ref.getAABBNode();
+
+      /* enable exclusive prefetch for >= AVX platforms */      
+#if defined(__AVX__)      
+      BVH::prefetchW(ref);
+#endif      
+      BBox3fa bounds[N];
+
+      for (size_t i=0; i<N; i++)
+        if (unlikely(node->child(i) == BVH::emptyNode))
+        {
+          bounds[i] = BBox3fa(empty);          
+        }
+      else
+        bounds[i] = recurse_bottom(node->child(i));
+      
+      /* AOS to SOA transform */
+      BBox3vf<N> boundsT = transpose<N>(bounds);
+      
+      /* set new bounds */
+      node->lower_x = boundsT.lower.x;
+      node->lower_y = boundsT.lower.y;
+      node->lower_z = boundsT.lower.z;
+      node->upper_x = boundsT.upper.x;
+      node->upper_y = boundsT.upper.y;
+      node->upper_z = boundsT.upper.z;
+
+      return merge<N>(bounds);
+    }
+
+    template<int N, typename Mesh, typename Primitive>
+    BVHNRefitT<N,Mesh,Primitive>::BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode)
+      : bvh(bvh), builder(builder), refitter(new BVHNRefitter<N>(bvh,*(typename BVHNRefitter<N>::LeafBoundsInterface*)this)), mesh(mesh), topologyVersion(0) {}
+
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNRefitT<N,Mesh,Primitive>::clear()
+    {
+      if (builder) 
+        builder->clear();
+    }
+    
+    template<int N, typename Mesh, typename Primitive>
+    void BVHNRefitT<N,Mesh,Primitive>::build()
+    {
+      if (mesh->topologyChanged(topologyVersion)) {
+        topologyVersion = mesh->getTopologyVersion();
+        builder->build();
+      }
+      else
+        refitter->refit();
+    }
+
+    template class BVHNRefitter<4>;
+#if defined(__AVX__)
+    template class BVHNRefitter<8>;
+#endif
+    
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    Builder* BVH4Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+
+    Builder* BVH4Triangle4MeshRefitSAH  (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4> ((BVH4*)accel,BVH4Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH4Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4v>((BVH4*)accel,BVH4Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH4Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,TriangleMesh,Triangle4i>((BVH4*)accel,BVH4Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#if  defined(__AVX__)
+    Builder* BVH8Triangle4MeshBuilderSAH  (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Triangle4vMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Triangle4iMeshBuilderSAH (void* bvh, TriangleMesh* mesh, unsigned int geomID, size_t mode);
+
+    Builder* BVH8Triangle4MeshRefitSAH  (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4> ((BVH8*)accel,BVH8Triangle4MeshBuilderSAH (accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH8Triangle4vMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4v>((BVH8*)accel,BVH8Triangle4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+    Builder* BVH8Triangle4iMeshRefitSAH (void* accel, TriangleMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,TriangleMesh,Triangle4i>((BVH8*)accel,BVH8Triangle4iMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+    Builder* BVH4Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,QuadMesh,Quad4v>((BVH4*)accel,BVH4Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8Quad4vMeshBuilderSAH (void* bvh, QuadMesh* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8Quad4vMeshRefitSAH (void* accel, QuadMesh* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,QuadMesh,Quad4v>((BVH8*)accel,BVH8Quad4vMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    Builder* BVH4VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH4VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,UserGeometry,Object>((BVH4*)accel,BVH4VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8VirtualMeshBuilderSAH (void* bvh, UserGeometry* mesh, unsigned int geomID, size_t mode);
+    Builder* BVH8VirtualMeshRefitSAH (void* accel, UserGeometry* mesh, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,UserGeometry,Object>((BVH8*)accel,BVH8VirtualMeshBuilderSAH(accel,mesh,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    Builder* BVH4InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+    Builder* BVH4InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<4,Instance,InstancePrimitive>((BVH4*)accel,BVH4InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+
+#if  defined(__AVX__)
+    Builder* BVH8InstanceMeshBuilderSAH (void* bvh, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode);
+    Builder* BVH8InstanceMeshRefitSAH (void* accel, Instance* mesh, Geometry::GTypeMask gtype, unsigned int geomID, size_t mode) { return new BVHNRefitT<8,Instance,InstancePrimitive>((BVH8*)accel,BVH8InstanceMeshBuilderSAH(accel,mesh,gtype,geomID,mode),mesh,mode); }
+#endif
+#endif
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h
new file mode 100644
index 0000000000..4aa9bdd7cc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_refit.h
@@ -0,0 +1,95 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../bvh/bvh.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N>
+    class BVHNRefitter
+    {
+    public:
+
+      /*! Type shortcuts */
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+
+      struct LeafBoundsInterface {
+        virtual const BBox3fa leafBounds(NodeRef& ref) const = 0;
+      };
+
+    public:
+    
+      /*! Constructor. */
+      BVHNRefitter (BVH* bvh, const LeafBoundsInterface& leafBounds);
+
+      /*! refits the BVH */
+      void refit();
+
+    private:
+      /* single-threaded subtree extraction based on BVH depth */
+      void gather_subtree_refs(NodeRef& ref, 
+                               size_t &subtrees,
+                               const size_t depth = 0);
+
+      /* single-threaded top-level refit */
+      BBox3fa refit_toplevel(NodeRef& ref,
+                             size_t &subtrees,
+							 const BBox3fa *const subTreeBounds,
+                             const size_t depth = 0);
+
+      /* single-threaded subtree refit */
+      BBox3fa recurse_bottom(NodeRef& ref);
+      
+    public:
+      BVH* bvh;                              //!< BVH to refit
+      const LeafBoundsInterface& leafBounds; //!< calculates bounds of leaves
+
+      static const size_t MAX_SUB_TREE_EXTRACTION_DEPTH = (N==4) ? 4   : (N==8) ? 3    : 3;
+      static const size_t MAX_NUM_SUB_TREES             = (N==4) ? 256 : (N==8) ? 512 : N*N*N; // N ^ MAX_SUB_TREE_EXTRACTION_DEPTH
+      size_t numSubTrees;
+      NodeRef subTrees[MAX_NUM_SUB_TREES];
+    };
+
+    template<int N, typename Mesh, typename Primitive>
+    class BVHNRefitT : public Builder, public BVHNRefitter<N>::LeafBoundsInterface
+    {
+    public:
+      
+      /*! Type shortcuts */
+      typedef BVHN<N> BVH;
+      typedef typename BVH::AABBNode AABBNode;
+      typedef typename BVH::NodeRef NodeRef;
+      
+    public:
+      BVHNRefitT (BVH* bvh, Builder* builder, Mesh* mesh, size_t mode);
+
+      virtual void build();
+      
+      virtual void clear();
+
+      virtual const BBox3fa leafBounds (NodeRef& ref) const
+      {
+        size_t num; char* prim = ref.leaf(num);
+        if (unlikely(ref == BVH::emptyNode)) return empty;
+
+        BBox3fa bounds = empty;
+        for (size_t i=0; i<num; i++)
+            bounds.extend(((Primitive*)prim)[i].update(mesh));
+        return bounds;
+      }
+      
+    private:
+      BVH* bvh;
+      std::unique_ptr<Builder> builder;
+      std::unique_ptr<BVHNRefitter<N>> refitter;
+      Mesh* mesh;
+      unsigned int topologyVersion;
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp
new file mode 100644
index 0000000000..2bb431bf0e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.cpp
@@ -0,0 +1,127 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_rotate.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    /*! Computes half surface area of box. */
+    __forceinline float halfArea3f(const BBox<vfloat4>& box) {
+      const vfloat4 d = box.size();
+      const vfloat4 a = d*shuffle<1,2,0,3>(d);
+      return a[0]+a[1]+a[2];
+    }
+    
+    size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth)
+    {
+      /*! nothing to rotate if we reached a leaf node. */
+      if (parentRef.isBarrier()) return 0;
+      if (parentRef.isLeaf()) return 0;
+      AABBNode* parent = parentRef.getAABBNode();
+      
+      /*! rotate all children first */
+      vint4 cdepth;
+      for (size_t c=0; c<4; c++)
+	cdepth[c] = (int)rotate(parent->child(c),depth+1);
+      
+      /* compute current areas of all children */
+      vfloat4 sizeX = parent->upper_x-parent->lower_x;
+      vfloat4 sizeY = parent->upper_y-parent->lower_y;
+      vfloat4 sizeZ = parent->upper_z-parent->lower_z;
+      vfloat4 childArea = madd(sizeX,(sizeY + sizeZ),sizeY*sizeZ);
+      
+      /*! get node bounds */
+      BBox<vfloat4> child1_0,child1_1,child1_2,child1_3;
+      parent->bounds(child1_0,child1_1,child1_2,child1_3);
+      
+      /*! Find best rotation. We pick a first child (child1) and a sub-child 
+	(child2child) of a different second child (child2), and swap child1 
+	and child2child. We perform the best such swap. */
+      float bestArea = 0;
+      size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1;
+      for (size_t c2=0; c2<4; c2++)
+      {
+	/*! ignore leaf nodes as we cannot descent into them */
+	if (parent->child(c2).isBarrier()) continue;
+	if (parent->child(c2).isLeaf()) continue;
+	AABBNode* child2 = parent->child(c2).getAABBNode();
+	
+	/*! transpose child bounds */
+	BBox<vfloat4> child2c0,child2c1,child2c2,child2c3;
+	child2->bounds(child2c0,child2c1,child2c2,child2c3);
+	
+	/*! put child1_0 at each child2 position */
+	float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3));
+	float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3));
+	float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3));
+	float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0));
+	vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03);
+	vfloat4 min0 = vreduce_min(cost0);
+	int pos0 = (int)bsf(movemask(min0 == cost0));
+	
+	/*! put child1_1 at each child2 position */
+	float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3));
+	float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3));
+	float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3));
+	float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1));
+	vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13);
+	vfloat4 min1 = vreduce_min(cost1);
+	int pos1 = (int)bsf(movemask(min1 == cost1));
+	
+	/*! put child1_2 at each child2 position */
+	float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3));
+	float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3));
+	float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3));
+	float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2));
+	vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23);
+	vfloat4 min2 = vreduce_min(cost2);
+	int pos2 = (int)bsf(movemask(min2 == cost2));
+	
+	/*! put child1_3 at each child2 position */
+	float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3));
+	float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3));
+	float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3));
+	float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3));
+	vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33);
+	vfloat4 min3 = vreduce_min(cost3);
+	int pos3 = (int)bsf(movemask(min3 == cost3));
+	
+	/*! find best other child */
+	vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]);
+	int pos[4] = { pos0,pos1,pos2,pos3 };
+	const size_t mbd = BVH4::maxBuildDepth;
+	vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints
+	valid &= vint4(int(c2)) != vint4(step);
+	if (none(valid)) continue;
+	size_t c1 = select_min(valid,area0123);
+	float area = area0123[c1]; 
+        if (c1 == c2) continue; // can happen if bounds are NANs
+	
+	/*! accept a swap when it reduces cost and is not swapping a node with itself */
+	if (area < bestArea) {
+	  bestArea = area;
+	  bestChild1 = c1;
+	  bestChild2 = c2;
+	  bestChild2Child = pos[c1];
+	}
+      }
+      
+      /*! if we did not find a swap that improves the SAH then do nothing */
+      if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth);
+      
+      /*! perform the best found tree rotation */
+      AABBNode* child2 = parent->child(bestChild2).getAABBNode();
+      AABBNode::swap(parent,bestChild1,child2,bestChild2Child);
+      parent->setBounds(bestChild2,child2->bounds());
+      AABBNode::compact(parent);
+      AABBNode::compact(child2);
+      
+      /*! This returned depth is conservative as the child that was
+       *  pulled up in the tree could have been on the critical path. */
+      cdepth[bestChild1]++; // bestChild1 was pushed down one level
+      return 1+reduce_max(cdepth); 
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h
new file mode 100644
index 0000000000..009bef339e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_rotate.h
@@ -0,0 +1,37 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+
+namespace embree
+{
+  namespace isa 
+  { 
+    template<int N>
+    class BVHNRotate
+    {
+      typedef typename BVHN<N>::NodeRef NodeRef;
+
+    public:
+      static const bool enabled = false;
+
+      static __forceinline size_t rotate(NodeRef parentRef, size_t depth = 1) { return 0; }
+      static __forceinline void restructure(NodeRef ref, size_t depth = 1) {}
+    };
+
+    /* BVH4 tree rotations */
+    template<>
+    class BVHNRotate<4>
+    {
+      typedef BVH4::AABBNode AABBNode;
+      typedef BVH4::NodeRef NodeRef;
+      
+    public:
+      static const bool enabled = true;
+
+      static size_t rotate(NodeRef parentRef, size_t depth = 1);
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
new file mode 100644
index 0000000000..aa56035026
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
@@ -0,0 +1,168 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "bvh_statistics.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  template<int N>
+  BVHNStatistics<N>::BVHNStatistics (BVH* bvh) : bvh(bvh)
+  {
+    double A = max(0.0f,bvh->getLinearBounds().expectedHalfArea());
+    stat = statistics(bvh->root,A,BBox1f(0.0f,1.0f));
+  }
+  
+  template<int N>
+  std::string BVHNStatistics<N>::str()
+  {
+    std::ostringstream stream;
+    stream.setf(std::ios::fixed, std::ios::floatfield);
+    stream << "  primitives = " << bvh->numPrimitives << ", vertices = " << bvh->numVertices << ", depth = " << stat.depth << std::endl;
+    size_t totalBytes = stat.bytes(bvh);
+    double totalSAH = stat.sah(bvh);
+    stream << "  total            : sah = "  << std::setw(7) << std::setprecision(3) << totalSAH << " (100.00%), ";
+    stream << "#bytes = " << std::setw(7) << std::setprecision(2) << totalBytes/1E6 << " MB (100.00%), ";
+    stream << "#nodes = " << std::setw(7) << stat.size() << " (" << std::setw(6) << std::setprecision(2) << 100.0*stat.fillRate(bvh) << "% filled), ";
+    stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(totalBytes)/double(bvh->numPrimitives) << std::endl;
+    if (stat.statAABBNodes.numNodes    ) stream << "  getAABBNodes     : "  << stat.statAABBNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statOBBNodes.numNodes  ) stream << "  ungetAABBNodes   : "  << stat.statOBBNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statAABBNodesMB.numNodes  ) stream << "  getAABBNodesMB   : "  << stat.statAABBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statAABBNodesMB4D.numNodes) stream << "  getAABBNodesMB4D : "  << stat.statAABBNodesMB4D.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statOBBNodesMB.numNodes) stream << "  ungetAABBNodesMB : "  << stat.statOBBNodesMB.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (stat.statQuantizedNodes.numNodes  ) stream << "  quantizedNodes   : "  << stat.statQuantizedNodes.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (true)                               stream << "  leaves           : "  << stat.statLeaf.toString(bvh,totalSAH,totalBytes) << std::endl;
+    if (true)                               stream << "    histogram      : "  << stat.statLeaf.histToString() << std::endl;
+    return stream.str();
+  }
+  
+  template<int N>
+  typename BVHNStatistics<N>::Statistics BVHNStatistics<N>::statistics(NodeRef node, const double A, const BBox1f t0t1)
+  {
+    Statistics s;
+    assert(t0t1.size() > 0.0f);
+    double dt = max(0.0f,t0t1.size());
+    if (node.isAABBNode())
+    {
+      AABBNode* n = node.getAABBNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extend(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statAABBNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodes.numNodes++;
+      s.statAABBNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isOBBNode())
+    {
+      OBBNode* n = node.ungetAABBNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statOBBNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statOBBNodes.numNodes++;
+      s.statOBBNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isAABBNodeMB())
+    {
+      AABBNodeMB* n = node.getAABBNodeMB();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,n->expectedHalfArea(i,t0t1));
+          Statistics s = statistics(n->child(i),Ai,t0t1);
+          s.statAABBNodesMB.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodesMB.numNodes++;
+      s.statAABBNodesMB.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isAABBNodeMB4D())
+    {
+      AABBNodeMB4D* n = node.getAABBNodeMB4D();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const BBox1f t0t1i = intersect(t0t1,n->timeRange(i));
+          assert(!t0t1i.empty());
+          const double Ai = n->AABBNodeMB::expectedHalfArea(i,t0t1i);
+          Statistics s =  statistics(n->child(i),Ai,t0t1i);
+          s.statAABBNodesMB4D.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statAABBNodesMB4D.numNodes++;
+      s.statAABBNodesMB4D.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isOBBNodeMB())
+    {
+      OBBNodeMB* n = node.ungetAABBNodeMB();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent0(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statOBBNodesMB.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statOBBNodesMB.numNodes++;
+      s.statOBBNodesMB.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isQuantizedNode())
+    {
+      QuantizedNode* n = node.quantizedNode();
+      s = s + parallel_reduce(0,N,Statistics(),[&] ( const int i ) {
+          if (n->child(i) == BVH::emptyNode) return Statistics();
+          const double Ai = max(0.0f,halfArea(n->extent(i)));
+          Statistics s = statistics(n->child(i),Ai,t0t1); 
+          s.statQuantizedNodes.numChildren++;
+          return s;
+        }, Statistics::add);
+      s.statQuantizedNodes.numNodes++;
+      s.statQuantizedNodes.nodeSAH += dt*A;
+      s.depth++;
+    }
+    else if (node.isLeaf())
+    {
+      size_t num; const char* tri = node.leaf(num);
+      if (num)
+      {
+        for (size_t i=0; i<num; i++)
+        {
+          const size_t bytes = bvh->primTy->getBytes(tri);
+          s.statLeaf.numPrimsActive += bvh->primTy->sizeActive(tri);
+          s.statLeaf.numPrimsTotal += bvh->primTy->sizeTotal(tri);
+          s.statLeaf.numBytes += bytes;
+          tri+=bytes;
+        }
+        s.statLeaf.numLeaves++;
+        s.statLeaf.numPrimBlocks += num;
+        s.statLeaf.leafSAH += dt*A*num;
+        if (num-1 < Statistics::LeafStat::NHIST) {
+          s.statLeaf.numPrimBlocksHistogram[num-1]++;
+        }
+      }
+    }
+    else {
+      // -- GODOT start --
+      // throw std::runtime_error("not supported node type in bvh_statistics");
+      abort();
+      // -- GODOT end --
+    }
+    return s;
+  } 
+
+#if defined(__AVX__)
+  template class BVHNStatistics<8>;
+#endif
+
+#if !defined(__AVX__) || (!defined(EMBREE_TARGET_SSE2) && !defined(EMBREE_TARGET_SSE42)) || defined(__aarch64__)
+  template class BVHNStatistics<4>;
+#endif
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h
new file mode 100644
index 0000000000..73dfc6fbcc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.h
@@ -0,0 +1,285 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include <sstream>
+
+namespace embree
+{
+  template<int N>
+  class BVHNStatistics
+  {
+    typedef BVHN<N> BVH;
+    typedef typename BVH::AABBNode AABBNode;
+    typedef typename BVH::OBBNode OBBNode;
+    typedef typename BVH::AABBNodeMB AABBNodeMB;
+    typedef typename BVH::AABBNodeMB4D AABBNodeMB4D;
+    typedef typename BVH::OBBNodeMB OBBNodeMB;
+    typedef typename BVH::QuantizedNode QuantizedNode;
+
+    typedef typename BVH::NodeRef NodeRef;
+
+    struct Statistics 
+    {
+      template<typename Node>
+        struct NodeStat
+      {
+        NodeStat ( double nodeSAH = 0,
+                   size_t numNodes = 0, 
+                   size_t numChildren = 0)
+        : nodeSAH(nodeSAH),
+          numNodes(numNodes), 
+          numChildren(numChildren) {}
+        
+        double sah(BVH* bvh) const {
+          return nodeSAH/bvh->getLinearBounds().expectedHalfArea();
+        }
+
+        size_t bytes() const {
+          return numNodes*sizeof(Node);
+        }
+
+        size_t size() const {
+          return numNodes;
+        }
+
+        double fillRateNom () const { return double(numChildren);  }
+        double fillRateDen () const { return double(numNodes*N);  }
+        double fillRate    () const { return fillRateNom()/fillRateDen(); }
+
+        __forceinline friend NodeStat operator+ ( const NodeStat& a, const NodeStat& b)
+        {
+          return NodeStat(a.nodeSAH + b.nodeSAH,
+                          a.numNodes+b.numNodes,
+                          a.numChildren+b.numChildren);
+        }
+
+        std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh);
+          stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), ";          
+          stream << "#bytes = " << std::setw(7) << std::setprecision(2) << bytes()/1E6  << " MB ";
+          stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes())/double(bytesTotal) << "%), ";
+          stream << "#nodes = " << std::setw(7) << numNodes << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate() << "% filled), ";
+          stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes())/double(bvh->numPrimitives);
+          return stream.str();
+        }
+
+      public:
+        double nodeSAH;
+        size_t numNodes;
+        size_t numChildren;
+      };
+
+      struct LeafStat
+      {
+        static const int NHIST = 8;
+
+        LeafStat ( double leafSAH = 0.0f, 
+                   size_t numLeaves = 0,
+                   size_t numPrimsActive = 0,
+                   size_t numPrimsTotal = 0,
+                   size_t numPrimBlocks = 0,
+                   size_t numBytes = 0)
+        : leafSAH(leafSAH),
+          numLeaves(numLeaves),
+          numPrimsActive(numPrimsActive),
+          numPrimsTotal(numPrimsTotal),
+          numPrimBlocks(numPrimBlocks),
+          numBytes(numBytes)
+        {
+          for (size_t i=0; i<NHIST; i++)
+            numPrimBlocksHistogram[i] = 0;
+        }
+
+        double sah(BVH* bvh) const {
+          return leafSAH/bvh->getLinearBounds().expectedHalfArea();
+        }
+
+        size_t bytes(BVH* bvh) const {
+          return numBytes;
+        }
+
+        size_t size() const {
+          return numLeaves;
+        }
+
+        double fillRateNom (BVH* bvh) const { return double(numPrimsActive);  }
+        double fillRateDen (BVH* bvh) const { return double(numPrimsTotal);  }
+        double fillRate    (BVH* bvh) const { return fillRateNom(bvh)/fillRateDen(bvh); }
+
+        __forceinline friend LeafStat operator+ ( const LeafStat& a, const LeafStat& b)
+        {
+          LeafStat stat(a.leafSAH + b.leafSAH,
+                        a.numLeaves+b.numLeaves,
+                        a.numPrimsActive+b.numPrimsActive,
+                        a.numPrimsTotal+b.numPrimsTotal,
+                        a.numPrimBlocks+b.numPrimBlocks,
+                        a.numBytes+b.numBytes);
+          for (size_t i=0; i<NHIST; i++) {
+            stat.numPrimBlocksHistogram[i] += a.numPrimBlocksHistogram[i];
+            stat.numPrimBlocksHistogram[i] += b.numPrimBlocksHistogram[i];
+          }
+          return stat;
+        }
+
+        std::string toString(BVH* bvh, double sahTotal, size_t bytesTotal) const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          stream << "sah = " << std::setw(7) << std::setprecision(3) << sah(bvh);
+          stream << " (" << std::setw(6) << std::setprecision(2) << 100.0*sah(bvh)/sahTotal << "%), ";
+          stream << "#bytes = " << std::setw(7) << std::setprecision(2) << double(bytes(bvh))/1E6  << " MB ";
+          stream << "(" << std::setw(6) << std::setprecision(2) << 100.0*double(bytes(bvh))/double(bytesTotal) << "%), ";
+          stream << "#nodes = " << std::setw(7) << numLeaves << " (" << std::setw(6) << std::setprecision(2) << 100.0*fillRate(bvh) << "% filled), ";
+          stream << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytes(bvh))/double(bvh->numPrimitives);
+          return stream.str();
+        }
+
+        std::string histToString() const
+        {
+          std::ostringstream stream;
+          stream.setf(std::ios::fixed, std::ios::floatfield);
+          for (size_t i=0; i<NHIST; i++)
+            stream << std::setw(6) << std::setprecision(2) << 100.0f*float(numPrimBlocksHistogram[i])/float(numLeaves) << "% ";
+          return stream.str();
+        }
+     
+      public:
+        double leafSAH;                    //!< SAH of the leaves only
+        size_t numLeaves;                  //!< Number of leaf nodes.
+        size_t numPrimsActive;             //!< Number of active primitives (
+        size_t numPrimsTotal;              //!< Number of active and inactive primitives
+        size_t numPrimBlocks;              //!< Number of primitive blocks.
+        size_t numBytes;                   //!< Number of bytes of leaves.
+        size_t numPrimBlocksHistogram[8];
+      };
+
+    public:
+      Statistics (size_t depth = 0,
+                  LeafStat statLeaf = LeafStat(),
+                  NodeStat<AABBNode> statAABBNodes = NodeStat<AABBNode>(),
+                  NodeStat<OBBNode> statOBBNodes = NodeStat<OBBNode>(),
+                  NodeStat<AABBNodeMB> statAABBNodesMB = NodeStat<AABBNodeMB>(),
+                  NodeStat<AABBNodeMB4D> statAABBNodesMB4D = NodeStat<AABBNodeMB4D>(),
+                  NodeStat<OBBNodeMB> statOBBNodesMB = NodeStat<OBBNodeMB>(),
+                  NodeStat<QuantizedNode> statQuantizedNodes = NodeStat<QuantizedNode>())
+
+      : depth(depth), 
+        statLeaf(statLeaf),
+        statAABBNodes(statAABBNodes),
+        statOBBNodes(statOBBNodes),
+        statAABBNodesMB(statAABBNodesMB),
+        statAABBNodesMB4D(statAABBNodesMB4D),
+        statOBBNodesMB(statOBBNodesMB),
+        statQuantizedNodes(statQuantizedNodes) {}
+
+      double sah(BVH* bvh) const 
+      {
+        return statLeaf.sah(bvh) +
+          statAABBNodes.sah(bvh) + 
+          statOBBNodes.sah(bvh) + 
+          statAABBNodesMB.sah(bvh) + 
+          statAABBNodesMB4D.sah(bvh) + 
+          statOBBNodesMB.sah(bvh) + 
+          statQuantizedNodes.sah(bvh);
+      }
+      
+      size_t bytes(BVH* bvh) const {
+        return statLeaf.bytes(bvh) +
+          statAABBNodes.bytes() + 
+          statOBBNodes.bytes() + 
+          statAABBNodesMB.bytes() + 
+          statAABBNodesMB4D.bytes() + 
+          statOBBNodesMB.bytes() + 
+          statQuantizedNodes.bytes();
+      }
+
+      size_t size() const 
+      {
+        return statLeaf.size() +
+          statAABBNodes.size() + 
+          statOBBNodes.size() + 
+          statAABBNodesMB.size() + 
+          statAABBNodesMB4D.size() + 
+          statOBBNodesMB.size() + 
+          statQuantizedNodes.size();
+      }
+
+      double fillRate (BVH* bvh) const 
+      {
+        double nom = statLeaf.fillRateNom(bvh) +
+          statAABBNodes.fillRateNom() + 
+          statOBBNodes.fillRateNom() + 
+          statAABBNodesMB.fillRateNom() + 
+          statAABBNodesMB4D.fillRateNom() + 
+          statOBBNodesMB.fillRateNom() + 
+          statQuantizedNodes.fillRateNom();
+        double den = statLeaf.fillRateDen(bvh) +
+          statAABBNodes.fillRateDen() + 
+          statOBBNodes.fillRateDen() + 
+          statAABBNodesMB.fillRateDen() + 
+          statAABBNodesMB4D.fillRateDen() + 
+          statOBBNodesMB.fillRateDen() + 
+          statQuantizedNodes.fillRateDen();
+        return nom/den;
+      }
+
+      friend Statistics operator+ ( const Statistics& a, const Statistics& b )
+      {
+        return Statistics(max(a.depth,b.depth),
+                          a.statLeaf + b.statLeaf,
+                          a.statAABBNodes + b.statAABBNodes,
+                          a.statOBBNodes + b.statOBBNodes,
+                          a.statAABBNodesMB + b.statAABBNodesMB,
+                          a.statAABBNodesMB4D + b.statAABBNodesMB4D,
+                          a.statOBBNodesMB + b.statOBBNodesMB,
+                          a.statQuantizedNodes + b.statQuantizedNodes);
+      }
+
+      static Statistics add ( const Statistics& a, const Statistics& b ) {
+        return a+b;
+      }
+
+    public:
+      size_t depth;
+      LeafStat statLeaf;
+      NodeStat<AABBNode> statAABBNodes;
+      NodeStat<OBBNode> statOBBNodes;
+      NodeStat<AABBNodeMB> statAABBNodesMB;
+      NodeStat<AABBNodeMB4D> statAABBNodesMB4D;
+      NodeStat<OBBNodeMB> statOBBNodesMB;
+      NodeStat<QuantizedNode> statQuantizedNodes;
+    };
+
+  public:
+
+    /* Constructor gathers statistics. */
+    BVHNStatistics (BVH* bvh);
+
+    /*! Convert statistics into a string */
+    std::string str();
+
+    double sah() const { 
+      return stat.sah(bvh); 
+    }
+
+    size_t bytesUsed() const {
+      return stat.bytes(bvh);
+    }
+
+  private:
+    Statistics statistics(NodeRef node, const double A, const BBox1f dt);
+
+  private:
+    BVH* bvh;
+    Statistics stat;
+  };
+
+  typedef BVHNStatistics<4> BVH4Statistics;
+  typedef BVHNStatistics<8> BVH8Statistics;
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h
new file mode 100644
index 0000000000..7f17084b81
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser1.h
@@ -0,0 +1,676 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "node_intersector1.h"
+#include "../common/stack_item.h"
+
+#define NEW_SORTING_CODE 1
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! BVH regular node traversal for single rays. */
+    template<int N, int Nx, int types>
+    class BVHNNodeTraverser1Hit;
+
+    /*! Helper functions for fast sorting using AVX512 instructions. */
+#if defined(__AVX512ER__)
+
+    /* KNL code path */
+    __forceinline void isort_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p)
+    {
+      const vfloat16 dist_shift = align_shift_right<15>(dist,dist);
+      const vllong8  ptr_shift  = align_shift_right<7>(ptr,ptr);
+      const vbool16 m_geq = d >= dist;
+      const vbool16 m_geq_shift = m_geq << 1;
+      dist = select(m_geq,d,dist);
+      ptr  = select(vboold8(m_geq),p,ptr);
+      dist = select(m_geq_shift,dist_shift,dist);
+      ptr  = select(vboold8(m_geq_shift),ptr_shift,ptr);
+    }
+
+    __forceinline void isort_quick_update(vfloat16 &dist, vllong8 &ptr, const vfloat16 &d, const vllong8 &p)
+    {
+      //dist = align_shift_right<15>(dist,d);
+      //ptr  = align_shift_right<7>(ptr,p);
+      dist = align_shift_right<15>(dist,permute(d,vint16(zero)));
+      ptr  = align_shift_right<7>(ptr,permute(p,vllong8(zero)));
+    }
+
+    template<int N, int Nx, int types, class NodeRef, class BaseNode>
+    __forceinline void traverseClosestHitAVX512(NodeRef& cur,
+                                                size_t mask,
+                                                const vfloat<Nx>& tNear,
+                                                StackItemT<NodeRef>*& stackPtr,
+                                                StackItemT<NodeRef>* stackEnd)
+    {
+      assert(mask != 0);
+      const BaseNode* node = cur.baseNode();
+
+      vllong8 children( vllong<N>::loadu((void*)node->children) );
+      children = vllong8::compact((int)mask,children);
+      vfloat16 distance = tNear;
+      distance = vfloat16::compact((int)mask,distance,tNear);
+
+      cur = toScalar(children);
+      BVHN<N>::prefetch(cur,types);
+
+      mask &= mask-1;
+      if (likely(mask == 0)) return;
+
+      /* 2 hits: order A0 B0 */
+      const vllong8 c0(children);
+      const vfloat16 d0(distance);
+      children = align_shift_right<1>(children,children);
+      distance = align_shift_right<1>(distance,distance);
+      const vllong8 c1(children);
+      const vfloat16 d1(distance);
+
+      cur = toScalar(children);
+      BVHN<N>::prefetch(cur,types);
+
+      /* a '<' keeps the order for equal distances, scenes like powerplant largely benefit from it */
+      const vboolf16 m_dist  = d0 < d1;
+      const vfloat16 dist_A0 = select(m_dist, d0, d1);
+      const vfloat16 dist_B0 = select(m_dist, d1, d0);
+      const vllong8 ptr_A0   = select(vboold8(m_dist), c0, c1);
+      const vllong8 ptr_B0   = select(vboold8(m_dist), c1, c0);
+
+      mask &= mask-1;
+      if (likely(mask == 0)) {
+        cur = toScalar(ptr_A0);
+        stackPtr[0].ptr            = toScalar(ptr_B0);
+        *(float*)&stackPtr[0].dist = toScalar(dist_B0);
+        stackPtr++;
+        return;
+      }
+
+      /* 3 hits: order A1 B1 C1 */
+
+      children = align_shift_right<1>(children,children);
+      distance = align_shift_right<1>(distance,distance);
+
+      const vllong8 c2(children);
+      const vfloat16 d2(distance);
+
+      cur = toScalar(children);
+      BVHN<N>::prefetch(cur,types);
+
+      const vboolf16 m_dist1     = dist_A0 <= d2;
+      const vfloat16 dist_tmp_B1 = select(m_dist1, d2, dist_A0);
+      const vllong8  ptr_A1      = select(vboold8(m_dist1), ptr_A0, c2);
+      const vllong8  ptr_tmp_B1  = select(vboold8(m_dist1), c2, ptr_A0);
+
+      const vboolf16 m_dist2     = dist_B0 <= dist_tmp_B1;
+      const vfloat16 dist_B1     = select(m_dist2, dist_B0 , dist_tmp_B1);
+      const vfloat16 dist_C1     = select(m_dist2, dist_tmp_B1, dist_B0);
+      const vllong8  ptr_B1      = select(vboold8(m_dist2), ptr_B0, ptr_tmp_B1);
+      const vllong8  ptr_C1      = select(vboold8(m_dist2), ptr_tmp_B1, ptr_B0);
+
+      mask &= mask-1;
+      if (likely(mask == 0)) {
+        cur = toScalar(ptr_A1);
+        stackPtr[0].ptr  = toScalar(ptr_C1);
+        *(float*)&stackPtr[0].dist = toScalar(dist_C1);
+        stackPtr[1].ptr  = toScalar(ptr_B1);
+        *(float*)&stackPtr[1].dist = toScalar(dist_B1);
+        stackPtr+=2;
+        return;
+      }
+
+      /* 4 hits: order A2 B2 C2 D2 */
+
+      const vfloat16 dist_A1  = select(m_dist1, dist_A0, d2);
+
+      children = align_shift_right<1>(children,children);
+      distance = align_shift_right<1>(distance,distance);
+
+      const vllong8 c3(children);
+      const vfloat16 d3(distance);
+
+      cur = toScalar(children);
+      BVHN<N>::prefetch(cur,types);
+
+      const vboolf16 m_dist3     = dist_A1 <= d3;
+      const vfloat16 dist_tmp_B2 = select(m_dist3, d3, dist_A1);
+      const vllong8  ptr_A2      = select(vboold8(m_dist3), ptr_A1, c3);
+      const vllong8  ptr_tmp_B2  = select(vboold8(m_dist3), c3, ptr_A1);
+
+      const vboolf16 m_dist4     = dist_B1 <= dist_tmp_B2;
+      const vfloat16 dist_B2     = select(m_dist4, dist_B1 , dist_tmp_B2);
+      const vfloat16 dist_tmp_C2 = select(m_dist4, dist_tmp_B2, dist_B1);
+      const vllong8  ptr_B2      = select(vboold8(m_dist4), ptr_B1, ptr_tmp_B2);
+      const vllong8  ptr_tmp_C2  = select(vboold8(m_dist4), ptr_tmp_B2, ptr_B1);
+
+      const vboolf16 m_dist5     = dist_C1 <= dist_tmp_C2;
+      const vfloat16 dist_C2     = select(m_dist5, dist_C1 , dist_tmp_C2);
+      const vfloat16 dist_D2     = select(m_dist5, dist_tmp_C2, dist_C1);
+      const vllong8  ptr_C2      = select(vboold8(m_dist5), ptr_C1, ptr_tmp_C2);
+      const vllong8  ptr_D2      = select(vboold8(m_dist5), ptr_tmp_C2, ptr_C1);
+
+      mask &= mask-1;
+      if (likely(mask == 0)) {
+        cur = toScalar(ptr_A2);
+        stackPtr[0].ptr  = toScalar(ptr_D2);
+        *(float*)&stackPtr[0].dist = toScalar(dist_D2);
+        stackPtr[1].ptr  = toScalar(ptr_C2);
+        *(float*)&stackPtr[1].dist = toScalar(dist_C2);
+        stackPtr[2].ptr  = toScalar(ptr_B2);
+        *(float*)&stackPtr[2].dist = toScalar(dist_B2);
+        stackPtr+=3;
+        return;
+      }
+
+      /* >=5 hits: reverse to descending order for writing to stack */
+
+      const size_t hits = 4 + popcnt(mask);
+      const vfloat16 dist_A2  = select(m_dist3, dist_A1, d3);
+      vfloat16 dist(neg_inf);
+      vllong8 ptr(zero);
+
+
+      isort_quick_update(dist,ptr,dist_A2,ptr_A2);
+      isort_quick_update(dist,ptr,dist_B2,ptr_B2);
+      isort_quick_update(dist,ptr,dist_C2,ptr_C2);
+      isort_quick_update(dist,ptr,dist_D2,ptr_D2);
+
+      do {
+
+        children = align_shift_right<1>(children,children);
+        distance = align_shift_right<1>(distance,distance);
+
+        cur = toScalar(children);
+        BVHN<N>::prefetch(cur,types);
+
+        const vfloat16 new_dist(permute(distance,vint16(zero)));
+        const vllong8 new_ptr(permute(children,vllong8(zero)));
+
+        mask &= mask-1;
+        isort_update(dist,ptr,new_dist,new_ptr);
+
+      } while(mask);
+
+      const vboold8 m_stack_ptr(0x55);  // 10101010 (lsb -> msb)
+      const vboolf16 m_stack_dist(0x4444); // 0010001000100010 (lsb -> msb)
+
+      /* extract current noderef */
+      cur = toScalar(permute(ptr,vllong8(hits-1)));
+      /* rearrange pointers to beginning of 16 bytes block */
+      vllong8 stackElementA0;
+      stackElementA0 = vllong8::expand(m_stack_ptr,ptr,stackElementA0);
+      /* put distances in between */
+      vuint16 stackElementA1((__m512i)stackElementA0);
+      stackElementA1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementA1);
+      /* write out first 4 x 16 bytes block to stack */
+      vuint16::storeu(stackPtr,stackElementA1);
+      /* get upper half of dist and ptr */
+      dist = align_shift_right<4>(dist,dist);
+      ptr  = align_shift_right<4>(ptr,ptr);
+      /* assemble and write out second block */
+      vllong8 stackElementB0;
+      stackElementB0 = vllong8::expand(m_stack_ptr,ptr,stackElementB0);
+      vuint16 stackElementB1((__m512i)stackElementB0);
+      stackElementB1 = vuint16::expand(m_stack_dist,asUInt(dist),stackElementB1);
+      vuint16::storeu(stackPtr + 4,stackElementB1);
+      /* increase stack pointer */
+      stackPtr += hits-1;
+    }
+#endif
+
+#if defined(__AVX512VL__) // SKX
+
+    template<int N>
+    __forceinline void isort_update(vint<N> &dist, const vint<N> &d)
+    {
+      const vint<N> dist_shift = align_shift_right<N-1>(dist,dist);
+      const vboolf<N> m_geq = d >= dist;
+      const vboolf<N> m_geq_shift = m_geq << 1;
+      dist = select(m_geq,d,dist);
+      dist = select(m_geq_shift,dist_shift,dist);
+    }
+
+    template<int N>
+    __forceinline void isort_quick_update(vint<N> &dist, const vint<N> &d) {
+      dist = align_shift_right<N-1>(dist,permute(d,vint<N>(zero)));
+    }
+
+    __forceinline size_t permuteExtract(const vint8& index, const vllong4& n0, const vllong4& n1) {
+      return toScalar(permutex2var((__m256i)index,n0,n1));
+    }
+
+    __forceinline float permuteExtract(const vint8& index, const vfloat8& n) {
+      return toScalar(permute(n,index));
+    }
+
+#endif
+
+    /* Specialization for BVH4. */
+    template<int Nx, int types>
+    class BVHNNodeTraverser1Hit<4, Nx, types>
+    {
+      typedef BVH4 BVH;
+      typedef BVH4::NodeRef NodeRef;
+      typedef BVH4::BaseNode BaseNode;
+
+
+    public:
+      /* Traverses a node with at least one hit child. Optimized for finding the closest hit (intersection). */
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t mask,
+                                                   const vfloat<Nx>& tNear,
+                                                   StackItemT<NodeRef>*& stackPtr,
+                                                   StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+#if defined(__AVX512ER__)
+        traverseClosestHitAVX512<4,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
+#else
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+        if (likely(mask == 0)) {
+          assert(cur != BVH::emptyNode);
+          return;
+        }
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        const unsigned int d0 = ((unsigned int*)&tNear)[r];
+        r = bscf(mask);
+        NodeRef c1 = node->child(r);
+        BVH::prefetch(c1,types);
+        const unsigned int d1 = ((unsigned int*)&tNear)[r];
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          assert(stackPtr < stackEnd);
+          if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
+          else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
+        }
+
+#if NEW_SORTING_CODE == 1
+        vint4 s0((size_t)c0,(size_t)d0);
+        vint4 s1((size_t)c1,(size_t)d1);
+        r = bscf(mask);
+        NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; 
+        vint4 s2((size_t)c2,(size_t)d2);
+        /* 3 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort3(s0,s1,s2);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
+          cur = toSizeT(s2);
+          stackPtr+=2;
+          return;
+        }
+        r = bscf(mask);
+        NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; 
+        vint4 s3((size_t)c3,(size_t)d3);
+        /* 4 hits */
+        StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
+        *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
+        cur = toSizeT(s3);
+        stackPtr+=3;
+#else
+        /*! Here starts the slow path for 3 or 4 hit children. We push
+         *  all nodes onto the stack to sort them there. */
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
+
+        /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+
+        /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#endif
+#endif
+      }
+
+      /* Traverses a node with at least one hit child. Optimized for finding any hit (occlusion). */
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t mask,
+                                               const vfloat<Nx>& tNear,
+                                               NodeRef*& stackPtr,
+                                               NodeRef* stackEnd)
+      {
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r); 
+        BVH::prefetch(cur,types);
+
+        /* simpler in sequence traversal order */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        assert(stackPtr < stackEnd);
+        *stackPtr = cur; stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r); BVH::prefetch(cur,types);
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          assert(stackPtr < stackEnd);
+          *stackPtr = cur; stackPtr++;
+        }
+      }
+    };
+
+    /* Specialization for BVH8. */
+    template<int Nx, int types>
+    class BVHNNodeTraverser1Hit<8, Nx, types>
+    {
+      typedef BVH8 BVH;
+      typedef BVH8::NodeRef NodeRef;
+      typedef BVH8::BaseNode BaseNode;
+      
+#if defined(__AVX512VL__)
+      template<class NodeRef, class BaseNode>
+        static __forceinline void traverseClosestHitAVX512VL8(NodeRef& cur,
+                                                              size_t mask,
+                                                              const vfloat8& tNear,
+                                                              StackItemT<NodeRef>*& stackPtr,
+                                                              StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+        const vllong4 n0 = vllong4::loadu((vllong4*)&node->children[0]);
+        const vllong4 n1 = vllong4::loadu((vllong4*)&node->children[4]);
+        vint8 distance_i = (asInt(tNear) & 0xfffffff8) | vint8(step);
+        distance_i = vint8::compact((int)mask,distance_i,distance_i);
+        cur = permuteExtract(distance_i,n0,n1);
+        BVH::prefetch(cur,types);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) return;
+
+        /* 2 hits: order A0 B0 */
+        const vint8 d0(distance_i);
+        const vint8 d1(shuffle<1>(distance_i));
+        cur = permuteExtract(d1,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A0 = min(d0, d1);
+        const vint8 dist_B0 = max(d0, d1);
+        assert(dist_A0[0] < dist_B0[0]);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A0,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_B0,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_B0,tNear);
+          stackPtr++;
+          return;
+        }
+
+        /* 3 hits: order A1 B1 C1 */
+
+        const vint8 d2(shuffle<2>(distance_i));
+        cur = permuteExtract(d2,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A1     = min(dist_A0,d2);
+        const vint8 dist_tmp_B1 = max(dist_A0,d2);
+        const vint8 dist_B1     = min(dist_B0,dist_tmp_B1);
+        const vint8 dist_C1     = max(dist_B0,dist_tmp_B1);
+        assert(dist_A1[0] < dist_B1[0]);
+        assert(dist_B1[0] < dist_C1[0]);
+
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A1,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_C1,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_C1,tNear);
+          stackPtr[1].ptr            = permuteExtract(dist_B1,n0,n1);
+          *(float*)&stackPtr[1].dist = permuteExtract(dist_B1,tNear);
+          stackPtr+=2;
+          return;
+        }
+
+        /* 4 hits: order A2 B2 C2 D2 */
+
+        const vint8 d3(shuffle<3>(distance_i));
+        cur = permuteExtract(d3,n0,n1);
+        BVH::prefetch(cur,types);
+
+        const vint8 dist_A2     = min(dist_A1,d3);
+        const vint8 dist_tmp_B2 = max(dist_A1,d3);
+        const vint8 dist_B2     = min(dist_B1,dist_tmp_B2);
+        const vint8 dist_tmp_C2 = max(dist_B1,dist_tmp_B2);
+        const vint8 dist_C2     = min(dist_C1,dist_tmp_C2);
+        const vint8 dist_D2     = max(dist_C1,dist_tmp_C2);
+        assert(dist_A2[0] < dist_B2[0]);
+        assert(dist_B2[0] < dist_C2[0]);
+        assert(dist_C2[0] < dist_D2[0]);
+        
+        mask &= mask-1;
+        if (likely(mask == 0)) {
+          cur                        = permuteExtract(dist_A2,n0,n1);
+          stackPtr[0].ptr            = permuteExtract(dist_D2,n0,n1);
+          *(float*)&stackPtr[0].dist = permuteExtract(dist_D2,tNear);
+          stackPtr[1].ptr            = permuteExtract(dist_C2,n0,n1);
+          *(float*)&stackPtr[1].dist = permuteExtract(dist_C2,tNear);
+          stackPtr[2].ptr            = permuteExtract(dist_B2,n0,n1);
+          *(float*)&stackPtr[2].dist = permuteExtract(dist_B2,tNear);
+          stackPtr+=3;
+          return;
+        }
+
+        /* >=5 hits: reverse to descending order for writing to stack */
+
+        distance_i = align_shift_right<3>(distance_i,distance_i);
+        const size_t hits = 4 + popcnt(mask);
+        vint8 dist(INT_MIN); // this will work with -0.0f (0x80000000) as distance, isort_update uses >= to insert
+	
+        isort_quick_update(dist,dist_A2);
+        isort_quick_update(dist,dist_B2);
+        isort_quick_update(dist,dist_C2);
+        isort_quick_update(dist,dist_D2);
+
+        do {
+
+          distance_i = align_shift_right<1>(distance_i,distance_i);
+          cur = permuteExtract(distance_i,n0,n1);
+          BVH::prefetch(cur,types);
+          const vint8 new_dist(permute(distance_i,vint8(zero)));
+          mask &= mask-1;
+          isort_update(dist,new_dist);
+
+        } while(mask);
+
+        for (size_t i=0; i<7; i++)
+          assert(dist[i+0]>=dist[i+1]);
+
+        for (size_t i=0;i<hits-1;i++)
+        {
+          stackPtr->ptr            = permuteExtract(dist,n0,n1);
+          *(float*)&stackPtr->dist = permuteExtract(dist,tNear);
+          dist = align_shift_right<1>(dist,dist);
+          stackPtr++;
+        }
+        cur = permuteExtract(dist,n0,n1);
+      }
+#endif
+
+    public:
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t mask,
+                                                   const vfloat<Nx>& tNear,
+                                                   StackItemT<NodeRef>*& stackPtr,
+                                                   StackItemT<NodeRef>* stackEnd)
+      {
+        assert(mask != 0);
+#if defined(__AVX512ER__)
+        traverseClosestHitAVX512<8,Nx,types,NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
+#elif defined(__AVX512VL__)
+        traverseClosestHitAVX512VL8<NodeRef,BaseNode>(cur,mask,tNear,stackPtr,stackEnd);
+#else
+
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+        if (likely(mask == 0)) {
+          assert(cur != BVH::emptyNode);
+          return;
+        }
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        const unsigned int d0 = ((unsigned int*)&tNear)[r];
+        r = bscf(mask);
+        NodeRef c1 = node->child(r);
+        BVH::prefetch(c1,types);
+        const unsigned int d1 = ((unsigned int*)&tNear)[r];
+
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          assert(stackPtr < stackEnd);
+          if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; return; }
+          else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; return; }
+        }
+#if NEW_SORTING_CODE == 1
+        vint4 s0((size_t)c0,(size_t)d0);
+        vint4 s1((size_t)c1,(size_t)d1);
+
+        r = bscf(mask);
+        NodeRef c2 = node->child(r); BVH::prefetch(c2,types); unsigned int d2 = ((unsigned int*)&tNear)[r]; 
+        vint4 s2((size_t)c2,(size_t)d2);
+        /* 3 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort3(s0,s1,s2);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1;
+          cur = toSizeT(s2);
+          stackPtr+=2;
+          return;
+        }
+        r = bscf(mask);
+        NodeRef c3 = node->child(r); BVH::prefetch(c3,types); unsigned int d3 = ((unsigned int*)&tNear)[r]; 
+        vint4 s3((size_t)c3,(size_t)d3);
+        /* 4 hits */
+        if (likely(mask == 0)) {
+          StackItemT<NodeRef>::sort4(s0,s1,s2,s3);
+          *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2;
+          cur = toSizeT(s3);
+          stackPtr+=3;
+          return;
+        }
+        *(vint4*)&stackPtr[0] = s0; *(vint4*)&stackPtr[1] = s1; *(vint4*)&stackPtr[2] = s2; *(vint4*)&stackPtr[3] = s3;
+        /*! fallback case if more than 4 children are hit */
+        StackItemT<NodeRef>* stackFirst = stackPtr;
+        stackPtr+=4;      
+        while (1)
+        {
+          assert(stackPtr < stackEnd);
+          r = bscf(mask);
+          NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = *(unsigned int*)&tNear[r]; 
+          const vint4 s((size_t)c,(size_t)d);
+          *(vint4*)stackPtr++ = s;
+          assert(c != BVH::emptyNode);
+          if (unlikely(mask == 0)) break;
+        }
+        sort(stackFirst,stackPtr);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#else
+        /*! Here starts the slow path for 3 or 4 hit children. We push
+         *  all nodes onto the stack to sort them there. */
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
+        assert(stackPtr < stackEnd);
+        stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
+
+        /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        NodeRef c = node->child(r); BVH::prefetch(c,types); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+
+        /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
+        assert(stackPtr < stackEnd);
+        r = bscf(mask);
+        c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+        assert(c != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
+          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+          return;
+        }
+        /*! fallback case if more than 4 children are hit */
+        StackItemT<NodeRef>* stackFirst = stackPtr-4;
+        while (1)
+        {
+          assert(stackPtr < stackEnd);
+          r = bscf(mask);
+          c = node->child(r); BVH::prefetch(c,types); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
+          assert(c != BVH::emptyNode);
+          if (unlikely(mask == 0)) break;
+        }
+        sort(stackFirst,stackPtr);
+        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
+#endif
+#endif
+      }
+
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t mask,
+                                               const vfloat<Nx>& tNear,
+                                               NodeRef*& stackPtr,
+                                               NodeRef* stackEnd)
+      {
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVH::prefetch(cur,types);
+
+        /* simpler in sequence traversal order */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        assert(stackPtr < stackEnd);
+        *stackPtr = cur; stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r); BVH::prefetch(cur,types);
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          assert(stackPtr < stackEnd);
+          *stackPtr = cur; stackPtr++;
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h
new file mode 100644
index 0000000000..9c603babf0
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_traverser_stream.h
@@ -0,0 +1,154 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+#include "../common/ray.h"
+#include "../common/stack_item.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, int Nx, int types>
+    class BVHNNodeTraverserStreamHitCoherent
+    {
+      typedef BVHN<N> BVH;
+      typedef typename BVH::NodeRef NodeRef;
+      typedef typename BVH::BaseNode BaseNode;
+
+    public:
+      template<class T>
+      static __forceinline void traverseClosestHit(NodeRef& cur,
+                                                   size_t& m_trav_active,
+                                                   const vbool<Nx>& vmask,
+                                                   const vfloat<Nx>& tNear,
+                                                   const T* const tMask,
+                                                   StackItemMaskCoherent*& stackPtr)
+      {
+        const NodeRef parent = cur;
+        size_t mask = movemask(vmask);
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        const size_t r0 = bscf(mask);
+        assert(r0 < 8);
+        cur = node->child(r0);
+        BVHN<N>::prefetch(cur,types);
+        m_trav_active = tMask[r0];
+        assert(cur != BVH::emptyNode);
+        if (unlikely(mask == 0)) return;
+
+        const unsigned int* const tNear_i = (unsigned int*)&tNear;
+
+        /*! two children are hit, push far child, and continue with closer child */
+        NodeRef c0 = cur;
+        unsigned int d0 = tNear_i[r0];
+        const size_t r1 = bscf(mask);
+        assert(r1 < 8);
+        NodeRef c1 = node->child(r1);
+        BVHN<N>::prefetch(c1,types);
+        unsigned int d1 = tNear_i[r1];
+
+        assert(c0 != BVH::emptyNode);
+        assert(c1 != BVH::emptyNode);
+        if (likely(mask == 0)) {
+          if (d0 < d1) {
+            assert(tNear[r1] >= 0.0f);
+            stackPtr->mask    = tMask[r1];
+            stackPtr->parent  = parent;
+            stackPtr->child   = c1;
+            stackPtr++;
+            cur = c0;
+            m_trav_active = tMask[r0];
+            return;
+          }
+          else {
+            assert(tNear[r0] >= 0.0f);
+            stackPtr->mask    = tMask[r0];
+            stackPtr->parent  = parent;
+            stackPtr->child   = c0;
+            stackPtr++;
+            cur = c1;
+            m_trav_active = tMask[r1];
+            return;
+          }
+        }
+
+        /*! slow path for more than two hits */
+        size_t hits = movemask(vmask);
+        const vint<Nx> dist_i = select(vmask, (asInt(tNear) & 0xfffffff8) | vint<Nx>(step), 0);
+  #if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+        const vint<N> tmp = extractN<N,0>(dist_i);
+        const vint<Nx> dist_i_sorted = usort_descending(tmp);
+  #else
+        const vint<Nx> dist_i_sorted = usort_descending(dist_i);
+  #endif
+        const vint<Nx> sorted_index = dist_i_sorted & 7;
+
+        size_t i = 0;
+        for (;;)
+        {
+          const unsigned int index = sorted_index[i];
+          assert(index < 8);
+          cur = node->child(index);
+          m_trav_active = tMask[index];
+          assert(m_trav_active);
+          BVHN<N>::prefetch(cur,types);
+          bscf(hits);
+          if (unlikely(hits==0)) break;
+          i++;
+          assert(cur != BVH::emptyNode);
+          assert(tNear[index] >= 0.0f);
+          stackPtr->mask    = m_trav_active;
+          stackPtr->parent  = parent;
+          stackPtr->child   = cur;
+          stackPtr++;
+        }
+      }
+
+      template<class T>
+      static __forceinline void traverseAnyHit(NodeRef& cur,
+                                               size_t& m_trav_active,
+                                               const vbool<Nx>& vmask,
+                                               const T* const tMask,
+                                               StackItemMaskCoherent*& stackPtr)
+      {
+        const NodeRef parent = cur;
+        size_t mask = movemask(vmask);
+        assert(mask != 0);
+        const BaseNode* node = cur.baseNode();
+
+        /*! one child is hit, continue with that child */
+        size_t r = bscf(mask);
+        cur = node->child(r);
+        BVHN<N>::prefetch(cur,types);
+        m_trav_active = tMask[r];
+
+        /* simple in order sequence */
+        assert(cur != BVH::emptyNode);
+        if (likely(mask == 0)) return;
+        stackPtr->mask    = m_trav_active;
+        stackPtr->parent  = parent;
+        stackPtr->child   = cur;
+        stackPtr++;
+
+        for (; ;)
+        {
+          r = bscf(mask);
+          cur = node->child(r);
+          BVHN<N>::prefetch(cur,types);
+          m_trav_active = tMask[r];
+          assert(cur != BVH::emptyNode);
+          if (likely(mask == 0)) return;
+          stackPtr->mask    = m_trav_active;
+          stackPtr->parent  = parent;
+          stackPtr->child   = cur;
+          stackPtr++;
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h
new file mode 100644
index 0000000000..a978c0c459
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector.h
@@ -0,0 +1,31 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bvh.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct NearFarPrecalculations
+    {
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+
+      __forceinline NearFarPrecalculations() {}
+
+      __forceinline NearFarPrecalculations(const Vec3fa& dir, size_t N)
+      {
+        const size_t size = sizeof(float)*N;
+        nearX = (dir.x < 0.0f) ? 1*size : 0*size;
+        nearY = (dir.y < 0.0f) ? 3*size : 2*size;
+        nearZ = (dir.z < 0.0f) ? 5*size : 4*size;
+        farX  = nearX ^ size;
+        farY  = nearY ^ size;
+        farZ  = nearZ ^ size;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h
new file mode 100644
index 0000000000..aa0d4ba4d7
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector1.h
@@ -0,0 +1,1788 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+#if defined(__AVX2__)
+#define __FMA_X4__
+#endif
+
+#if defined(__aarch64__)
+#define __FMA_X4__
+#endif
+
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray structure used in single-ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx, bool robust>
+      struct TravRayBase;
+      
+    /* Base (without tnear and tfar) */
+    template<int N, int Nx>
+      struct TravRayBase<N,Nx,false>
+    {
+      __forceinline TravRayBase() {}
+
+      __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir)
+        : org_xyz(ray_org), dir_xyz(ray_dir) 
+      {
+        const Vec3fa ray_rdir = rcp_safe(ray_dir);
+        org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
+        dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
+        rdir = Vec3vf<N>(ray_rdir.x,ray_rdir.y,ray_rdir.z);
+#if defined(__FMA_X4__)
+        const Vec3fa ray_org_rdir = ray_org*ray_rdir;
+#if !defined(__aarch64__)
+        org_rdir = Vec3vf<N>(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
+#else
+          //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd
+          //x86 will use msub
+        neg_org_rdir = Vec3vf<N>(-ray_org_rdir.x,-ray_org_rdir.y,-ray_org_rdir.z);
+#endif
+#endif
+        nearX = ray_rdir.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
+        nearY = ray_rdir.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
+        nearZ = ray_rdir.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
+        farX  = nearX ^ sizeof(vfloat<N>);
+        farY  = nearY ^ sizeof(vfloat<N>);
+        farZ  = nearZ ^ sizeof(vfloat<N>);
+
+#if defined(__AVX512ER__) // KNL+
+        /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+        const vint<16> id(step);
+        const vint<16> id2 = align_shift_right<16/2>(id, id);
+        permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+        permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+        permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+
+      }
+
+      template<int K>
+      __forceinline TravRayBase(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                                const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                                size_t flip = sizeof(vfloat<N>))
+      {
+        org  = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
+        dir  = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
+        rdir = Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
+        org_rdir = org*rdir;
+#else
+        neg_org_rdir = -(org*rdir);
+#endif
+#endif
+	nearX = nearXYZ.x[k];
+	nearY = nearXYZ.y[k];
+	nearZ = nearXYZ.z[k];
+        farX  = nearX ^ flip;
+        farY  = nearY ^ flip;
+        farZ  = nearZ ^ flip;
+
+#if defined(__AVX512ER__) // KNL+
+        /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+        const vint<16> id(step);
+        const vint<16> id2 = align_shift_right<16/2>(id, id);
+        permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+        permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+        permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+      }
+
+      Vec3fa org_xyz, dir_xyz;
+      Vec3vf<Nx> org, dir, rdir;
+#if defined(__FMA_X4__)
+#if !defined(__aarch64__)
+      Vec3vf<Nx> org_rdir;
+#else
+        //aarch64 version are keeping negation of the org_rdir and use madd
+        //x86 uses msub
+      Vec3vf<Nx> neg_org_rdir;
+#endif
+#endif
+#if defined(__AVX512ER__) // KNL+
+      vint16 permX, permY, permZ;
+#endif
+
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+    };
+
+    /* Base (without tnear and tfar) */
+    template<int N, int Nx>
+      struct TravRayBase<N,Nx,true>
+    {
+      __forceinline TravRayBase() {}
+
+      __forceinline TravRayBase(const Vec3fa& ray_org, const Vec3fa& ray_dir)
+        : org_xyz(ray_org), dir_xyz(ray_dir) 
+      {
+        const float round_down = 1.0f-3.0f*float(ulp);
+        const float round_up   = 1.0f+3.0f*float(ulp);
+        const Vec3fa ray_rdir = 1.0f/zero_fix(ray_dir);
+        const Vec3fa ray_rdir_near = round_down*ray_rdir;
+        const Vec3fa ray_rdir_far  = round_up  *ray_rdir;
+        org = Vec3vf<N>(ray_org.x,ray_org.y,ray_org.z);
+        dir = Vec3vf<N>(ray_dir.x,ray_dir.y,ray_dir.z);
+        rdir_near = Vec3vf<N>(ray_rdir_near.x,ray_rdir_near.y,ray_rdir_near.z);
+        rdir_far  = Vec3vf<N>(ray_rdir_far .x,ray_rdir_far .y,ray_rdir_far .z);
+        nearX = ray_rdir_near.x >= 0.0f ? 0*sizeof(vfloat<N>) : 1*sizeof(vfloat<N>);
+        nearY = ray_rdir_near.y >= 0.0f ? 2*sizeof(vfloat<N>) : 3*sizeof(vfloat<N>);
+        nearZ = ray_rdir_near.z >= 0.0f ? 4*sizeof(vfloat<N>) : 5*sizeof(vfloat<N>);
+        farX  = nearX ^ sizeof(vfloat<N>);
+        farY  = nearY ^ sizeof(vfloat<N>);
+        farZ  = nearZ ^ sizeof(vfloat<N>);
+
+#if defined(__AVX512ER__) // KNL+
+        /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+        const vint<16> id(step);
+        const vint<16> id2 = align_shift_right<16/2>(id, id);
+        permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+        permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+        permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+      }
+
+      template<int K>
+      __forceinline TravRayBase(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                                const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                                size_t flip = sizeof(vfloat<N>))
+      {
+        const vfloat<Nx> round_down = 1.0f-3.0f*float(ulp);
+        const vfloat<Nx> round_up   = 1.0f+3.0f*float(ulp);
+        org  = Vec3vf<Nx>(ray_org.x[k], ray_org.y[k], ray_org.z[k]);
+        dir  = Vec3vf<Nx>(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]);
+        rdir_near = round_down*Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+        rdir_far  = round_up  *Vec3vf<Nx>(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]);
+
+	nearX = nearXYZ.x[k];
+	nearY = nearXYZ.y[k];
+	nearZ = nearXYZ.z[k];
+        farX  = nearX ^ flip;
+        farY  = nearY ^ flip;
+        farZ  = nearZ ^ flip;
+
+#if defined(__AVX512ER__) // KNL+
+        /* optimization works only for 8-wide BVHs with 16-wide SIMD */
+        const vint<16> id(step);
+        const vint<16> id2 = align_shift_right<16/2>(id, id);
+        permX = select(vfloat<16>(dir.x) >= 0.0f, id, id2);
+        permY = select(vfloat<16>(dir.y) >= 0.0f, id, id2);
+        permZ = select(vfloat<16>(dir.z) >= 0.0f, id, id2);
+#endif
+      }
+
+      Vec3fa org_xyz, dir_xyz;
+      Vec3vf<Nx> org, dir, rdir_near, rdir_far;
+#if defined(__AVX512ER__) // KNL+
+      vint16 permX, permY, permZ;
+#endif
+
+      size_t nearX, nearY, nearZ;
+      size_t farX, farY, farZ;
+    };
+
+    /* Full (with tnear and tfar) */
+    template<int N, int Nx, bool robust>
+      struct TravRay : TravRayBase<N,Nx,robust>
+    {
+      __forceinline TravRay() {}
+
+      __forceinline TravRay(const Vec3fa& ray_org, const Vec3fa& ray_dir, float ray_tnear, float ray_tfar)
+        : TravRayBase<N,Nx,robust>(ray_org, ray_dir),
+          tnear(ray_tnear), tfar(ray_tfar) {}
+
+      template<int K>
+      __forceinline TravRay(size_t k, const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir,
+                            const Vec3vf<K>& ray_rdir, const Vec3vi<K>& nearXYZ,
+                            float ray_tnear, float ray_tfar,
+                            size_t flip = sizeof(vfloat<N>))
+        : TravRayBase<N,Nx,robust>(k, ray_org, ray_dir, ray_rdir, nearXYZ, flip),
+          tnear(ray_tnear), tfar(ray_tfar) {}
+
+      vfloat<Nx> tnear;
+      vfloat<Nx> tfar;
+    };
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Point Query structure used in single-ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    struct TravPointQuery
+    {
+      __forceinline TravPointQuery() {}
+
+      __forceinline TravPointQuery(const Vec3fa& query_org, const Vec3fa& query_rad)
+      {
+        org = Vec3vf<N>(query_org.x, query_org.y, query_org.z);
+        rad = Vec3vf<N>(query_rad.x, query_rad.y, query_rad.z);
+      }
+
+      __forceinline vfloat<N> const& tfar() const {
+        return rad.x;
+      }
+
+      Vec3vf<N> org, rad;
+    };
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // point query
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+    __forceinline size_t pointQuerySphereDistAndMask(
+      const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, 
+      vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ)
+    {
+      const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x;
+      const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y;
+      const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z;
+      dist = vX * vX + vY * vY + vZ * vZ;
+      const vbool<N> vmask = dist <= query.tfar()*query.tfar();
+      const vbool<N> valid = minX <= maxX;
+      return movemask(vmask) & movemask(valid);
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x));
+      const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y));
+      const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z));
+      const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x));
+      const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y));
+      const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z));
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x);
+      const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y);
+      const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z);
+      const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x);
+      const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y);
+      const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z);
+      const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0]));
+      const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0]));
+      const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0]));
+      const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0]));
+      const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0]));
+      const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0]));
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+      __forceinline size_t pointQueryNodeSphereMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+      size_t mask = pointQueryNodeSphere(node, query, time, dist);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t);
+        mask &= movemask(vmask);
+      }
+
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> start_x(node->start.x);
+      const vfloat<N> scale_x(node->scale.x);
+      const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> start_y(node->start.y);
+      const vfloat<N> scale_y(node->scale.y);
+      const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> start_z(node->start.z);
+      const vfloat<N> scale_z(node->scale.z);
+      const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask());
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = node->dequantizeLowerX(time);
+      const vfloat<N> maxX = node->dequantizeUpperX(time);
+      const vfloat<N> minY = node->dequantizeLowerY(time);
+      const vfloat<N> maxY = node->dequantizeUpperY(time);
+      const vfloat<N> minZ = node->dequantizeLowerZ(time);
+      const vfloat<N> maxZ = node->dequantizeUpperZ(time);     
+      return pointQuerySphereDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & movemask(node->validMask());
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeSphere(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryAABBDistAndMask(
+      const TravPointQuery<N>& query, vfloat<N>& dist, vfloat<N> const& minX, vfloat<N> const& maxX, 
+      vfloat<N> const& minY, vfloat<N> const& maxY, vfloat<N> const& minZ, vfloat<N> const& maxZ)
+    {
+      const vfloat<N> vX = min(max(query.org.x, minX), maxX) - query.org.x;
+      const vfloat<N> vY = min(max(query.org.y, minY), maxY) - query.org.y;
+      const vfloat<N> vZ = min(max(query.org.z, minZ), maxZ) - query.org.z;
+      dist = vX * vX + vY * vY + vZ * vZ;
+      const vbool<N> valid = minX <= maxX;
+      const vbool<N> vmask = !((maxX < query.org.x - query.rad.x) | (minX > query.org.x + query.rad.x) |
+                               (maxY < query.org.y - query.rad.y) | (minY > query.org.y + query.rad.y) |
+                               (maxZ < query.org.z - query.rad.z) | (minZ > query.org.z + query.rad.z));
+      return movemask(vmask) & movemask(valid);
+    }
+
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const vfloat<N> minX = vfloat<N>::load((float*)((const char*)&node->lower_x));
+      const vfloat<N> minY = vfloat<N>::load((float*)((const char*)&node->lower_y));
+      const vfloat<N> minZ = vfloat<N>::load((float*)((const char*)&node->lower_z));
+      const vfloat<N> maxX = vfloat<N>::load((float*)((const char*)&node->upper_x));
+      const vfloat<N> maxY = vfloat<N>::load((float*)((const char*)&node->upper_y));
+      const vfloat<N> maxZ = vfloat<N>::load((float*)((const char*)&node->upper_z));
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::AABBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pMinX = (const vfloat<N>*)((const char*)&node->lower_x);
+      const vfloat<N>* pMinY = (const vfloat<N>*)((const char*)&node->lower_y);
+      const vfloat<N>* pMinZ = (const vfloat<N>*)((const char*)&node->lower_z);
+      const vfloat<N>* pMaxX = (const vfloat<N>*)((const char*)&node->upper_x);
+      const vfloat<N>* pMaxY = (const vfloat<N>*)((const char*)&node->upper_y);
+      const vfloat<N>* pMaxZ = (const vfloat<N>*)((const char*)&node->upper_z);
+      const vfloat<N> minX = madd(time,pMinX[6],vfloat<N>(pMinX[0]));
+      const vfloat<N> minY = madd(time,pMinY[6],vfloat<N>(pMinY[0]));
+      const vfloat<N> minZ = madd(time,pMinZ[6],vfloat<N>(pMinZ[0]));
+      const vfloat<N> maxX = madd(time,pMaxX[6],vfloat<N>(pMaxX[0]));
+      const vfloat<N> maxY = madd(time,pMaxY[6],vfloat<N>(pMaxY[0]));
+      const vfloat<N> maxZ = madd(time,pMaxZ[6],vfloat<N>(pMaxZ[0]));
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ);
+    }
+    
+    template<int N>
+      __forceinline size_t pointQueryNodeAABBMB4D(const typename BVHN<N>::NodeRef ref, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+      size_t mask = pointQueryNodeAABB(node, query, time, dist);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        const vbool<N> vmask = (node1->lower_t <= time) & (time < node1->upper_t);
+        mask &= movemask(vmask);
+      }
+
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat<N> start_x(node->start.x);
+      const vfloat<N> scale_x(node->scale.x);
+      const vfloat<N> minX = madd(node->template dequantize<N>((0*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> maxX = madd(node->template dequantize<N>((1*sizeof(vfloat<N>)) >> 2),scale_x,start_x);
+      const vfloat<N> start_y(node->start.y);
+      const vfloat<N> scale_y(node->scale.y);
+      const vfloat<N> minY = madd(node->template dequantize<N>((2*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> maxY = madd(node->template dequantize<N>((3*sizeof(vfloat<N>)) >> 2),scale_y,start_y);
+      const vfloat<N> start_z(node->start.z);
+      const vfloat<N> scale_z(node->scale.z);
+      const vfloat<N> minZ = madd(node->template dequantize<N>((4*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      const vfloat<N> maxZ = madd(node->template dequantize<N>((5*sizeof(vfloat<N>)) >> 2),scale_z,start_z);
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat<N> minX = node->dequantizeLowerX(time);
+      const vfloat<N> maxX = node->dequantizeUpperX(time);
+      const vfloat<N> minY = node->dequantizeLowerY(time);
+      const vfloat<N> maxY = node->dequantizeUpperY(time);
+      const vfloat<N> minZ = node->dequantizeLowerZ(time);
+      const vfloat<N> maxZ = node->dequantizeUpperZ(time);     
+      return pointQueryAABBDistAndMask(query, dist, minX, maxX, minY, maxY, minZ, maxZ) & mvalid;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+    
+    template<int N>
+    __forceinline size_t pointQueryNodeAABB(const typename BVHN<N>::OBBNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+    {
+      // TODO: point query - implement
+      const vbool<N> vmask = vbool<N>(true);
+      const size_t mask = movemask(vmask) & ((1<<N)-1);
+      dist = vfloat<N>(0.0f);
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::AABBNode* node, const TravRay<N,Nx,robust>& ray, vfloat<Nx>& dist);
+
+    template<>
+      __forceinline size_t intersectNode<4,4>(const typename BVH4::AABBNode* node, const TravRay<4,4,false>& ray, vfloat4& dist)
+    {
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat4 tFarX  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tFarY  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tFarZ  = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat4 tNearX = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tNearY = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tNearZ = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+      const vfloat4 tFarX  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tFarY  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tFarZ  = msub(vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat4 tNearX = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
+      const vfloat4 tNearY = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
+      const vfloat4 tNearZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z;
+      const vfloat4 tFarX  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x;
+      const vfloat4 tFarY  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
+      const vfloat4 tFarZ  = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__aarch64__)
+      const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear);
+      const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#elif defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<4)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+#if defined(__AVX__)
+
+    template<>
+      __forceinline size_t intersectNode<8,8>(const typename BVH8::AABBNode* node, const TravRay<8,8,false>& ray, vfloat8& dist)
+    {
+#if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat8 tNearX = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tNearY = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tNearZ = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat8 tFarX  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tFarY  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tFarZ  = madd(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat8 tNearX = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tNearY = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tNearZ = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+      const vfloat8 tFarX  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tFarY  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tFarZ  = msub(vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);
+#endif
+        
+#else
+      const vfloat8 tNearX = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir.x;
+      const vfloat8 tNearY = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir.y;
+      const vfloat8 tNearZ = (vfloat8::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir.z;
+      const vfloat8 tFarX  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir.x;
+      const vfloat8 tFarY  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir.y;
+      const vfloat8 tFarZ  = (vfloat8::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<8)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+#endif
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+
+    template<>
+      __forceinline size_t intersectNode<4,16>(const typename BVH4::AABBNode* node, const TravRay<4,16,false>& ray, vfloat16& dist)
+    {
+      const vfloat16 tNearX = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tNearY = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tNearZ = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.org_rdir.z);
+      const vfloat16 tFarX  = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farX )), ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tFarY  = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farY )), ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tFarZ  = msub(vfloat16(*(vfloat4*)((const char*)&node->lower_x+ray.farZ )), ray.rdir.z, ray.org_rdir.z);      
+      const vfloat16 tNear  = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat16 tFar   = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool16 vmask   = le(vbool16(0xf),tNear,tFar);
+      const size_t mask     = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<8,16>(const typename BVH8::AABBNode* node, const TravRay<8,16,false>& ray, vfloat16& dist)
+    {
+      const vllong8 invalid((size_t)BVH8::emptyNode);
+      const vboold8 m_valid(invalid != vllong8::loadu(node->children));
+      const vfloat16 bminmaxX  = permute(vfloat16::load((const float*)&node->lower_x), ray.permX);
+      const vfloat16 bminmaxY  = permute(vfloat16::load((const float*)&node->lower_y), ray.permY);
+      const vfloat16 bminmaxZ  = permute(vfloat16::load((const float*)&node->lower_z), ray.permZ);
+      const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z);
+      const vfloat16 tNear     = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear);
+      const vfloat16 tFar      = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar);
+      const vbool16 vmask      = le(vboolf16(m_valid),tNear,align_shift_right<8>(tFar, tFar));
+      const size_t mask        = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+    
+#endif
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx>
+      __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNode* node, const TravRay<N,Nx,true>& ray, vfloat<Nx>& dist)
+    {
+      const vfloat<N> tNearX = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tFarX  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ  = (vfloat<N>::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+
+    template<>
+      __forceinline size_t intersectNodeRobust<4,16>(const typename BVHN<4>::AABBNode* node, const TravRay<4,16,true>& ray, vfloat<16>& dist)
+    {      
+      const vfloat16 tNearX = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x;
+      const vfloat16 tNearY = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y;
+      const vfloat16 tNearZ = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z;
+      const vfloat16 tFarX  = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x;
+      const vfloat16 tFarY  = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y;
+      const vfloat16 tFarZ  = (vfloat16(*(vfloat<4>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z;
+      const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat16 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool16 vmask = le((1 << 4)-1,tNear,tFar);
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    template<>
+      __forceinline size_t intersectNodeRobust<8,16>(const typename BVHN<8>::AABBNode* node, const TravRay<8,16,true>& ray, vfloat<16>& dist)
+    {      
+      const vfloat16 tNearX = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearX)) - ray.org.x) * ray.rdir_near.x;
+      const vfloat16 tNearY = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearY)) - ray.org.y) * ray.rdir_near.y;
+      const vfloat16 tNearZ = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.nearZ)) - ray.org.z) * ray.rdir_near.z;
+      const vfloat16 tFarX  = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farX )) - ray.org.x) * ray.rdir_far.x;
+      const vfloat16 tFarY  = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farY )) - ray.org.y) * ray.rdir_far.y;
+      const vfloat16 tFarZ  = (vfloat16(*(vfloat<8>*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir_far.z;
+      const vfloat16 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat16 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool16 vmask = le((1 << 8)-1,tNear,tFar);
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+#endif
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNode(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,N,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
+#endif
+#if defined(__FMA_X4__) && !defined(__AVX512F__) // HSW
+      const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<N)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vfloat<N> tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool<N> vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N> tFar  = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+      const vbool<N> vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeRobust(const typename BVHN<N>::AABBNodeMB* node, const TravRay<N,N,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+      const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ);
+      const size_t mask = movemask(tNear <= tFar);
+      dist = tNear;
+      return mask;
+    }
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeMB4D(const typename BVHN<N>::NodeRef ref, const TravRay<N,N,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+        
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N>* pFarX  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ  = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+#if defined (__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat<N> tNearX = msub(madd(time,pNearX[6],vfloat<N>(pNearX[0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(madd(time,pNearY[6],vfloat<N>(pNearY[0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(madd(time,pNearZ[6],vfloat<N>(pNearZ[0])), ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(madd(time,pFarX [6],vfloat<N>(pFarX [0])), ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(madd(time,pFarY [6],vfloat<N>(pFarY [0])), ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(madd(time,pFarZ [6],vfloat<N>(pFarZ [0])), ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (madd(time,pFarX [6],vfloat<N>(pFarX [0])) - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (madd(time,pFarY [6],vfloat<N>(pFarY [0])) - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (madd(time,pFarZ [6],vfloat<N>(pFarZ [0])) - ray.org.z) * ray.rdir.z;
+#endif
+#if defined(__FMA_X4__) && !defined(__AVX512F__)
+      const vfloat<N> tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray.tnear));
+      const vfloat<N> tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray.tfar ));
+#else
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N> tFar  = min(ray.tfar, tFarX ,tFarY ,tFarZ );
+#endif
+      vbool<N> vmask = tNear <= tFar;
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        vmask &= (node1->lower_t <= time) & (time < node1->upper_t);
+      }
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N>
+      __forceinline size_t intersectNodeMB4DRobust(const typename BVHN<N>::NodeRef ref, const TravRay<N,N,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<N>* pNearX = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearX);
+      const vfloat<N>* pNearY = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearY);
+      const vfloat<N>* pNearZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.nearZ);
+      const vfloat<N> tNearX = (madd(time,pNearX[6],vfloat<N>(pNearX[0])) - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (madd(time,pNearY[6],vfloat<N>(pNearY[0])) - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (madd(time,pNearZ[6],vfloat<N>(pNearZ[0])) - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tNear = max(ray.tnear,tNearX,tNearY,tNearZ);
+      const vfloat<N>* pFarX = (const vfloat<N>*)((const char*)&node->lower_x+ray.farX);
+      const vfloat<N>* pFarY = (const vfloat<N>*)((const char*)&node->lower_x+ray.farY);
+      const vfloat<N>* pFarZ = (const vfloat<N>*)((const char*)&node->lower_x+ray.farZ);
+      const vfloat<N> tFarX = (madd(time,pFarX[6],vfloat<N>(pFarX[0])) - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY = (madd(time,pFarY[6],vfloat<N>(pFarY[0])) - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ = (madd(time,pFarZ[6],vfloat<N>(pFarZ[0])) - ray.org.z) * ray.rdir_far.z;
+      const vfloat<N> tFar = min(ray.tfar,tFarX,tFarY,tFarZ);
+      vbool<N> vmask = tNear <= tFar;
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        vmask &= (node1->lower_t <= time) & (time < node1->upper_t);
+      }
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast QuantizedBaseNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,robust>& ray, vfloat<Nx>& dist);
+
+    template<>
+      __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,false>& ray, vfloat4& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat4 start_x(node->start.x);
+      const vfloat4 scale_x(node->scale.x);
+      const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat4 start_y(node->start.y);
+      const vfloat4 scale_y(node->scale.y);
+      const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat4 start_z(node->start.z);
+      const vfloat4 scale_z(node->scale.z);
+      const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
+
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat4 tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat4 tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat4 tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat4 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat4 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat4 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat4 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat4 tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat4 tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if (defined(__aarch64__) && defined(BUILD_IOS)) || defined(__SSE4_1__) && !defined(__AVX512F__) // up to HSW
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<4)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vfloat4 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<4,4>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,4,true>& ray, vfloat4& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat4 start_x(node->start.x);
+      const vfloat4 scale_x(node->scale.x);
+      const vfloat4 lower_x = madd(node->dequantize<4>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat4 upper_x = madd(node->dequantize<4>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat4 start_y(node->start.y);
+      const vfloat4 scale_y(node->scale.y);
+      const vfloat4 lower_y = madd(node->dequantize<4>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat4 upper_y = madd(node->dequantize<4>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat4 start_z(node->start.z);
+      const vfloat4 scale_z(node->scale.z);
+      const vfloat4 lower_z = madd(node->dequantize<4>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ  >> 2),scale_z,start_z);
+
+      const vfloat4 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat4 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat4 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat4 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat4 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat4 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+      
+      const vfloat4 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat4 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool4 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+
+#if defined(__AVX__)
+
+    template<>
+      __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,false>& ray, vfloat8& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat8 start_x(node->start.x);
+      const vfloat8 scale_x(node->scale.x);
+      const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat8 start_y(node->start.y);
+      const vfloat8 scale_y(node->scale.y);
+      const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat8 start_z(node->start.z);
+      const vfloat8 scale_z(node->scale.z);
+      const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
+
+#if defined(__AVX2__)
+#if defined(__aarch64__)
+      const vfloat8 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat8 tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat8 tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat8 tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat8 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat8 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat8 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat8 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat8 tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat8 tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat8 tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif
+      
+#if defined(__AVX2__) && !defined(__AVX512F__) // HSW
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) > asInt(tFar);
+      const size_t mask = movemask(vmask) ^ ((1<<8)-1);
+#elif defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vfloat8 tNear = maxi(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = mini(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = asInt(tNear) <= asInt(tFar);
+      const size_t mask = movemask(vmask);
+#else
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+#endif
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<8,8>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,8,true>& ray, vfloat8& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat8 start_x(node->start.x);
+      const vfloat8 scale_x(node->scale.x);
+      const vfloat8 lower_x = madd(node->dequantize<8>(ray.nearX >> 2),scale_x,start_x);
+      const vfloat8 upper_x = madd(node->dequantize<8>(ray.farX  >> 2),scale_x,start_x);
+      const vfloat8 start_y(node->start.y);
+      const vfloat8 scale_y(node->scale.y);
+      const vfloat8 lower_y = madd(node->dequantize<8>(ray.nearY >> 2),scale_y,start_y);
+      const vfloat8 upper_y = madd(node->dequantize<8>(ray.farY  >> 2),scale_y,start_y);
+      const vfloat8 start_z(node->start.z);
+      const vfloat8 scale_z(node->scale.z);
+      const vfloat8 lower_z = madd(node->dequantize<8>(ray.nearZ >> 2),scale_z,start_z);
+      const vfloat8 upper_z = madd(node->dequantize<8>(ray.farZ  >> 2),scale_z,start_z);
+
+      const vfloat8 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat8 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat8 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat8 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat8 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat8 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+      
+      const vfloat8 tNear = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool8 vmask = tNear <= tFar;
+      const size_t mask = movemask(vmask);
+
+      dist = tNear;
+      return mask & mvalid;
+    }
+
+
+#endif
+
+#if defined(__AVX512F__) && !defined(__AVX512VL__) // KNL
+
+    template<>
+      __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,false>& ray, vfloat16& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat16 start_x(node->start.x);
+      const vfloat16 scale_x(node->scale.x);
+      const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x);
+      const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX  >> 2)),scale_x,start_x);
+      const vfloat16 start_y(node->start.y);
+      const vfloat16 scale_y(node->scale.y);
+      const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y);
+      const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY  >> 2)),scale_y,start_y);
+      const vfloat16 start_z(node->start.z);
+      const vfloat16 scale_z(node->scale.z);
+      const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z);
+      const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ  >> 2)),scale_z,start_z);
+
+      const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat16 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);      
+      const vfloat16 tNear  = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat16 tFar   = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool16 vmask   = le(vbool16(0xf),tNear,tFar);
+      const size_t mask     = movemask(vmask) & mvalid;
+      dist = tNear;
+      return mask;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<4,16>(const typename BVH4::QuantizedBaseNode* node, const TravRay<4,16,true>& ray, vfloat16& dist)
+    {
+      const size_t mvalid  = movemask(node->validMask());
+      const vfloat16 start_x(node->start.x);
+      const vfloat16 scale_x(node->scale.x);
+      const vfloat16 lower_x = madd(vfloat16(node->dequantize<4>(ray.nearX >> 2)),scale_x,start_x);
+      const vfloat16 upper_x = madd(vfloat16(node->dequantize<4>(ray.farX  >> 2)),scale_x,start_x);
+      const vfloat16 start_y(node->start.y);
+      const vfloat16 scale_y(node->scale.y);
+      const vfloat16 lower_y = madd(vfloat16(node->dequantize<4>(ray.nearY >> 2)),scale_y,start_y);
+      const vfloat16 upper_y = madd(vfloat16(node->dequantize<4>(ray.farY  >> 2)),scale_y,start_y);
+      const vfloat16 start_z(node->start.z);
+      const vfloat16 scale_z(node->scale.z);
+      const vfloat16 lower_z = madd(vfloat16(node->dequantize<4>(ray.nearZ >> 2)),scale_z,start_z);
+      const vfloat16 upper_z = madd(vfloat16(node->dequantize<4>(ray.farZ  >> 2)),scale_z,start_z);
+
+      const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat16 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat16 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat16 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+      const vfloat16 tNear  = max(tNearX,tNearY,tNearZ,ray.tnear);
+      const vfloat16 tFar   = min(tFarX ,tFarY ,tFarZ ,ray.tfar);
+      const vbool16 vmask   = le(vbool16(0xf),tNear,tFar);
+      const size_t mask     = movemask(vmask) & mvalid;
+      dist = tNear;
+      return mask;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,false>& ray, vfloat16& dist)
+    {
+      const vbool16 m_valid(node->validMask16());
+      const vfloat16 bminmaxX  = node->dequantizeLowerUpperX(ray.permX);
+      const vfloat16 bminmaxY  = node->dequantizeLowerUpperY(ray.permY);
+      const vfloat16 bminmaxZ  = node->dequantizeLowerUpperZ(ray.permZ);
+      const vfloat16 tNearFarX = msub(bminmaxX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tNearFarY = msub(bminmaxY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tNearFarZ = msub(bminmaxZ, ray.rdir.z, ray.org_rdir.z);
+      const vfloat16 tNear     = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear);
+      const vfloat16 tFar      = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar);
+      const vbool16 vmask      = le(m_valid,tNear,align_shift_right<8>(tFar, tFar));
+      const size_t mask        = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    template<>
+      __forceinline size_t intersectNode<8,16>(const typename BVH8::QuantizedBaseNode* node, const TravRay<8,16,true>& ray, vfloat16& dist)
+    {
+      const vbool16 m_valid(node->validMask16());
+      const vfloat16 bminmaxX  = node->dequantizeLowerUpperX(ray.permX);
+      const vfloat16 bminmaxY  = node->dequantizeLowerUpperY(ray.permY);
+      const vfloat16 bminmaxZ  = node->dequantizeLowerUpperZ(ray.permZ);
+      const vfloat16 tNearFarX = (bminmaxX - ray.org.x) * ray.rdir_far.x; // FIXME: this is not conservative !!!!!!!!!
+      const vfloat16 tNearFarY = (bminmaxY - ray.org.y) * ray.rdir_far.y;
+      const vfloat16 tNearFarZ = (bminmaxZ - ray.org.z) * ray.rdir_far.z;
+      const vfloat16 tNear     = max(tNearFarX, tNearFarY, tNearFarZ, ray.tnear);
+      const vfloat16 tFar      = min(tNearFarX, tNearFarY, tNearFarZ, ray.tfar);
+      const vbool16 vmask      = le(m_valid,tNear,align_shift_right<8>(tFar, tFar));
+      const size_t mask        = movemask(vmask);
+      dist = tNear;
+      return mask;
+    }
+
+    
+#endif
+
+
+    template<int N, int Nx>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,false>& ray, const float time, vfloat<N>& dist)
+    {
+      const vboolf<N> mvalid    = node->validMask();
+      const vfloat<N> lower_x   = node->dequantizeLowerX(time);
+      const vfloat<N> upper_x   = node->dequantizeUpperX(time);
+      const vfloat<N> lower_y   = node->dequantizeLowerY(time);
+      const vfloat<N> upper_y   = node->dequantizeUpperY(time);
+      const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
+      const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
+#if defined(__FMA_X4__)
+#if defined(__aarch64__)
+      const vfloat<N> tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<N> tFarX  = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<N> tFarY  = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<N> tFarZ  = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat<N> tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<N> tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<N> tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<N> tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#endif
+#else
+      const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<N> tFarX  = (upper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<N> tFarY  = (upper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<N> tFarZ  = (upper_z - ray.org.z) * ray.rdir.z;
+#endif      
+
+      const vfloat<N> tminX = mini(tNearX,tFarX);
+      const vfloat<N> tmaxX = maxi(tNearX,tFarX);
+      const vfloat<N> tminY = mini(tNearY,tFarY);
+      const vfloat<N> tmaxY = maxi(tNearY,tFarY);
+      const vfloat<N> tminZ = mini(tNearZ,tFarZ);
+      const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ);
+      const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tmaxX,tmaxY,tmaxZ,ray.tfar);
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vbool<N> vmask =  le(mvalid,asInt(tNear),asInt(tFar));
+#else
+      const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid;
+#endif
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;      
+    }
+
+    template<int N, int Nx>
+      __forceinline size_t intersectNode(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,true>& ray, const float time, vfloat<N>& dist)
+    {
+      const vboolf<N> mvalid    = node->validMask();
+      const vfloat<N> lower_x   = node->dequantizeLowerX(time);
+      const vfloat<N> upper_x   = node->dequantizeUpperX(time);
+      const vfloat<N> lower_y   = node->dequantizeLowerY(time);
+      const vfloat<N> upper_y   = node->dequantizeUpperY(time);
+      const vfloat<N> lower_z   = node->dequantizeLowerZ(time);
+      const vfloat<N> upper_z   = node->dequantizeUpperZ(time);     
+      const vfloat<N> tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat<N> tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat<N> tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat<N> tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat<N> tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat<N> tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+      const vfloat<N> tminX = mini(tNearX,tFarX);
+      const vfloat<N> tmaxX = maxi(tNearX,tFarX);
+      const vfloat<N> tminY = mini(tNearY,tFarY);
+      const vfloat<N> tmaxY = maxi(tNearY,tFarY);
+      const vfloat<N> tminZ = mini(tNearZ,tFarZ);
+      const vfloat<N> tmaxZ = maxi(tNearZ,tFarZ);
+      const vfloat<N> tNear = maxi(tminX,tminY,tminZ,ray.tnear);
+      const vfloat<N> tFar  = mini(tmaxX,tmaxY,tmaxZ,ray.tfar);
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      const vbool<N> vmask =  le(mvalid,asInt(tNear),asInt(tFar));
+#else
+      const vbool<N> vmask = (asInt(tNear) <= asInt(tFar)) & mvalid;
+#endif
+      const size_t mask = movemask(vmask);
+      dist = tNear;
+      return mask;      
+    }
+
+
+#if defined(__AVX512ER__)
+    // for KNL
+    template<>
+      __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,false>& ray, const float time, vfloat<4>& dist)
+    {
+      const size_t  mvalid    = movemask(node->validMask());
+      const vfloat16 lower_x  = node->dequantizeLowerX(time);
+      const vfloat16 upper_x  = node->dequantizeUpperX(time);
+      const vfloat16 lower_y  = node->dequantizeLowerY(time);
+      const vfloat16 upper_y  = node->dequantizeUpperY(time);
+      const vfloat16 lower_z  = node->dequantizeLowerZ(time);
+      const vfloat16 upper_z  = node->dequantizeUpperZ(time);     
+
+      const vfloat16 tNearX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tNearY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tNearZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat16 tFarX  = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat16 tFarY  = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat16 tFarZ  = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+
+      const vfloat16 tminX = min(tNearX,tFarX);
+      const vfloat16 tmaxX = max(tNearX,tFarX);
+      const vfloat16 tminY = min(tNearY,tFarY);
+      const vfloat16 tmaxY = max(tNearY,tFarY);
+      const vfloat16 tminZ = min(tNearZ,tFarZ);
+      const vfloat16 tmaxZ = max(tNearZ,tFarZ);
+      const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear);
+      const vfloat16 tFar  = min(tmaxX,tmaxY,tmaxZ,ray.tfar );
+      const vbool16 vmask =  tNear <= tFar;
+      const size_t mask = movemask(vmask) & mvalid;
+      dist = extractN<4,0>(tNear);
+      return mask;      
+    }
+
+
+    // for KNL
+    template<>
+      __forceinline size_t intersectNode<4,16>(const typename BVHN<4>::QuantizedBaseNodeMB* node, const TravRay<4,16,true>& ray, const float time, vfloat<4>& dist)
+    {
+      const size_t  mvalid    = movemask(node->validMask());
+      const vfloat16 lower_x  = node->dequantizeLowerX(time);
+      const vfloat16 upper_x  = node->dequantizeUpperX(time);
+      const vfloat16 lower_y  = node->dequantizeLowerY(time);
+      const vfloat16 upper_y  = node->dequantizeUpperY(time);
+      const vfloat16 lower_z  = node->dequantizeLowerZ(time);
+      const vfloat16 upper_z  = node->dequantizeUpperZ(time);     
+
+      const vfloat16 tNearX = (lower_x - ray.org.x) * ray.rdir_near.x;
+      const vfloat16 tNearY = (lower_y - ray.org.y) * ray.rdir_near.y;
+      const vfloat16 tNearZ = (lower_z - ray.org.z) * ray.rdir_near.z;
+      const vfloat16 tFarX  = (upper_x - ray.org.x) * ray.rdir_far.x;
+      const vfloat16 tFarY  = (upper_y - ray.org.y) * ray.rdir_far.y;
+      const vfloat16 tFarZ  = (upper_z - ray.org.z) * ray.rdir_far.z;
+
+      const vfloat16 tminX = min(tNearX,tFarX);
+      const vfloat16 tmaxX = max(tNearX,tFarX);
+      const vfloat16 tminY = min(tNearY,tFarY);
+      const vfloat16 tmaxY = max(tNearY,tFarY);
+      const vfloat16 tminZ = min(tNearZ,tFarZ);
+      const vfloat16 tmaxZ = max(tNearZ,tFarZ);
+      const vfloat16 tNear = max(tminX,tminY,tminZ,ray.tnear);
+      const vfloat16 tFar  = min(tmaxX,tmaxY,tmaxZ,ray.tfar );
+      const vbool16 vmask =  tNear <= tFar;
+      const size_t mask = movemask(vmask) & mvalid;
+      dist = extractN<4,0>(tNear);
+      return mask;      
+    }
+
+#endif
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::OBBNode* node, const TravRay<N,N,robust>& ray, vfloat<N>& dist)
+    {
+      const Vec3vf<N> dir = xfmVector(node->naabb,ray.dir);
+      //const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))/dir;
+      const Vec3vf<N> nrdir = Vec3vf<N>(vfloat<N>(-1.0f))*rcp_safe(dir);
+      const Vec3vf<N> org = xfmPoint(node->naabb,ray.org);
+      const Vec3vf<N> tLowerXYZ = org * nrdir;       // (Vec3fa(zero) - org) * rdir;
+      const Vec3vf<N> tUpperXYZ = tLowerXYZ - nrdir; // (Vec3fa(one ) - org) * rdir;
+
+      const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z);
+      const vfloat<N> tFarX  = maxi(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tFarY  = maxi(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tFarZ  = maxi(tLowerXYZ.z,tUpperXYZ.z);
+      vfloat<N> tNear  = max(ray.tnear, tNearX,tNearY,tNearZ);
+      vfloat<N> tFar   = min(ray.tfar,  tFarX ,tFarY ,tFarZ );
+      if (robust) {
+        tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp));
+        tFar  = tFar *vfloat<N>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<N> vmask = tNear <= tFar;
+      dist = tNear;
+      return movemask(vmask);
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, bool robust>
+      __forceinline size_t intersectNode(const typename BVHN<N>::OBBNodeMB* node, const TravRay<N,N,robust>& ray, const float time, vfloat<N>& dist)
+    {
+      const AffineSpace3vf<N> xfm = node->space0;
+      const Vec3vf<N> b0_lower = zero;
+      const Vec3vf<N> b0_upper = one;
+      const Vec3vf<N> lower = lerp(b0_lower,node->b1.lower,vfloat<N>(time));
+      const Vec3vf<N> upper = lerp(b0_upper,node->b1.upper,vfloat<N>(time));
+
+      const BBox3vf<N> bounds(lower,upper);
+      const Vec3vf<N> dir = xfmVector(xfm,ray.dir);
+      const Vec3vf<N> rdir = rcp_safe(dir);
+      const Vec3vf<N> org = xfmPoint(xfm,ray.org);
+
+      const Vec3vf<N> tLowerXYZ = (bounds.lower - org) * rdir;
+      const Vec3vf<N> tUpperXYZ = (bounds.upper - org) * rdir;
+
+      const vfloat<N> tNearX = mini(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tNearY = mini(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tNearZ = mini(tLowerXYZ.z,tUpperXYZ.z);
+      const vfloat<N> tFarX  = maxi(tLowerXYZ.x,tUpperXYZ.x);
+      const vfloat<N> tFarY  = maxi(tLowerXYZ.y,tUpperXYZ.y);
+      const vfloat<N> tFarZ  = maxi(tLowerXYZ.z,tUpperXYZ.z);
+      vfloat<N> tNear  = max(ray.tnear, tNearX,tNearY,tNearZ);
+      vfloat<N> tFar   = min(ray.tfar,  tFarX ,tFarY ,tFarZ );
+      if (robust) {
+        tNear = tNear*vfloat<N>(1.0f-3.0f*float(ulp));
+        tFar  = tFar *vfloat<N>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<N> vmask = tNear <= tFar;
+      dist = tNear;
+      return movemask(vmask);
+    }
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in point query raversal
+    //////////////////////////////////////////////////////////////////////////////////////
+    
+    /*! Computes traversal information for N nodes with 1 point query */
+    template<int N, int types>
+    struct BVHNNodePointQuerySphere1;
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere(node.getAABBNode(), query, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphereMB4D<N>(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN1_UN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = pointQueryNodeSphere(node.getAABBNode(), query, dist);
+        else if (unlikely(node.isOBBNode())) mask = pointQueryNodeSphere(node.ungetAABBNode(), query, dist);
+        else return false;
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = pointQueryNodeSphere(node.getAABBNodeMB(), query, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_AN2_AN4D_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeSphere(node.ungetAABBNodeMB(), query, time, dist);
+        else                                    mask = pointQueryNodeSphereMB4D(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQuerySphere1<N, BVH_QN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeSphere((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist);
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNQuantizedBaseNodePointQuerySphere1
+    {
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+      {
+        return pointQueryNodeSphere(node,query,dist);
+      }
+
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+      {
+        return pointQueryNodeSphere(node,query,time,dist);
+      }
+    };
+
+    /*! Computes traversal information for N nodes with 1 point query */
+    template<int N, int types>
+    struct BVHNNodePointQueryAABB1;
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB(node.getAABBNode(), query, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABBMB4D<N>(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN1_UN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = pointQueryNodeAABB(node.getAABBNode(), query, dist);
+        else if (unlikely(node.isOBBNode())) mask = pointQueryNodeAABB(node.ungetAABBNode(), query, dist);
+        else return false;
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = pointQueryNodeAABB(node.getAABBNodeMB(), query, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_AN2_AN4D_UN2>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = pointQueryNodeAABB(node.ungetAABBNodeMB(), query, time, dist);
+        else                                    mask = pointQueryNodeAABBMB4D(node, query, time, dist);
+        return true;
+      }
+    };
+
+    template<int N>
+    struct BVHNNodePointQueryAABB1<N, BVH_QN1>
+    {
+      static __forceinline bool pointQuery(const typename BVHN<N>::NodeRef& node, const TravPointQuery<N>& query, float time, vfloat<N>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = pointQueryNodeAABB((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), query, dist);
+        return true;
+      }
+    };
+    
+    template<int N>
+    struct BVHNQuantizedBaseNodePointQueryAABB1
+    {
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNode* node, const TravPointQuery<N>& query, vfloat<N>& dist)
+      {
+        return pointQueryNodeAABB(node,query,dist);
+      }
+
+      static __forceinline size_t pointQuery(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravPointQuery<N>& query, const float time, vfloat<N>& dist)
+      {
+        return pointQueryNodeAABB(node,query,time,dist);
+      }
+    };
+
+    
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in ray traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*! Intersects N nodes with 1 ray */
+    template<int N, int Nx, int types, bool robust>
+    struct BVHNNodeIntersector1;
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode(node.getAABBNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust(node.getAABBNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode(node.getAABBNodeMB(), ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeMB4D<N>(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeMB4DRobust<N>(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN1_UN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = intersectNode(node.getAABBNode(), ray, dist);
+        else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN1_UN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNode()))          mask = intersectNodeRobust(node.getAABBNode(), ray, dist);
+        else if (unlikely(node.isOBBNode())) mask = intersectNode(node.ungetAABBNode(), ray, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = intersectNode(node.getAABBNodeMB(), ray, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (likely(node.isAABBNodeMB()))           mask = intersectNodeRobust(node.getAABBNodeMB(), ray, time, dist);
+        else if (unlikely(node.isOBBNodeMB()))  mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else return false;
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else                                    mask = intersectNodeMB4D(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_AN2_AN4D_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        if (unlikely(node.isOBBNodeMB())) mask = intersectNode(node.ungetAABBNodeMB(), ray, time, dist);
+        else                                    mask = intersectNodeMB4DRobust(node, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_QN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,false>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNode((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int Nx>
+    struct BVHNNodeIntersector1<N, Nx, BVH_QN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, const TravRay<N,Nx,true>& ray, float time, vfloat<Nx>& dist, size_t& mask)
+      {
+        if (unlikely(node.isLeaf())) return false;
+        mask = intersectNodeRobust((const typename BVHN<N>::QuantizedNode*)node.quantizedNode(), ray, dist);
+        return true;
+      }
+    };
+
+    /*! Intersects N nodes with K rays */
+    template<int N, int Nx, bool robust>
+      struct BVHNQuantizedBaseNodeIntersector1;
+
+    template<int N, int Nx>
+      struct BVHNQuantizedBaseNodeIntersector1<N, Nx, false>
+    {
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,false>& ray, vfloat<Nx>& dist)
+      {
+        return intersectNode(node,ray,dist);
+      }
+
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,false>& ray, const float time, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,time,dist);
+      }
+
+    };
+
+    template<int N, int Nx>
+      struct BVHNQuantizedBaseNodeIntersector1<N, Nx, true>
+    {
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNode* node, const TravRay<N,Nx,true>& ray, vfloat<Nx>& dist)
+      {
+        return intersectNode(node,ray,dist); 
+      }
+
+      static __forceinline size_t intersect(const typename BVHN<N>::QuantizedBaseNodeMB* node, const TravRay<N,Nx,true>& ray, const float time, vfloat<N>& dist)
+      {
+        return intersectNode(node,ray,time,dist);
+      }
+
+    };
+
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h
new file mode 100644
index 0000000000..800ac8b478
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_frustum.h
@@ -0,0 +1,269 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Frustum structure used in hybrid and stream traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*
+       Optimized frustum test. We calculate t=(p-org)/dir in ray/box
+       intersection. We assume the rays are split by octant, thus
+       dir intervals are either positive or negative in each
+       dimension.
+
+       Case 1: dir.min >= 0 && dir.max >= 0:
+         t_min = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min
+         t_max = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max
+
+       Case 2: dir.min < 0 && dir.max < 0:
+         t_min = (p_max - org_min) / dir_min = (p_max - org_min)*rdir_max = p_max*rdir_max - org_min*rdir_max
+         t_max = (p_min - org_max) / dir_max = (p_min - org_max)*rdir_min = p_min*rdir_min - org_max*rdir_min
+    */
+
+    template<bool robust>
+    struct Frustum;
+    
+    /* Fast variant */
+    template<>
+    struct Frustum<false>
+    {
+      __forceinline Frustum() {}
+
+      template<int K>
+      __forceinline Frustum(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(valid, org, rdir, ray_tnear, ray_tfar, N);
+      }
+
+      template<int K>
+      __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)),
+                                     reduce_min(select(valid, org.y, pos_inf)),
+                                     reduce_min(select(valid, org.z, pos_inf)));
+
+        const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)),
+                                     reduce_max(select(valid, org.y, neg_inf)),
+                                     reduce_max(select(valid, org.z, neg_inf)));
+
+        const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)),
+                                      reduce_min(select(valid, rdir.y, pos_inf)),
+                                      reduce_min(select(valid, rdir.z, pos_inf)));
+
+        const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)),
+                                      reduce_max(select(valid, rdir.y, neg_inf)),
+                                      reduce_max(select(valid, rdir.z, neg_inf)));
+
+        const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf)));
+        const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf)));
+
+        init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N);
+      }
+
+      __forceinline void init(const Vec3fa& reduced_min_org,
+                              const Vec3fa& reduced_max_org,
+                              const Vec3fa& reduced_min_rdir,
+                              const Vec3fa& reduced_max_rdir,
+                              float reduced_min_dist,
+                              float reduced_max_dist,
+                              int N)
+      {
+        const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero));
+
+        min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
+        max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
+
+#if defined (__aarch64__)
+        neg_min_org_rdir = -(min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org));
+        neg_max_org_rdir = -(max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org));
+#else
+        min_org_rdir = min_rdir * select(pos_rdir, reduced_max_org, reduced_min_org);
+        max_org_rdir = max_rdir * select(pos_rdir, reduced_min_org, reduced_max_org);
+#endif
+        min_dist = reduced_min_dist;
+        max_dist = reduced_max_dist;
+
+        nf = NearFarPrecalculations(min_rdir, N);
+      }
+
+      template<int K>
+      __forceinline void updateMaxDist(const vfloat<K>& ray_tfar)
+      {
+        max_dist = reduce_max(ray_tfar);
+      }
+
+      NearFarPrecalculations nf;
+
+      Vec3fa min_rdir;
+      Vec3fa max_rdir;
+
+#if defined (__aarch64__)
+      Vec3fa neg_min_org_rdir;
+      Vec3fa neg_max_org_rdir;
+#else
+      Vec3fa min_org_rdir;
+      Vec3fa max_org_rdir;
+#endif
+      float min_dist;
+      float max_dist;
+    };
+
+    typedef Frustum<false> FrustumFast;
+
+    /* Robust variant */
+    template<>
+    struct Frustum<true>
+    {
+      __forceinline Frustum() {}
+
+      template<int K>
+      __forceinline Frustum(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(valid, org, rdir, ray_tnear, ray_tfar, N);
+      }
+
+      template<int K>
+      __forceinline void init(const vbool<K>& valid, const Vec3vf<K>& org, const Vec3vf<K>& rdir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        const Vec3fa reduced_min_org(reduce_min(select(valid, org.x, pos_inf)),
+                                     reduce_min(select(valid, org.y, pos_inf)),
+                                     reduce_min(select(valid, org.z, pos_inf)));
+
+        const Vec3fa reduced_max_org(reduce_max(select(valid, org.x, neg_inf)),
+                                     reduce_max(select(valid, org.y, neg_inf)),
+                                     reduce_max(select(valid, org.z, neg_inf)));
+
+        const Vec3fa reduced_min_rdir(reduce_min(select(valid, rdir.x, pos_inf)),
+                                      reduce_min(select(valid, rdir.y, pos_inf)),
+                                      reduce_min(select(valid, rdir.z, pos_inf)));
+
+        const Vec3fa reduced_max_rdir(reduce_max(select(valid, rdir.x, neg_inf)),
+                                      reduce_max(select(valid, rdir.y, neg_inf)),
+                                      reduce_max(select(valid, rdir.z, neg_inf)));
+
+        const float reduced_min_dist = reduce_min(select(valid, ray_tnear, vfloat<K>(pos_inf)));
+        const float reduced_max_dist = reduce_max(select(valid, ray_tfar , vfloat<K>(neg_inf)));
+
+        init(reduced_min_org, reduced_max_org, reduced_min_rdir, reduced_max_rdir, reduced_min_dist, reduced_max_dist, N);
+      }
+
+      __forceinline void init(const Vec3fa& reduced_min_org,
+                              const Vec3fa& reduced_max_org,
+                              const Vec3fa& reduced_min_rdir,
+                              const Vec3fa& reduced_max_rdir,
+                              float reduced_min_dist,
+                              float reduced_max_dist,
+                              int N)
+      {
+        const Vec3ba pos_rdir = ge_mask(reduced_min_rdir, Vec3fa(zero));
+        min_rdir = select(pos_rdir, reduced_min_rdir, reduced_max_rdir);
+        max_rdir = select(pos_rdir, reduced_max_rdir, reduced_min_rdir);
+
+        min_org = select(pos_rdir, reduced_max_org, reduced_min_org);
+        max_org = select(pos_rdir, reduced_min_org, reduced_max_org);
+
+        min_dist = reduced_min_dist;
+        max_dist = reduced_max_dist;
+
+        nf = NearFarPrecalculations(min_rdir, N);
+      }
+
+      template<int K>
+      __forceinline void updateMaxDist(const vfloat<K>& ray_tfar)
+      {
+        max_dist = reduce_max(ray_tfar);
+      }
+
+      NearFarPrecalculations nf;
+
+      Vec3fa min_rdir;
+      Vec3fa max_rdir;
+
+      Vec3fa min_org;
+      Vec3fa max_org;
+
+      float min_dist;
+      float max_dist;
+    };
+
+    typedef Frustum<true> FrustumRobust;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx>
+    __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                       const FrustumFast& frustum, vfloat<Nx>& dist)
+    {
+      const vfloat<Nx> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX);
+      const vfloat<Nx> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY);
+      const vfloat<Nx> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ);
+      const vfloat<Nx> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX);
+      const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
+      const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
+
+#if defined (__aarch64__)
+      const vfloat<Nx> fminX = madd(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.neg_min_org_rdir.x));
+      const vfloat<Nx> fminY = madd(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.neg_min_org_rdir.y));
+      const vfloat<Nx> fminZ = madd(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.neg_min_org_rdir.z));
+      const vfloat<Nx> fmaxX = madd(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.neg_max_org_rdir.x));
+      const vfloat<Nx> fmaxY = madd(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.neg_max_org_rdir.y));
+      const vfloat<Nx> fmaxZ = madd(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.neg_max_org_rdir.z));
+#else
+      const vfloat<Nx> fminX = msub(bminX, vfloat<Nx>(frustum.min_rdir.x), vfloat<Nx>(frustum.min_org_rdir.x));
+      const vfloat<Nx> fminY = msub(bminY, vfloat<Nx>(frustum.min_rdir.y), vfloat<Nx>(frustum.min_org_rdir.y));
+      const vfloat<Nx> fminZ = msub(bminZ, vfloat<Nx>(frustum.min_rdir.z), vfloat<Nx>(frustum.min_org_rdir.z));
+      const vfloat<Nx> fmaxX = msub(bmaxX, vfloat<Nx>(frustum.max_rdir.x), vfloat<Nx>(frustum.max_org_rdir.x));
+      const vfloat<Nx> fmaxY = msub(bmaxY, vfloat<Nx>(frustum.max_rdir.y), vfloat<Nx>(frustum.max_org_rdir.y));
+      const vfloat<Nx> fmaxZ = msub(bmaxZ, vfloat<Nx>(frustum.max_rdir.z), vfloat<Nx>(frustum.max_org_rdir.z));
+#endif
+      const vfloat<Nx> fmin  = maxi(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist));
+      dist = fmin;
+      const vfloat<Nx> fmax  = mini(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist));
+      const vbool<Nx> vmask_node_hit = fmin <= fmax;
+      size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1);
+      return m_node;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx>
+    __forceinline size_t intersectNodeFrustum(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                       const FrustumRobust& frustum, vfloat<Nx>& dist)
+    {
+      const vfloat<Nx> bminX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearX);
+      const vfloat<Nx> bminY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearY);
+      const vfloat<Nx> bminZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.nearZ);
+      const vfloat<Nx> bmaxX = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farX);
+      const vfloat<Nx> bmaxY = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farY);
+      const vfloat<Nx> bmaxZ = *(const vfloat<N>*)((const char*)&node->lower_x + frustum.nf.farZ);
+
+      const vfloat<Nx> fminX = (bminX - vfloat<Nx>(frustum.min_org.x)) * vfloat<Nx>(frustum.min_rdir.x);
+      const vfloat<Nx> fminY = (bminY - vfloat<Nx>(frustum.min_org.y)) * vfloat<Nx>(frustum.min_rdir.y);
+      const vfloat<Nx> fminZ = (bminZ - vfloat<Nx>(frustum.min_org.z)) * vfloat<Nx>(frustum.min_rdir.z);
+      const vfloat<Nx> fmaxX = (bmaxX - vfloat<Nx>(frustum.max_org.x)) * vfloat<Nx>(frustum.max_rdir.x);
+      const vfloat<Nx> fmaxY = (bmaxY - vfloat<Nx>(frustum.max_org.y)) * vfloat<Nx>(frustum.max_rdir.y);
+      const vfloat<Nx> fmaxZ = (bmaxZ - vfloat<Nx>(frustum.max_org.z)) * vfloat<Nx>(frustum.max_rdir.z);
+
+      const float round_down = 1.0f-2.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
+      const float round_up   = 1.0f+2.0f*float(ulp);
+      const vfloat<Nx> fmin  = max(fminX, fminY, fminZ, vfloat<Nx>(frustum.min_dist));
+      dist = fmin;
+      const vfloat<Nx> fmax  = min(fmaxX, fmaxY, fmaxZ, vfloat<Nx>(frustum.max_dist));
+      const vbool<Nx> vmask_node_hit = (round_down*fmin <= round_up*fmax);
+      size_t m_node = movemask(vmask_node_hit) & (((size_t)1 << N)-1);
+      return m_node;
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h
new file mode 100644
index 0000000000..0543e56f8e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet.h
@@ -0,0 +1,843 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray packet structure used in hybrid traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int K, bool robust>
+    struct TravRayK;
+
+    /* Fast variant */
+    template<int K>
+    struct TravRayK<K, false>
+    {
+      __forceinline TravRayK() {}
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        init(ray_org, ray_dir, N);
+      }
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(ray_org, ray_dir, N);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        org = ray_org;
+        dir = ray_dir;
+        rdir = rcp_safe(ray_dir);
+#if defined(__aarch64__)
+        neg_org_rdir = -(org * rdir);
+#elif defined(__AVX2__)
+        org_rdir = org * rdir;
+#endif
+        if (N)
+        {
+          const int size = sizeof(float)*N;
+          nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size));
+          nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size));
+          nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size));
+        }
+      }
+
+      Vec3vf<K> org;
+      Vec3vf<K> dir;
+      Vec3vf<K> rdir;
+#if defined(__aarch64__)
+      Vec3vf<K> neg_org_rdir;
+#elif defined(__AVX2__)
+      Vec3vf<K> org_rdir;
+#endif
+      Vec3vi<K> nearXYZ;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKFast = TravRayK<K, false>;
+
+    /* Robust variant */
+    template<int K>
+    struct TravRayK<K, true>
+    {
+      __forceinline TravRayK() {}
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        init(ray_org, ray_dir, N);
+      }
+
+      __forceinline TravRayK(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar, int N)
+      {
+        init(ray_org, ray_dir, N);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, int N)
+      {
+        org = ray_org;
+        dir = ray_dir;
+        rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
+
+        if (N)
+        {
+          const int size = sizeof(float)*N;
+          nearXYZ.x = select(rdir.x >= 0.0f, vint<K>(0*size), vint<K>(1*size));
+          nearXYZ.y = select(rdir.y >= 0.0f, vint<K>(2*size), vint<K>(3*size));
+          nearXYZ.z = select(rdir.z >= 0.0f, vint<K>(4*size), vint<K>(5*size));
+        }
+      }
+
+      Vec3vf<K> org;
+      Vec3vf<K> dir;
+      Vec3vf<K> rdir;
+      Vec3vi<K> nearXYZ;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKRobust = TravRayK<K, true>;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNode* node, size_t i,
+                                         const TravRayKFast<K>& ray, vfloat<K>& dist)
+
+    {
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(node->lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(node->lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(node->lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(node->upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(node->upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(node->upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+      const vfloat<K> lclipMinX = msub(node->lower_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(node->lower_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(node->lower_z[i], ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(node->upper_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(node->upper_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(node->upper_z[i], ray.rdir.z, ray.org_rdir.z);
+  #else
+      const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z;
+  #endif
+
+  #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+  #endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+  #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+  #else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+  #endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNode* node, size_t i,
+                                               const TravRayKRobust<K>& ray, vfloat<K>& dist)
+    {
+      // FIXME: use per instruction rounding for AVX512
+      const vfloat<K> lclipMinX = (node->lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (node->lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (node->lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (node->upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (node->upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (node->upper_z[i] - ray.org.z) * ray.rdir.z;
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+      const vfloat<K> lnearP = round_down*max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
+      const vbool<K> lhit   = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::AABBNodeMB* node, const size_t i,
+                                         const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+      const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+#endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+#else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+#endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKRobust(const typename BVHN<N>::AABBNodeMB* node, const size_t i,
+                                               const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+
+#if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      if (K == 16)
+      {
+        const vfloat<K> lnearP = round_down*maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K>  lhit   = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+      else
+#endif
+      {
+        const vfloat<K> lnearP = round_down*maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+        const vbool<K>  lhit   = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKMB4D(const typename BVHN<N>::NodeRef ref, const size_t i,
+                                             const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+#if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(vlower_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(vlower_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(vlower_z, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(vupper_x, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(vupper_y, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(vupper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+      const vfloat<K> lclipMinX = msub(vlower_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(vlower_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(vlower_z, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(vupper_x, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(vupper_y, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(vupper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+#endif
+
+      const vfloat<K> lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
+      vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i]));
+      }
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNodeMB4D intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectNodeKMB4DRobust(const typename BVHN<N>::NodeRef ref, const size_t i,
+                                                    const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const typename BVHN<N>::AABBNodeMB* node = ref.getAABBNodeMB();
+
+      const vfloat<K> vlower_x = madd(time, vfloat<K>(node->lower_dx[i]), vfloat<K>(node->lower_x[i]));
+      const vfloat<K> vlower_y = madd(time, vfloat<K>(node->lower_dy[i]), vfloat<K>(node->lower_y[i]));
+      const vfloat<K> vlower_z = madd(time, vfloat<K>(node->lower_dz[i]), vfloat<K>(node->lower_z[i]));
+      const vfloat<K> vupper_x = madd(time, vfloat<K>(node->upper_dx[i]), vfloat<K>(node->upper_x[i]));
+      const vfloat<K> vupper_y = madd(time, vfloat<K>(node->upper_dy[i]), vfloat<K>(node->upper_y[i]));
+      const vfloat<K> vupper_z = madd(time, vfloat<K>(node->upper_dz[i]), vfloat<K>(node->upper_z[i]));
+
+      const vfloat<K> lclipMinX = (vlower_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (vlower_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (vlower_z - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (vupper_x - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (vupper_y - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (vupper_z - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+      const vfloat<K> lnearP = round_down*maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
+      vbool<K> lhit = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+
+      if (unlikely(ref.isAABBNodeMB4D())) {
+        const typename BVHN<N>::AABBNodeMB4D* node1 = (const typename BVHN<N>::AABBNodeMB4D*) node;
+        lhit = lhit & (vfloat<K>(node1->lower_t[i]) <= time) & (time < vfloat<K>(node1->upper_t[i]));
+      }
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K, bool robust>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNode* node, const size_t i,
+                                          const TravRayK<K,robust>& ray, vfloat<K>& dist)
+    {
+      const AffineSpace3vf<K> naabb(Vec3f(node->naabb.l.vx.x[i], node->naabb.l.vx.y[i], node->naabb.l.vx.z[i]),
+                                    Vec3f(node->naabb.l.vy.x[i], node->naabb.l.vy.y[i], node->naabb.l.vy.z[i]),
+                                    Vec3f(node->naabb.l.vz.x[i], node->naabb.l.vz.y[i], node->naabb.l.vz.z[i]),
+                                    Vec3f(node->naabb.p   .x[i], node->naabb.p   .y[i], node->naabb.p   .z[i]));
+
+      const Vec3vf<K> dir = xfmVector(naabb, ray.dir);
+      const Vec3vf<K> nrdir = Vec3vf<K>(vfloat<K>(-1.0f)) * rcp_safe(dir); // FIXME: negate instead of mul with -1?
+      const Vec3vf<K> org = xfmPoint(naabb, ray.org);
+
+      const vfloat<K> lclipMinX = org.x * nrdir.x; // (Vec3fa(zero) - org) * rdir;
+      const vfloat<K> lclipMinY = org.y * nrdir.y;
+      const vfloat<K> lclipMinZ = org.z * nrdir.z;
+      const vfloat<K> lclipMaxX  = lclipMinX - nrdir.x; // (Vec3fa(one) - org) * rdir;
+      const vfloat<K> lclipMaxY  = lclipMinY - nrdir.y;
+      const vfloat<K> lclipMaxZ  = lclipMinZ - nrdir.z;
+
+      vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+      vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+      if (robust) {
+        lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp));
+        lfarP  = lfarP *vfloat<K>(1.0f+3.0f*float(ulp));
+      }
+      const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast OBBNodeMB intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K, bool robust>
+    __forceinline vbool<K> intersectNodeK(const typename BVHN<N>::OBBNodeMB* node, const size_t i,
+                                          const TravRayK<K,robust>& ray, const vfloat<K>& time, vfloat<K>& dist)
+    {
+      const AffineSpace3vf<K> xfm(Vec3f(node->space0.l.vx.x[i], node->space0.l.vx.y[i], node->space0.l.vx.z[i]),
+                                  Vec3f(node->space0.l.vy.x[i], node->space0.l.vy.y[i], node->space0.l.vy.z[i]),
+                                  Vec3f(node->space0.l.vz.x[i], node->space0.l.vz.y[i], node->space0.l.vz.z[i]),
+                                  Vec3f(node->space0.p   .x[i], node->space0.p   .y[i], node->space0.p   .z[i]));
+
+      const Vec3vf<K> b0_lower = zero;
+      const Vec3vf<K> b0_upper = one;
+      const Vec3vf<K> b1_lower(node->b1.lower.x[i], node->b1.lower.y[i], node->b1.lower.z[i]);
+      const Vec3vf<K> b1_upper(node->b1.upper.x[i], node->b1.upper.y[i], node->b1.upper.z[i]);
+      const Vec3vf<K> lower = lerp(b0_lower, b1_lower, time);
+      const Vec3vf<K> upper = lerp(b0_upper, b1_upper, time);
+
+      const Vec3vf<K> dir = xfmVector(xfm, ray.dir);
+      const Vec3vf<K> rdir = rcp_safe(dir);
+      const Vec3vf<K> org = xfmPoint(xfm, ray.org);
+
+      const vfloat<K> lclipMinX = (lower.x - org.x) * rdir.x;
+      const vfloat<K> lclipMinY = (lower.y - org.y) * rdir.y;
+      const vfloat<K> lclipMinZ = (lower.z - org.z) * rdir.z;
+      const vfloat<K> lclipMaxX  = (upper.x - org.x) * rdir.x;
+      const vfloat<K> lclipMaxY  = (upper.y - org.y) * rdir.y;
+      const vfloat<K> lclipMaxZ  = (upper.z - org.z) * rdir.z;
+
+      vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+      vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+      if (robust) {
+        lnearP = lnearP*vfloat<K>(1.0f-3.0f*float(ulp));
+        lfarP  = lfarP *vfloat<K>(1.0f+3.0f*float(ulp));
+      }
+        
+      const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+    }
+
+
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // QuantizedBaseNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i,
+                                                   const TravRayK<K,false>& ray, vfloat<K>& dist)
+
+    {
+      assert(movemask(node->validMask()) & ((size_t)1 << i));
+      const vfloat<N> lower_x = node->dequantizeLowerX();
+      const vfloat<N> upper_x = node->dequantizeUpperX();
+      const vfloat<N> lower_y = node->dequantizeLowerY();
+      const vfloat<N> upper_y = node->dequantizeUpperY();
+      const vfloat<N> lower_z = node->dequantizeLowerZ();
+      const vfloat<N> upper_z = node->dequantizeUpperZ();
+
+  #if defined(__aarch64__)
+      const vfloat<K> lclipMinX = madd(lower_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMinY = madd(lower_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMinZ = madd(lower_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> lclipMaxX = madd(upper_x[i], ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> lclipMaxY = madd(upper_y[i], ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> lclipMaxZ = madd(upper_z[i], ray.rdir.z, ray.neg_org_rdir.z);
+  #elif defined(__AVX2__)
+      const vfloat<K> lclipMinX = msub(lower_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMinY = msub(lower_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMinZ = msub(lower_z[i], ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> lclipMaxX = msub(upper_x[i], ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> lclipMaxY = msub(upper_y[i], ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> lclipMaxZ = msub(upper_z[i], ray.rdir.z, ray.org_rdir.z);
+  #else
+      const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z;
+  #endif
+
+  #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+      if (K == 16)
+      {
+        /* use mixed float/int min/max */
+        const vfloat<K> lnearP = maxi(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+        dist = lnearP;
+        return lhit;
+      }
+      else
+  #endif
+      {
+        const vfloat<K> lnearP = maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY), mini(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY), maxi(lclipMinZ, lclipMaxZ));
+  #if defined(__AVX512F__) && !defined(__AVX512ER__) // SKX
+        const vbool<K> lhit    = asInt(maxi(lnearP, ray.tnear)) <= asInt(mini(lfarP, ray.tfar));
+  #else
+        const vbool<K> lhit    = maxi(lnearP, ray.tnear) <= mini(lfarP, ray.tfar);
+  #endif
+        dist = lnearP;
+        return lhit;
+      }
+    }
+
+    template<int N, int K>
+    __forceinline vbool<K> intersectQuantizedNodeK(const typename BVHN<N>::QuantizedBaseNode* node, size_t i,
+          const TravRayK<K,true>& ray, vfloat<K>& dist)
+
+    {
+      assert(movemask(node->validMask()) & ((size_t)1 << i));
+      const vfloat<N> lower_x = node->dequantizeLowerX();
+      const vfloat<N> upper_x = node->dequantizeUpperX();
+      const vfloat<N> lower_y = node->dequantizeLowerY();
+      const vfloat<N> upper_y = node->dequantizeUpperY();
+      const vfloat<N> lower_z = node->dequantizeLowerZ();
+      const vfloat<N> upper_z = node->dequantizeUpperZ();
+
+      const vfloat<K> lclipMinX = (lower_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMinY = (lower_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMinZ = (lower_z[i] - ray.org.z) * ray.rdir.z;
+      const vfloat<K> lclipMaxX = (upper_x[i] - ray.org.x) * ray.rdir.x;
+      const vfloat<K> lclipMaxY = (upper_y[i] - ray.org.y) * ray.rdir.y;
+      const vfloat<K> lclipMaxZ = (upper_z[i] - ray.org.z) * ray.rdir.z;
+
+      const float round_up   = 1.0f+3.0f*float(ulp);
+      const float round_down = 1.0f-3.0f*float(ulp);
+
+      const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+      const vfloat<K> lfarP  = round_up  *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+      const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+      dist = lnearP;
+      return lhit;
+      }
+
+    template<int N, int K>
+      __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist)
+
+    {
+        assert(movemask(node->validMask()) & ((size_t)1 << i));
+
+        const vfloat<K> lower_x = node->dequantizeLowerX(i,time);
+        const vfloat<K> upper_x = node->dequantizeUpperX(i,time);
+        const vfloat<K> lower_y = node->dequantizeLowerY(i,time);
+        const vfloat<K> upper_y = node->dequantizeUpperY(i,time);
+        const vfloat<K> lower_z = node->dequantizeLowerZ(i,time);
+        const vfloat<K> upper_z = node->dequantizeUpperZ(i,time);
+        
+#if defined(__aarch64__)
+        const vfloat<K> lclipMinX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x);
+        const vfloat<K> lclipMinY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y);
+        const vfloat<K> lclipMinZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z);
+        const vfloat<K> lclipMaxX = madd(upper_x, ray.rdir.x, ray.neg_org_rdir.x);
+        const vfloat<K> lclipMaxY = madd(upper_y, ray.rdir.y, ray.neg_org_rdir.y);
+        const vfloat<K> lclipMaxZ = madd(upper_z, ray.rdir.z, ray.neg_org_rdir.z);
+#elif defined(__AVX2__)
+        const vfloat<K> lclipMinX = msub(lower_x, ray.rdir.x, ray.org_rdir.x);
+        const vfloat<K> lclipMinY = msub(lower_y, ray.rdir.y, ray.org_rdir.y);
+        const vfloat<K> lclipMinZ = msub(lower_z, ray.rdir.z, ray.org_rdir.z);
+        const vfloat<K> lclipMaxX = msub(upper_x, ray.rdir.x, ray.org_rdir.x);
+        const vfloat<K> lclipMaxY = msub(upper_y, ray.rdir.y, ray.org_rdir.y);
+        const vfloat<K> lclipMaxZ = msub(upper_z, ray.rdir.z, ray.org_rdir.z);
+#else
+        const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z;
+        const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z;
+  #endif
+        const vfloat<K> lnearP = max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+
+
+    template<int N, int K>
+      __forceinline vbool<K> intersectQuantizedNodeMBK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist)
+
+    {
+        assert(movemask(node->validMask()) & ((size_t)1 << i));
+
+        const vfloat<K> lower_x = node->dequantizeLowerX(i,time);
+        const vfloat<K> upper_x = node->dequantizeUpperX(i,time);
+        const vfloat<K> lower_y = node->dequantizeLowerY(i,time);
+        const vfloat<K> upper_y = node->dequantizeUpperY(i,time);
+        const vfloat<K> lower_z = node->dequantizeLowerZ(i,time);
+        const vfloat<K> upper_z = node->dequantizeUpperZ(i,time);
+
+        const vfloat<K> lclipMinX = (lower_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMinY = (lower_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMinZ = (lower_z - ray.org.z) * ray.rdir.z;
+        const vfloat<K> lclipMaxX = (upper_x - ray.org.x) * ray.rdir.x;
+        const vfloat<K> lclipMaxY = (upper_y - ray.org.y) * ray.rdir.y;
+        const vfloat<K> lclipMaxZ = (upper_z - ray.org.z) * ray.rdir.z;
+
+        const float round_up   = 1.0f+3.0f*float(ulp);
+        const float round_down = 1.0f-3.0f*float(ulp);
+
+        const vfloat<K> lnearP = round_down*max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY), min(lclipMinZ, lclipMaxZ));
+        const vfloat<K> lfarP  = round_up  *min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY), max(lclipMinZ, lclipMaxZ));
+        const vbool<K> lhit    = max(lnearP, ray.tnear) <= min(lfarP, ray.tfar);
+        dist = lnearP;
+        return lhit;
+      }
+
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Node intersectors used in hybrid traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    /*! Intersects N nodes with K rays */
+    template<int N, int K, int types, bool robust>
+    struct BVHNNodeIntersectorK;
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1, false>
+    {
+      /* vmask is both an input and an output parameter! Its initial value should be the parent node
+         hit mask, which is used for correctly computing the current hit mask. The parent hit mask
+         is actually required only for motion blur node intersections (because different rays may
+         have different times), so for regular nodes vmask is simply overwritten. */
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNode()))              vmask = intersectNodeK<N,K>(node.getAABBNode(), i, ray, dist);
+        else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN1_UN1, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNode()))              vmask = intersectNodeKRobust<N,K>(node.getAABBNode(), i, ray, dist);
+        else /*if (unlikely(node.isOBBNode()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNode(), i, ray, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB()))              vmask = intersectNodeK<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB()))              vmask = intersectNodeKRobust<N,K>(node.getAABBNodeMB(), i, ray, time, dist);
+        else /*if (unlikely(node.isOBBNodeMB()))*/ vmask = intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist);
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, false>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKFast<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) {
+          vmask &= intersectNodeKMB4D<N,K>(node, i, ray, time, dist);
+        } else /*if (unlikely(node.isOBBNodeMB()))*/ {
+          assert(node.isOBBNodeMB());
+          vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        }
+        return true;
+      }
+    };
+
+    template<int N, int K>
+    struct BVHNNodeIntersectorK<N, K, BVH_AN2_AN4D_UN2, true>
+    {
+      static __forceinline bool intersect(const typename BVHN<N>::NodeRef& node, size_t i,
+                                          const TravRayKRobust<K>& ray, const vfloat<K>& time, vfloat<K>& dist, vbool<K>& vmask)
+      {
+        if (likely(node.isAABBNodeMB() || node.isAABBNodeMB4D())) {
+          vmask &= intersectNodeKMB4DRobust<N,K>(node, i, ray, time, dist);
+        } else /*if (unlikely(node.isOBBNodeMB()))*/ {
+          assert(node.isOBBNodeMB());
+          vmask &= intersectNodeK<N,K>(node.ungetAABBNodeMB(), i, ray, time, dist);
+        }
+        return true;
+      }
+    };
+
+
+    /*! Intersects N nodes with K rays */
+    template<int N, int K, bool robust>
+    struct BVHNQuantizedBaseNodeIntersectorK;
+
+    template<int N, int K>
+    struct BVHNQuantizedBaseNodeIntersectorK<N, K, false>
+    {
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i,
+                                              const TravRayK<K,false>& ray, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeK<N,K>(node,i,ray,dist);
+      }
+
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+                                               const TravRayK<K,false>& ray, const vfloat<K>& time, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist);
+      }
+
+    };
+
+    template<int N, int K>
+    struct BVHNQuantizedBaseNodeIntersectorK<N, K, true>
+    {
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNode* node, const size_t i,
+                                               const TravRayK<K,true>& ray, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeK<N,K>(node,i,ray,dist);
+      }
+
+      static __forceinline vbool<K> intersectK(const typename BVHN<N>::QuantizedBaseNodeMB* node, const size_t i,
+          const TravRayK<K,true>& ray, const vfloat<K>& time, vfloat<K>& dist)
+      {
+        return intersectQuantizedNodeMBK<N,K>(node,i,ray,time,dist);
+      }
+    };
+
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h
new file mode 100644
index 0000000000..f379b57aea
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/bvh/node_intersector_packet_stream.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "node_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Ray packet structure used in stream traversal
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int K, bool robust>
+    struct TravRayKStream;
+
+    /* Fast variant */
+    template<int K>
+    struct TravRayKStream<K, false>
+    {
+      __forceinline TravRayKStream() {}
+
+      __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
+      {
+        init(ray_org, ray_dir);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
+      {
+        rdir = rcp_safe(ray_dir);
+#if defined(__aarch64__)
+        neg_org_rdir = -(ray_org * rdir);
+#else
+        org_rdir = ray_org * rdir;
+#endif
+      }
+
+      Vec3vf<K> rdir;
+#if defined(__aarch64__)
+      Vec3vf<K> neg_org_rdir;
+#else
+      Vec3vf<K> org_rdir;
+#endif
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKStreamFast = TravRayKStream<K, false>;
+
+    /* Robust variant */
+    template<int K>
+    struct TravRayKStream<K, true>
+    {
+      __forceinline TravRayKStream() {}
+
+      __forceinline TravRayKStream(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir, const vfloat<K>& ray_tnear, const vfloat<K>& ray_tfar)
+      {
+        init(ray_org, ray_dir);
+        tnear = ray_tnear;
+        tfar = ray_tfar;
+      }
+
+      __forceinline void init(const Vec3vf<K>& ray_org, const Vec3vf<K>& ray_dir)
+      {
+        rdir = vfloat<K>(1.0f)/(zero_fix(ray_dir));
+        org = ray_org;
+      }
+
+      Vec3vf<K> rdir;
+      Vec3vf<K> org;
+      vfloat<K> tnear;
+      vfloat<K> tfar;
+    };
+
+    template<int K>
+    using TravRayKStreamRobust = TravRayKStream<K, true>;
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Fast AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx, int K>
+    __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                        const TravRayKStreamFast<K>& ray, size_t k, const NearFarPrecalculations& nf)
+    {
+      const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+      const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+      const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+      const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+      const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+      const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+
+#if defined (__aarch64__)
+      const vfloat<Nx> rminX = madd(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
+      const vfloat<Nx> rminY = madd(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
+      const vfloat<Nx> rminZ = madd(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
+      const vfloat<Nx> rmaxX = madd(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.neg_org_rdir.x[k]));
+      const vfloat<Nx> rmaxY = madd(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.neg_org_rdir.y[k]));
+      const vfloat<Nx> rmaxZ = madd(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.neg_org_rdir.z[k]));
+#else
+      const vfloat<Nx> rminX = msub(bminX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
+      const vfloat<Nx> rminY = msub(bminY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
+      const vfloat<Nx> rminZ = msub(bminZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
+      const vfloat<Nx> rmaxX = msub(bmaxX, vfloat<Nx>(ray.rdir.x[k]), vfloat<Nx>(ray.org_rdir.x[k]));
+      const vfloat<Nx> rmaxY = msub(bmaxY, vfloat<Nx>(ray.rdir.y[k]), vfloat<Nx>(ray.org_rdir.y[k]));
+      const vfloat<Nx> rmaxZ = msub(bmaxZ, vfloat<Nx>(ray.rdir.z[k]), vfloat<Nx>(ray.org_rdir.z[k]));
+#endif
+      const vfloat<Nx> rmin  = maxi(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k]));
+      const vfloat<Nx> rmax  = mini(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k]));
+
+      const vbool<Nx> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
+    }
+
+    template<int N, int K>
+    __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
+                                        const TravRayKStreamFast<K>& ray, const NearFarPrecalculations& nf)
+    {
+      char* ptr = (char*)&node->lower_x + i*sizeof(float);
+      const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
+      const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
+      const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
+      const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
+      const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
+      const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
+
+#if defined (__aarch64__)
+      const vfloat<K> rminX = madd(bminX, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> rminY = madd(bminY, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> rminZ = madd(bminZ, ray.rdir.z, ray.neg_org_rdir.z);
+      const vfloat<K> rmaxX = madd(bmaxX, ray.rdir.x, ray.neg_org_rdir.x);
+      const vfloat<K> rmaxY = madd(bmaxY, ray.rdir.y, ray.neg_org_rdir.y);
+      const vfloat<K> rmaxZ = madd(bmaxZ, ray.rdir.z, ray.neg_org_rdir.z);
+#else
+      const vfloat<K> rminX = msub(bminX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> rminY = msub(bminY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> rminZ = msub(bminZ, ray.rdir.z, ray.org_rdir.z);
+      const vfloat<K> rmaxX = msub(bmaxX, ray.rdir.x, ray.org_rdir.x);
+      const vfloat<K> rmaxY = msub(bmaxY, ray.rdir.y, ray.org_rdir.y);
+      const vfloat<K> rmaxZ = msub(bmaxZ, ray.rdir.z, ray.org_rdir.z);
+#endif
+
+      const vfloat<K> rmin  = maxi(rminX, rminY, rminZ, ray.tnear);
+      const vfloat<K> rmax  = mini(rmaxX, rmaxY, rmaxZ, ray.tfar);
+
+      const vbool<K> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit);
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////
+    // Robust AABBNode intersection
+    //////////////////////////////////////////////////////////////////////////////////////
+
+    template<int N, int Nx, int K>
+    __forceinline size_t intersectNode1(const typename BVHN<N>::AABBNode* __restrict__ node,
+                                        const TravRayKStreamRobust<K>& ray, size_t k, const NearFarPrecalculations& nf)
+    {
+      const vfloat<Nx> bminX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearX));
+      const vfloat<Nx> bminY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearY));
+      const vfloat<Nx> bminZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.nearZ));
+      const vfloat<Nx> bmaxX = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farX));
+      const vfloat<Nx> bmaxY = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farY));
+      const vfloat<Nx> bmaxZ = vfloat<Nx>(*(const vfloat<N>*)((const char*)&node->lower_x + nf.farZ));
+
+      const vfloat<Nx> rminX = (bminX - vfloat<Nx>(ray.org.x[k])) * vfloat<Nx>(ray.rdir.x[k]);
+      const vfloat<Nx> rminY = (bminY - vfloat<Nx>(ray.org.y[k])) * vfloat<Nx>(ray.rdir.y[k]);
+      const vfloat<Nx> rminZ = (bminZ - vfloat<Nx>(ray.org.z[k])) * vfloat<Nx>(ray.rdir.z[k]);
+      const vfloat<Nx> rmaxX = (bmaxX - vfloat<Nx>(ray.org.x[k])) * vfloat<Nx>(ray.rdir.x[k]);
+      const vfloat<Nx> rmaxY = (bmaxY - vfloat<Nx>(ray.org.y[k])) * vfloat<Nx>(ray.rdir.y[k]);
+      const vfloat<Nx> rmaxZ = (bmaxZ - vfloat<Nx>(ray.org.z[k])) * vfloat<Nx>(ray.rdir.z[k]);
+      const float round_up = 1.0f+3.0f*float(ulp); // FIXME: use per instruction rounding for AVX512
+      const vfloat<Nx> rmin  =            max(rminX, rminY, rminZ, vfloat<Nx>(ray.tnear[k]));
+      const vfloat<Nx> rmax  = round_up  *min(rmaxX, rmaxY, rmaxZ, vfloat<Nx>(ray.tfar[k]));
+
+      const vbool<Nx> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit) & (((size_t)1 << N)-1);
+    }
+
+    template<int N, int K>
+    __forceinline size_t intersectNodeK(const typename BVHN<N>::AABBNode* __restrict__ node, size_t i,
+                                        const TravRayKStreamRobust<K>& ray, const NearFarPrecalculations& nf)
+    {
+      char *ptr = (char*)&node->lower_x + i*sizeof(float);
+      const vfloat<K> bminX = *(const float*)(ptr + nf.nearX);
+      const vfloat<K> bminY = *(const float*)(ptr + nf.nearY);
+      const vfloat<K> bminZ = *(const float*)(ptr + nf.nearZ);
+      const vfloat<K> bmaxX = *(const float*)(ptr + nf.farX);
+      const vfloat<K> bmaxY = *(const float*)(ptr + nf.farY);
+      const vfloat<K> bmaxZ = *(const float*)(ptr + nf.farZ);
+
+      const vfloat<K> rminX = (bminX - ray.org.x) * ray.rdir.x;
+      const vfloat<K> rminY = (bminY - ray.org.y) * ray.rdir.y;
+      const vfloat<K> rminZ = (bminZ - ray.org.z) * ray.rdir.z;
+      const vfloat<K> rmaxX = (bmaxX - ray.org.x) * ray.rdir.x;
+      const vfloat<K> rmaxY = (bmaxY - ray.org.y) * ray.rdir.y;
+      const vfloat<K> rmaxZ = (bmaxZ - ray.org.z) * ray.rdir.z;
+
+      const float round_up  = 1.0f+3.0f*float(ulp);
+      const vfloat<K> rmin  =            max(rminX, rminY, rminZ, vfloat<K>(ray.tnear));
+      const vfloat<K> rmax  = round_up * min(rmaxX, rmaxY, rmaxZ, vfloat<K>(ray.tfar));
+
+      const vbool<K> vmask_first_hit = rmin <= rmax;
+
+      return movemask(vmask_first_hit);
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accel.h b/thirdparty/embree-aarch64/kernels/common/accel.h
new file mode 100644
index 0000000000..c038d3cf21
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accel.h
@@ -0,0 +1,556 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "point_query.h"
+#include "context.h"
+
+namespace embree
+{
+  class Scene;
+
+  /*! Base class for the acceleration structure data. */
+  class AccelData : public RefCount 
+  {
+    ALIGNED_CLASS_(16);
+  public:
+    enum Type { TY_UNKNOWN = 0, TY_ACCELN = 1, TY_ACCEL_INSTANCE = 2, TY_BVH4 = 3, TY_BVH8 = 4 };
+
+  public:
+    AccelData (const Type type) 
+      : bounds(empty), type(type) {}
+
+    /*! notifies the acceleration structure about the deletion of some geometry */
+    virtual void deleteGeometry(size_t geomID) {};
+   
+    /*! clears the acceleration structure data */
+    virtual void clear() = 0;
+
+    /*! returns normal bounds */
+    __forceinline BBox3fa getBounds() const {
+      return bounds.bounds();
+    }
+
+    /*! returns bounds for some time */
+    __forceinline BBox3fa getBounds(float t) const {
+      return bounds.interpolate(t);
+    }
+
+    /*! returns linear bounds */
+    __forceinline LBBox3fa getLinearBounds() const {
+      return bounds;
+    }
+
+    /*! checks if acceleration structure is empty */
+    __forceinline bool isEmpty() const {
+      return bounds.bounds0.lower.x == float(pos_inf);
+    }
+
+  public:
+    LBBox3fa bounds; // linear bounds
+    Type type;
+  };
+
+  /*! Base class for all intersectable and buildable acceleration structures. */
+  class Accel : public AccelData
+  {
+     ALIGNED_CLASS_(16);
+  public:
+
+    struct Intersectors;
+
+    /*! Type of collide function */
+    typedef void (*CollideFunc)(void* bvh0, void* bvh1, RTCCollideFunc callback, void* userPtr);
+
+    /*! Type of point query function */
+    typedef bool(*PointQueryFunc)(Intersectors* This,          /*!< this pointer to accel */
+                                  PointQuery* query,        /*!< point query for lookup */
+                                  PointQueryContext* context); /*!< point query context */
+
+    /*! Type of intersect function pointer for single rays. */
+    typedef void (*IntersectFunc)(Intersectors* This,  /*!< this pointer to accel */
+                                  RTCRayHit& ray,      /*!< ray to intersect */
+                                  IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 4. */
+    typedef void (*IntersectFunc4)(const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHit4& ray,    /*!< ray packet to intersect */
+                                   IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 8. */
+    typedef void (*IntersectFunc8)(const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHit8& ray,    /*!< ray packet to intersect */
+                                   IntersectContext* context);
+    
+    /*! Type of intersect function pointer for ray packets of size 16. */
+    typedef void (*IntersectFunc16)(const void* valid,  /*!< pointer to valid mask */
+                                    Intersectors* This, /*!< this pointer to accel */
+                                    RTCRayHit16& ray,   /*!< ray packet to intersect */
+                                    IntersectContext* context);
+
+    /*! Type of intersect function pointer for ray packets of size N. */
+    typedef void (*IntersectFuncN)(Intersectors* This, /*!< this pointer to accel */
+                                   RTCRayHitN** ray,   /*!< ray stream to intersect */
+                                   const size_t N,     /*!< number of rays in stream */
+                                   IntersectContext* context /*!< layout flags */);
+    
+    
+    /*! Type of occlusion function pointer for single rays. */
+    typedef void (*OccludedFunc) (Intersectors* This, /*!< this pointer to accel */
+                                  RTCRay& ray,        /*!< ray to test occlusion */
+                                  IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 4. */
+    typedef void (*OccludedFunc4) (const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRay4& ray,       /*!< ray packet to test occlusion. */
+                                   IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 8. */
+    typedef void (*OccludedFunc8) (const void* valid,  /*!< pointer to valid mask */
+                                   Intersectors* This, /*!< this pointer to accel */
+                                   RTCRay8& ray,       /*!< ray packet to test occlusion. */
+                                   IntersectContext* context);
+    
+    /*! Type of occlusion function pointer for ray packets of size 16. */
+    typedef void (*OccludedFunc16) (const void* valid,  /*!< pointer to valid mask */
+                                    Intersectors* This, /*!< this pointer to accel */
+                                    RTCRay16& ray,      /*!< ray packet to test occlusion. */
+                                    IntersectContext* context);
+
+    /*! Type of intersect function pointer for ray packets of size N. */
+    typedef void (*OccludedFuncN)(Intersectors* This, /*!< this pointer to accel */
+                                  RTCRayN** ray,      /*!< ray stream to test occlusion */
+                                  const size_t N,     /*!< number of rays in stream */
+                                  IntersectContext* context /*!< layout flags */);
+    typedef void (*ErrorFunc) ();
+
+    struct Collider
+    {
+      Collider (ErrorFunc error = nullptr) 
+      : collide((CollideFunc)error), name(nullptr) {}
+
+      Collider (CollideFunc collide, const char* name)
+      : collide(collide), name(name) {}
+
+      operator bool() const { return name; }
+
+    public:
+      CollideFunc collide;  
+      const char* name;
+    };
+    
+    struct Intersector1
+    {
+      Intersector1 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc)error), occluded((OccludedFunc)error), name(nullptr) {}
+      
+      Intersector1 (IntersectFunc intersect, OccludedFunc occluded, const char* name)
+      : intersect(intersect), occluded(occluded), pointQuery(nullptr), name(name) {}
+      
+      Intersector1 (IntersectFunc intersect, OccludedFunc occluded, PointQueryFunc pointQuery, const char* name)
+      : intersect(intersect), occluded(occluded), pointQuery(pointQuery), name(name) {}
+
+      operator bool() const { return name; }
+
+    public:
+      static const char* type;
+      IntersectFunc intersect;
+      OccludedFunc occluded;
+      PointQueryFunc pointQuery;
+      const char* name;
+    };
+    
+    struct Intersector4 
+    {
+      Intersector4 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc4)error), occluded((OccludedFunc4)error), name(nullptr) {}
+
+      Intersector4 (IntersectFunc4 intersect, OccludedFunc4 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc4 intersect;
+      OccludedFunc4 occluded;
+      const char* name;
+    };
+    
+    struct Intersector8 
+    {
+      Intersector8 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc8)error), occluded((OccludedFunc8)error), name(nullptr) {}
+
+      Intersector8 (IntersectFunc8 intersect, OccludedFunc8 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc8 intersect;
+      OccludedFunc8 occluded;
+      const char* name;
+    };
+    
+    struct Intersector16 
+    {
+      Intersector16 (ErrorFunc error = nullptr)
+      : intersect((IntersectFunc16)error), occluded((OccludedFunc16)error), name(nullptr) {}
+
+      Intersector16 (IntersectFunc16 intersect, OccludedFunc16 occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFunc16 intersect;
+      OccludedFunc16 occluded;
+      const char* name;
+    };
+
+    struct IntersectorN 
+    {
+      IntersectorN (ErrorFunc error = nullptr)
+      : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+
+      IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+      : intersect(intersect), occluded(occluded), name(name) {}
+
+      operator bool() const { return name; }
+      
+    public:
+      static const char* type;
+      IntersectFuncN intersect;
+      OccludedFuncN occluded;
+      const char* name;
+    };
+   
+    struct Intersectors 
+    {
+      Intersectors() 
+      : ptr(nullptr), leafIntersector(nullptr), collider(nullptr), intersector1(nullptr), intersector4(nullptr), intersector8(nullptr), intersector16(nullptr), intersectorN(nullptr) {}
+
+      Intersectors (ErrorFunc error) 
+      : ptr(nullptr), leafIntersector(nullptr), collider(error), intersector1(error), intersector4(error), intersector8(error), intersector16(error), intersectorN(error) {}
+
+      void print(size_t ident) 
+      {
+        if (collider.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "collider  = " << collider.name << std::endl;
+        }
+        if (intersector1.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector1  = " << intersector1.name << std::endl;
+        }
+        if (intersector4.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector4  = " << intersector4.name << std::endl;
+        }
+        if (intersector8.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector8  = " << intersector8.name << std::endl;
+        }
+        if (intersector16.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersector16 = " << intersector16.name << std::endl;
+        }
+        if (intersectorN.name) {
+          for (size_t i=0; i<ident; i++) std::cout << " ";
+          std::cout << "intersectorN = " << intersectorN.name << std::endl;
+        }        
+      }
+
+      void select(bool filter)
+      {
+        if (intersector4_filter) {
+          if (filter) intersector4 = intersector4_filter;
+          else        intersector4 = intersector4_nofilter;
+        }
+        if (intersector8_filter) {
+          if (filter) intersector8 = intersector8_filter;
+          else        intersector8 = intersector8_nofilter;
+        }
+        if (intersector16_filter) {
+          if (filter) intersector16 = intersector16_filter;
+          else         intersector16 = intersector16_nofilter;
+        }
+        if (intersectorN_filter) {
+          if (filter) intersectorN = intersectorN_filter;
+          else        intersectorN = intersectorN_nofilter;
+        }        
+      }
+
+      __forceinline bool pointQuery (PointQuery* query, PointQueryContext* context) {
+        assert(intersector1.pointQuery);
+        return intersector1.pointQuery(this,query,context);
+      }
+
+      /*! collides two scenes */
+      __forceinline void collide (Accel* scene0, Accel* scene1, RTCCollideFunc callback, void* userPtr) {
+        assert(collider.collide);
+        collider.collide(scene0->intersectors.ptr,scene1->intersectors.ptr,callback,userPtr);
+      }
+
+      /*! Intersects a single ray with the scene. */
+      __forceinline void intersect (RTCRayHit& ray, IntersectContext* context) {
+        assert(intersector1.intersect);
+        intersector1.intersect(this,ray,context);
+      }
+
+      /*! Intersects a packet of 4 rays with the scene. */
+      __forceinline void intersect4 (const void* valid, RTCRayHit4& ray, IntersectContext* context) {
+        assert(intersector4.intersect);
+        intersector4.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a packet of 8 rays with the scene. */
+      __forceinline void intersect8 (const void* valid, RTCRayHit8& ray, IntersectContext* context) {
+        assert(intersector8.intersect);
+        intersector8.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a packet of 16 rays with the scene. */
+      __forceinline void intersect16 (const void* valid, RTCRayHit16& ray, IntersectContext* context) {
+        assert(intersector16.intersect);
+        intersector16.intersect(valid,this,ray,context);
+      }
+      
+      /*! Intersects a stream of N rays in SOA layout with the scene. */
+      __forceinline void intersectN (RTCRayHitN** rayN, const size_t N, IntersectContext* context)
+      {
+        assert(intersectorN.intersect);
+        intersectorN.intersect(this,rayN,N,context);
+      }
+      
+#if defined(__SSE__) || defined(__ARM_NEON)
+      __forceinline void intersect(const vbool4& valid, RayHitK<4>& ray, IntersectContext* context) {
+        const vint<4> mask = valid.mask32();
+        intersect4(&mask,(RTCRayHit4&)ray,context);
+      }
+#endif
+#if defined(__AVX__)
+      __forceinline void intersect(const vbool8& valid, RayHitK<8>& ray, IntersectContext* context) {
+        const vint<8> mask = valid.mask32();
+        intersect8(&mask,(RTCRayHit8&)ray,context);
+      }
+#endif
+#if defined(__AVX512F__)
+      __forceinline void intersect(const vbool16& valid, RayHitK<16>& ray, IntersectContext* context) {
+        const vint<16> mask = valid.mask32();
+        intersect16(&mask,(RTCRayHit16&)ray,context);
+      }
+#endif
+      
+      template<int K>
+      __forceinline void intersectN (RayHitK<K>** rayN, const size_t N, IntersectContext* context)
+      {
+        intersectN((RTCRayHitN**)rayN,N,context);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void occluded (RTCRay& ray, IntersectContext* context) {
+        assert(intersector1.occluded);
+        intersector1.occluded(this,ray,context);
+      }
+      
+      /*! Tests if a packet of 4 rays is occluded by the scene. */
+      __forceinline void occluded4 (const void* valid, RTCRay4& ray, IntersectContext* context) {
+        assert(intersector4.occluded);
+        intersector4.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a packet of 8 rays is occluded by the scene. */
+      __forceinline void occluded8 (const void* valid, RTCRay8& ray, IntersectContext* context) {
+        assert(intersector8.occluded);
+        intersector8.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a packet of 16 rays is occluded by the scene. */
+      __forceinline void occluded16 (const void* valid, RTCRay16& ray, IntersectContext* context) {
+        assert(intersector16.occluded);
+        intersector16.occluded(valid,this,ray,context);
+      }
+      
+      /*! Tests if a stream of N rays in SOA layout is occluded by the scene. */
+      __forceinline void occludedN (RTCRayN** rayN, const size_t N, IntersectContext* context)
+      {
+        assert(intersectorN.occluded);
+        intersectorN.occluded(this,rayN,N,context);
+      }
+      
+#if defined(__SSE__) || defined(__ARM_NEON)
+      __forceinline void occluded(const vbool4& valid, RayK<4>& ray, IntersectContext* context) {
+        const vint<4> mask = valid.mask32();
+        occluded4(&mask,(RTCRay4&)ray,context);
+      }
+#endif
+#if defined(__AVX__)
+      __forceinline void occluded(const vbool8& valid, RayK<8>& ray, IntersectContext* context) {
+        const vint<8> mask = valid.mask32();
+        occluded8(&mask,(RTCRay8&)ray,context);
+      }
+#endif
+#if defined(__AVX512F__)
+      __forceinline void occluded(const vbool16& valid, RayK<16>& ray, IntersectContext* context) {
+        const vint<16> mask = valid.mask32();
+        occluded16(&mask,(RTCRay16&)ray,context);
+      }
+#endif
+
+      template<int K>
+      __forceinline void occludedN (RayK<K>** rayN, const size_t N, IntersectContext* context)
+      {
+        occludedN((RTCRayN**)rayN,N,context);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void intersect(RTCRay& ray, IntersectContext* context) {
+        occluded(ray, context);
+      }
+
+      /*! Tests if a packet of K rays is occluded by the scene. */
+      template<int K>
+      __forceinline void intersect(const vbool<K>& valid, RayK<K>& ray, IntersectContext* context) {
+        occluded(valid, ray, context);
+      }
+
+      /*! Tests if a packet of N rays in SOA layout is occluded by the scene. */
+      template<int K>
+      __forceinline void intersectN(RayK<K>** rayN, const size_t N, IntersectContext* context) {
+        occludedN(rayN, N, context);
+      }
+      
+    public:
+      AccelData* ptr;
+      void* leafIntersector;
+      Collider collider;
+      Intersector1 intersector1;
+      Intersector4 intersector4;
+      Intersector4 intersector4_filter;
+      Intersector4 intersector4_nofilter;
+      Intersector8 intersector8;
+      Intersector8 intersector8_filter;
+      Intersector8 intersector8_nofilter;
+      Intersector16 intersector16;
+      Intersector16 intersector16_filter;
+      Intersector16 intersector16_nofilter;
+      IntersectorN intersectorN;
+      IntersectorN intersectorN_filter;
+      IntersectorN intersectorN_nofilter;      
+    };
+  
+  public:
+
+    /*! Construction */
+    Accel (const AccelData::Type type) 
+      : AccelData(type) {}
+    
+    /*! Construction */
+    Accel (const AccelData::Type type, const Intersectors& intersectors) 
+      : AccelData(type), intersectors(intersectors) {}
+
+    /*! Virtual destructor */
+    virtual ~Accel() {}
+
+    /*! makes the acceleration structure immutable */
+    virtual void immutable () {}
+    
+    /*! build acceleration structure */
+    virtual void build () = 0;
+
+  public:
+    Intersectors intersectors;
+  };
+
+#define DEFINE_COLLIDER(symbol,collider)                                \
+  Accel::Collider symbol() {                                            \
+    return Accel::Collider((Accel::CollideFunc)collider::collide,       \
+                           TOSTRING(isa) "::" TOSTRING(symbol));        \
+  }
+
+#define DEFINE_INTERSECTOR1(symbol,intersector)                               \
+  Accel::Intersector1 symbol() {                                              \
+    return Accel::Intersector1((Accel::IntersectFunc )intersector::intersect, \
+                               (Accel::OccludedFunc  )intersector::occluded,  \
+                               (Accel::PointQueryFunc)intersector::pointQuery,\
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+  
+#define DEFINE_INTERSECTOR4(symbol,intersector)                               \
+  Accel::Intersector4 symbol() {                                              \
+    return Accel::Intersector4((Accel::IntersectFunc4)intersector::intersect, \
+                               (Accel::OccludedFunc4)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+  
+#define DEFINE_INTERSECTOR8(symbol,intersector)                               \
+  Accel::Intersector8 symbol() {                                              \
+    return Accel::Intersector8((Accel::IntersectFunc8)intersector::intersect, \
+                               (Accel::OccludedFunc8)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+
+#define DEFINE_INTERSECTOR16(symbol,intersector)                                \
+  Accel::Intersector16 symbol() {                                               \
+    return Accel::Intersector16((Accel::IntersectFunc16)intersector::intersect, \
+                                (Accel::OccludedFunc16)intersector::occluded,   \
+                                TOSTRING(isa) "::" TOSTRING(symbol));           \
+  }
+
+#define DEFINE_INTERSECTORN(symbol,intersector)                               \
+  Accel::IntersectorN symbol() {                                              \
+    return Accel::IntersectorN((Accel::IntersectFuncN)intersector::intersect, \
+                               (Accel::OccludedFuncN)intersector::occluded,   \
+                               TOSTRING(isa) "::" TOSTRING(symbol));          \
+  }
+
+  /* ray stream filter interface */
+  typedef void (*intersectStreamAOS_func)(Scene* scene, RTCRayHit*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
+  typedef void (*intersectStreamAOP_func)(Scene* scene, RTCRayHit** _rayN, const size_t N, IntersectContext* context);
+  typedef void (*intersectStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+  typedef void (*intersectStreamSOP_func)(Scene* scene, const RTCRayHitNp* rayN, const size_t N, IntersectContext* context);
+
+  typedef void (*occludedStreamAOS_func)(Scene* scene, RTCRay*  _rayN, const size_t N, const size_t stride, IntersectContext* context);
+  typedef void (*occludedStreamAOP_func)(Scene* scene, RTCRay** _rayN, const size_t N, IntersectContext* context);
+  typedef void (*occludedStreamSOA_func)(Scene* scene, char* rayN, const size_t N, const size_t streams, const size_t stream_offset, IntersectContext* context);
+  typedef void (*occludedStreamSOP_func)(Scene* scene, const RTCRayNp* rayN, const size_t N, IntersectContext* context);
+
+  struct RayStreamFilterFuncs
+  {
+    RayStreamFilterFuncs()
+    : intersectAOS(nullptr), intersectAOP(nullptr), intersectSOA(nullptr), intersectSOP(nullptr),
+      occludedAOS(nullptr),  occludedAOP(nullptr),  occludedSOA(nullptr),  occludedSOP(nullptr) {}
+
+    RayStreamFilterFuncs(void (*ptr) ())
+    : intersectAOS((intersectStreamAOS_func) ptr), intersectAOP((intersectStreamAOP_func) ptr), intersectSOA((intersectStreamSOA_func) ptr), intersectSOP((intersectStreamSOP_func) ptr),
+      occludedAOS((occludedStreamAOS_func) ptr),   occludedAOP((occludedStreamAOP_func) ptr),   occludedSOA((occludedStreamSOA_func) ptr),   occludedSOP((occludedStreamSOP_func) ptr) {}
+
+    RayStreamFilterFuncs(intersectStreamAOS_func intersectAOS, intersectStreamAOP_func intersectAOP, intersectStreamSOA_func intersectSOA, intersectStreamSOP_func intersectSOP,
+                         occludedStreamAOS_func  occludedAOS,  occludedStreamAOP_func  occludedAOP,  occludedStreamSOA_func  occludedSOA,  occludedStreamSOP_func  occludedSOP)
+    : intersectAOS(intersectAOS), intersectAOP(intersectAOP), intersectSOA(intersectSOA), intersectSOP(intersectSOP),
+      occludedAOS(occludedAOS),   occludedAOP(occludedAOP),   occludedSOA(occludedSOA),   occludedSOP(occludedSOP) {}
+
+  public:
+    intersectStreamAOS_func intersectAOS;
+    intersectStreamAOP_func intersectAOP;
+    intersectStreamSOA_func intersectSOA;
+    intersectStreamSOP_func intersectSOP;
+
+    occludedStreamAOS_func occludedAOS;
+    occludedStreamAOP_func occludedAOP;
+    occludedStreamSOA_func occludedSOA;
+    occludedStreamSOP_func occludedSOP;
+  }; 
+
+  typedef RayStreamFilterFuncs (*RayStreamFilterFuncsType)();
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelinstance.h b/thirdparty/embree-aarch64/kernels/common/accelinstance.h
new file mode 100644
index 0000000000..d74b96df3f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelinstance.h
@@ -0,0 +1,41 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+#include "builder.h"
+
+namespace embree
+{
+  class AccelInstance : public Accel
+  {
+  public:
+    AccelInstance (AccelData* accel, Builder* builder, Intersectors& intersectors)
+      : Accel(AccelData::TY_ACCEL_INSTANCE,intersectors), accel(accel), builder(builder) {}
+
+    void immutable () {
+      builder.reset(nullptr);
+    }
+
+  public:
+    void build () {
+      if (builder) builder->build();
+      bounds = accel->bounds;
+    }
+
+    void deleteGeometry(size_t geomID) {
+      if (accel  ) accel->deleteGeometry(geomID);
+      if (builder) builder->deleteGeometry(geomID);
+    }
+    
+    void clear() {
+      if (accel) accel->clear();
+      if (builder) builder->clear();
+    }
+
+  private:
+    std::unique_ptr<AccelData> accel;
+    std::unique_ptr<Builder> builder;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.cpp b/thirdparty/embree-aarch64/kernels/common/acceln.cpp
new file mode 100644
index 0000000000..aadb4a64ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/acceln.cpp
@@ -0,0 +1,232 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "acceln.h"
+#include "ray.h"
+#include "../../include/embree3/rtcore_ray.h"
+#include "../../common/algorithms/parallel_for.h"
+
+namespace embree
+{
+  AccelN::AccelN()
+    : Accel(AccelData::TY_ACCELN), accels() {}
+
+  AccelN::~AccelN() 
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      delete accels[i];
+  }
+
+  void AccelN::accels_add(Accel* accel) 
+  {
+    assert(accel);
+    accels.push_back(accel);
+  }
+
+  void AccelN::accels_init() 
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      delete accels[i];
+    
+    accels.clear();
+  }
+
+  bool AccelN::pointQuery (Accel::Intersectors* This_in, PointQuery* query, PointQueryContext* context)
+  {
+    bool changed = false;
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        changed |= This->accels[i]->intersectors.pointQuery(query,context);
+    return changed;
+  }
+
+  void AccelN::intersect (Accel::Intersectors* This_in, RTCRayHit& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect(ray,context);
+  }
+
+  void AccelN::intersect4 (const void* valid, Accel::Intersectors* This_in, RTCRayHit4& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect4(valid,ray,context);
+  }
+
+  void AccelN::intersect8 (const void* valid, Accel::Intersectors* This_in, RTCRayHit8& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect8(valid,ray,context);
+  }
+
+  void AccelN::intersect16 (const void* valid, Accel::Intersectors* This_in, RTCRayHit16& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersect16(valid,ray,context);
+  }
+
+  void AccelN::intersectN (Accel::Intersectors* This_in, RTCRayHitN** ray, const size_t N, IntersectContext* context)
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.intersectN(ray,N,context);
+  }
+
+  void AccelN::occluded (Accel::Intersectors* This_in, RTCRay& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded(ray,context); 
+      if (ray.tfar < 0.0f) break; 
+    }
+  }
+
+  void AccelN::occluded4 (const void* valid, Accel::Intersectors* This_in, RTCRay4& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded4(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON)
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      if (unlikely(none(valid0 & hit0))) break;
+#endif
+    }
+  }
+
+  void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded8(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      vbool4 valid1 = asBool(((vint4*)valid)[1]);
+      vbool4 hit1   = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+      if (unlikely((none((valid0 & hit0) | (valid1 & hit1))))) break;
+#endif
+    }
+  }
+
+  void AccelN::occluded16 (const void* valid, Accel::Intersectors* This_in, RTCRay16& ray, IntersectContext* context) 
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    for (size_t i=0; i<This->accels.size(); i++) {
+      if (This->accels[i]->isEmpty()) continue;
+      This->accels[i]->intersectors.occluded16(valid,ray,context);
+#if defined(__SSE2__) || defined(__ARM_NEON) // FIXME: use higher ISA
+      vbool4 valid0 = asBool(((vint4*)valid)[0]);
+      vbool4 hit0   = ((vfloat4*)ray.tfar)[0] >= vfloat4(zero);
+      vbool4 valid1 = asBool(((vint4*)valid)[1]);
+      vbool4 hit1   = ((vfloat4*)ray.tfar)[1] >= vfloat4(zero);
+      vbool4 valid2 = asBool(((vint4*)valid)[2]);
+      vbool4 hit2   = ((vfloat4*)ray.tfar)[2] >= vfloat4(zero);
+      vbool4 valid3 = asBool(((vint4*)valid)[3]);
+      vbool4 hit3   = ((vfloat4*)ray.tfar)[3] >= vfloat4(zero);
+      if (unlikely((none((valid0 & hit0) | (valid1 & hit1) | (valid2 & hit2) | (valid3 & hit3))))) break;
+#endif
+    }
+  }
+
+  void AccelN::occludedN (Accel::Intersectors* This_in, RTCRayN** ray, const size_t N, IntersectContext* context)
+  {
+    AccelN* This = (AccelN*)This_in->ptr;
+    size_t M = N;
+    for (size_t i=0; i<This->accels.size(); i++)
+      if (!This->accels[i]->isEmpty())
+        This->accels[i]->intersectors.occludedN(ray,M,context);
+  }
+
+  void AccelN::accels_print(size_t ident)
+  {
+    for (size_t i=0; i<accels.size(); i++)
+    {
+      for (size_t j=0; j<ident; j++) std::cout << " "; 
+      std::cout << "accels[" << i << "]" << std::endl;
+      accels[i]->intersectors.print(ident+2);
+    }
+  }
+
+  void AccelN::accels_immutable()
+  {
+    for (size_t i=0; i<accels.size(); i++)
+      accels[i]->immutable();
+  }
+  
+  void AccelN::accels_build () 
+  {
+    /* reduce memory consumption */
+    accels.shrink_to_fit();
+    
+    /* build all acceleration structures in parallel */
+    parallel_for (accels.size(), [&] (size_t i) { 
+        accels[i]->build();
+      });
+
+    /* create list of non-empty acceleration structures */
+    bool valid1 = true;
+    bool valid4 = true;
+    bool valid8 = true;
+    bool valid16 = true;
+    for (size_t i=0; i<accels.size(); i++) {
+      valid1 &= (bool) accels[i]->intersectors.intersector1;
+      valid4 &= (bool) accels[i]->intersectors.intersector4;
+      valid8 &= (bool) accels[i]->intersectors.intersector8;
+      valid16 &= (bool) accels[i]->intersectors.intersector16;
+    }
+
+    if (accels.size() == 1) {
+      type = accels[0]->type; // FIXME: should just assign entire Accel
+      bounds = accels[0]->bounds;
+      intersectors = accels[0]->intersectors;
+    }
+    else 
+    {
+      type = AccelData::TY_ACCELN;
+      intersectors.ptr = this;
+      intersectors.intersector1  = Intersector1(&intersect,&occluded,&pointQuery,valid1 ? "AccelN::intersector1": nullptr);
+      intersectors.intersector4  = Intersector4(&intersect4,&occluded4,valid4 ? "AccelN::intersector4" : nullptr);
+      intersectors.intersector8  = Intersector8(&intersect8,&occluded8,valid8 ? "AccelN::intersector8" : nullptr);
+      intersectors.intersector16 = Intersector16(&intersect16,&occluded16,valid16 ? "AccelN::intersector16": nullptr);
+      intersectors.intersectorN  = IntersectorN(&intersectN,&occludedN,"AccelN::intersectorN");
+
+      /*! calculate bounds */
+      bounds = empty;
+      for (size_t i=0; i<accels.size(); i++) 
+        bounds.extend(accels[i]->bounds);
+    }
+  }
+
+  void AccelN::accels_select(bool filter)
+  {
+    for (size_t i=0; i<accels.size(); i++) 
+      accels[i]->intersectors.select(filter);
+  }
+
+  void AccelN::accels_deleteGeometry(size_t geomID) 
+  {
+    for (size_t i=0; i<accels.size(); i++) 
+      accels[i]->deleteGeometry(geomID);
+  }
+
+  void AccelN::accels_clear()
+  {
+    for (size_t i=0; i<accels.size(); i++) {
+      accels[i]->clear();
+    }
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/common/acceln.h b/thirdparty/embree-aarch64/kernels/common/acceln.h
new file mode 100644
index 0000000000..2edd98f647
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/acceln.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accel.h"
+
+namespace embree
+{
+  /*! merges N acceleration structures together, by processing them in order */
+  class AccelN : public Accel
+  {
+  public:
+    AccelN ();
+    ~AccelN();
+
+  public:
+    void accels_add(Accel* accel);
+    void accels_init();
+
+  public:
+    static bool pointQuery (Accel::Intersectors* This, PointQuery* query, PointQueryContext* context);
+
+  public:
+    static void intersect (Accel::Intersectors* This, RTCRayHit& ray, IntersectContext* context);
+    static void intersect4 (const void* valid, Accel::Intersectors* This, RTCRayHit4& ray, IntersectContext* context);
+    static void intersect8 (const void* valid, Accel::Intersectors* This, RTCRayHit8& ray, IntersectContext* context);
+    static void intersect16 (const void* valid, Accel::Intersectors* This, RTCRayHit16& ray, IntersectContext* context);
+    static void intersectN (Accel::Intersectors* This, RTCRayHitN** ray, const size_t N, IntersectContext* context);
+
+  public:
+    static void occluded (Accel::Intersectors* This, RTCRay& ray, IntersectContext* context);
+    static void occluded4 (const void* valid, Accel::Intersectors* This, RTCRay4& ray, IntersectContext* context);
+    static void occluded8 (const void* valid, Accel::Intersectors* This, RTCRay8& ray, IntersectContext* context);
+    static void occluded16 (const void* valid, Accel::Intersectors* This, RTCRay16& ray, IntersectContext* context);
+    static void occludedN (Accel::Intersectors* This, RTCRayN** ray, const size_t N, IntersectContext* context);
+
+  public:
+    void accels_print(size_t ident);
+    void accels_immutable();
+    void accels_build ();
+    void accels_select(bool filter);
+    void accels_deleteGeometry(size_t geomID);
+    void accels_clear ();
+
+  public:
+    std::vector<Accel*> accels;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.cpp b/thirdparty/embree-aarch64/kernels/common/accelset.cpp
new file mode 100644
index 0000000000..79be1c4301
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelset.cpp
@@ -0,0 +1,17 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "accelset.h"
+#include "scene.h"
+
+namespace embree
+{
+  AccelSet::AccelSet (Device* device, Geometry::GType gtype, size_t numItems, size_t numTimeSteps) 
+    : Geometry(device,gtype,(unsigned int)numItems,(unsigned int)numTimeSteps), boundsFunc(nullptr) {}
+
+  AccelSet::IntersectorN::IntersectorN (ErrorFunc error) 
+    : intersect((IntersectFuncN)error), occluded((OccludedFuncN)error), name(nullptr) {}
+  
+  AccelSet::IntersectorN::IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name)
+    : intersect(intersect), occluded(occluded), name(name) {}
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/accelset.h b/thirdparty/embree-aarch64/kernels/common/accelset.h
new file mode 100644
index 0000000000..3774b2accb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/accelset.h
@@ -0,0 +1,248 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "builder.h"
+#include "geometry.h"
+#include "ray.h"
+#include "hit.h"
+
+namespace embree
+{
+  struct IntersectFunctionNArguments;
+  struct OccludedFunctionNArguments;
+  
+  typedef void (*ReportIntersectionFunc) (IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+  typedef void (*ReportOcclusionFunc) (OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args);
+  
+  struct IntersectFunctionNArguments : public RTCIntersectFunctionNArguments
+  {
+    IntersectContext* internal_context;
+    Geometry* geometry;
+    ReportIntersectionFunc report;
+  };
+
+  struct OccludedFunctionNArguments : public RTCOccludedFunctionNArguments
+  {
+    IntersectContext* internal_context;
+    Geometry* geometry;
+    ReportOcclusionFunc report;
+  };
+
+  /*! Base class for set of acceleration structures. */
+  class AccelSet : public Geometry
+  {
+  public:
+    typedef RTCIntersectFunctionN IntersectFuncN;  
+    typedef RTCOccludedFunctionN OccludedFuncN;
+    typedef void (*ErrorFunc) ();
+
+      struct IntersectorN
+      {
+        IntersectorN (ErrorFunc error = nullptr) ;
+        IntersectorN (IntersectFuncN intersect, OccludedFuncN occluded, const char* name);
+        
+        operator bool() const { return name; }
+        
+      public:
+        static const char* type;
+        IntersectFuncN intersect;
+        OccludedFuncN occluded; 
+        const char* name;
+      };
+      
+    public:
+      
+      /*! construction */
+      AccelSet (Device* device, Geometry::GType gtype, size_t items, size_t numTimeSteps);
+      
+      /*! makes the acceleration structure immutable */
+      virtual void immutable () {}
+      
+      /*! build accel */
+      virtual void build () = 0;
+
+      /*! check if the i'th primitive is valid between the specified time range */
+      __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+      {
+        for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+          if (!isvalid_non_empty(bounds(i,itime))) return false;
+        
+        return true;
+      }
+
+      /*! Calculates the bounds of an item */
+      __forceinline BBox3fa bounds(size_t i, size_t itime = 0) const
+      {
+        BBox3fa box;
+        assert(i < size());
+        RTCBoundsFunctionArguments args;
+        args.geometryUserPtr = userPtr;
+        args.primID = (unsigned int)i;
+        args.timeStep = (unsigned int)itime;
+        args.bounds_o = (RTCBounds*)&box;
+        boundsFunc(&args);
+        return box;
+      }
+
+      /*! calculates the linear bounds of the i'th item at the itime'th time segment */
+      __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const
+      {
+        BBox3fa box[2];
+        assert(i < size());
+        RTCBoundsFunctionArguments args;
+        args.geometryUserPtr = userPtr;
+        args.primID = (unsigned int)i;
+        args.timeStep = (unsigned int)(itime+0);
+        args.bounds_o = (RTCBounds*)&box[0];
+        boundsFunc(&args);
+        args.timeStep = (unsigned int)(itime+1);
+        args.bounds_o = (RTCBounds*)&box[1];
+        boundsFunc(&args);
+        return LBBox3fa(box[0],box[1]);
+      }
+
+      /*! calculates the build bounds of the i'th item, if it's valid */
+      __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+      {
+        const BBox3fa b = bounds(i);
+        if (bbox) *bbox = b;
+        return isvalid_non_empty(b);
+      }
+
+      /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+      __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+      {
+        const LBBox3fa bounds = linearBounds(i,itime);
+        bbox = bounds.bounds0; // use bounding box of first timestep to build BVH
+        return isvalid_non_empty(bounds);
+      }
+
+      /*! calculates the linear bounds of the i'th primitive for the specified time range */
+      __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+        return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+      }
+      
+      /*! calculates the linear bounds of the i'th primitive for the specified time range */
+      __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const  {
+        if (!valid(i, timeSegmentRange(time_range))) return false;
+        bbox = linearBounds(i, time_range);
+        return true;
+      }
+
+      /* gets version info of topology */
+      unsigned int getTopologyVersion() const {
+        return numPrimitives;
+      }
+    
+      /* returns true if topology changed */
+      bool topologyChanged(unsigned int otherVersion) const {
+        return numPrimitives != otherVersion;
+      }
+
+  public:
+
+      /*! Intersects a single ray with the scene. */
+      __forceinline void intersect (RayHit& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      {
+        assert(primID < size());
+        assert(intersectorN.intersect);
+        
+        int mask = -1;
+        IntersectFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.rayhit = (RTCRayHitN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.intersect(&args);
+      }
+
+      /*! Tests if single ray is occluded by the scene. */
+      __forceinline void occluded (Ray& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      {
+        assert(primID < size());
+        assert(intersectorN.occluded);
+        
+        int mask = -1;
+        OccludedFunctionNArguments args;
+        args.valid = &mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.ray = (RTCRayN*)&ray;
+        args.N = 1;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.occluded(&args);
+      }
+   
+      /*! Intersects a packet of K rays with the scene. */
+      template<int K>
+        __forceinline void intersect (const vbool<K>& valid, RayHitK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportIntersectionFunc report) 
+      {
+        assert(primID < size());
+        assert(intersectorN.intersect);
+        
+        vint<K> mask = valid.mask32();
+        IntersectFunctionNArguments args;
+        args.valid = (int*)&mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.rayhit = (RTCRayHitN*)&ray;
+        args.N = K;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+         
+        intersectorN.intersect(&args);
+      }
+
+      /*! Tests if a packet of K rays is occluded by the scene. */
+      template<int K>
+        __forceinline void occluded (const vbool<K>& valid, RayK<K>& ray, unsigned int geomID, unsigned int primID, IntersectContext* context, ReportOcclusionFunc report)
+      {
+        assert(primID < size());
+        assert(intersectorN.occluded);
+        
+        vint<K> mask = valid.mask32();
+        OccludedFunctionNArguments args;
+        args.valid = (int*)&mask;
+        args.geometryUserPtr = userPtr;
+        args.context = context->user;
+        args.ray = (RTCRayN*)&ray;
+        args.N = K;
+        args.geomID = geomID;
+        args.primID = primID;
+        args.internal_context = context;
+        args.geometry = this;
+        args.report = report;
+        
+        intersectorN.occluded(&args);
+      }
+
+    public:
+      RTCBoundsFunction boundsFunc;
+      IntersectorN intersectorN;
+  };
+  
+#define DEFINE_SET_INTERSECTORN(symbol,intersector)                     \
+  AccelSet::IntersectorN symbol() {                                     \
+    return AccelSet::IntersectorN(intersector::intersect, \
+                                  intersector::occluded, \
+                                  TOSTRING(isa) "::" TOSTRING(symbol)); \
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.cpp b/thirdparty/embree-aarch64/kernels/common/alloc.cpp
new file mode 100644
index 0000000000..6fa406f03a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/alloc.cpp
@@ -0,0 +1,82 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "alloc.h"
+#include "../../common/sys/thread.h"
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include "../../common/sys/barrier.h"
+#endif
+
+namespace embree
+{
+  __thread FastAllocator::ThreadLocal2* FastAllocator::thread_local_allocator2 = nullptr;
+  SpinLock FastAllocator::s_thread_local_allocators_lock;
+  std::vector<std::unique_ptr<FastAllocator::ThreadLocal2>> FastAllocator::s_thread_local_allocators;
+   
+  struct fast_allocator_regression_test : public RegressionTest
+  {
+    BarrierSys barrier;
+    std::atomic<size_t> numFailed;
+    std::unique_ptr<FastAllocator> alloc;
+
+    fast_allocator_regression_test() 
+      : RegressionTest("fast_allocator_regression_test"), numFailed(0)
+    {
+      registerRegressionTest(this);
+    }
+
+    static void thread_alloc(fast_allocator_regression_test* This)
+    {
+      FastAllocator::CachedAllocator threadalloc = This->alloc->getCachedAllocator();
+
+      size_t* ptrs[1000];
+      for (size_t j=0; j<1000; j++)
+      {
+        This->barrier.wait();
+        for (size_t i=0; i<1000; i++) {
+          ptrs[i] = (size_t*) threadalloc.malloc0(sizeof(size_t)+(i%32));
+          *ptrs[i] = size_t(threadalloc.talloc0) + i;
+        }
+        for (size_t i=0; i<1000; i++) {
+          if (*ptrs[i] != size_t(threadalloc.talloc0) + i) 
+            This->numFailed++;
+        }
+        This->barrier.wait();
+      }
+    }
+    
+    bool run ()
+    {
+      alloc = make_unique(new FastAllocator(nullptr,false));
+      numFailed.store(0);
+
+      size_t numThreads = getNumberOfLogicalThreads();
+      barrier.init(numThreads+1);
+
+      /* create threads */
+      std::vector<thread_t> threads;
+      for (size_t i=0; i<numThreads; i++)
+        threads.push_back(createThread((thread_func)thread_alloc,this));
+
+      /* run test */ 
+      for (size_t i=0; i<1000; i++)
+      {
+        alloc->reset();
+        barrier.wait();
+        barrier.wait();
+      }
+     
+      /* destroy threads */
+      for (size_t i=0; i<numThreads; i++)
+        join(threads[i]);
+
+      alloc = nullptr;
+
+      return numFailed == 0;
+    }
+  };
+
+  fast_allocator_regression_test fast_allocator_regression;
+}
+
+
diff --git a/thirdparty/embree-aarch64/kernels/common/alloc.h b/thirdparty/embree-aarch64/kernels/common/alloc.h
new file mode 100644
index 0000000000..488fa707ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/alloc.h
@@ -0,0 +1,1006 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "primref.h"
+
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include <mutex>
+#endif
+
+namespace embree
+{
+  class FastAllocator
+  {
+    /*! maximum supported alignment */
+    static const size_t maxAlignment = 64;
+
+    /*! maximum allocation size */
+
+    /* default settings */
+    //static const size_t defaultBlockSize = 4096;
+#define maxAllocationSize size_t(2*1024*1024-maxAlignment)
+
+    static const size_t MAX_THREAD_USED_BLOCK_SLOTS = 8;
+
+  public:
+
+    struct ThreadLocal2;
+    enum AllocationType { ALIGNED_MALLOC, EMBREE_OS_MALLOC, SHARED, ANY_TYPE };
+
+    /*! Per thread structure holding the current memory block. */
+    struct __aligned(64) ThreadLocal
+    {
+      ALIGNED_CLASS_(64);
+    public:
+
+      /*! Constructor for usage with ThreadLocalData */
+      __forceinline ThreadLocal (ThreadLocal2* parent) 
+	: parent(parent), ptr(nullptr), cur(0), end(0), allocBlockSize(0), bytesUsed(0), bytesWasted(0) {}
+
+      /*! initialize allocator */
+      void init(FastAllocator* alloc) 
+      {
+        ptr = nullptr;
+	cur = end = 0;
+        bytesUsed = 0;
+        bytesWasted = 0;
+        allocBlockSize = 0;
+        if (alloc) allocBlockSize = alloc->defaultBlockSize;
+      }
+
+      /* Allocate aligned memory from the threads memory block. */
+      __forceinline void* malloc(FastAllocator* alloc, size_t bytes, size_t align = 16) 
+      {
+        /* bind the thread local allocator to the proper FastAllocator*/
+        parent->bind(alloc);
+
+        assert(align <= maxAlignment);
+	bytesUsed += bytes;
+
+        /* try to allocate in local block */
+	size_t ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+        
+        /* if allocation is too large allocate with parent allocator */
+        if (4*bytes > allocBlockSize) {
+          return alloc->malloc(bytes,maxAlignment,false);
+	}
+
+        /* get new partial block if allocation failed */
+        size_t blockSize = allocBlockSize;
+        ptr = (char*) alloc->malloc(blockSize,maxAlignment,true);
+ 	bytesWasted += end-cur;
+	cur = 0; end = blockSize;
+
+        /* retry allocation */
+	ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+
+        /* get new full block if allocation failed */
+        blockSize = allocBlockSize;
+        ptr = (char*) alloc->malloc(blockSize,maxAlignment,false);
+	bytesWasted += end-cur;
+	cur = 0; end = blockSize;
+
+        /* retry allocation */
+	ofs = (align - cur) & (align-1);
+        cur += bytes + ofs;
+        if (likely(cur <= end)) { bytesWasted += ofs; return &ptr[cur - bytes]; }
+	cur -= bytes + ofs;
+
+        /* should never happen as large allocations get handled specially above */
+        assert(false);
+        return nullptr;
+      }
+
+      
+      /*! returns amount of used bytes */
+      __forceinline size_t getUsedBytes() const { return bytesUsed; }
+  
+      /*! returns amount of free bytes */
+      __forceinline size_t getFreeBytes() const { return end-cur; }
+      
+      /*! returns amount of wasted bytes */
+      __forceinline size_t getWastedBytes() const { return bytesWasted; }
+  
+    private:
+      ThreadLocal2* parent;
+      char*  ptr;            //!< pointer to memory block
+      size_t cur;            //!< current location of the allocator
+      size_t end;            //!< end of the memory block
+      size_t allocBlockSize; //!< block size for allocations
+      size_t bytesUsed;      //!< number of total bytes allocated
+      size_t bytesWasted;    //!< number of bytes wasted
+    };
+
+    /*! Two thread local structures. */
+    struct __aligned(64) ThreadLocal2
+    {
+      ALIGNED_CLASS_(64);
+    public:
+
+      __forceinline ThreadLocal2()
+        : alloc(nullptr), alloc0(this), alloc1(this) {}
+
+      /*! bind to fast allocator */
+      __forceinline void bind(FastAllocator* alloc_i) 
+      {
+        assert(alloc_i);
+        if (alloc.load() == alloc_i) return;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(mutex);
+#else
+        Lock<SpinLock> lock(mutex);
+#endif
+        //if (alloc.load() == alloc_i) return; // not required as only one thread calls bind
+        if (alloc.load()) {
+          alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
+          alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
+          alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+        }
+        alloc0.init(alloc_i);
+        alloc1.init(alloc_i);
+        alloc.store(alloc_i);
+        alloc_i->join(this);
+      }
+
+      /*! unbind to fast allocator */
+      void unbind(FastAllocator* alloc_i) 
+      {
+        assert(alloc_i);
+        if (alloc.load() != alloc_i) return;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(mutex);
+#else
+        Lock<SpinLock> lock(mutex);
+#endif
+        if (alloc.load() != alloc_i) return; // required as a different thread calls unbind
+        alloc.load()->bytesUsed   += alloc0.getUsedBytes()   + alloc1.getUsedBytes();
+        alloc.load()->bytesFree   += alloc0.getFreeBytes()   + alloc1.getFreeBytes();
+        alloc.load()->bytesWasted += alloc0.getWastedBytes() + alloc1.getWastedBytes();
+        alloc0.init(nullptr);
+        alloc1.init(nullptr);
+        alloc.store(nullptr);
+      }
+
+    public:
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::mutex mutex;
+#else
+      SpinLock mutex;        //!< required as unbind is called from other threads
+#endif
+      std::atomic<FastAllocator*> alloc;  //!< parent allocator
+      ThreadLocal alloc0;
+      ThreadLocal alloc1;
+    };
+
+    FastAllocator (Device* device, bool osAllocation) 
+      : device(device), slotMask(0), usedBlocks(nullptr), freeBlocks(nullptr), use_single_mode(false), defaultBlockSize(PAGE_SIZE), estimatedSize(0),
+        growSize(PAGE_SIZE), maxGrowSize(maxAllocationSize), log2_grow_size_scale(0), bytesUsed(0), bytesFree(0), bytesWasted(0), atype(osAllocation ? EMBREE_OS_MALLOC : ALIGNED_MALLOC),
+        primrefarray(device,0)
+    {
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+        assert(!slotMutex[i].isLocked());
+      }
+    }
+
+    ~FastAllocator () {
+      clear();
+    }
+
+    /*! returns the device attached to this allocator */
+    Device* getDevice() {
+      return device;
+    }
+
+    void share(mvector<PrimRef>& primrefarray_i) {
+      primrefarray = std::move(primrefarray_i);
+    }
+
+    void unshare(mvector<PrimRef>& primrefarray_o)
+    {
+      reset(); // this removes blocks that are allocated inside the shared primref array
+      primrefarray_o = std::move(primrefarray);
+    }
+
+    /*! returns first fast thread local allocator */
+    __forceinline ThreadLocal* _threadLocal() {
+      return &threadLocal2()->alloc0;
+    }
+
+    void setOSallocation(bool flag)
+    {
+      atype = flag ? EMBREE_OS_MALLOC : ALIGNED_MALLOC;
+    }
+
+  private:
+
+    /*! returns both fast thread local allocators */
+    __forceinline ThreadLocal2* threadLocal2() 
+    {
+      ThreadLocal2* alloc = thread_local_allocator2;
+      if (alloc == nullptr) {
+        thread_local_allocator2 = alloc = new ThreadLocal2;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+        std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
+        Lock<SpinLock> lock(s_thread_local_allocators_lock);
+#endif
+        s_thread_local_allocators.push_back(make_unique(alloc));
+      }
+      return alloc;
+    }
+
+  public:
+
+    __forceinline void join(ThreadLocal2* alloc)
+    {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::scoped_lock lock(s_thread_local_allocators_lock);
+#else
+      Lock<SpinLock> lock(thread_local_allocators_lock);
+#endif
+      thread_local_allocators.push_back(alloc);
+    }
+
+  public:
+
+    struct CachedAllocator
+    {
+      __forceinline CachedAllocator(void* ptr)
+        : alloc(nullptr), talloc0(nullptr), talloc1(nullptr) 
+      {
+        assert(ptr == nullptr);
+      }
+
+      __forceinline CachedAllocator(FastAllocator* alloc, ThreadLocal2* talloc)
+        : alloc(alloc), talloc0(&talloc->alloc0), talloc1(alloc->use_single_mode ? &talloc->alloc0 : &talloc->alloc1) {}
+
+      __forceinline operator bool () const {
+        return alloc != nullptr;
+      }
+
+      __forceinline void* operator() (size_t bytes, size_t align = 16) const {
+        return talloc0->malloc(alloc,bytes,align);
+      }
+
+      __forceinline void* malloc0 (size_t bytes, size_t align = 16) const {
+        return talloc0->malloc(alloc,bytes,align);
+      }
+
+      __forceinline void* malloc1 (size_t bytes, size_t align = 16) const {
+        return talloc1->malloc(alloc,bytes,align);
+      }
+
+    public:
+      FastAllocator* alloc;
+      ThreadLocal* talloc0;
+      ThreadLocal* talloc1;
+    };
+
+    __forceinline CachedAllocator getCachedAllocator() {
+      return CachedAllocator(this,threadLocal2());
+    }
+
+    /*! Builder interface to create thread local allocator */
+    struct Create
+    {
+    public:
+      __forceinline Create (FastAllocator* allocator) : allocator(allocator) {}
+      __forceinline CachedAllocator operator() () const { return allocator->getCachedAllocator();  }
+
+    private:
+      FastAllocator* allocator;
+    };
+
+    void internal_fix_used_blocks()
+    {
+      /* move thread local blocks to global block list */
+      for (size_t i = 0; i < MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        while (threadBlocks[i].load() != nullptr) {
+          Block* nextUsedBlock = threadBlocks[i].load()->next;
+          threadBlocks[i].load()->next = usedBlocks.load();
+          usedBlocks = threadBlocks[i].load();
+          threadBlocks[i] = nextUsedBlock;
+        }
+        threadBlocks[i] = nullptr;
+      }
+    }
+
+    static const size_t threadLocalAllocOverhead = 20; //! 20 means 5% parallel allocation overhead through unfilled thread local blocks
+#if defined(__AVX512ER__) // KNL
+    static const size_t mainAllocOverheadStatic  = 15;  //! 15 means 7.5% allocation overhead through unfilled main alloc blocks
+#else
+    static const size_t mainAllocOverheadStatic  = 20;  //! 20 means 5% allocation overhead through unfilled main alloc blocks
+#endif
+    static const size_t mainAllocOverheadDynamic = 8;  //! 20 means 12.5% allocation overhead through unfilled main alloc blocks
+
+    /* calculates a single threaded threshold for the builders such
+     * that for small scenes the overhead of partly allocated blocks
+     * per thread is low */
+    size_t fixSingleThreadThreshold(size_t branchingFactor, size_t defaultThreshold, size_t numPrimitives, size_t bytesEstimated)
+    {
+      if (numPrimitives == 0 || bytesEstimated == 0) 
+        return defaultThreshold;
+
+      /* calculate block size in bytes to fulfill threadLocalAllocOverhead constraint */
+      const size_t single_mode_factor = use_single_mode ? 1 : 2;
+      const size_t threadCount = TaskScheduler::threadCount();
+      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSize;
+
+      /* if we do not have to limit number of threads use optimal thresdhold */
+      if ( (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+        return defaultThreshold;
+
+      /* otherwise limit number of threads by calculating proper single thread threshold */
+      else {
+        double bytesPerPrimitive = double(bytesEstimated)/double(numPrimitives);
+        return size_t(ceil(branchingFactor*singleThreadBytes/bytesPerPrimitive)); 
+      }
+    }
+
+    __forceinline size_t alignSize(size_t i) {
+      return (i+127)/128*128;
+    }
+
+    /*! initializes the grow size */
+    __forceinline void initGrowSizeAndNumSlots(size_t bytesEstimated, bool fast) 
+    {
+      /* we do not need single thread local allocator mode */
+      use_single_mode = false;
+     
+      /* calculate growSize such that at most mainAllocationOverhead gets wasted when a block stays unused */
+      size_t mainAllocOverhead = fast ? mainAllocOverheadDynamic : mainAllocOverheadStatic;
+      size_t blockSize = alignSize(bytesEstimated/mainAllocOverhead);
+      growSize = maxGrowSize = clamp(blockSize,size_t(1024),maxAllocationSize);
+
+      /* if we reached the maxAllocationSize for growSize, we can
+       * increase the number of allocation slots by still guaranteeing
+       * the mainAllocationOverhead */
+      slotMask = 0x0;
+
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 2 && bytesEstimated > 2*mainAllocOverhead*growSize) slotMask = 0x1;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 4 && bytesEstimated > 4*mainAllocOverhead*growSize) slotMask = 0x3;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 8*mainAllocOverhead*growSize) slotMask = 0x7;
+      if (MAX_THREAD_USED_BLOCK_SLOTS >= 8 && bytesEstimated > 16*mainAllocOverhead*growSize) { growSize *= 2; } /* if the overhead is tiny, double the growSize */
+
+      /* set the thread local alloc block size */
+      size_t defaultBlockSizeSwitch = PAGE_SIZE+maxAlignment;
+      
+      /* for sufficiently large scene we can increase the defaultBlockSize over the defaultBlockSizeSwitch size */
+#if 0 // we do not do this as a block size of 4160 if for some reason best for KNL
+      const size_t threadCount = TaskScheduler::threadCount();
+      const size_t single_mode_factor = use_single_mode ? 1 : 2;
+      const size_t singleThreadBytes = single_mode_factor*threadLocalAllocOverhead*defaultBlockSizeSwitch;
+      if (bytesEstimated+(singleThreadBytes-1))/singleThreadBytes >= threadCount)
+        defaultBlockSize = min(max(defaultBlockSizeSwitch,bytesEstimated/(single_mode_factor*threadLocalAllocOverhead*threadCount)),growSize);
+
+      /* otherwise we grow the defaultBlockSize up to defaultBlockSizeSwitch */
+        else
+#endif
+        defaultBlockSize = clamp(blockSize,size_t(1024),defaultBlockSizeSwitch);
+
+      if (bytesEstimated == 0) {
+        maxGrowSize = maxAllocationSize; // special mode if builder cannot estimate tree size
+        defaultBlockSize = defaultBlockSizeSwitch;
+      }
+      log2_grow_size_scale = 0;
+      
+      if (device->alloc_main_block_size != 0) growSize = device->alloc_main_block_size;
+      if (device->alloc_num_main_slots >= 1 ) slotMask = 0x0;
+      if (device->alloc_num_main_slots >= 2 ) slotMask = 0x1;
+      if (device->alloc_num_main_slots >= 4 ) slotMask = 0x3;
+      if (device->alloc_num_main_slots >= 8 ) slotMask = 0x7;
+      if (device->alloc_thread_block_size != 0) defaultBlockSize = device->alloc_thread_block_size;
+      if (device->alloc_single_thread_alloc != -1) use_single_mode = device->alloc_single_thread_alloc;
+    }
+
+    /*! initializes the allocator */
+    void init(size_t bytesAllocate, size_t bytesReserve, size_t bytesEstimate)
+    {
+      internal_fix_used_blocks();
+      /* distribute the allocation to multiple thread block slots */
+      slotMask = MAX_THREAD_USED_BLOCK_SLOTS-1; // FIXME: remove
+      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+      if (bytesReserve == 0) bytesReserve = bytesAllocate;
+      freeBlocks = Block::create(device,bytesAllocate,bytesReserve,nullptr,atype);
+      estimatedSize = bytesEstimate;
+      initGrowSizeAndNumSlots(bytesEstimate,true);
+    }
+
+    /*! initializes the allocator */
+    void init_estimate(size_t bytesEstimate)
+    {
+      internal_fix_used_blocks();
+      if (usedBlocks.load() || freeBlocks.load()) { reset(); return; }
+      /* single allocator mode ? */
+      estimatedSize = bytesEstimate;
+      //initGrowSizeAndNumSlots(bytesEstimate,false);
+      initGrowSizeAndNumSlots(bytesEstimate,false);
+
+    }
+
+    /*! frees state not required after build */
+    __forceinline void cleanup()
+    {
+      internal_fix_used_blocks();
+
+      /* unbind all thread local allocators */
+      for (auto alloc : thread_local_allocators) alloc->unbind(this);
+      thread_local_allocators.clear();
+    }
+
+    /*! resets the allocator, memory blocks get reused */
+    void reset ()
+    {
+      internal_fix_used_blocks();
+
+      bytesUsed.store(0);
+      bytesFree.store(0);
+      bytesWasted.store(0);
+
+      /* reset all used blocks and move them to begin of free block list */
+      while (usedBlocks.load() != nullptr) {
+        usedBlocks.load()->reset_block();
+        Block* nextUsedBlock = usedBlocks.load()->next;
+        usedBlocks.load()->next = freeBlocks.load();
+        freeBlocks = usedBlocks.load();
+        usedBlocks = nextUsedBlock;
+      }
+
+      /* remove all shared blocks as they are re-added during build */
+      freeBlocks.store(Block::remove_shared_blocks(freeBlocks.load()));
+
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++)
+      {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+      }
+      
+      /* unbind all thread local allocators */
+      for (auto alloc : thread_local_allocators) alloc->unbind(this);
+      thread_local_allocators.clear();
+    }
+
+    /*! frees all allocated memory */
+    __forceinline void clear()
+    {
+      cleanup();
+      bytesUsed.store(0);
+      bytesFree.store(0);
+      bytesWasted.store(0);
+      if (usedBlocks.load() != nullptr) usedBlocks.load()->clear_list(device); usedBlocks = nullptr;
+      if (freeBlocks.load() != nullptr) freeBlocks.load()->clear_list(device); freeBlocks = nullptr;
+      for (size_t i=0; i<MAX_THREAD_USED_BLOCK_SLOTS; i++) {
+        threadUsedBlocks[i] = nullptr;
+        threadBlocks[i] = nullptr;
+      }
+      primrefarray.clear();
+    }
+
+    __forceinline size_t incGrowSizeScale()
+    {
+      size_t scale = log2_grow_size_scale.fetch_add(1)+1;
+      return size_t(1) << min(size_t(16),scale);
+    }
+
+    /*! thread safe allocation of memory */
+    void* malloc(size_t& bytes, size_t align, bool partial)
+    {
+      assert(align <= maxAlignment);
+
+      while (true)
+      {
+        /* allocate using current block */
+        size_t threadID = TaskScheduler::threadID();
+        size_t slot = threadID & slotMask;
+	Block* myUsedBlocks = threadUsedBlocks[slot];
+        if (myUsedBlocks) {
+          void* ptr = myUsedBlocks->malloc(device,bytes,align,partial);
+          if (ptr) return ptr;
+        }
+
+        /* throw error if allocation is too large */
+        if (bytes > maxAllocationSize)
+          throw_RTCError(RTC_ERROR_UNKNOWN,"allocation is too large");
+
+        /* parallel block creation in case of no freeBlocks, avoids single global mutex */
+        if (likely(freeBlocks.load() == nullptr))
+        {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+          std::scoped_lock lock(slotMutex[slot]);
+#else
+          Lock<SpinLock> lock(slotMutex[slot]);
+#endif
+          if (myUsedBlocks == threadUsedBlocks[slot]) {
+            const size_t alignedBytes = (bytes+(align-1)) & ~(align-1);
+            const size_t allocSize = max(min(growSize,maxGrowSize),alignedBytes);
+            assert(allocSize >= bytes);
+            threadBlocks[slot] = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,threadBlocks[slot],atype); // FIXME: a large allocation might throw away a block here!
+            // FIXME: a direct allocation should allocate inside the block here, and not in the next loop! a different thread could do some allocation and make the large allocation fail.
+          }
+          continue;
+        }
+
+        /* if this fails allocate new block */
+        {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+            std::scoped_lock lock(mutex);
+#else
+            Lock<SpinLock> lock(mutex);
+#endif
+	  if (myUsedBlocks == threadUsedBlocks[slot])
+	  {
+            if (freeBlocks.load() != nullptr) {
+	      Block* nextFreeBlock = freeBlocks.load()->next;
+	      freeBlocks.load()->next = usedBlocks;
+	      __memory_barrier();
+	      usedBlocks = freeBlocks.load();
+              threadUsedBlocks[slot] = freeBlocks.load();
+	      freeBlocks = nextFreeBlock;
+	    } else {
+              const size_t allocSize = min(growSize*incGrowSizeScale(),maxGrowSize);
+	      usedBlocks = threadUsedBlocks[slot] = Block::create(device,allocSize,allocSize,usedBlocks,atype); // FIXME: a large allocation should get delivered directly, like above!
+	    }
+          }
+        }
+      }
+    }
+
+    /*! add new block */
+    void addBlock(void* ptr, ssize_t bytes)
+    {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+      std::scoped_lock lock(mutex);
+#else
+      Lock<SpinLock> lock(mutex);
+#endif
+      const size_t sizeof_Header = offsetof(Block,data[0]);
+      void* aptr = (void*) ((((size_t)ptr)+maxAlignment-1) & ~(maxAlignment-1));
+      size_t ofs = (size_t) aptr - (size_t) ptr;
+      bytes -= ofs;
+      if (bytes < 4096) return; // ignore empty or very small blocks
+      freeBlocks = new (aptr) Block(SHARED,bytes-sizeof_Header,bytes-sizeof_Header,freeBlocks,ofs);
+    }
+
+    /* special allocation only used from morton builder only a single time for each build */
+    void* specialAlloc(size_t bytes)
+    {
+      assert(freeBlocks.load() != nullptr && freeBlocks.load()->getBlockAllocatedBytes() >= bytes);
+      return freeBlocks.load()->ptr();
+    }
+
+    struct Statistics
+    {
+      Statistics ()
+      : bytesUsed(0), bytesFree(0), bytesWasted(0) {}
+
+      Statistics (size_t bytesUsed, size_t bytesFree, size_t bytesWasted)
+      : bytesUsed(bytesUsed), bytesFree(bytesFree), bytesWasted(bytesWasted) {}
+
+      Statistics (FastAllocator* alloc, AllocationType atype, bool huge_pages = false)
+      : bytesUsed(0), bytesFree(0), bytesWasted(0)
+      {
+        Block* usedBlocks = alloc->usedBlocks.load();
+        Block* freeBlocks = alloc->freeBlocks.load();
+        if (usedBlocks) bytesUsed += usedBlocks->getUsedBytes(atype,huge_pages);
+        if (freeBlocks) bytesFree += freeBlocks->getAllocatedBytes(atype,huge_pages);
+        if (usedBlocks) bytesFree += usedBlocks->getFreeBytes(atype,huge_pages);
+        if (freeBlocks) bytesWasted += freeBlocks->getWastedBytes(atype,huge_pages);
+        if (usedBlocks) bytesWasted += usedBlocks->getWastedBytes(atype,huge_pages);
+      }
+
+      std::string str(size_t numPrimitives)
+      {
+        std::stringstream str;
+        str.setf(std::ios::fixed, std::ios::floatfield);
+        str << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+            << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "
+            << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
+            << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesAllocatedTotal() << " MB, "
+            << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesAllocatedTotal())/double(numPrimitives);
+        return str.str();
+      }
+
+      friend Statistics operator+ ( const Statistics& a, const Statistics& b)
+      {
+        return Statistics(a.bytesUsed+b.bytesUsed,
+                          a.bytesFree+b.bytesFree,
+                          a.bytesWasted+b.bytesWasted);
+      }
+
+      size_t bytesAllocatedTotal() const {
+        return bytesUsed + bytesFree + bytesWasted;
+      }
+
+    public:
+      size_t bytesUsed;
+      size_t bytesFree;
+      size_t bytesWasted;
+    };
+
+    Statistics getStatistics(AllocationType atype, bool huge_pages = false) {
+      return Statistics(this,atype,huge_pages);
+    }
+
+    size_t getUsedBytes() {
+      return bytesUsed;
+    }
+
+    size_t getWastedBytes() {
+      return bytesWasted;
+    }
+
+    struct AllStatistics
+    {
+      AllStatistics (FastAllocator* alloc)
+
+      : bytesUsed(alloc->bytesUsed),
+        bytesFree(alloc->bytesFree),
+        bytesWasted(alloc->bytesWasted),
+        stat_all(alloc,ANY_TYPE),
+        stat_malloc(alloc,ALIGNED_MALLOC),
+        stat_4K(alloc,EMBREE_OS_MALLOC,false),
+        stat_2M(alloc,EMBREE_OS_MALLOC,true),
+        stat_shared(alloc,SHARED) {}
+
+      AllStatistics (size_t bytesUsed,
+                     size_t bytesFree,
+                     size_t bytesWasted,
+                     Statistics stat_all,
+                     Statistics stat_malloc,
+                     Statistics stat_4K,
+                     Statistics stat_2M,
+                     Statistics stat_shared)
+
+      : bytesUsed(bytesUsed),
+        bytesFree(bytesFree),
+        bytesWasted(bytesWasted),
+        stat_all(stat_all),
+        stat_malloc(stat_malloc),
+        stat_4K(stat_4K),
+        stat_2M(stat_2M),
+        stat_shared(stat_shared) {}
+
+      friend AllStatistics operator+ (const AllStatistics& a, const AllStatistics& b)
+      {
+        return AllStatistics(a.bytesUsed+b.bytesUsed,
+                             a.bytesFree+b.bytesFree,
+                             a.bytesWasted+b.bytesWasted,
+                             a.stat_all + b.stat_all,
+                             a.stat_malloc + b.stat_malloc,
+                             a.stat_4K + b.stat_4K,
+                             a.stat_2M + b.stat_2M,
+                             a.stat_shared + b.stat_shared);
+      }
+
+      void print(size_t numPrimitives)
+      {
+        std::stringstream str0;
+        str0.setf(std::ios::fixed, std::ios::floatfield);
+        str0 << "  alloc : " 
+             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+             << "                                                            " 
+             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed)/double(numPrimitives);
+        std::cout << str0.str() << std::endl;
+      
+        std::stringstream str1;
+        str1.setf(std::ios::fixed, std::ios::floatfield);
+        str1 << "  alloc : " 
+             << "used = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesUsed << " MB, "
+             << "free = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesFree << " MB, "            
+             << "wasted = " << std::setw(7) << std::setprecision(3) << 1E-6f*bytesWasted << " MB, "            
+             << "total = " << std::setw(7) << std::setprecision(3) << 1E-6f*(bytesUsed+bytesFree+bytesWasted) << " MB, "
+             << "#bytes/prim = " << std::setw(6) << std::setprecision(2) << double(bytesUsed+bytesFree+bytesWasted)/double(numPrimitives);
+        std::cout << str1.str() << std::endl;
+     
+        std::cout << "  total : " << stat_all.str(numPrimitives) << std::endl;
+        std::cout << "  4K    : " << stat_4K.str(numPrimitives) << std::endl;
+        std::cout << "  2M    : " << stat_2M.str(numPrimitives) << std::endl;
+        std::cout << "  malloc: " << stat_malloc.str(numPrimitives) << std::endl;
+        std::cout << "  shared: " << stat_shared.str(numPrimitives) << std::endl;
+      }
+
+    private:
+      size_t bytesUsed;
+      size_t bytesFree;
+      size_t bytesWasted;
+      Statistics stat_all;
+      Statistics stat_malloc;
+      Statistics stat_4K;
+      Statistics stat_2M;
+      Statistics stat_shared;
+    };
+
+    void print_blocks()
+    {
+      std::cout << "  estimatedSize = " << estimatedSize << ", slotMask = " << slotMask << ", use_single_mode = " << use_single_mode << ", maxGrowSize = " << maxGrowSize << ", defaultBlockSize = " << defaultBlockSize << std::endl;
+
+      std::cout << "  used blocks = ";
+      if (usedBlocks.load() != nullptr) usedBlocks.load()->print_list();
+      std::cout << "[END]" << std::endl;
+
+      std::cout << "  free blocks = ";
+      if (freeBlocks.load() != nullptr) freeBlocks.load()->print_list();
+      std::cout << "[END]" << std::endl;
+    }
+
+  private:
+
+    struct Block
+    {
+      static Block* create(MemoryMonitorInterface* device, size_t bytesAllocate, size_t bytesReserve, Block* next, AllocationType atype)
+      {
+        /* We avoid using os_malloc for small blocks as this could
+         * cause a risk of fragmenting the virtual address space and
+         * reach the limit of vm.max_map_count = 65k under Linux. */
+        if (atype == EMBREE_OS_MALLOC && bytesAllocate < maxAllocationSize)
+          atype = ALIGNED_MALLOC;
+
+        /* we need to additionally allocate some header */
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        bytesAllocate = sizeof_Header+bytesAllocate;
+        bytesReserve  = sizeof_Header+bytesReserve;
+
+        /* consume full 4k pages with using os_malloc */
+        if (atype == EMBREE_OS_MALLOC) {
+          bytesAllocate = ((bytesAllocate+PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+          bytesReserve  = ((bytesReserve +PAGE_SIZE-1) & ~(PAGE_SIZE-1));
+        }
+
+        /* either use alignedMalloc or os_malloc */
+        void *ptr = nullptr;
+        if (atype == ALIGNED_MALLOC)
+        {
+          /* special handling for default block size */
+          if (bytesAllocate == (2*PAGE_SIZE_2M))
+          {
+            const size_t alignment = maxAlignment;
+            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+            ptr = alignedMalloc(bytesAllocate,alignment);
+
+            /* give hint to transparently convert these pages to 2MB pages */
+            const size_t ptr_aligned_begin = ((size_t)ptr) & ~size_t(PAGE_SIZE_2M-1);
+            os_advise((void*)(ptr_aligned_begin +              0),PAGE_SIZE_2M); // may fail if no memory mapped before block
+            os_advise((void*)(ptr_aligned_begin + 1*PAGE_SIZE_2M),PAGE_SIZE_2M);
+            os_advise((void*)(ptr_aligned_begin + 2*PAGE_SIZE_2M),PAGE_SIZE_2M); // may fail if no memory mapped after block
+
+            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+          }
+          else
+          {
+            const size_t alignment = maxAlignment;
+            if (device) device->memoryMonitor(bytesAllocate+alignment,false);
+            ptr = alignedMalloc(bytesAllocate,alignment);
+            return new (ptr) Block(ALIGNED_MALLOC,bytesAllocate-sizeof_Header,bytesAllocate-sizeof_Header,next,alignment);
+          }
+        }
+        else if (atype == EMBREE_OS_MALLOC)
+        {
+          if (device) device->memoryMonitor(bytesAllocate,false);
+          bool huge_pages; ptr = os_malloc(bytesReserve,huge_pages);
+          return new (ptr) Block(EMBREE_OS_MALLOC,bytesAllocate-sizeof_Header,bytesReserve-sizeof_Header,next,0,huge_pages);
+        }
+        else
+          assert(false);
+
+        return NULL;
+      }
+
+      Block (AllocationType atype, size_t bytesAllocate, size_t bytesReserve, Block* next, size_t wasted, bool huge_pages = false)
+      : cur(0), allocEnd(bytesAllocate), reserveEnd(bytesReserve), next(next), wasted(wasted), atype(atype), huge_pages(huge_pages)
+      {
+        assert((((size_t)&data[0]) & (maxAlignment-1)) == 0);
+      }
+
+      static Block* remove_shared_blocks(Block* head)
+      {
+        Block** prev_next = &head;
+        for (Block* block = head; block; block = block->next) {
+          if (block->atype == SHARED) *prev_next = block->next;
+          else                         prev_next = &block->next;
+        }
+        return head;
+      }
+
+      void clear_list(MemoryMonitorInterface* device)
+      {
+        Block* block = this;
+        while (block) {
+          Block* next = block->next;
+          block->clear_block(device);
+          block = next;
+        }
+      }
+
+      void clear_block (MemoryMonitorInterface* device)
+      {
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        const ssize_t sizeof_Alloced = wasted+sizeof_Header+getBlockAllocatedBytes();
+
+        if (atype == ALIGNED_MALLOC) {
+          alignedFree(this);
+          if (device) device->memoryMonitor(-sizeof_Alloced,true);
+        }
+
+        else if (atype == EMBREE_OS_MALLOC) {
+         size_t sizeof_This = sizeof_Header+reserveEnd;
+         os_free(this,sizeof_This,huge_pages);
+         if (device) device->memoryMonitor(-sizeof_Alloced,true);
+        }
+
+        else /* if (atype == SHARED) */ {
+        }
+      }
+
+      void* malloc(MemoryMonitorInterface* device, size_t& bytes_in, size_t align, bool partial)
+      {
+        size_t bytes = bytes_in;
+        assert(align <= maxAlignment);
+        bytes = (bytes+(align-1)) & ~(align-1);
+	if (unlikely(cur+bytes > reserveEnd && !partial)) return nullptr;
+	const size_t i = cur.fetch_add(bytes);
+        if (unlikely(i+bytes > reserveEnd && !partial)) return nullptr;
+        if (unlikely(i > reserveEnd)) return nullptr;
+        bytes_in = bytes = min(bytes,reserveEnd-i);
+        
+	if (i+bytes > allocEnd) {
+          if (device) device->memoryMonitor(i+bytes-max(i,allocEnd),true);
+        }
+	return &data[i];
+      }
+
+      void* ptr() {
+        return &data[cur];
+      }
+
+      void reset_block ()
+      {
+        allocEnd = max(allocEnd,(size_t)cur);
+        cur = 0;
+      }
+
+      size_t getBlockUsedBytes() const {
+        return min(size_t(cur),reserveEnd);
+      }
+
+      size_t getBlockFreeBytes() const {
+	return getBlockAllocatedBytes() - getBlockUsedBytes();
+      }
+
+      size_t getBlockAllocatedBytes() const {
+        return min(max(allocEnd,size_t(cur)),reserveEnd);
+      }
+
+      size_t getBlockWastedBytes() const {
+        const size_t sizeof_Header = offsetof(Block,data[0]);
+        return sizeof_Header + wasted;
+      }
+
+      size_t getBlockReservedBytes() const {
+        return reserveEnd;
+      }
+  
+      bool hasType(AllocationType atype_i, bool huge_pages_i) const
+      {
+        if      (atype_i == ANY_TYPE ) return true;
+        else if (atype   == EMBREE_OS_MALLOC) return atype_i == atype && huge_pages_i == huge_pages;
+        else                           return atype_i == atype;
+      }
+
+      size_t getUsedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockUsedBytes();
+        }
+        return bytes;
+      }
+
+      size_t getFreeBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockFreeBytes();
+        }
+        return bytes;
+      }
+
+      size_t getWastedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockWastedBytes();
+        }
+        return bytes;
+      }
+
+      size_t getAllocatedBytes(AllocationType atype, bool huge_pages = false) const {
+        size_t bytes = 0;
+        for (const Block* block = this; block; block = block->next) {
+          if (!block->hasType(atype,huge_pages)) continue;
+          bytes += block->getBlockAllocatedBytes();
+        }
+        return bytes;
+      }
+
+      void print_list ()
+      {
+        for (const Block* block = this; block; block = block->next)
+          block->print_block();
+      }
+
+      void print_block() const
+      {
+        if (atype == ALIGNED_MALLOC) std::cout << "A";
+        else if (atype == EMBREE_OS_MALLOC) std::cout << "O";
+        else if (atype == SHARED) std::cout << "S";
+        if (huge_pages) std::cout << "H";
+        size_t bytesUsed = getBlockUsedBytes();
+        size_t bytesFree = getBlockFreeBytes();
+        size_t bytesWasted = getBlockWastedBytes();
+        std::cout << "[" << bytesUsed << ", " << bytesFree << ", " << bytesWasted << "] ";
+      }
+
+    public:
+      std::atomic<size_t> cur;        //!< current location of the allocator
+      std::atomic<size_t> allocEnd;   //!< end of the allocated memory region
+      std::atomic<size_t> reserveEnd; //!< end of the reserved memory region
+      Block* next;               //!< pointer to next block in list
+      size_t wasted;             //!< amount of memory wasted through block alignment
+      AllocationType atype;      //!< allocation mode of the block
+      bool huge_pages;           //!< whether the block uses huge pages
+      char align[maxAlignment-5*sizeof(size_t)-sizeof(AllocationType)-sizeof(bool)]; //!< align data to maxAlignment
+      char data[1];              //!< here starts memory to use for allocations
+    };
+
+  private:
+    Device* device;
+    SpinLock mutex;
+    size_t slotMask;
+    std::atomic<Block*> threadUsedBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+    std::atomic<Block*> usedBlocks;
+    std::atomic<Block*> freeBlocks;
+
+    std::atomic<Block*> threadBlocks[MAX_THREAD_USED_BLOCK_SLOTS];
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::mutex slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#else
+    SpinLock slotMutex[MAX_THREAD_USED_BLOCK_SLOTS];
+#endif
+
+    bool use_single_mode;
+    size_t defaultBlockSize;
+    size_t estimatedSize;
+    size_t growSize;
+    size_t maxGrowSize;
+    std::atomic<size_t> log2_grow_size_scale; //!< log2 of scaling factor for grow size // FIXME: remove
+    std::atomic<size_t> bytesUsed;
+    std::atomic<size_t> bytesFree;
+    std::atomic<size_t> bytesWasted;
+    static __thread ThreadLocal2* thread_local_allocator2;
+    static SpinLock s_thread_local_allocators_lock;
+    static std::vector<std::unique_ptr<ThreadLocal2>> s_thread_local_allocators;
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::mutex thread_local_allocators_lock;
+#else
+    SpinLock thread_local_allocators_lock;
+#endif
+    std::vector<ThreadLocal2*> thread_local_allocators;
+    AllocationType atype;
+    mvector<PrimRef> primrefarray;     //!< primrefarray used to allocate nodes
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/buffer.h b/thirdparty/embree-aarch64/kernels/common/buffer.h
new file mode 100644
index 0000000000..02d319c59d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/buffer.h
@@ -0,0 +1,263 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+
+namespace embree
+{
+  /*! Implements an API data buffer object. This class may or may not own the data. */
+  class Buffer : public RefCount
+  {
+  public:
+    /*! Buffer construction */
+    Buffer() 
+      : device(nullptr), ptr(nullptr), numBytes(0), shared(false) {}
+
+    /*! Buffer construction */
+    Buffer(Device* device, size_t numBytes_in, void* ptr_in = nullptr)
+      : device(device), numBytes(numBytes_in)
+    {
+      device->refInc();
+      
+      if (ptr_in)
+      {
+        shared = true;
+        ptr = (char*)ptr_in;
+      }
+      else
+      {
+        shared = false;
+        alloc();
+      }
+    }
+    
+    /*! Buffer destruction */
+    ~Buffer() {
+      free();
+      device->refDec();
+    }
+    
+    /*! this class is not copyable */
+  private:
+    Buffer(const Buffer& other) DELETED; // do not implement
+    Buffer& operator =(const Buffer& other) DELETED; // do not implement
+    
+  public:
+    /* inits and allocates the buffer */
+    void create(Device* device_in, size_t numBytes_in)
+    {
+      init(device_in, numBytes_in);
+      alloc();
+    }
+    
+    /* inits the buffer */
+    void init(Device* device_in, size_t numBytes_in)
+    {
+      free();
+      device = device_in;
+      ptr = nullptr;
+      numBytes = numBytes_in;
+      shared = false;
+    }
+
+    /*! sets shared buffer */
+    void set(Device* device_in, void* ptr_in, size_t numBytes_in)
+    {
+      free();
+      device = device_in;
+      ptr = (char*)ptr_in;
+      if (numBytes_in != (size_t)-1)
+        numBytes = numBytes_in;
+      shared = true;
+    }
+    
+    /*! allocated buffer */
+    void alloc()
+    {
+      if (device)
+        device->memoryMonitor(this->bytes(), false);
+      size_t b = (this->bytes()+15) & ssize_t(-16);
+      ptr = (char*)alignedMalloc(b,16);
+    }
+    
+    /*! frees the buffer */
+    void free()
+    {
+      if (shared) return;
+      alignedFree(ptr); 
+      if (device)
+        device->memoryMonitor(-ssize_t(this->bytes()), true);
+      ptr = nullptr;
+    }
+    
+    /*! gets buffer pointer */
+    void* data()
+    {
+      /* report error if buffer is not existing */
+      if (!device)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer specified");
+      
+      /* return buffer */
+      return ptr;
+    }
+
+    /*! returns pointer to first element */
+    __forceinline char* getPtr() const {
+      return ptr;
+    }
+
+    /*! returns the number of bytes of the buffer */
+    __forceinline size_t bytes() const { 
+      return numBytes;
+    }
+    
+    /*! returns true of the buffer is not empty */
+    __forceinline operator bool() const { 
+      return ptr; 
+    }
+
+  public:
+    Device* device;  //!< device to report memory usage to
+    char* ptr;       //!< pointer to buffer data
+    size_t numBytes; //!< number of bytes in the buffer
+    bool shared;     //!< set if memory is shared with application
+  };
+
+  /*! An untyped contiguous range of a buffer. This class does not own the buffer content. */
+  class RawBufferView
+  {
+  public:
+    /*! Buffer construction */
+    RawBufferView()
+      : ptr_ofs(nullptr), stride(0), num(0), format(RTC_FORMAT_UNDEFINED), modCounter(1), modified(true), userData(0) {}
+
+  public:
+    /*! sets the buffer view */
+    void set(const Ref<Buffer>& buffer_in, size_t offset_in, size_t stride_in, size_t num_in, RTCFormat format_in)
+    {
+      if ((offset_in + stride_in * num_in) > (stride_in * buffer_in->numBytes))
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "buffer range out of bounds");
+
+      ptr_ofs = buffer_in->ptr + offset_in;
+      stride = stride_in;
+      num = num_in;
+      format = format_in;
+      modCounter++;
+      modified = true;
+      buffer = buffer_in;
+    }
+
+    /*! returns pointer to the first element */
+    __forceinline char* getPtr() const {
+      return ptr_ofs;
+    }
+
+    /*! returns pointer to the i'th element */
+    __forceinline char* getPtr(size_t i) const
+    {
+      assert(i<num);
+      return ptr_ofs + i*stride;
+    }
+
+    /*! returns the number of elements of the buffer */
+    __forceinline size_t size() const { 
+      return num; 
+    }
+
+    /*! returns the number of bytes of the buffer */
+    __forceinline size_t bytes() const { 
+      return num*stride; 
+    }
+    
+    /*! returns the buffer stride */
+    __forceinline unsigned getStride() const
+    {
+      assert(stride <= unsigned(inf));
+      return unsigned(stride);
+    }
+
+    /*! return the buffer format */
+    __forceinline RTCFormat getFormat() const {
+      return format;
+    }
+
+    /*! mark buffer as modified or unmodified */
+    __forceinline void setModified() {
+      modCounter++;
+      modified = true;
+    }
+
+    /*! mark buffer as modified or unmodified */
+    __forceinline bool isModified(unsigned int otherModCounter) const {
+      return modCounter > otherModCounter;
+    }
+
+     /*! mark buffer as modified or unmodified */
+    __forceinline bool isLocalModified() const {
+      return modified;
+    }
+
+    /*! clear local modified flag */
+    __forceinline void clearLocalModified() {
+      modified = false;
+    }
+
+    /*! returns true of the buffer is not empty */
+    __forceinline operator bool() const { 
+      return ptr_ofs; 
+    }
+
+    /*! checks padding to 16 byte check, fails hard */
+    __forceinline void checkPadding16() const
+    {
+      if (ptr_ofs && num)
+        volatile int MAYBE_UNUSED w = *((int*)getPtr(size()-1)+3); // FIXME: is failing hard avoidable?
+    }
+
+  public:
+    char* ptr_ofs;      //!< base pointer plus offset
+    size_t stride;      //!< stride of the buffer in bytes
+    size_t num;         //!< number of elements in the buffer
+    RTCFormat format;   //!< format of the buffer
+    unsigned int modCounter; //!< version ID of this buffer
+    bool modified;      //!< local modified data
+    int userData;       //!< special data
+    Ref<Buffer> buffer; //!< reference to the parent buffer
+  };
+
+  /*! A typed contiguous range of a buffer. This class does not own the buffer content. */
+  template<typename T>
+  class BufferView : public RawBufferView
+  {
+  public:
+    typedef T value_type;
+
+    /*! access to the ith element of the buffer */
+    __forceinline       T& operator [](size_t i)       { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+    __forceinline const T& operator [](size_t i) const { assert(i<num); return *(T*)(ptr_ofs + i*stride); }
+  };
+
+  template<>
+  class BufferView<Vec3fa> : public RawBufferView
+  {
+  public:
+    typedef Vec3fa value_type;
+
+    /*! access to the ith element of the buffer */
+    __forceinline const Vec3fa operator [](size_t i) const
+    {
+      assert(i<num);
+      return Vec3fa(vfloat4::loadu((float*)(ptr_ofs + i*stride)));
+    }
+    
+    /*! writes the i'th element */
+    __forceinline void store(size_t i, const Vec3fa& v)
+    {
+      assert(i<num);
+      vfloat4::storeu((float*)(ptr_ofs + i*stride), (vfloat4)v);
+    }
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/builder.h b/thirdparty/embree-aarch64/kernels/common/builder.h
new file mode 100644
index 0000000000..d2a1cfe3ce
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/builder.h
@@ -0,0 +1,60 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "accel.h"
+
+namespace embree
+{
+#define MODE_HIGH_QUALITY (1<<8)
+
+  /*! virtual interface for all hierarchy builders */
+  class Builder : public RefCount {
+  public:
+
+    static const size_t DEFAULT_SINGLE_THREAD_THRESHOLD = 1024;
+
+    /*! initiates the hierarchy builder */
+    virtual void build() = 0;
+
+    /*! notifies the builder about the deletion of some geometry */
+    virtual void deleteGeometry(size_t geomID) {};
+
+    /*! clears internal builder state */
+    virtual void clear() = 0;
+  };
+
+  /*! virtual interface for progress monitor class */
+  struct BuildProgressMonitor {
+    virtual void operator() (size_t dn) const = 0;
+  };
+
+  /*! build the progress monitor interface from a closure */
+  template<typename Closure>
+    struct ProgressMonitorClosure : BuildProgressMonitor
+  {
+  public:
+    ProgressMonitorClosure (const Closure& closure) : closure(closure) {}
+    void operator() (size_t dn) const { closure(dn); }
+  private:
+    const Closure closure;
+  };
+  template<typename Closure> __forceinline const ProgressMonitorClosure<Closure> BuildProgressMonitorFromClosure(const Closure& closure) {
+    return ProgressMonitorClosure<Closure>(closure);
+  }
+
+  struct LineSegments;
+  struct TriangleMesh;
+  struct QuadMesh;
+  struct UserGeometry;
+
+  class Scene;
+
+  typedef void (*createLineSegmentsAccelTy)(Scene* scene, LineSegments* mesh, AccelData*& accel, Builder*& builder);
+  typedef void (*createTriangleMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+  typedef void (*createQuadMeshAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+  typedef void (*createUserGeometryAccelTy)(Scene* scene, unsigned int geomID, AccelData*& accel, Builder*& builder);
+
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/context.h b/thirdparty/embree-aarch64/kernels/common/context.h
new file mode 100644
index 0000000000..d0185a74f2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/context.h
@@ -0,0 +1,131 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "rtcore.h"
+#include "point_query.h"
+
+namespace embree
+{
+  class Scene;
+
+  struct IntersectContext
+  {
+  public:
+    __forceinline IntersectContext(Scene* scene, RTCIntersectContext* user_context)
+      : scene(scene), user(user_context) {}
+
+    __forceinline bool hasContextFilter() const {
+      return user->filter != nullptr;
+    }
+
+    __forceinline bool isCoherent() const {
+      return embree::isCoherent(user->flags);
+    }
+
+    __forceinline bool isIncoherent() const {
+      return embree::isIncoherent(user->flags);
+    }
+    
+  public:
+    Scene* scene;
+    RTCIntersectContext* user;
+  };
+
+  template<int M, typename Geometry>
+      __forceinline Vec4vf<M> enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3vf<M>& ray_org, const Vec4vf<M>& v)
+    {
+#if RTC_MIN_WIDTH
+      const vfloat<M> d = length(Vec3vf<M>(v) - ray_org);
+      const vfloat<M> r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+      return Vec4vf<M>(v.x,v.y,v.z,r);
+#else
+      return v;
+#endif
+    }
+
+    template<typename Geometry>
+    __forceinline Vec3ff enlargeRadiusToMinWidth(const IntersectContext* context, const Geometry* geom, const Vec3fa& ray_org, const Vec3ff& v)
+  {
+#if RTC_MIN_WIDTH
+    const float d = length(Vec3fa(v) - ray_org);
+    const float r = clamp(context->user->minWidthDistanceFactor*d, v.w, geom->maxRadiusScale*v.w);
+    return Vec3ff(v.x,v.y,v.z,r);
+#else
+    return v;
+#endif
+  }
+  
+  enum PointQueryType
+  {
+    POINT_QUERY_TYPE_UNDEFINED = 0,
+    POINT_QUERY_TYPE_SPHERE = 1,
+    POINT_QUERY_TYPE_AABB = 2,
+  };
+
+  typedef bool (*PointQueryFunction)(struct RTCPointQueryFunctionArguments* args);
+  
+  struct PointQueryContext
+  {
+  public:
+    __forceinline PointQueryContext(Scene* scene, 
+                                    PointQuery* query_ws, 
+                                    PointQueryType query_type,
+                                    PointQueryFunction func, 
+                                    RTCPointQueryContext* userContext,
+                                    float similarityScale,
+                                    void* userPtr)
+      : scene(scene)
+      , query_ws(query_ws)
+      , query_type(query_type)
+      , func(func)
+      , userContext(userContext)
+      , similarityScale(similarityScale)
+      , userPtr(userPtr) 
+      , primID(RTC_INVALID_GEOMETRY_ID)
+      , geomID(RTC_INVALID_GEOMETRY_ID)
+      , query_radius(query_ws->radius)
+    { 
+      if (query_type == POINT_QUERY_TYPE_AABB) {
+        assert(similarityScale == 0.f);
+        updateAABB();
+      }
+      if (userContext->instStackSize == 0) {
+        assert(similarityScale == 1.f);
+      }
+    }
+
+  public:
+    __forceinline void updateAABB() 
+    {
+      if (likely(query_ws->radius == (float)inf || userContext->instStackSize == 0)) {
+        query_radius = Vec3fa(query_ws->radius);
+        return;
+      }
+
+      const AffineSpace3fa m = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+      BBox3fa bbox(Vec3fa(-query_ws->radius), Vec3fa(query_ws->radius));
+      bbox = xfmBounds(m, bbox);
+      query_radius = 0.5f * (bbox.upper - bbox.lower);
+    }
+
+public:
+    Scene* scene;
+
+    PointQuery* query_ws; // the original world space point query 
+    PointQueryType query_type;
+    PointQueryFunction func;
+    RTCPointQueryContext* userContext;
+    const float similarityScale;
+
+    void* userPtr;
+
+    unsigned int primID;
+    unsigned int geomID;
+
+    Vec3fa query_radius;  // used if the query is converted to an AABB internally
+  };
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/common/default.h b/thirdparty/embree-aarch64/kernels/common/default.h
new file mode 100644
index 0000000000..709119163b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/default.h
@@ -0,0 +1,273 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+#include "../../common/sys/thread.h"
+#include "../../common/sys/alloc.h"
+#include "../../common/sys/ref.h"
+#include "../../common/sys/intrinsics.h"
+#include "../../common/sys/atomic.h"
+#include "../../common/sys/mutex.h"
+#include "../../common/sys/vector.h"
+#include "../../common/sys/array.h"
+#include "../../common/sys/string.h"
+#include "../../common/sys/regression.h"
+#include "../../common/sys/vector.h"
+
+#include "../../common/math/math.h"
+#include "../../common/math/transcendental.h"
+#include "../../common/simd/simd.h"
+#include "../../common/math/vec2.h"
+#include "../../common/math/vec3.h"
+#include "../../common/math/vec4.h"
+#include "../../common/math/vec2fa.h"
+#include "../../common/math/vec3fa.h"
+#include "../../common/math/interval.h"
+#include "../../common/math/bbox.h"
+#include "../../common/math/obbox.h"
+#include "../../common/math/lbbox.h"
+#include "../../common/math/linearspace2.h"
+#include "../../common/math/linearspace3.h"
+#include "../../common/math/affinespace.h"
+#include "../../common/math/range.h"
+#include "../../common/lexers/tokenstream.h"
+
+#include "../../common/tasking/taskscheduler.h"
+
+#define COMMA ,
+
+#include "../config.h"
+#include "isa.h"
+#include "stat.h"
+#include "profile.h"
+#include "rtcore.h"
+#include "vector.h"
+#include "state.h"
+#include "instance_stack.h"
+
+#include <vector>
+#include <map>
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <sstream>
+
+#if !defined(_DEBUG) && defined(BUILD_IOS)
+#undef assert
+#define assert(_EXPR)
+#endif
+
+namespace embree
+{
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec2 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec2vf  = Vec2<vfloat<N>>;
+  template<int N> using Vec2vd  = Vec2<vdouble<N>>;
+  template<int N> using Vec2vr  = Vec2<vreal<N>>;
+  template<int N> using Vec2vi  = Vec2<vint<N>>;
+  template<int N> using Vec2vl  = Vec2<vllong<N>>;
+  template<int N> using Vec2vb  = Vec2<vbool<N>>;
+  template<int N> using Vec2vbf = Vec2<vboolf<N>>;
+  template<int N> using Vec2vbd = Vec2<vboold<N>>;
+
+  typedef Vec2<vfloat4>  Vec2vf4;
+  typedef Vec2<vdouble4> Vec2vd4;
+  typedef Vec2<vreal4>   Vec2vr4;
+  typedef Vec2<vint4>    Vec2vi4;
+  typedef Vec2<vllong4>  Vec2vl4;
+  typedef Vec2<vbool4>   Vec2vb4;
+  typedef Vec2<vboolf4>  Vec2vbf4;
+  typedef Vec2<vboold4>  Vec2vbd4;
+
+  typedef Vec2<vfloat8>  Vec2vf8;
+  typedef Vec2<vdouble8> Vec2vd8;
+  typedef Vec2<vreal8>   Vec2vr8;
+  typedef Vec2<vint8>    Vec2vi8;
+  typedef Vec2<vllong8>  Vec2vl8;
+  typedef Vec2<vbool8>   Vec2vb8;
+  typedef Vec2<vboolf8>  Vec2vbf8;
+  typedef Vec2<vboold8>  Vec2vbd8;
+
+  typedef Vec2<vfloat16>  Vec2vf16;
+  typedef Vec2<vdouble16> Vec2vd16;
+  typedef Vec2<vreal16>   Vec2vr16;
+  typedef Vec2<vint16>    Vec2vi16;
+  typedef Vec2<vllong16>  Vec2vl16;
+  typedef Vec2<vbool16>   Vec2vb16;
+  typedef Vec2<vboolf16>  Vec2vbf16;
+  typedef Vec2<vboold16>  Vec2vbd16;
+
+  typedef Vec2<vfloatx>  Vec2vfx;
+  typedef Vec2<vdoublex> Vec2vdx;
+  typedef Vec2<vrealx>   Vec2vrx;
+  typedef Vec2<vintx>    Vec2vix;
+  typedef Vec2<vllongx>  Vec2vlx;
+  typedef Vec2<vboolx>   Vec2vbx;
+  typedef Vec2<vboolfx>  Vec2vbfx;
+  typedef Vec2<vbooldx>  Vec2vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec3 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec3vf  = Vec3<vfloat<N>>;
+  template<int N> using Vec3vd  = Vec3<vdouble<N>>;
+  template<int N> using Vec3vr  = Vec3<vreal<N>>;
+  template<int N> using Vec3vi  = Vec3<vint<N>>;
+  template<int N> using Vec3vl  = Vec3<vllong<N>>;
+  template<int N> using Vec3vb  = Vec3<vbool<N>>;
+  template<int N> using Vec3vbf = Vec3<vboolf<N>>;
+  template<int N> using Vec3vbd = Vec3<vboold<N>>;
+
+  typedef Vec3<vfloat4>  Vec3vf4;
+  typedef Vec3<vdouble4> Vec3vd4;
+  typedef Vec3<vreal4>   Vec3vr4;
+  typedef Vec3<vint4>    Vec3vi4;
+  typedef Vec3<vllong4>  Vec3vl4;
+  typedef Vec3<vbool4>   Vec3vb4;
+  typedef Vec3<vboolf4>  Vec3vbf4;
+  typedef Vec3<vboold4>  Vec3vbd4;
+
+  typedef Vec3<vfloat8>  Vec3vf8;
+  typedef Vec3<vdouble8> Vec3vd8;
+  typedef Vec3<vreal8>   Vec3vr8;
+  typedef Vec3<vint8>    Vec3vi8;
+  typedef Vec3<vllong8>  Vec3vl8;
+  typedef Vec3<vbool8>   Vec3vb8;
+  typedef Vec3<vboolf8>  Vec3vbf8;
+  typedef Vec3<vboold8>  Vec3vbd8;
+
+  typedef Vec3<vfloat16>  Vec3vf16;
+  typedef Vec3<vdouble16> Vec3vd16;
+  typedef Vec3<vreal16>   Vec3vr16;
+  typedef Vec3<vint16>    Vec3vi16;
+  typedef Vec3<vllong16>  Vec3vl16;
+  typedef Vec3<vbool16>   Vec3vb16;
+  typedef Vec3<vboolf16>  Vec3vbf16;
+  typedef Vec3<vboold16>  Vec3vbd16;
+
+  typedef Vec3<vfloatx>  Vec3vfx;
+  typedef Vec3<vdoublex> Vec3vdx;
+  typedef Vec3<vrealx>   Vec3vrx;
+  typedef Vec3<vintx>    Vec3vix;
+  typedef Vec3<vllongx>  Vec3vlx;
+  typedef Vec3<vboolx>   Vec3vbx;
+  typedef Vec3<vboolfx>  Vec3vbfx;
+  typedef Vec3<vbooldx>  Vec3vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Vec4 shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using Vec4vf  = Vec4<vfloat<N>>;
+  template<int N> using Vec4vd  = Vec4<vdouble<N>>;
+  template<int N> using Vec4vr  = Vec4<vreal<N>>;
+  template<int N> using Vec4vi  = Vec4<vint<N>>;
+  template<int N> using Vec4vl  = Vec4<vllong<N>>;
+  template<int N> using Vec4vb  = Vec4<vbool<N>>;
+  template<int N> using Vec4vbf = Vec4<vboolf<N>>;
+  template<int N> using Vec4vbd = Vec4<vboold<N>>;
+
+  typedef Vec4<vfloat4>  Vec4vf4;
+  typedef Vec4<vdouble4> Vec4vd4;
+  typedef Vec4<vreal4>   Vec4vr4;
+  typedef Vec4<vint4>    Vec4vi4;
+  typedef Vec4<vllong4>  Vec4vl4;
+  typedef Vec4<vbool4>   Vec4vb4;
+  typedef Vec4<vboolf4>  Vec4vbf4;
+  typedef Vec4<vboold4>  Vec4vbd4;
+
+  typedef Vec4<vfloat8>  Vec4vf8;
+  typedef Vec4<vdouble8> Vec4vd8;
+  typedef Vec4<vreal8>   Vec4vr8;
+  typedef Vec4<vint8>    Vec4vi8;
+  typedef Vec4<vllong8>  Vec4vl8;
+  typedef Vec4<vbool8>   Vec4vb8;
+  typedef Vec4<vboolf8>  Vec4vbf8;
+  typedef Vec4<vboold8>  Vec4vbd8;
+
+  typedef Vec4<vfloat16>  Vec4vf16;
+  typedef Vec4<vdouble16> Vec4vd16;
+  typedef Vec4<vreal16>   Vec4vr16;
+  typedef Vec4<vint16>    Vec4vi16;
+  typedef Vec4<vllong16>  Vec4vl16;
+  typedef Vec4<vbool16>   Vec4vb16;
+  typedef Vec4<vboolf16>  Vec4vbf16;
+  typedef Vec4<vboold16>  Vec4vbd16;
+
+  typedef Vec4<vfloatx>  Vec4vfx;
+  typedef Vec4<vdoublex> Vec4vdx;
+  typedef Vec4<vrealx>   Vec4vrx;
+  typedef Vec4<vintx>    Vec4vix;
+  typedef Vec4<vllongx>  Vec4vlx;
+  typedef Vec4<vboolx>   Vec4vbx;
+  typedef Vec4<vboolfx>  Vec4vbfx;
+  typedef Vec4<vbooldx>  Vec4vbdx;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// Other shortcuts
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<int N> using BBox3vf = BBox<Vec3vf<N>>;
+  typedef BBox<Vec3vf4>  BBox3vf4;
+  typedef BBox<Vec3vf8>  BBox3vf8;
+  typedef BBox<Vec3vf16> BBox3vf16;
+
+  /* calculate time segment itime and fractional time ftime */
+  __forceinline int getTimeSegment(float time, float numTimeSegments, float& ftime)
+  {
+    const float timeScaled = time * numTimeSegments;
+    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return int(itimef);
+  }
+
+  __forceinline int getTimeSegment(float time, float start_time, float end_time, float numTimeSegments, float& ftime)
+  {
+    const float timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+    const float itimef = clamp(floorf(timeScaled), 0.0f, numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return int(itimef);
+  }
+
+  template<int N>
+  __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+  {
+    const vfloat<N> timeScaled = time * numTimeSegments;
+    const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return vint<N>(itimef);
+  }
+
+  template<int N>
+    __forceinline vint<N> getTimeSegment(const vfloat<N>& time, const vfloat<N>& start_time, const vfloat<N>& end_time, const vfloat<N>& numTimeSegments, vfloat<N>& ftime)
+  {
+    const vfloat<N> timeScaled = (time-start_time)/(end_time-start_time) * numTimeSegments;
+    const vfloat<N> itimef = clamp(floor(timeScaled), vfloat<N>(zero), numTimeSegments-1.0f);
+    ftime = timeScaled - itimef;
+    return vint<N>(itimef);
+  }
+
+  /* calculate overlapping time segment range */
+  __forceinline range<int> getTimeSegmentRange(const BBox1f& time_range, float numTimeSegments)
+  {
+    const float round_up   = 1.0f+2.0f*float(ulp); // corrects inaccuracies to precisely match time step
+    const float round_down = 1.0f-2.0f*float(ulp);
+    const int itime_lower = (int)max(floor(round_up  *time_range.lower*numTimeSegments), 0.0f);
+    const int itime_upper = (int)min(ceil (round_down*time_range.upper*numTimeSegments), numTimeSegments);
+    return make_range(itime_lower, itime_upper);
+  }
+
+  /* calculate overlapping time segment range */
+  __forceinline range<int> getTimeSegmentRange(const BBox1f& range, BBox1f time_range, float numTimeSegments)
+  {
+    const float lower = (range.lower-time_range.lower)/time_range.size();
+    const float upper = (range.upper-time_range.lower)/time_range.size();
+    return getTimeSegmentRange(BBox1f(lower,upper),numTimeSegments);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/device.cpp b/thirdparty/embree-aarch64/kernels/common/device.cpp
new file mode 100644
index 0000000000..16ec11b892
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/device.cpp
@@ -0,0 +1,567 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "device.h"
+#include "../hash.h"
+#include "scene_triangle_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_subdiv_mesh.h"
+
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+#include "../geometry/cylinder.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+
+#include "../../common/tasking/taskscheduler.h"
+#include "../../common/sys/alloc.h"
+
+namespace embree
+{
+  /*! some global variables that can be set via rtcSetParameter1i for debugging purposes */
+  ssize_t Device::debug_int0 = 0;
+  ssize_t Device::debug_int1 = 0;
+  ssize_t Device::debug_int2 = 0;
+  ssize_t Device::debug_int3 = 0;
+
+  DECLARE_SYMBOL2(RayStreamFilterFuncs,rayStreamFilterFuncs);
+
+  static MutexSys g_mutex;
+  static std::map<Device*,size_t> g_cache_size_map;
+  static std::map<Device*,size_t> g_num_threads_map;
+
+  Device::Device (const char* cfg)
+  {
+    /* check that CPU supports lowest ISA */
+    if (!hasISA(ISA)) {
+      throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support " ISA_STR);
+    }
+
+    /* set default frequency level for detected CPU */
+    switch (getCPUModel()) {
+    case CPU::UNKNOWN:         frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_ICE_LAKE:   frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_TIGER_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_COMET_LAKE: frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_CANNON_LAKE:frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_KABY_LAKE:  frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE_SKY_LAKE:   frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::XEON_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_BROADWELL:  frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_HASWELL:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_HASWELL:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::XEON_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::CORE_IVY_BRIDGE: frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::SANDY_BRIDGE:    frequency_level = FREQUENCY_SIMD256; break;
+    case CPU::NEHALEM:         frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE2:           frequency_level = FREQUENCY_SIMD128; break;
+    case CPU::CORE1:           frequency_level = FREQUENCY_SIMD128; break;
+    }
+
+    /* initialize global state */
+#if defined(EMBREE_CONFIG)
+    State::parseString(EMBREE_CONFIG);
+#endif
+    State::parseString(cfg);
+    if (!ignore_config_files && FileName::executableFolder() != FileName(""))
+      State::parseFile(FileName::executableFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR)));
+    if (!ignore_config_files && FileName::homeFolder() != FileName(""))
+      State::parseFile(FileName::homeFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR)));
+    State::verify();
+
+    /* check whether selected ISA is supported by the HW, as the user could have forced an unsupported ISA */    
+    if (!checkISASupport()) {
+      throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support selected ISA");
+    }    
+    
+    /*! do some internal tests */
+    assert(isa::Cylinder::verify());
+
+    /*! enable huge page support if desired */
+#if defined(__WIN32__)
+    if (State::enable_selockmemoryprivilege)
+      State::hugepages_success &= win_enable_selockmemoryprivilege(State::verbosity(3));
+#endif
+    State::hugepages_success &= os_init(State::hugepages,State::verbosity(3));
+    
+    /*! set tessellation cache size */
+    setCacheSize( State::tessellation_cache_size );
+
+    /*! enable some floating point exceptions to catch bugs */
+    if (State::float_exceptions)
+    {
+      int exceptions = _MM_MASK_MASK;
+      //exceptions &= ~_MM_MASK_INVALID;
+      exceptions &= ~_MM_MASK_DENORM;
+      exceptions &= ~_MM_MASK_DIV_ZERO;
+      //exceptions &= ~_MM_MASK_OVERFLOW;
+      //exceptions &= ~_MM_MASK_UNDERFLOW;
+      //exceptions &= ~_MM_MASK_INEXACT;
+      _MM_SET_EXCEPTION_MASK(exceptions);
+    }
+    
+    /* print info header */
+    if (State::verbosity(1))
+      print();
+    if (State::verbosity(2)) 
+      State::print();
+
+    /* register all algorithms */
+    bvh4_factory = make_unique(new BVH4Factory(enabled_builder_cpu_features, enabled_cpu_features));
+
+#if defined(EMBREE_TARGET_SIMD8)
+    bvh8_factory = make_unique(new BVH8Factory(enabled_builder_cpu_features, enabled_cpu_features));
+#endif
+
+    /* setup tasking system */
+    initTaskingSystem(numThreads);
+
+    /* ray stream SOA to AOS conversion */
+#if defined(EMBREE_RAY_PACKETS)
+    RayStreamFilterFuncsType rayStreamFilterFuncs;
+    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(enabled_cpu_features,rayStreamFilterFuncs);
+    rayStreamFilters = rayStreamFilterFuncs();
+#endif
+  }
+
+  Device::~Device ()
+  {
+    setCacheSize(0);
+    exitTaskingSystem();
+  }
+
+  std::string getEnabledTargets()
+  {
+    std::string v;
+#if defined(EMBREE_TARGET_SSE2)
+    v += "SSE2 ";
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+    v += "SSE4.2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX)
+    v += "AVX ";
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+    v += "AVX2 ";
+#endif
+#if defined(EMBREE_TARGET_AVX512KNL)
+    v += "AVX512KNL ";
+#endif
+#if defined(EMBREE_TARGET_AVX512SKX)
+    v += "AVX512SKX ";
+#endif
+    return v;
+  }
+
+  std::string getEmbreeFeatures()
+  {
+    std::string v;
+#if defined(EMBREE_RAY_MASK)
+    v += "raymasks ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING)
+    v += "backfaceculling ";
+#endif
+#if defined (EMBREE_BACKFACE_CULLING_CURVES)
+    v += "backfacecullingcurves ";
+#endif
+#if defined(EMBREE_FILTER_FUNCTION)
+    v += "intersection_filter ";
+#endif
+#if defined (EMBREE_COMPACT_POLYS)
+    v += "compact_polys ";
+#endif
+    return v;
+  }
+
+  void Device::print()
+  {
+    const int cpu_features = getCPUFeatures();
+    std::cout << std::endl;
+    std::cout << "Embree Ray Tracing Kernels " << RTC_VERSION_STRING << " (" << RTC_HASH << ")" << std::endl;
+    std::cout << "  Compiler  : " << getCompilerName() << std::endl;
+    std::cout << "  Build     : ";
+#if defined(DEBUG)
+    std::cout << "Debug " << std::endl;
+#else
+    std::cout << "Release " << std::endl;
+#endif
+    std::cout << "  Platform  : " << getPlatformName() << std::endl;
+    std::cout << "  CPU       : " << stringOfCPUModel(getCPUModel()) << " (" << getCPUVendor() << ")" << std::endl;
+    std::cout << "   Threads  : " << getNumberOfLogicalThreads() << std::endl;
+    std::cout << "   ISA      : " << stringOfCPUFeatures(cpu_features) << std::endl;
+    std::cout << "   Targets  : " << supportedTargetList(cpu_features) << std::endl;
+    const bool hasFTZ = _mm_getcsr() & _MM_FLUSH_ZERO_ON;
+    const bool hasDAZ = _mm_getcsr() & _MM_DENORMALS_ZERO_ON;
+    std::cout << "   MXCSR    : " << "FTZ=" << hasFTZ << ", DAZ=" << hasDAZ << std::endl;
+    std::cout << "  Config" << std::endl;
+    std::cout << "    Threads : " << (numThreads ? toString(numThreads) : std::string("default")) << std::endl;
+    std::cout << "    ISA     : " << stringOfCPUFeatures(enabled_cpu_features) << std::endl;
+    std::cout << "    Targets : " << supportedTargetList(enabled_cpu_features) << " (supported)" << std::endl;
+    std::cout << "              " << getEnabledTargets() << " (compile time enabled)" << std::endl;
+    std::cout << "    Features: " << getEmbreeFeatures() << std::endl;
+    std::cout << "    Tasking : ";
+#if defined(TASKING_TBB)
+    std::cout << "TBB" << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << " ";
+  #if TBB_INTERFACE_VERSION >= 12002
+    std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << TBB_runtime_interface_version() << " ";
+  #else
+    std::cout << "TBB_header_interface_" << TBB_INTERFACE_VERSION << " TBB_lib_interface_" << tbb::TBB_runtime_interface_version() << " ";
+  #endif
+#endif
+#if defined(TASKING_INTERNAL)
+    std::cout << "internal_tasking_system ";
+#endif
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+    std::cout << "GCD tasking system ";
+#endif
+#if defined(TASKING_PPL)
+	std::cout << "PPL ";
+#endif
+    std::cout << std::endl;
+
+    /* check of FTZ and DAZ flags are set in CSR */
+    if (!hasFTZ || !hasDAZ) 
+    {
+#if !defined(_DEBUG)
+      if (State::verbosity(1)) 
+#endif
+      {
+        std::cout << std::endl;
+        std::cout << "================================================================================" << std::endl;
+        std::cout << "  WARNING: \"Flush to Zero\" or \"Denormals are Zero\" mode not enabled "         << std::endl 
+                  << "           in the MXCSR control and status register. This can have a severe "     << std::endl
+                  << "           performance impact. Please enable these modes for each application "   << std::endl
+                  << "           thread the following way:" << std::endl
+                  << std::endl 
+                  << "           #include \"xmmintrin.h\"" << std::endl 
+                  << "           #include \"pmmintrin.h\"" << std::endl 
+                  << std::endl 
+                  << "           _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);" << std::endl 
+                  << "           _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);" << std::endl;
+        std::cout << "================================================================================" << std::endl;
+        std::cout << std::endl;
+      }
+    }
+    std::cout << std::endl;
+  }
+
+  void Device::setDeviceErrorCode(RTCError error)
+  {
+    RTCError* stored_error = errorHandler.error();
+    if (*stored_error == RTC_ERROR_NONE)
+      *stored_error = error;
+  }
+
+  RTCError Device::getDeviceErrorCode()
+  {
+    RTCError* stored_error = errorHandler.error();
+    RTCError error = *stored_error;
+    *stored_error = RTC_ERROR_NONE;
+    return error;
+  }
+
+  void Device::setThreadErrorCode(RTCError error)
+  {
+    RTCError* stored_error = g_errorHandler.error();
+    if (*stored_error == RTC_ERROR_NONE)
+      *stored_error = error;
+  }
+
+  RTCError Device::getThreadErrorCode()
+  {
+    RTCError* stored_error = g_errorHandler.error();
+    RTCError error = *stored_error;
+    *stored_error = RTC_ERROR_NONE;
+    return error;
+  }
+
+  void Device::process_error(Device* device, RTCError error, const char* str)
+  { 
+    /* store global error code when device construction failed */
+    if (!device)
+      return setThreadErrorCode(error);
+
+    /* print error when in verbose mode */
+    if (device->verbosity(1)) 
+    {
+      switch (error) {
+      case RTC_ERROR_NONE         : std::cerr << "Embree: No error"; break;
+      case RTC_ERROR_UNKNOWN    : std::cerr << "Embree: Unknown error"; break;
+      case RTC_ERROR_INVALID_ARGUMENT : std::cerr << "Embree: Invalid argument"; break;
+      case RTC_ERROR_INVALID_OPERATION: std::cerr << "Embree: Invalid operation"; break;
+      case RTC_ERROR_OUT_OF_MEMORY    : std::cerr << "Embree: Out of memory"; break;
+      case RTC_ERROR_UNSUPPORTED_CPU  : std::cerr << "Embree: Unsupported CPU"; break;
+      default                   : std::cerr << "Embree: Invalid error code"; break;                   
+      };
+      if (str) std::cerr << ", (" << str << ")";
+      std::cerr << std::endl;
+    }
+
+    /* call user specified error callback */
+    if (device->error_function) 
+      device->error_function(device->error_function_userptr,error,str); 
+
+    /* record error code */
+    device->setDeviceErrorCode(error);
+  }
+
+  void Device::memoryMonitor(ssize_t bytes, bool post)
+  {
+    if (State::memory_monitor_function && bytes != 0) {
+      if (!State::memory_monitor_function(State::memory_monitor_userptr,bytes,post)) {
+        if (bytes > 0) { // only throw exception when we allocate memory to never throw inside a destructor
+          throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"memory monitor forced termination");
+        }
+      }
+    }
+  }
+
+  size_t getMaxNumThreads()
+  {
+    size_t maxNumThreads = 0;
+    for (std::map<Device*,size_t>::iterator i=g_num_threads_map.begin(); i != g_num_threads_map.end(); i++)
+      maxNumThreads = max(maxNumThreads, (*i).second);
+    if (maxNumThreads == 0)
+      maxNumThreads = std::numeric_limits<size_t>::max();
+    return maxNumThreads;
+  }
+
+  size_t getMaxCacheSize()
+  {
+    size_t maxCacheSize = 0;
+    for (std::map<Device*,size_t>::iterator i=g_cache_size_map.begin(); i!= g_cache_size_map.end(); i++)
+      maxCacheSize = max(maxCacheSize, (*i).second);
+    return maxCacheSize;
+  }
+ 
+  void Device::setCacheSize(size_t bytes) 
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    Lock<MutexSys> lock(g_mutex);
+    if (bytes == 0) g_cache_size_map.erase(this);
+    else            g_cache_size_map[this] = bytes;
+    
+    size_t maxCacheSize = getMaxCacheSize();
+    resizeTessellationCache(maxCacheSize);
+#endif
+  }
+
+  void Device::initTaskingSystem(size_t numThreads) 
+  {
+    Lock<MutexSys> lock(g_mutex);
+    if (numThreads == 0) 
+      g_num_threads_map[this] = std::numeric_limits<size_t>::max();
+    else 
+      g_num_threads_map[this] = numThreads;
+
+    /* create task scheduler */
+    size_t maxNumThreads = getMaxNumThreads();
+    TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+#if USE_TASK_ARENA
+    const size_t nThreads = min(maxNumThreads,TaskScheduler::threadCount());
+    const size_t uThreads = min(max(numUserThreads,(size_t)1),nThreads);
+    arena = make_unique(new tbb::task_arena((int)nThreads,(unsigned int)uThreads));
+#endif
+  }
+
+  void Device::exitTaskingSystem() 
+  {
+    Lock<MutexSys> lock(g_mutex);
+    g_num_threads_map.erase(this);
+
+    /* terminate tasking system */
+    if (g_num_threads_map.size() == 0) {
+      TaskScheduler::destroy();
+    } 
+    /* or configure new number of threads */
+    else {
+      size_t maxNumThreads = getMaxNumThreads();
+      TaskScheduler::create(maxNumThreads,State::set_affinity,State::start_threads);
+    }
+#if USE_TASK_ARENA
+    arena.reset();
+#endif
+  }
+
+  void Device::setProperty(const RTCDeviceProperty prop, ssize_t val)
+  {
+    /* hidden internal properties */
+    switch ((size_t)prop)
+    {
+    case 1000000: debug_int0 = val; return;
+    case 1000001: debug_int1 = val; return;
+    case 1000002: debug_int2 = val; return;
+    case 1000003: debug_int3 = val; return;
+    }
+
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown writable property");
+  }
+
+  ssize_t Device::getProperty(const RTCDeviceProperty prop)
+  {
+    size_t iprop = (size_t)prop;
+
+    /* get name of internal regression test */
+    if (iprop >= 2000000 && iprop < 3000000)
+    {
+      RegressionTest* test = getRegressionTest(iprop-2000000);
+      if (test) return (ssize_t) test->name.c_str();
+      else      return 0;
+    }
+
+    /* run internal regression test */
+    if (iprop >= 3000000 && iprop < 4000000)
+    {
+      RegressionTest* test = getRegressionTest(iprop-3000000);
+      if (test) return test->run();
+      else      return 0;
+    }
+
+    /* documented properties */
+    switch (prop) 
+    {
+    case RTC_DEVICE_PROPERTY_VERSION_MAJOR: return RTC_VERSION_MAJOR;
+    case RTC_DEVICE_PROPERTY_VERSION_MINOR: return RTC_VERSION_MINOR;
+    case RTC_DEVICE_PROPERTY_VERSION_PATCH: return RTC_VERSION_PATCH;
+    case RTC_DEVICE_PROPERTY_VERSION      : return RTC_VERSION;
+
+#if defined(EMBREE_TARGET_SIMD4) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED:  return hasISA(SSE2);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY4_SUPPORTED:  return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD8) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED:  return hasISA(AVX);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY8_SUPPORTED:  return 0;
+#endif
+
+#if defined(EMBREE_TARGET_SIMD16) && defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return hasISA(AVX512KNL) | hasISA(AVX512SKX);
+#else
+    case RTC_DEVICE_PROPERTY_NATIVE_RAY16_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_RAY_PACKETS)
+    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 1;
+#else
+    case RTC_DEVICE_PROPERTY_RAY_STREAM_SUPPORTED:  return 0;
+#endif
+    
+#if defined(EMBREE_RAY_MASK)
+    case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_RAY_MASK_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING)
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_BACKFACE_CULLING_CURVES)
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_BACKFACE_CULLING_CURVES_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_COMPACT_POLYS)
+    case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_COMPACT_POLYS_ENABLED: return 0;
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+    case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_FILTER_FUNCTION_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_IGNORE_INVALID_RAYS)
+    case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_IGNORE_INVALID_RAYS_ENABLED: return 0;
+#endif
+
+#if defined(TASKING_INTERNAL)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 0;
+#endif
+
+#if defined(TASKING_TBB)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 1;
+#endif
+
+#if defined(TASKING_PPL)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 2;
+#endif
+            
+#if defined(TASKING_GCD) && defined(BUILD_IOS)
+    case RTC_DEVICE_PROPERTY_TASKING_SYSTEM: return 3;
+#endif
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_TRIANGLE_GEOMETRY_SUPPORTED: return 0;
+#endif
+        
+#if defined(EMBREE_GEOMETRY_QUAD)
+    case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_QUAD_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+    case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_CURVE_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_SUBDIVISION_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+    case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_USER_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+    case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_POINT_GEOMETRY_SUPPORTED: return 0;
+#endif
+
+#if defined(TASKING_PPL)
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#elif defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 0;
+#else
+    case RTC_DEVICE_PROPERTY_JOIN_COMMIT_SUPPORTED: return 1;
+#endif
+
+#if defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+    case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 1;
+#else
+    case RTC_DEVICE_PROPERTY_PARALLEL_COMMIT_SUPPORTED: return 0;
+#endif
+
+    default: throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown readable property"); break;
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/device.h b/thirdparty/embree-aarch64/kernels/common/device.h
new file mode 100644
index 0000000000..e9a81bb109
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/device.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "state.h"
+#include "accel.h"
+
+namespace embree
+{
+  class BVH4Factory;
+  class BVH8Factory;
+
+  class Device : public State, public MemoryMonitorInterface
+  {
+    ALIGNED_CLASS_(16);
+
+  public:
+
+    /*! Device construction */
+    Device (const char* cfg);
+
+    /*! Device destruction */
+    virtual ~Device ();
+
+    /*! prints info about the device */
+    void print();
+
+    /*! sets the error code */
+    void setDeviceErrorCode(RTCError error);
+
+    /*! returns and clears the error code */
+    RTCError getDeviceErrorCode();
+
+    /*! sets the error code */
+    static void setThreadErrorCode(RTCError error);
+
+    /*! returns and clears the error code */
+    static RTCError getThreadErrorCode();
+
+    /*! processes error codes, do not call directly */
+    static void process_error(Device* device, RTCError error, const char* str);
+
+    /*! invokes the memory monitor callback */
+    void memoryMonitor(ssize_t bytes, bool post);
+
+    /*! sets the size of the software cache. */
+    void setCacheSize(size_t bytes);
+
+    /*! sets a property */
+    void setProperty(const RTCDeviceProperty prop, ssize_t val);
+
+    /*! gets a property */
+    ssize_t getProperty(const RTCDeviceProperty prop);
+
+  private:
+
+    /*! initializes the tasking system */
+    void initTaskingSystem(size_t numThreads);
+
+    /*! shuts down the tasking system */
+    void exitTaskingSystem();
+
+    /*! some variables that can be set via rtcSetParameter1i for debugging purposes */
+  public:
+    static ssize_t debug_int0;
+    static ssize_t debug_int1;
+    static ssize_t debug_int2;
+    static ssize_t debug_int3;
+
+  public:
+    std::unique_ptr<BVH4Factory> bvh4_factory;
+#if defined(EMBREE_TARGET_SIMD8)
+    std::unique_ptr<BVH8Factory> bvh8_factory;
+#endif
+    
+#if USE_TASK_ARENA
+    std::unique_ptr<tbb::task_arena> arena;
+#endif
+    
+    /* ray streams filter */
+    RayStreamFilterFuncs rayStreamFilters;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.cpp b/thirdparty/embree-aarch64/kernels/common/geometry.cpp
new file mode 100644
index 0000000000..b3aa8e3396
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/geometry.cpp
@@ -0,0 +1,259 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "geometry.h"
+#include "scene.h"
+
+namespace embree
+{
+  const char* Geometry::gtype_names[Geometry::GTY_END] =
+  {
+    "flat_linear_curve",
+    "round_linear_curve",
+    "oriented_linear_curve",
+    "",
+    "flat_bezier_curve",
+    "round_bezier_curve",
+    "oriented_bezier_curve",
+    "",
+    "flat_bspline_curve",
+    "round_bspline_curve",
+    "oriented_bspline_curve",
+    "",
+    "flat_hermite_curve",
+    "round_hermite_curve",
+    "oriented_hermite_curve",
+    "",
+    "flat_catmull_rom_curve",
+    "round_catmull_rom_curve",
+    "oriented_catmull_rom_curve",
+    "",    
+    "triangles",
+    "quads",
+    "grid",
+    "subdivs",
+    "",
+    "sphere",
+    "disc",
+    "oriented_disc",
+    "",
+    "usergeom",
+    "instance_cheap",
+    "instance_expensive",
+  };
+     
+  Geometry::Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps) 
+    : device(device), userPtr(nullptr),
+      numPrimitives(numPrimitives), numTimeSteps(unsigned(numTimeSteps)), fnumTimeSegments(float(numTimeSteps-1)), time_range(0.0f,1.0f),
+      mask(-1),
+      gtype(gtype),
+      gsubtype(GTY_SUBTYPE_DEFAULT),
+      quality(RTC_BUILD_QUALITY_MEDIUM),
+      state((unsigned)State::MODIFIED),
+      enabled(true),
+      intersectionFilterN(nullptr), occlusionFilterN(nullptr), pointQueryFunc(nullptr)
+  {
+    device->refInc();
+  }
+
+  Geometry::~Geometry()
+  {
+    device->refDec();
+  }
+
+  void Geometry::setNumPrimitives(unsigned int numPrimitives_in)
+  {      
+    if (numPrimitives_in == numPrimitives) return;
+    
+    numPrimitives = numPrimitives_in;
+    
+    Geometry::update();
+  }
+
+  void Geometry::setNumTimeSteps (unsigned int numTimeSteps_in)
+  {
+    if (numTimeSteps_in == numTimeSteps) {
+      return;
+    }
+    
+    numTimeSteps = numTimeSteps_in;
+    fnumTimeSegments = float(numTimeSteps_in-1);
+    
+    Geometry::update();
+  }
+
+  void Geometry::setTimeRange (const BBox1f range)
+  {
+    time_range = range;
+    Geometry::update();
+  }
+  
+  void Geometry::update()
+  {
+    ++modCounter_; // FIXME: required?
+    state = (unsigned)State::MODIFIED;
+  }
+  
+  void Geometry::commit() 
+  {
+    ++modCounter_;
+    state = (unsigned)State::COMMITTED;
+  }
+
+  void Geometry::preCommit()
+  {
+    if (State::MODIFIED == (State)state)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"geometry not committed");
+  }
+
+  void Geometry::postCommit()
+  {
+  }
+
+  void Geometry::enable () 
+  {
+    if (isEnabled()) 
+      return;
+
+    enabled = true;
+    ++modCounter_;
+  }
+
+  void Geometry::disable () 
+  {
+    if (isDisabled()) 
+      return;
+    
+    enabled = false;
+    ++modCounter_;
+  }
+
+  void Geometry::setUserData (void* ptr)
+  {
+    userPtr = ptr;
+  }
+  
+  void Geometry::setIntersectionFilterFunctionN (RTCFilterFunctionN filter) 
+  {
+    if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); 
+
+    intersectionFilterN = filter;
+  }
+
+  void Geometry::setOcclusionFilterFunctionN (RTCFilterFunctionN filter) 
+  {
+    if (!(getTypeMask() & (MTY_TRIANGLE_MESH | MTY_QUAD_MESH | MTY_CURVES | MTY_SUBDIV_MESH | MTY_USER_GEOMETRY | MTY_GRID_MESH)))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"filter functions not supported for this geometry"); 
+
+    occlusionFilterN = filter;
+  }
+  
+  void Geometry::setPointQueryFunction (RTCPointQueryFunction func) 
+  {
+    pointQueryFunc = func;
+  }
+
+  void Geometry::interpolateN(const RTCInterpolateNArguments* const args)
+  {
+    const void* valid_i = args->valid;
+    const unsigned* primIDs = args->primIDs;
+    const float* u = args->u;
+    const float* v = args->v;
+    unsigned int N = args->N;
+    RTCBufferType bufferType = args->bufferType;
+    unsigned int bufferSlot = args->bufferSlot;
+    float* P = args->P;
+    float* dPdu = args->dPdu;
+    float* dPdv = args->dPdv;
+    float* ddPdudu = args->ddPdudu;
+    float* ddPdvdv = args->ddPdvdv;
+    float* ddPdudv = args->ddPdudv;
+    unsigned int valueCount = args->valueCount;
+
+    if (valueCount > 256) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximally 256 floating point values can be interpolated per vertex");
+    const int* valid = (const int*) valid_i;
+ 
+    __aligned(64) float P_tmp[256];
+    __aligned(64) float dPdu_tmp[256];
+    __aligned(64) float dPdv_tmp[256];
+    __aligned(64) float ddPdudu_tmp[256];
+    __aligned(64) float ddPdvdv_tmp[256];
+    __aligned(64) float ddPdudv_tmp[256];
+
+    float* Pt = P ? P_tmp : nullptr;
+    float* dPdut = nullptr, *dPdvt = nullptr;
+    if (dPdu) { dPdut = dPdu_tmp; dPdvt = dPdv_tmp; }
+    float* ddPdudut = nullptr, *ddPdvdvt = nullptr, *ddPdudvt = nullptr;
+    if (ddPdudu) { ddPdudut = ddPdudu_tmp; ddPdvdvt = ddPdvdv_tmp; ddPdudvt = ddPdudv_tmp; }
+    
+    for (unsigned int i=0; i<N; i++)
+    {
+      if (valid && !valid[i]) continue;
+
+      RTCInterpolateArguments iargs;
+      iargs.primID = primIDs[i];
+      iargs.u = u[i];
+      iargs.v = v[i];
+      iargs.bufferType = bufferType;
+      iargs.bufferSlot = bufferSlot;
+      iargs.P = Pt;
+      iargs.dPdu = dPdut;
+      iargs.dPdv = dPdvt;
+      iargs.ddPdudu = ddPdudut;
+      iargs.ddPdvdv = ddPdvdvt;
+      iargs.ddPdudv = ddPdudvt;
+      iargs.valueCount = valueCount;
+      interpolate(&iargs);
+      
+      if (likely(P)) {
+        for (unsigned int j=0; j<valueCount; j++) 
+          P[j*N+i] = Pt[j];
+      }
+      if (likely(dPdu)) 
+      {
+        for (unsigned int j=0; j<valueCount; j++) {
+          dPdu[j*N+i] = dPdut[j];
+          dPdv[j*N+i] = dPdvt[j];
+        }
+      }
+      if (likely(ddPdudu)) 
+      {
+        for (unsigned int j=0; j<valueCount; j++) {
+          ddPdudu[j*N+i] = ddPdudut[j];
+          ddPdvdv[j*N+i] = ddPdvdvt[j];
+          ddPdudv[j*N+i] = ddPdudvt[j];
+        }
+      }
+    }
+  }
+    
+  bool Geometry::pointQuery(PointQuery* query, PointQueryContext* context)
+  {
+    assert(context->primID < size());
+   
+    RTCPointQueryFunctionArguments args;
+    args.query           = (RTCPointQuery*)context->query_ws;
+    args.userPtr         = context->userPtr;
+    args.primID          = context->primID;
+    args.geomID          = context->geomID;
+    args.context         = context->userContext;
+    args.similarityScale = context->similarityScale;
+    
+    bool update = false;
+    if(context->func)  update |= context->func(&args);
+    if(pointQueryFunc) update |= pointQueryFunc(&args);
+
+    if (update && context->userContext->instStackSize > 0)
+    {
+      // update point query
+      if (context->query_type == POINT_QUERY_TYPE_AABB) {
+        context->updateAABB();
+      } else {
+        assert(context->similarityScale > 0.f);
+        query->radius = context->query_ws->radius * context->similarityScale;
+      }
+    }
+    return update;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/geometry.h b/thirdparty/embree-aarch64/kernels/common/geometry.h
new file mode 100644
index 0000000000..953974bfd2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/geometry.h
@@ -0,0 +1,582 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "buffer.h"
+#include "../common/point_query.h"
+#include "../builders/priminfo.h"
+
+namespace embree
+{
+  class Scene;
+  class Geometry;
+
+  struct GeometryCounts 
+  {
+    __forceinline GeometryCounts()
+      : numFilterFunctions(0),
+        numTriangles(0), numMBTriangles(0), 
+        numQuads(0), numMBQuads(0), 
+        numBezierCurves(0), numMBBezierCurves(0), 
+        numLineSegments(0), numMBLineSegments(0), 
+        numSubdivPatches(0), numMBSubdivPatches(0), 
+        numUserGeometries(0), numMBUserGeometries(0), 
+        numInstancesCheap(0), numMBInstancesCheap(0), 
+        numInstancesExpensive(0), numMBInstancesExpensive(0), 
+        numGrids(0), numMBGrids(0), 
+        numPoints(0), numMBPoints(0) {}
+
+    __forceinline size_t size() const {
+      return    numTriangles + numQuads + numBezierCurves + numLineSegments + numSubdivPatches + numUserGeometries + numInstancesCheap + numInstancesExpensive + numGrids + numPoints
+              + numMBTriangles + numMBQuads + numMBBezierCurves + numMBLineSegments + numMBSubdivPatches + numMBUserGeometries + numMBInstancesCheap + numMBInstancesExpensive + numMBGrids + numMBPoints;
+    }
+
+    __forceinline unsigned int enabledGeometryTypesMask() const
+    {
+      unsigned int mask = 0;
+      if (numTriangles) mask |= 1 << 0;
+      if (numQuads) mask |= 1 << 1;
+      if (numBezierCurves+numLineSegments) mask |= 1 << 2;
+      if (numSubdivPatches) mask |= 1 << 3;
+      if (numUserGeometries) mask |= 1 << 4;
+      if (numInstancesCheap) mask |= 1 << 5;
+      if (numInstancesExpensive) mask |= 1 << 6;
+      if (numGrids) mask |= 1 << 7;
+      if (numPoints) mask |= 1 << 8;
+
+      unsigned int maskMB = 0;
+      if (numMBTriangles) maskMB |= 1 << 0;
+      if (numMBQuads) maskMB |= 1 << 1;
+      if (numMBBezierCurves+numMBLineSegments) maskMB |= 1 << 2;
+      if (numMBSubdivPatches) maskMB |= 1 << 3;
+      if (numMBUserGeometries) maskMB |= 1 << 4;
+      if (numMBInstancesCheap) maskMB |= 1 << 5;
+      if (numMBInstancesExpensive) maskMB |= 1 << 6;
+      if (numMBGrids) maskMB |= 1 << 7;
+      if (numMBPoints) maskMB |= 1 << 8;
+      
+      return (mask<<8) + maskMB;
+    }
+
+    __forceinline GeometryCounts operator+ (GeometryCounts const & rhs) const
+    {
+      GeometryCounts ret;
+      ret.numFilterFunctions = numFilterFunctions + rhs.numFilterFunctions;
+      ret.numTriangles = numTriangles + rhs.numTriangles;
+      ret.numMBTriangles = numMBTriangles + rhs.numMBTriangles;
+      ret.numQuads = numQuads + rhs.numQuads;
+      ret.numMBQuads = numMBQuads + rhs.numMBQuads;
+      ret.numBezierCurves = numBezierCurves + rhs.numBezierCurves;
+      ret.numMBBezierCurves = numMBBezierCurves + rhs.numMBBezierCurves;
+      ret.numLineSegments = numLineSegments + rhs.numLineSegments;
+      ret.numMBLineSegments = numMBLineSegments + rhs.numMBLineSegments;
+      ret.numSubdivPatches = numSubdivPatches + rhs.numSubdivPatches;
+      ret.numMBSubdivPatches = numMBSubdivPatches + rhs.numMBSubdivPatches;
+      ret.numUserGeometries = numUserGeometries + rhs.numUserGeometries;
+      ret.numMBUserGeometries = numMBUserGeometries + rhs.numMBUserGeometries;
+      ret.numInstancesCheap = numInstancesCheap + rhs.numInstancesCheap;
+      ret.numMBInstancesCheap = numMBInstancesCheap + rhs.numMBInstancesCheap;
+      ret.numInstancesExpensive = numInstancesExpensive + rhs.numInstancesExpensive;
+      ret.numMBInstancesExpensive = numMBInstancesExpensive + rhs.numMBInstancesExpensive;
+      ret.numGrids = numGrids + rhs.numGrids;
+      ret.numMBGrids = numMBGrids + rhs.numMBGrids;
+      ret.numPoints = numPoints + rhs.numPoints;
+      ret.numMBPoints = numMBPoints + rhs.numMBPoints;
+
+      return ret;
+    }
+
+    size_t numFilterFunctions;       //!< number of geometries with filter functions enabled
+    size_t numTriangles;             //!< number of enabled triangles
+    size_t numMBTriangles;           //!< number of enabled motion blured triangles
+    size_t numQuads;                 //!< number of enabled quads
+    size_t numMBQuads;               //!< number of enabled motion blurred quads
+    size_t numBezierCurves;          //!< number of enabled curves
+    size_t numMBBezierCurves;        //!< number of enabled motion blurred curves
+    size_t numLineSegments;          //!< number of enabled line segments
+    size_t numMBLineSegments;        //!< number of enabled line motion blurred segments
+    size_t numSubdivPatches;         //!< number of enabled subdivision patches
+    size_t numMBSubdivPatches;       //!< number of enabled motion blured subdivision patches
+    size_t numUserGeometries;        //!< number of enabled user geometries
+    size_t numMBUserGeometries;      //!< number of enabled motion blurred user geometries
+    size_t numInstancesCheap;        //!< number of enabled cheap instances
+    size_t numMBInstancesCheap;      //!< number of enabled motion blurred cheap instances
+    size_t numInstancesExpensive;    //!< number of enabled expensive instances
+    size_t numMBInstancesExpensive;  //!< number of enabled motion blurred expensive instances
+    size_t numGrids;                 //!< number of enabled grid geometries
+    size_t numMBGrids;               //!< number of enabled motion blurred grid geometries
+    size_t numPoints;                //!< number of enabled points
+    size_t numMBPoints;              //!< number of enabled motion blurred points
+  };
+
+  /*! Base class all geometries are derived from */
+  class Geometry : public RefCount
+  {
+    friend class Scene;
+  public:
+
+    /*! type of geometry */
+    enum GType
+    {
+      GTY_FLAT_LINEAR_CURVE = 0,
+      GTY_ROUND_LINEAR_CURVE = 1,
+      GTY_ORIENTED_LINEAR_CURVE = 2,
+      GTY_CONE_LINEAR_CURVE = 3,
+      
+      GTY_FLAT_BEZIER_CURVE = 4,
+      GTY_ROUND_BEZIER_CURVE = 5,
+      GTY_ORIENTED_BEZIER_CURVE = 6,
+      
+      GTY_FLAT_BSPLINE_CURVE = 8,
+      GTY_ROUND_BSPLINE_CURVE = 9,
+      GTY_ORIENTED_BSPLINE_CURVE = 10,
+
+      GTY_FLAT_HERMITE_CURVE = 12,
+      GTY_ROUND_HERMITE_CURVE = 13,
+      GTY_ORIENTED_HERMITE_CURVE = 14,
+      
+      GTY_FLAT_CATMULL_ROM_CURVE = 16,
+      GTY_ROUND_CATMULL_ROM_CURVE = 17,
+      GTY_ORIENTED_CATMULL_ROM_CURVE = 18,      
+
+      GTY_TRIANGLE_MESH = 20,
+      GTY_QUAD_MESH = 21,
+      GTY_GRID_MESH = 22,
+      GTY_SUBDIV_MESH = 23,
+
+      GTY_SPHERE_POINT = 25,
+      GTY_DISC_POINT = 26,
+      GTY_ORIENTED_DISC_POINT = 27,
+      
+      GTY_USER_GEOMETRY = 29,
+      GTY_INSTANCE_CHEAP = 30,
+      GTY_INSTANCE_EXPENSIVE = 31,
+      GTY_END = 32,
+
+      GTY_BASIS_LINEAR = 0,
+      GTY_BASIS_BEZIER = 4,
+      GTY_BASIS_BSPLINE = 8,
+      GTY_BASIS_HERMITE = 12,
+      GTY_BASIS_CATMULL_ROM = 16,
+      GTY_BASIS_MASK = 28,
+
+      GTY_SUBTYPE_FLAT_CURVE = 0,
+      GTY_SUBTYPE_ROUND_CURVE = 1,
+      GTY_SUBTYPE_ORIENTED_CURVE = 2,
+      GTY_SUBTYPE_MASK = 3,
+    };
+
+    enum GSubType
+    {
+      GTY_SUBTYPE_DEFAULT= 0,
+      GTY_SUBTYPE_INSTANCE_LINEAR = 0,
+      GTY_SUBTYPE_INSTANCE_QUATERNION = 1
+    };
+
+    enum GTypeMask
+    {
+      MTY_FLAT_LINEAR_CURVE = 1ul << GTY_FLAT_LINEAR_CURVE,
+      MTY_ROUND_LINEAR_CURVE = 1ul << GTY_ROUND_LINEAR_CURVE,
+      MTY_CONE_LINEAR_CURVE = 1ul << GTY_CONE_LINEAR_CURVE,
+      MTY_ORIENTED_LINEAR_CURVE = 1ul << GTY_ORIENTED_LINEAR_CURVE,
+      
+      MTY_FLAT_BEZIER_CURVE = 1ul << GTY_FLAT_BEZIER_CURVE,
+      MTY_ROUND_BEZIER_CURVE = 1ul << GTY_ROUND_BEZIER_CURVE,
+      MTY_ORIENTED_BEZIER_CURVE = 1ul << GTY_ORIENTED_BEZIER_CURVE,
+      
+      MTY_FLAT_BSPLINE_CURVE = 1ul << GTY_FLAT_BSPLINE_CURVE,
+      MTY_ROUND_BSPLINE_CURVE = 1ul << GTY_ROUND_BSPLINE_CURVE,
+      MTY_ORIENTED_BSPLINE_CURVE = 1ul << GTY_ORIENTED_BSPLINE_CURVE,
+
+      MTY_FLAT_HERMITE_CURVE = 1ul << GTY_FLAT_HERMITE_CURVE,
+      MTY_ROUND_HERMITE_CURVE = 1ul << GTY_ROUND_HERMITE_CURVE,
+      MTY_ORIENTED_HERMITE_CURVE = 1ul << GTY_ORIENTED_HERMITE_CURVE,
+
+      MTY_FLAT_CATMULL_ROM_CURVE = 1ul << GTY_FLAT_CATMULL_ROM_CURVE,
+      MTY_ROUND_CATMULL_ROM_CURVE = 1ul << GTY_ROUND_CATMULL_ROM_CURVE,
+      MTY_ORIENTED_CATMULL_ROM_CURVE = 1ul << GTY_ORIENTED_CATMULL_ROM_CURVE,
+
+      MTY_CURVE2 = MTY_FLAT_LINEAR_CURVE | MTY_ROUND_LINEAR_CURVE | MTY_CONE_LINEAR_CURVE | MTY_ORIENTED_LINEAR_CURVE,
+      
+      MTY_CURVE4 = MTY_FLAT_BEZIER_CURVE | MTY_ROUND_BEZIER_CURVE | MTY_ORIENTED_BEZIER_CURVE |
+                   MTY_FLAT_BSPLINE_CURVE | MTY_ROUND_BSPLINE_CURVE | MTY_ORIENTED_BSPLINE_CURVE |
+                   MTY_FLAT_HERMITE_CURVE | MTY_ROUND_HERMITE_CURVE | MTY_ORIENTED_HERMITE_CURVE |
+                   MTY_FLAT_CATMULL_ROM_CURVE | MTY_ROUND_CATMULL_ROM_CURVE | MTY_ORIENTED_CATMULL_ROM_CURVE,
+
+      MTY_SPHERE_POINT = 1ul << GTY_SPHERE_POINT,
+      MTY_DISC_POINT = 1ul << GTY_DISC_POINT,
+      MTY_ORIENTED_DISC_POINT = 1ul << GTY_ORIENTED_DISC_POINT,
+
+      MTY_POINTS = MTY_SPHERE_POINT | MTY_DISC_POINT | MTY_ORIENTED_DISC_POINT,
+
+      MTY_CURVES = MTY_CURVE2 | MTY_CURVE4 | MTY_POINTS,
+
+      MTY_TRIANGLE_MESH = 1ul << GTY_TRIANGLE_MESH,
+      MTY_QUAD_MESH = 1ul << GTY_QUAD_MESH,
+      MTY_GRID_MESH = 1ul << GTY_GRID_MESH,
+      MTY_SUBDIV_MESH = 1ul << GTY_SUBDIV_MESH,
+      MTY_USER_GEOMETRY = 1ul << GTY_USER_GEOMETRY,
+
+      MTY_INSTANCE_CHEAP = 1ul << GTY_INSTANCE_CHEAP,
+      MTY_INSTANCE_EXPENSIVE = 1ul << GTY_INSTANCE_EXPENSIVE,
+      MTY_INSTANCE = MTY_INSTANCE_CHEAP | MTY_INSTANCE_EXPENSIVE
+    };
+
+    static const char* gtype_names[GTY_END];
+
+    enum class State : unsigned {
+      MODIFIED = 0,
+      COMMITTED = 1,
+    };
+
+  public:
+    
+    /*! Geometry constructor */
+    Geometry (Device* device, GType gtype, unsigned int numPrimitives, unsigned int numTimeSteps);
+
+    /*! Geometry destructor */
+    virtual ~Geometry();
+
+  public:
+
+    /*! tests if geometry is enabled */
+    __forceinline bool isEnabled() const { return enabled; }
+
+    /*! tests if geometry is disabled */
+    __forceinline bool isDisabled() const { return !isEnabled(); }
+
+    /*! tests if that geometry has some filter function set */
+    __forceinline bool hasFilterFunctions () const {
+      return (intersectionFilterN  != nullptr) || (occlusionFilterN  != nullptr);
+    }
+
+    /*! returns geometry type */
+    __forceinline GType getType() const { return gtype; }
+
+    /*! returns curve type */
+    __forceinline GType getCurveType() const { return (GType)(gtype & GTY_SUBTYPE_MASK); }
+
+    /*! returns curve basis */
+    __forceinline GType getCurveBasis() const { return (GType)(gtype & GTY_BASIS_MASK); }
+
+    /*! returns geometry type mask */
+    __forceinline GTypeMask getTypeMask() const { return (GTypeMask)(1 << gtype); }
+
+    /*! returns number of primitives */
+    __forceinline size_t size() const { return numPrimitives; }
+
+    /*! sets the number of primitives */
+    virtual void setNumPrimitives(unsigned int numPrimitives_in);
+
+    /*! sets number of time steps */
+    virtual void setNumTimeSteps (unsigned int numTimeSteps_in);
+
+    /*! sets motion blur time range */
+    void setTimeRange (const BBox1f range);
+
+    /*! sets number of vertex attributes */
+    virtual void setVertexAttributeCount (unsigned int N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! sets number of topologies */
+    virtual void setTopologyCount (unsigned int N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! sets the build quality */
+    void setBuildQuality(RTCBuildQuality quality_in)
+    {
+      this->quality = quality_in;
+      Geometry::update();
+    }
+
+    /* calculate time segment itime and fractional time ftime */
+    __forceinline int timeSegment(float time, float& ftime) const {
+      return getTimeSegment(time,time_range.lower,time_range.upper,fnumTimeSegments,ftime);
+    }
+
+    template<int N>
+      __forceinline vint<N> timeSegment(const vfloat<N>& time, vfloat<N>& ftime) const {
+      return getTimeSegment(time,vfloat<N>(time_range.lower),vfloat<N>(time_range.upper),vfloat<N>(fnumTimeSegments),ftime);
+    }
+    
+    /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,fnumTimeSegments);
+    }
+
+    /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<(int)numTimeSteps);
+      return time_range.lower + time_range.size()*float(i)/fnumTimeSegments;
+    }
+    
+    /*! for all geometries */
+  public:
+
+    /*! Enable geometry. */
+    virtual void enable();
+
+    /*! Update geometry. */
+    void update();
+    
+    /*! commit of geometry */
+    virtual void commit();
+
+    /*! Update geometry buffer. */
+    virtual void updateBuffer(RTCBufferType type, unsigned int slot) {
+      update(); // update everything for geometries not supporting this call
+    }
+    
+    /*! Disable geometry. */
+    virtual void disable();
+
+    /*! Verify the geometry */
+    virtual bool verify() { return true; }
+
+    /*! called before every build */
+    virtual void preCommit();
+  
+    /*! called after every build */
+    virtual void postCommit();
+
+    virtual void addElementsToCount (GeometryCounts & counts) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    };
+
+    /*! sets constant tessellation rate for the geometry */
+    virtual void setTessellationRate(float N) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Sets the maximal curve radius scale allowed by min-width feature. */
+    virtual void setMaxRadiusScale(float s) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set user data pointer. */
+    virtual void setUserData(void* ptr);
+      
+    /*! Get user data pointer. */
+    __forceinline void* getUserData() const {
+      return userPtr;
+    }
+
+    /*! interpolates user data to the specified u/v location */
+    virtual void interpolate(const RTCInterpolateArguments* const args) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! interpolates user data to the specified u/v locations */
+    virtual void interpolateN(const RTCInterpolateNArguments* const args);
+
+    /* point query api */
+    bool pointQuery(PointQuery* query, PointQueryContext* context);
+
+    /*! for subdivision surfaces only */
+  public:
+    virtual void setSubdivisionMode (unsigned topologyID, RTCSubdivisionMode mode) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual void setVertexAttributeTopology(unsigned int vertexBufferSlot, unsigned int indexBufferSlot) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set displacement function. */
+    virtual void setDisplacementFunction (RTCDisplacementFunctionN filter) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getFirstHalfEdge(unsigned int faceID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getFace(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    virtual unsigned int getNextHalfEdge(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getPreviousHalfEdge(unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    virtual unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! get fast access to first vertex buffer if applicable */
+    virtual float * getCompactVertexArray () const {
+      return nullptr;
+    }
+
+    /*! Returns the modified counter - how many times the geo has been modified */
+    __forceinline unsigned int getModCounter () const {
+      return modCounter_;
+    }
+
+    /*! for triangle meshes and bezier curves only */
+  public:
+
+
+    /*! Sets ray mask. */
+    virtual void setMask(unsigned mask) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Sets specified buffer. */
+    virtual void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Gets specified buffer. */
+    virtual void* getBuffer(RTCBufferType type, unsigned int slot) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
+    /*! Set intersection filter function for ray packets of size N. */
+    virtual void setIntersectionFilterFunctionN (RTCFilterFunctionN filterN);
+
+    /*! Set occlusion filter function for ray packets of size N. */
+    virtual void setOcclusionFilterFunctionN (RTCFilterFunctionN filterN);
+
+    /*! for instances only */
+  public:
+
+    /*! Sets the instanced scene */
+    virtual void setInstancedScene(const Ref<Scene>& scene) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry");
+    }
+
+    /*! Sets transformation of the instance */
+    virtual void setTransform(const AffineSpace3fa& transform, unsigned int timeStep) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Sets transformation of the instance */
+    virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Returns the transformation of the instance */
+    virtual AffineSpace3fa getTransform(float time) {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! for user geometries only */
+  public:
+
+    /*! Set bounds function. */
+    virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+
+    /*! Set intersect function for ray packets of size N. */
+    virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Set occlusion function for ray packets of size N. */
+    virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded) { 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation not supported for this geometry"); 
+    }
+    
+    /*! Set point query function. */
+    void setPointQueryFunction(RTCPointQueryFunction func);
+
+    /*! returns number of time segments */
+    __forceinline unsigned numTimeSegments () const {
+      return numTimeSteps-1;
+    }
+
+  public:
+
+    virtual PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefArray not implemented for this geometry"); 
+    }
+
+    virtual PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
+    }
+
+    virtual PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"createPrimRefMBArray not implemented for this geometry"); 
+    }
+
+    virtual LinearSpace3fa computeAlignedSpace(const size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); 
+    }
+
+    virtual LinearSpace3fa computeAlignedSpaceMB(const size_t primID, const BBox1f time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeAlignedSpace not implemented for this geometry"); 
+    }
+    
+    virtual Vec3fa computeDirection(unsigned int primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); 
+    }
+
+    virtual Vec3fa computeDirection(unsigned int primID, size_t time) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"computeDirection not implemented for this geometry"); 
+    }
+
+    virtual BBox3fa vbounds(size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+    
+    virtual BBox3fa vbounds(const LinearSpace3fa& space, size_t primID) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+
+    virtual BBox3fa vbounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t i, size_t itime = 0) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vbounds not implemented for this geometry"); 
+    }
+
+    virtual LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+    
+    virtual LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+
+    virtual LBBox3fa vlinearBounds(const Vec3fa& ofs, const float scale, const float r_scale0, const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"vlinearBounds not implemented for this geometry"); 
+    }
+    
+  public:
+    __forceinline bool hasIntersectionFilter() const { return intersectionFilterN != nullptr; }
+    __forceinline bool hasOcclusionFilter() const { return occlusionFilterN != nullptr; }
+
+  public:
+    Device* device;             //!< device this geometry belongs to
+
+    void* userPtr;              //!< user pointer
+    unsigned int numPrimitives; //!< number of primitives of this geometry
+    
+    unsigned int numTimeSteps;  //!< number of time steps
+    float fnumTimeSegments;     //!< number of time segments (precalculation)
+    BBox1f time_range;          //!< motion blur time range
+    
+    unsigned int mask;             //!< for masking out geometry
+    unsigned int modCounter_ = 1; //!< counter for every modification - used to rebuild scenes when geo is modified
+    
+    struct {
+      GType gtype : 8;                //!< geometry type
+      GSubType gsubtype : 8;          //!< geometry subtype
+      RTCBuildQuality quality : 3;    //!< build quality for geometry
+      unsigned state : 2;
+      bool enabled : 1;              //!< true if geometry is enabled
+    };
+       
+    RTCFilterFunctionN intersectionFilterN;
+    RTCFilterFunctionN occlusionFilterN;
+    RTCPointQueryFunction pointQueryFunc;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/hit.h b/thirdparty/embree-aarch64/kernels/common/hit.h
new file mode 100644
index 0000000000..32a198cdfe
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/hit.h
@@ -0,0 +1,114 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "ray.h"
+#include "instance_stack.h"
+
+namespace embree
+{
+  /* Hit structure for K hits */
+  template<int K>
+    struct HitK
+  {
+    /* Default construction does nothing */
+    __forceinline HitK() {}
+
+    /* Constructs a hit */
+    __forceinline HitK(const RTCIntersectContext* context, const vuint<K>& geomID, const vuint<K>& primID, const vfloat<K>& u, const vfloat<K>& v, const Vec3vf<K>& Ng)
+      : Ng(Ng), u(u), v(v), primID(primID), geomID(geomID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+      instance_id_stack::copy(context->instID, instID);
+    }
+
+    /* Returns the size of the hit */
+    static __forceinline size_t size() { return K; }
+
+  public:
+    Vec3vf<K> Ng;  // geometry normal
+    vfloat<K> u;         // barycentric u coordinate of hit
+    vfloat<K> v;         // barycentric v coordinate of hit
+    vuint<K> primID;      // primitive ID
+    vuint<K> geomID;      // geometry ID
+    vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+  };
+
+  /* Specialization for a single hit */
+  template<>
+    struct __aligned(16) HitK<1>
+  {
+     /* Default construction does nothing */
+    __forceinline HitK() {}
+
+    /* Constructs a hit */
+    __forceinline HitK(const RTCIntersectContext* context, unsigned int geomID, unsigned int primID, float u, float v, const Vec3fa& Ng)
+      : Ng(Ng.x,Ng.y,Ng.z), u(u), v(v), primID(primID), geomID(geomID)
+    {
+      instance_id_stack::copy(context->instID, instID);
+    }
+
+    /* Returns the size of the hit */
+    static __forceinline size_t size() { return 1; }
+
+  public:
+    Vec3<float> Ng;  // geometry normal
+    float u;         // barycentric u coordinate of hit
+    float v;         // barycentric v coordinate of hit
+    unsigned int primID;      // primitive ID
+    unsigned int geomID;      // geometry ID
+    unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT];      // instance ID
+  };
+
+  /* Shortcuts */
+  typedef HitK<1>  Hit;
+  typedef HitK<4>  Hit4;
+  typedef HitK<8>  Hit8;
+  typedef HitK<16> Hit16;
+
+  /* Outputs hit to stream */
+  template<int K>
+  __forceinline embree_ostream operator<<(embree_ostream cout, const HitK<K>& ray)
+  {
+    cout << "{ " << embree_endl
+         << "  Ng = " << ray.Ng <<  embree_endl
+         << "  u = " << ray.u <<  embree_endl
+         << "  v = " << ray.v << embree_endl
+         << "  primID = " << ray.primID <<  embree_endl
+         << "  geomID = " << ray.geomID << embree_endl
+         << "  instID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instID[l];
+    }
+    cout << embree_endl;
+    return cout << "}";
+  }
+
+  template<typename Hit>
+    __forceinline void copyHitToRay(RayHit& ray, const Hit& hit)
+  {
+    ray.Ng   = hit.Ng;
+    ray.u    = hit.u;
+    ray.v    = hit.v;
+    ray.primID = hit.primID;
+    ray.geomID = hit.geomID;
+    instance_id_stack::copy(hit.instID, ray.instID);
+  }
+
+  template<int K>
+    __forceinline void copyHitToRay(const vbool<K> &mask, RayHitK<K> &ray, const HitK<K> &hit)
+  {
+    vfloat<K>::storeu(mask,&ray.Ng.x, hit.Ng.x);
+    vfloat<K>::storeu(mask,&ray.Ng.y, hit.Ng.y);
+    vfloat<K>::storeu(mask,&ray.Ng.z, hit.Ng.z);
+    vfloat<K>::storeu(mask,&ray.u, hit.u);
+    vfloat<K>::storeu(mask,&ray.v, hit.v);
+    vuint<K>::storeu(mask,&ray.primID, hit.primID);
+    vuint<K>::storeu(mask,&ray.geomID, hit.geomID);
+    instance_id_stack::copy(hit.instID, ray.instID, mask);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/instance_stack.h b/thirdparty/embree-aarch64/kernels/common/instance_stack.h
new file mode 100644
index 0000000000..d7e3637f7b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/instance_stack.h
@@ -0,0 +1,199 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "rtcore.h"
+
+namespace embree {
+namespace instance_id_stack {
+
+static_assert(RTC_MAX_INSTANCE_LEVEL_COUNT > 0, 
+              "RTC_MAX_INSTANCE_LEVEL_COUNT must be greater than 0.");
+
+/*******************************************************************************
+ * Instance ID stack manipulation.
+ * This is used from the instance intersector.
+ ******************************************************************************/
+
+/* 
+ * Push an instance to the stack. 
+ */
+RTC_FORCEINLINE bool push(RTCIntersectContext* context, 
+                          unsigned instanceId)
+{
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  const bool spaceAvailable = context->instStackSize < RTC_MAX_INSTANCE_LEVEL_COUNT;
+  /* We assert here because instances are silently dropped when the stack is full. 
+     This might be quite hard to find in production. */
+  assert(spaceAvailable); 
+  if (likely(spaceAvailable))
+    context->instID[context->instStackSize++] = instanceId;
+  return spaceAvailable;
+#else
+  const bool spaceAvailable = (context->instID[0] == RTC_INVALID_GEOMETRY_ID);
+  assert(spaceAvailable); 
+  if (likely(spaceAvailable))
+    context->instID[0] = instanceId;
+  return spaceAvailable;
+#endif
+}
+
+
+/* 
+ * Pop the last instance pushed to the stack. 
+ * Do not call on an empty stack. 
+ */
+RTC_FORCEINLINE void pop(RTCIntersectContext* context)
+{
+  assert(context);
+#if RTC_MAX_INSTANCE_LEVEL_COUNT > 1
+  assert(context->instStackSize > 0);
+  context->instID[--context->instStackSize] = RTC_INVALID_GEOMETRY_ID;
+#else
+  assert(context->instID[0] != RTC_INVALID_GEOMETRY_ID);
+  context->instID[0] = RTC_INVALID_GEOMETRY_ID;
+#endif
+}
+
+/*******************************************************************************
+ * Optimized instance id stack copy.
+ * The copy() function at the bottom of this block will either copy full
+ * stacks or copy only until the last valid element has been copied, depending
+ * on RTC_MAX_INSTANCE_LEVEL_COUNT.
+ ******************************************************************************/
+
+/*
+ * Plain array assignment. This works for scalar->scalar,
+ * scalar->vector, and vector->vector.
+ */
+template <class Src, class Tgt>
+RTC_FORCEINLINE void level_copy(unsigned level, Src* src, Tgt* tgt)
+{
+  tgt[level] = src[level];
+}
+
+/*
+ * Masked SIMD vector->vector store.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+  vuint<K>::storeu(mask, tgt + level, src[level]);
+}
+
+/*
+ * Masked scalar->SIMD vector store.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint<K>* tgt, const vbool<K>& mask)
+{
+  vuint<K>::store(mask, tgt + level, src[level]);
+}
+
+/*
+ * Indexed assign from vector to scalar.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, unsigned* tgt, const size_t& idx)
+{
+  tgt[level] = src[level][idx];
+}
+
+/*
+ * Indexed assign from scalar to vector.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const unsigned* src, vuint<K>* tgt, const size_t& idx)
+{
+  tgt[level][idx] = src[level];
+}
+
+/*
+ * Indexed assign from vector to vector.
+ */
+template <int K>
+RTC_FORCEINLINE void level_copy(unsigned level, const vuint<K>* src, vuint<K>* tgt, const size_t& i, const size_t& j)
+{
+  tgt[level][j] = src[level][i];
+}
+
+/*
+ * Check if the given stack level is valid.
+ * These are only used for large max stack sizes.
+ */
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack)
+{
+  return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const size_t& /*i*/)
+{
+  return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const unsigned* stack, const vbool<K>& /*mask*/)
+{
+  return stack[level] != RTC_INVALID_GEOMETRY_ID;
+}
+
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack)
+{
+  return any(stack[level] != RTC_INVALID_GEOMETRY_ID);
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const vbool<K>& mask)
+{
+  return any(mask & (stack[level] != RTC_INVALID_GEOMETRY_ID));
+}
+
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const size_t& i)
+{
+  return stack[level][i] != RTC_INVALID_GEOMETRY_ID;
+}
+template <int K>
+RTC_FORCEINLINE bool level_valid(unsigned level, const vuint<K>* stack, const size_t& i, const size_t& /*j*/)
+{
+  return stack[level][i] != RTC_INVALID_GEOMETRY_ID;
+}
+
+/*
+ * Copy an instance ID stack.
+ *
+ * This function automatically selects a LevelFunctor from the above Assign 
+ * structs.
+ */
+template <class Src, class Tgt, class... Args>
+RTC_FORCEINLINE void copy(Src src, Tgt tgt, Args&&... args)
+{
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT == 1)
+  /* 
+   * Avoid all loops for only one level. 
+   */
+  level_copy(0, src, tgt, std::forward<Args>(args)...);
+
+#elif (RTC_MAX_INSTANCE_LEVEL_COUNT <= 4)
+  /* 
+   * It is faster to avoid the valid test for low level counts.
+   * Just copy the whole stack.
+   */
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    level_copy(l, src, tgt, std::forward<Args>(args)...);
+
+#else
+  /* 
+   * For general stack sizes, it pays off to test for validity.
+   */
+  bool valid = true;
+  for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT && valid; ++l)
+  {
+    level_copy(l, src, tgt, std::forward<Args>(args)...);
+    valid = level_valid(l, src, std::forward<Args>(args)...);
+  }
+#endif
+}
+
+} // namespace instance_id_stack
+} // namespace embree
+
diff --git a/thirdparty/embree-aarch64/kernels/common/isa.h b/thirdparty/embree-aarch64/kernels/common/isa.h
new file mode 100644
index 0000000000..63fb8d3351
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/isa.h
@@ -0,0 +1,271 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/sys/platform.h"
+#include "../../common/sys/sysinfo.h"
+
+namespace embree
+{
+#define DEFINE_SYMBOL2(type,name)               \
+  typedef type (*name##Func)();                 \
+  name##Func name;
+  
+#define DECLARE_SYMBOL2(type,name)                                       \
+  namespace sse2      { extern type name(); }                           \
+  namespace sse42     { extern type name(); }                           \
+  namespace avx       { extern type name(); }                           \
+  namespace avx2      { extern type name(); }                           \
+  namespace avx512knl { extern type name(); }                           \
+  namespace avx512skx { extern type name(); }                           \
+  void name##_error2() { throw_RTCError(RTC_ERROR_UNKNOWN,"internal error in ISA selection for " TOSTRING(name)); } \
+  type name##_error() { return type(name##_error2); }                   \
+  type name##_zero() { return type(nullptr); }
+
+#define DECLARE_ISA_FUNCTION(type,symbol,args)                            \
+  namespace sse2      { extern type symbol(args); }                       \
+  namespace sse42     { extern type symbol(args); }                       \
+  namespace avx       { extern type symbol(args); }                       \
+  namespace avx2      { extern type symbol(args); }                       \
+  namespace avx512knl { extern type symbol(args); }                       \
+  namespace avx512skx { extern type symbol(args); }                     \
+  inline type symbol##_error(args) { throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"function " TOSTRING(symbol) " not supported by your CPU"); } \
+  typedef type (*symbol##Ty)(args);                                       \
+  
+#define DEFINE_ISA_FUNCTION(type,symbol,args)   \
+  typedef type (*symbol##Func)(args);           \
+  symbol##Func symbol;
+  
+#define ZERO_SYMBOL(features,intersector)                      \
+  intersector = intersector##_zero;
+
+#define INIT_SYMBOL(features,intersector)                      \
+  intersector = decltype(intersector)(intersector##_error);
+
+#define SELECT_SYMBOL_DEFAULT(features,intersector) \
+  intersector = isa::intersector;
+
+#if defined(__SSE__) || defined(__ARM_NEON)
+#if !defined(EMBREE_TARGET_SIMD4)
+#define EMBREE_TARGET_SIMD4
+#endif
+#endif
+
+#if defined(EMBREE_TARGET_SSE42)
+#define SELECT_SYMBOL_SSE42(features,intersector) \
+  if ((features & SSE42) == SSE42) intersector = sse42::intersector;
+#else
+#define SELECT_SYMBOL_SSE42(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX) || defined(__AVX__)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#if defined(__AVX__) // if default ISA is >= AVX we treat AVX target as default target
+#define SELECT_SYMBOL_AVX(features,intersector)                 \
+  if ((features & ISA) == ISA) intersector = isa::intersector;
+#else
+#define SELECT_SYMBOL_AVX(features,intersector)                 \
+  if ((features & AVX) == AVX) intersector = avx::intersector;
+#endif
+#else
+#define SELECT_SYMBOL_AVX(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX2)
+#if !defined(EMBREE_TARGET_SIMD8)
+#define EMBREE_TARGET_SIMD8
+#endif
+#define SELECT_SYMBOL_AVX2(features,intersector) \
+  if ((features & AVX2) == AVX2) intersector = avx2::intersector;
+#else
+#define SELECT_SYMBOL_AVX2(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX512KNL)
+#if !defined(EMBREE_TARGET_SIMD16)
+#define EMBREE_TARGET_SIMD16
+#endif
+#define SELECT_SYMBOL_AVX512KNL(features,intersector) \
+  if ((features & AVX512KNL) == AVX512KNL) intersector = avx512knl::intersector;
+#else
+#define SELECT_SYMBOL_AVX512KNL(features,intersector)
+#endif
+
+#if defined(EMBREE_TARGET_AVX512SKX)
+#if !defined(EMBREE_TARGET_SIMD16)
+#define EMBREE_TARGET_SIMD16
+#endif
+#define SELECT_SYMBOL_AVX512SKX(features,intersector) \
+  if ((features & AVX512SKX) == AVX512SKX) intersector = avx512skx::intersector;
+#else
+#define SELECT_SYMBOL_AVX512SKX(features,intersector)
+#endif
+
+#define SELECT_SYMBOL_DEFAULT_SSE42(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);            \
+  SELECT_SYMBOL_SSE42(features,intersector);                                  
+  
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                \
+  SELECT_SYMBOL_SSE42(features,intersector);                  \
+  SELECT_SYMBOL_AVX(features,intersector);                        
+  
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                     \
+  SELECT_SYMBOL_SSE42(features,intersector);                       \
+  SELECT_SYMBOL_AVX(features,intersector);                         \
+  SELECT_SYMBOL_AVX2(features,intersector);                       
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                          \
+  SELECT_SYMBOL_SSE42(features,intersector);                            \
+  SELECT_SYMBOL_AVX(features,intersector);                              \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                   \
+  SELECT_SYMBOL_AVX(features,intersector);                                       \
+  SELECT_SYMBOL_AVX2(features,intersector);                                      \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                 \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                         \
+  SELECT_SYMBOL_AVX(features,intersector);                             \
+  SELECT_SYMBOL_AVX2(features,intersector);                            \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                         \
+  SELECT_SYMBOL_SSE42(features,intersector);                                           \
+  SELECT_SYMBOL_AVX(features,intersector);                                             \
+  SELECT_SYMBOL_AVX2(features,intersector);                                            \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                       \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                               \
+  SELECT_SYMBOL_SSE42(features,intersector);                                 \
+  SELECT_SYMBOL_AVX(features,intersector);                                   \
+  SELECT_SYMBOL_AVX2(features,intersector);                                  \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_DEFAULT_AVX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);          \
+  SELECT_SYMBOL_AVX(features,intersector);                        
+  
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);               \
+  SELECT_SYMBOL_AVX(features,intersector);                   \
+  SELECT_SYMBOL_AVX2(features,intersector);                       
+  
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                        \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                              \
+  SELECT_SYMBOL_AVX(features,intersector);                                  \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                            \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                        \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                 \
+  SELECT_SYMBOL_AVX(features,intersector);                                
+  
+#define SELECT_SYMBOL_INIT_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                      \
+  SELECT_SYMBOL_AVX(features,intersector);                \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                \
+  SELECT_SYMBOL_AVX(features,intersector);                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                         \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2(features,intersector) \
+  INIT_SYMBOL(features,intersector);                            \
+  SELECT_SYMBOL_SSE42(features,intersector);                    \
+  SELECT_SYMBOL_AVX(features,intersector);                      \
+  SELECT_SYMBOL_AVX2(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX_AVX512KNL(features,intersector) \
+  INIT_SYMBOL(features,intersector);                           \
+  SELECT_SYMBOL_AVX(features,intersector);                     \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                     \
+  SELECT_SYMBOL_AVX(features,intersector);                               \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                         \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                \
+  SELECT_SYMBOL_AVX(features,intersector);                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                         \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);
+
+#define SELECT_SYMBOL_INIT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                          \
+  SELECT_SYMBOL_AVX(features,intersector);                                    \
+  SELECT_SYMBOL_AVX2(features,intersector);                                   \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                              \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_INIT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                                \
+  SELECT_SYMBOL_SSE42(features,intersector);                                        \
+  SELECT_SYMBOL_AVX(features,intersector);                                          \
+  SELECT_SYMBOL_AVX2(features,intersector);                                         \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                    \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_ZERO_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  ZERO_SYMBOL(features,intersector);                                    \
+  SELECT_SYMBOL_SSE42(features,intersector);                            \
+  SELECT_SYMBOL_AVX(features,intersector);                              \
+  SELECT_SYMBOL_AVX2(features,intersector);                             \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                               \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+
+#define SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(features,intersector) \
+  SELECT_SYMBOL_DEFAULT(features,intersector);                                   \
+  SELECT_SYMBOL_AVX(features,intersector);                                       \
+  SELECT_SYMBOL_AVX2(features,intersector);                                      \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                                 \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_INIT_AVX512KNL_AVX512SKX(features,intersector) \
+  INIT_SYMBOL(features,intersector);                                 \
+  SELECT_SYMBOL_AVX512KNL(features,intersector);                     \
+  SELECT_SYMBOL_AVX512SKX(features,intersector);
+  
+#define SELECT_SYMBOL_SSE42_AVX_AVX2(features,intersector) \
+  SELECT_SYMBOL_SSE42(features,intersector);               \
+  SELECT_SYMBOL_AVX(features,intersector);                 \
+  SELECT_SYMBOL_AVX2(features,intersector);
+
+  struct VerifyMultiTargetLinking {
+    static __noinline int getISA(int depth = 5) { 
+      if (depth == 0) return ISA; 
+      else return getISA(depth-1); 
+    }
+  };
+  namespace sse2      { int getISA(); };
+  namespace sse42     { int getISA(); };
+  namespace avx       { int getISA(); };
+  namespace avx2      { int getISA(); };
+  namespace avx512knl { int getISA(); };
+  namespace avx512skx { int getISA(); };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/motion_derivative.h b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h
new file mode 100644
index 0000000000..82953f0e89
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/motion_derivative.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../common/math/affinespace.h"
+#include "../../common/math/interval.h"
+
+#include <functional>
+
+namespace embree {
+
+#define MOTION_DERIVATIVE_ROOT_EPSILON 1e-4f
+
+static void motion_derivative_coefficients(const float *p, float *coeff);
+
+struct MotionDerivativeCoefficients
+{
+  float theta;
+  float coeffs[3*8*7];
+
+  MotionDerivativeCoefficients() {}
+
+  // xfm0 and xfm1 are interpret as quaternion decomposition
+  MotionDerivativeCoefficients(AffineSpace3ff const& xfm0, AffineSpace3ff const& xfm1)
+  {
+    // cosTheta of the two quaternions
+    const float cosTheta = min(1.f, max(-1.f,
+                           xfm0.l.vx.w * xfm1.l.vx.w
+                         + xfm0.l.vy.w * xfm1.l.vy.w
+                         + xfm0.l.vz.w * xfm1.l.vz.w
+                         + xfm0.p.w * xfm1.p.w));
+
+    theta = std::acos(cosTheta);
+    Vec4f qperp(xfm1.p.w, xfm1.l.vx.w, xfm1.l.vy.w, xfm1.l.vz.w);
+    if (cosTheta < 0.995f) {
+      // compute perpendicular quaternion
+      qperp.x = xfm1.p.w    - cosTheta * xfm0.p.w;
+      qperp.y = xfm1.l.vx.w - cosTheta * xfm0.l.vx.w;
+      qperp.z = xfm1.l.vy.w - cosTheta * xfm0.l.vy.w;
+      qperp.w = xfm1.l.vz.w - cosTheta * xfm0.l.vz.w;
+      qperp = normalize(qperp);
+    }
+    const float p[33] = {
+      theta,
+      xfm0.l.vx.y, xfm0.l.vx.z, xfm0.l.vy.z, // translation component of xfm0
+      xfm1.l.vx.y, xfm1.l.vx.z, xfm1.l.vy.z, // translation component of xfm1
+      xfm0.p.w, xfm0.l.vx.w, xfm0.l.vy.w, xfm0.l.vz.w, // quaternion of xfm0
+      qperp.x, qperp.y, qperp.z, qperp.w,
+      xfm0.l.vx.x, xfm0.l.vy.x, xfm0.l.vz.x, xfm0.p.x, // scale/skew component of xfm0
+                   xfm0.l.vy.y, xfm0.l.vz.y, xfm0.p.y,
+                                xfm0.l.vz.z, xfm0.p.z,
+      xfm1.l.vx.x, xfm1.l.vy.x, xfm1.l.vz.x, xfm1.p.x, // scale/skew component of xfm1
+                   xfm1.l.vy.y, xfm1.l.vz.y, xfm1.p.y,
+                                xfm1.l.vz.z, xfm1.p.z
+    };
+    motion_derivative_coefficients(p, coeffs);
+  }
+};
+
+struct MotionDerivative
+{
+  float twoTheta;
+  float c[8];
+
+  MotionDerivative(MotionDerivativeCoefficients const& mdc,
+                    int dim, Vec3fa const& p0, Vec3fa const& p1)
+    : twoTheta(2.f*mdc.theta)
+  {
+    const float p[7] = { 1, p0.x, p0.y, p0.z, p1.x, p1.y, p1.z };
+    for (int i = 0; i < 8; ++i) {
+      c[i] = 0;
+      for (int j = 0; j < 7; ++j) {
+        c[i] += mdc.coeffs[8*7*dim + i*7 + j] * p[j];
+      }
+    }
+  }
+
+  template<typename T>
+  struct EvalMotionDerivative
+  {
+    MotionDerivative const& md;
+    float offset;
+
+    EvalMotionDerivative(MotionDerivative const& md, float offset) : md(md), offset(offset) {}
+
+    T operator()(T const& time) const {
+      return md.c[0] + md.c[1] * time
+          + (md.c[2] + md.c[3] * time + md.c[4] * time * time) * cos(md.twoTheta * time)
+          + (md.c[5] + md.c[6] * time + md.c[7] * time * time) * sin(md.twoTheta * time)
+          + offset;
+    }
+  };
+
+  unsigned int findRoots(
+    Interval1f const& interval,
+    float offset,
+    float* roots,
+    unsigned int maxNumRoots)
+  {
+    unsigned int numRoots = 0;
+    EvalMotionDerivative<Interval1f> eval(*this, offset);
+    findRoots(eval, interval, numRoots, roots, maxNumRoots);
+    return numRoots;
+  }
+
+  template<typename Eval>
+  static void findRoots(
+
+    Eval const& eval,
+    Interval1f const& interval,
+    unsigned int& numRoots,
+    float* roots,
+    unsigned int maxNumRoots)
+  {
+    Interval1f range = eval(interval);
+    if (range.lower > 0 || range.upper < 0 || range.lower >= range.upper) return;
+
+    const float split = 0.5f * (interval.upper + interval.lower);
+    if (interval.upper-interval.lower < 1e-7f || abs(split-interval.lower) < 1e-7f ||  abs(split-interval.upper) < 1e-7f)
+    {
+      // check if the root already exists
+      for (unsigned int k = 0; k < numRoots && k < maxNumRoots; ++k) {
+        if (abs(roots[k]-split) < MOTION_DERIVATIVE_ROOT_EPSILON)
+        return;
+      }
+      if (numRoots < maxNumRoots) {
+        roots[numRoots++] = split;
+      }
+      if (numRoots > maxNumRoots) {
+        printf("error: more roots than expected\n"); // FIXME: workaround for ICC2019.4 compiler bug under macOS
+        return;
+      }
+      return;
+    }
+
+    findRoots(eval, Interval1f(interval.lower, split), numRoots, roots, maxNumRoots);
+    findRoots(eval, Interval1f(split, interval.upper), numRoots, roots, maxNumRoots);
+  }
+};
+
+/******************************************************************************
+ *                       Code generated with sympy 1.4                        *
+ *              See http://www.sympy.org/ for more information.               *
+ *                                                                            *
+ * see                                                                        *
+ *                                                                            *
+ *     scripts/generate_motion_derivative_coefficients.py                     *
+ *                                                                            *
+ * for how this code is generated                                             *
+ *                                                                            *
+ ******************************************************************************/
+static void motion_derivative_coefficients(const float *p, float *coeff)
+{
+   coeff[0] = -p[1] + p[4] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27] - p[18] + p[27];
+   coeff[1] = 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - p[14]*p[14]*p[24] - 2*p[15] + p[24];
+   coeff[2] = 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - p[14]*p[14]*p[25] - 2*p[16] + p[25];
+   coeff[3] = -2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - p[14]*p[14]*p[26] - 2*p[17] + p[26];
+   coeff[4] = (-p[9]*p[9] - p[10]*p[10] - p[13]*p[13] - p[14]*p[14] + 1)*p[15];
+   coeff[5] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] - p[11]*p[14]*p[19] + p[12]*p[13]*p[19] - p[13]*p[13]*p[16] - p[14]*p[14]*p[16] + p[16];
+   coeff[6] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] + p[11]*p[13]*p[22] - p[11]*p[14]*p[20] + p[12]*p[13]*p[20] + p[12]*p[14]*p[22] - p[13]*p[13]*p[17] - p[14]*p[14]*p[17] + p[17];
+   coeff[7] = 0;
+   coeff[8] = -2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24] + 2*p[15] - 2*p[24];
+   coeff[9] = -2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25] + 2*p[16] - 2*p[25];
+   coeff[10] = 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26] + 2*p[17] - 2*p[26];
+   coeff[11] = 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24] - 2*p[15] + 2*p[24];
+   coeff[12] = 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25] - 2*p[16] + 2*p[25];
+   coeff[13] = -2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26] - 2*p[17] + 2*p[26];
+   coeff[14] = 2*p[0]*p[7]*p[11]*p[18] + 2*p[0]*p[7]*p[13]*p[23] - 2*p[0]*p[7]*p[14]*p[21] + 2*p[0]*p[8]*p[12]*p[18] + 2*p[0]*p[8]*p[13]*p[21] + 2*p[0]*p[8]*p[14]*p[23] + 2*p[0]*p[9]*p[11]*p[23] + 2*p[0]*p[9]*p[12]*p[21] - 2*p[0]*p[9]*p[13]*p[18] - 2*p[0]*p[10]*p[11]*p[21] + 2*p[0]*p[10]*p[12]*p[23] - 2*p[0]*p[10]*p[14]*p[18] - p[7]*p[9]*p[23] + p[7]*p[9]*p[32] + p[7]*p[10]*p[21] - p[7]*p[10]*p[30] - p[8]*p[9]*p[21] + p[8]*p[9]*p[30] - p[8]*p[10]*p[23] + p[8]*p[10]*p[32] + p[9]*p[9]*p[18] - p[9]*p[9]*p[27] + p[10]*p[10]*p[18] - p[10]*p[10]*p[27] + p[11]*p[13]*p[23] - p[11]*p[13]*p[32] - p[11]*p[14]*p[21] + p[11]*p[14]*p[30] + p[12]*p[13]*p[21] - p[12]*p[13]*p[30] + p[12]*p[14]*p[23] - p[12]*p[14]*p[32] - p[13]*p[13]*p[18] + p[13]*p[13]*p[27] - p[14]*p[14]*p[18] + p[14]*p[14]*p[27];
+   coeff[15] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + p[14]*p[14]*p[24];
+   coeff[16] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + p[14]*p[14]*p[25];
+   coeff[17] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + p[14]*p[14]*p[26];
+   coeff[18] = (-p[9]*p[9] - p[10]*p[10] + p[13]*p[13] + p[14]*p[14])*p[15];
+   coeff[19] = -p[7]*p[10]*p[19] + p[8]*p[9]*p[19] - p[9]*p[9]*p[16] - p[10]*p[10]*p[16] + p[11]*p[14]*p[19] - p[12]*p[13]*p[19] + p[13]*p[13]*p[16] + p[14]*p[14]*p[16];
+   coeff[20] = p[7]*p[9]*p[22] - p[7]*p[10]*p[20] + p[8]*p[9]*p[20] + p[8]*p[10]*p[22] - p[9]*p[9]*p[17] - p[10]*p[10]*p[17] - p[11]*p[13]*p[22] + p[11]*p[14]*p[20] - p[12]*p[13]*p[20] - p[12]*p[14]*p[22] + p[13]*p[13]*p[17] + p[14]*p[14]*p[17];
+   coeff[21] = 2*(-p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27])*p[0];
+   coeff[22] = -4*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[7]*p[11]*p[24] - 4*p[0]*p[8]*p[12]*p[15] + 2*p[0]*p[8]*p[12]*p[24] + 4*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[9]*p[13]*p[24] + 4*p[0]*p[10]*p[14]*p[15] - 2*p[0]*p[10]*p[14]*p[24] - 2*p[9]*p[9]*p[15] + 2*p[9]*p[9]*p[24] - 2*p[10]*p[10]*p[15] + 2*p[10]*p[10]*p[24] + 2*p[13]*p[13]*p[15] - 2*p[13]*p[13]*p[24] + 2*p[14]*p[14]*p[15] - 2*p[14]*p[14]*p[24];
+   coeff[23] = -4*p[0]*p[7]*p[11]*p[16] + 2*p[0]*p[7]*p[11]*p[25] + 4*p[0]*p[7]*p[14]*p[19] - 2*p[0]*p[7]*p[14]*p[28] - 4*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[12]*p[25] - 4*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[8]*p[13]*p[28] - 4*p[0]*p[9]*p[12]*p[19] + 2*p[0]*p[9]*p[12]*p[28] + 4*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[9]*p[13]*p[25] + 4*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[11]*p[28] + 4*p[0]*p[10]*p[14]*p[16] - 2*p[0]*p[10]*p[14]*p[25] - 2*p[7]*p[10]*p[19] + 2*p[7]*p[10]*p[28] + 2*p[8]*p[9]*p[19] - 2*p[8]*p[9]*p[28] - 2*p[9]*p[9]*p[16] + 2*p[9]*p[9]*p[25] - 2*p[10]*p[10]*p[16] + 2*p[10]*p[10]*p[25] + 2*p[11]*p[14]*p[19] - 2*p[11]*p[14]*p[28] - 2*p[12]*p[13]*p[19] + 2*p[12]*p[13]*p[28] + 2*p[13]*p[13]*p[16] - 2*p[13]*p[13]*p[25] + 2*p[14]*p[14]*p[16] - 2*p[14]*p[14]*p[25];
+   coeff[24] = -4*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[11]*p[26] - 4*p[0]*p[7]*p[13]*p[22] + 2*p[0]*p[7]*p[13]*p[31] + 4*p[0]*p[7]*p[14]*p[20] - 2*p[0]*p[7]*p[14]*p[29] - 4*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[12]*p[26] - 4*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[13]*p[29] - 4*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[8]*p[14]*p[31] - 4*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[11]*p[31] - 4*p[0]*p[9]*p[12]*p[20] + 2*p[0]*p[9]*p[12]*p[29] + 4*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[9]*p[13]*p[26] + 4*p[0]*p[10]*p[11]*p[20] - 2*p[0]*p[10]*p[11]*p[29] - 4*p[0]*p[10]*p[12]*p[22] + 2*p[0]*p[10]*p[12]*p[31] + 4*p[0]*p[10]*p[14]*p[17] - 2*p[0]*p[10]*p[14]*p[26] + 2*p[7]*p[9]*p[22] - 2*p[7]*p[9]*p[31] - 2*p[7]*p[10]*p[20] + 2*p[7]*p[10]*p[29] + 2*p[8]*p[9]*p[20] - 2*p[8]*p[9]*p[29] + 2*p[8]*p[10]*p[22] - 2*p[8]*p[10]*p[31] - 2*p[9]*p[9]*p[17] + 2*p[9]*p[9]*p[26] - 2*p[10]*p[10]*p[17] + 2*p[10]*p[10]*p[26] - 2*p[11]*p[13]*p[22] + 2*p[11]*p[13]*p[31] + 2*p[11]*p[14]*p[20] - 2*p[11]*p[14]*p[29] - 2*p[12]*p[13]*p[20] + 2*p[12]*p[13]*p[29] - 2*p[12]*p[14]*p[22] + 2*p[12]*p[14]*p[31] + 2*p[13]*p[13]*p[17] - 2*p[13]*p[13]*p[26] + 2*p[14]*p[14]*p[17] - 2*p[14]*p[14]*p[26];
+   coeff[25] = 2*p[0]*p[7]*p[11]*p[15] + 2*p[0]*p[8]*p[12]*p[15] - 2*p[0]*p[9]*p[13]*p[15] - 2*p[0]*p[10]*p[14]*p[15] + 2*p[9]*p[9]*p[15] - 2*p[9]*p[9]*p[24] + 2*p[10]*p[10]*p[15] - 2*p[10]*p[10]*p[24] - 2*p[13]*p[13]*p[15] + 2*p[13]*p[13]*p[24] - 2*p[14]*p[14]*p[15] + 2*p[14]*p[14]*p[24];
+   coeff[26] = 2*p[0]*p[7]*p[11]*p[16] - 2*p[0]*p[7]*p[14]*p[19] + 2*p[0]*p[8]*p[12]*p[16] + 2*p[0]*p[8]*p[13]*p[19] + 2*p[0]*p[9]*p[12]*p[19] - 2*p[0]*p[9]*p[13]*p[16] - 2*p[0]*p[10]*p[11]*p[19] - 2*p[0]*p[10]*p[14]*p[16] + 2*p[7]*p[10]*p[19] - 2*p[7]*p[10]*p[28] - 2*p[8]*p[9]*p[19] + 2*p[8]*p[9]*p[28] + 2*p[9]*p[9]*p[16] - 2*p[9]*p[9]*p[25] + 2*p[10]*p[10]*p[16] - 2*p[10]*p[10]*p[25] - 2*p[11]*p[14]*p[19] + 2*p[11]*p[14]*p[28] + 2*p[12]*p[13]*p[19] - 2*p[12]*p[13]*p[28] - 2*p[13]*p[13]*p[16] + 2*p[13]*p[13]*p[25] - 2*p[14]*p[14]*p[16] + 2*p[14]*p[14]*p[25];
+   coeff[27] = 2*p[0]*p[7]*p[11]*p[17] + 2*p[0]*p[7]*p[13]*p[22] - 2*p[0]*p[7]*p[14]*p[20] + 2*p[0]*p[8]*p[12]*p[17] + 2*p[0]*p[8]*p[13]*p[20] + 2*p[0]*p[8]*p[14]*p[22] + 2*p[0]*p[9]*p[11]*p[22] + 2*p[0]*p[9]*p[12]*p[20] - 2*p[0]*p[9]*p[13]*p[17] - 2*p[0]*p[10]*p[11]*p[20] + 2*p[0]*p[10]*p[12]*p[22] - 2*p[0]*p[10]*p[14]*p[17] - 2*p[7]*p[9]*p[22] + 2*p[7]*p[9]*p[31] + 2*p[7]*p[10]*p[20] - 2*p[7]*p[10]*p[29] - 2*p[8]*p[9]*p[20] + 2*p[8]*p[9]*p[29] - 2*p[8]*p[10]*p[22] + 2*p[8]*p[10]*p[31] + 2*p[9]*p[9]*p[17] - 2*p[9]*p[9]*p[26] + 2*p[10]*p[10]*p[17] - 2*p[10]*p[10]*p[26] + 2*p[11]*p[13]*p[22] - 2*p[11]*p[13]*p[31] - 2*p[11]*p[14]*p[20] + 2*p[11]*p[14]*p[29] + 2*p[12]*p[13]*p[20] - 2*p[12]*p[13]*p[29] + 2*p[12]*p[14]*p[22] - 2*p[12]*p[14]*p[31] - 2*p[13]*p[13]*p[17] + 2*p[13]*p[13]*p[26] - 2*p[14]*p[14]*p[17] + 2*p[14]*p[14]*p[26];
+   coeff[28] = 0;
+   coeff[29] = 2*(p[7]*p[11]*p[15] - p[7]*p[11]*p[24] + p[8]*p[12]*p[15] - p[8]*p[12]*p[24] - p[9]*p[13]*p[15] + p[9]*p[13]*p[24] - p[10]*p[14]*p[15] + p[10]*p[14]*p[24])*p[0];
+   coeff[30] = 2*(p[7]*p[11]*p[16] - p[7]*p[11]*p[25] - p[7]*p[14]*p[19] + p[7]*p[14]*p[28] + p[8]*p[12]*p[16] - p[8]*p[12]*p[25] + p[8]*p[13]*p[19] - p[8]*p[13]*p[28] + p[9]*p[12]*p[19] - p[9]*p[12]*p[28] - p[9]*p[13]*p[16] + p[9]*p[13]*p[25] - p[10]*p[11]*p[19] + p[10]*p[11]*p[28] - p[10]*p[14]*p[16] + p[10]*p[14]*p[25])*p[0];
+   coeff[31] = 2*(p[7]*p[11]*p[17] - p[7]*p[11]*p[26] + p[7]*p[13]*p[22] - p[7]*p[13]*p[31] - p[7]*p[14]*p[20] + p[7]*p[14]*p[29] + p[8]*p[12]*p[17] - p[8]*p[12]*p[26] + p[8]*p[13]*p[20] - p[8]*p[13]*p[29] + p[8]*p[14]*p[22] - p[8]*p[14]*p[31] + p[9]*p[11]*p[22] - p[9]*p[11]*p[31] + p[9]*p[12]*p[20] - p[9]*p[12]*p[29] - p[9]*p[13]*p[17] + p[9]*p[13]*p[26] - p[10]*p[11]*p[20] + p[10]*p[11]*p[29] + p[10]*p[12]*p[22] - p[10]*p[12]*p[31] - p[10]*p[14]*p[17] + p[10]*p[14]*p[26])*p[0];
+   coeff[32] = 2*(-p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + p[10]*p[14]*p[15] - p[10]*p[14]*p[24])*p[0];
+   coeff[33] = 2*(-p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + p[10]*p[14]*p[16] - p[10]*p[14]*p[25])*p[0];
+   coeff[34] = 2*(-p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + p[10]*p[14]*p[17] - p[10]*p[14]*p[26])*p[0];
+   coeff[35] = -2*p[0]*p[7]*p[9]*p[23] + 2*p[0]*p[7]*p[10]*p[21] - 2*p[0]*p[8]*p[9]*p[21] - 2*p[0]*p[8]*p[10]*p[23] + 2*p[0]*p[9]*p[9]*p[18] + 2*p[0]*p[10]*p[10]*p[18] + 2*p[0]*p[11]*p[13]*p[23] - 2*p[0]*p[11]*p[14]*p[21] + 2*p[0]*p[12]*p[13]*p[21] + 2*p[0]*p[12]*p[14]*p[23] - 2*p[0]*p[13]*p[13]*p[18] - 2*p[0]*p[14]*p[14]*p[18] - p[7]*p[11]*p[18] + p[7]*p[11]*p[27] - p[7]*p[13]*p[23] + p[7]*p[13]*p[32] + p[7]*p[14]*p[21] - p[7]*p[14]*p[30] - p[8]*p[12]*p[18] + p[8]*p[12]*p[27] - p[8]*p[13]*p[21] + p[8]*p[13]*p[30] - p[8]*p[14]*p[23] + p[8]*p[14]*p[32] - p[9]*p[11]*p[23] + p[9]*p[11]*p[32] - p[9]*p[12]*p[21] + p[9]*p[12]*p[30] + p[9]*p[13]*p[18] - p[9]*p[13]*p[27] + p[10]*p[11]*p[21] - p[10]*p[11]*p[30] - p[10]*p[12]*p[23] + p[10]*p[12]*p[32] + p[10]*p[14]*p[18] - p[10]*p[14]*p[27];
+   coeff[36] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - p[10]*p[14]*p[24];
+   coeff[37] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - p[10]*p[14]*p[25];
+   coeff[38] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - p[10]*p[14]*p[26];
+   coeff[39] = (p[7]*p[11] + p[8]*p[12] - p[9]*p[13] - p[10]*p[14])*p[15];
+   coeff[40] = p[7]*p[11]*p[16] - p[7]*p[14]*p[19] + p[8]*p[12]*p[16] + p[8]*p[13]*p[19] + p[9]*p[12]*p[19] - p[9]*p[13]*p[16] - p[10]*p[11]*p[19] - p[10]*p[14]*p[16];
+   coeff[41] = p[7]*p[11]*p[17] + p[7]*p[13]*p[22] - p[7]*p[14]*p[20] + p[8]*p[12]*p[17] + p[8]*p[13]*p[20] + p[8]*p[14]*p[22] + p[9]*p[11]*p[22] + p[9]*p[12]*p[20] - p[9]*p[13]*p[17] - p[10]*p[11]*p[20] + p[10]*p[12]*p[22] - p[10]*p[14]*p[17];
+   coeff[42] = 2*(p[7]*p[9]*p[23] - p[7]*p[9]*p[32] - p[7]*p[10]*p[21] + p[7]*p[10]*p[30] + p[8]*p[9]*p[21] - p[8]*p[9]*p[30] + p[8]*p[10]*p[23] - p[8]*p[10]*p[32] - p[9]*p[9]*p[18] + p[9]*p[9]*p[27] - p[10]*p[10]*p[18] + p[10]*p[10]*p[27] - p[11]*p[13]*p[23] + p[11]*p[13]*p[32] + p[11]*p[14]*p[21] - p[11]*p[14]*p[30] - p[12]*p[13]*p[21] + p[12]*p[13]*p[30] - p[12]*p[14]*p[23] + p[12]*p[14]*p[32] + p[13]*p[13]*p[18] - p[13]*p[13]*p[27] + p[14]*p[14]*p[18] - p[14]*p[14]*p[27])*p[0];
+   coeff[43] = -4*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[9]*p[9]*p[24] - 4*p[0]*p[10]*p[10]*p[15] + 2*p[0]*p[10]*p[10]*p[24] + 4*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[13]*p[13]*p[24] + 4*p[0]*p[14]*p[14]*p[15] - 2*p[0]*p[14]*p[14]*p[24] + 2*p[7]*p[11]*p[15] - 2*p[7]*p[11]*p[24] + 2*p[8]*p[12]*p[15] - 2*p[8]*p[12]*p[24] - 2*p[9]*p[13]*p[15] + 2*p[9]*p[13]*p[24] - 2*p[10]*p[14]*p[15] + 2*p[10]*p[14]*p[24];
+   coeff[44] = -4*p[0]*p[7]*p[10]*p[19] + 2*p[0]*p[7]*p[10]*p[28] + 4*p[0]*p[8]*p[9]*p[19] - 2*p[0]*p[8]*p[9]*p[28] - 4*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[9]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[16] + 2*p[0]*p[10]*p[10]*p[25] + 4*p[0]*p[11]*p[14]*p[19] - 2*p[0]*p[11]*p[14]*p[28] - 4*p[0]*p[12]*p[13]*p[19] + 2*p[0]*p[12]*p[13]*p[28] + 4*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[13]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[16] - 2*p[0]*p[14]*p[14]*p[25] + 2*p[7]*p[11]*p[16] - 2*p[7]*p[11]*p[25] - 2*p[7]*p[14]*p[19] + 2*p[7]*p[14]*p[28] + 2*p[8]*p[12]*p[16] - 2*p[8]*p[12]*p[25] + 2*p[8]*p[13]*p[19] - 2*p[8]*p[13]*p[28] + 2*p[9]*p[12]*p[19] - 2*p[9]*p[12]*p[28] - 2*p[9]*p[13]*p[16] + 2*p[9]*p[13]*p[25] - 2*p[10]*p[11]*p[19] + 2*p[10]*p[11]*p[28] - 2*p[10]*p[14]*p[16] + 2*p[10]*p[14]*p[25];
+   coeff[45] = 4*p[0]*p[7]*p[9]*p[22] - 2*p[0]*p[7]*p[9]*p[31] - 4*p[0]*p[7]*p[10]*p[20] + 2*p[0]*p[7]*p[10]*p[29] + 4*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[9]*p[29] + 4*p[0]*p[8]*p[10]*p[22] - 2*p[0]*p[8]*p[10]*p[31] - 4*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[9]*p[9]*p[26] - 4*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[10]*p[10]*p[26] - 4*p[0]*p[11]*p[13]*p[22] + 2*p[0]*p[11]*p[13]*p[31] + 4*p[0]*p[11]*p[14]*p[20] - 2*p[0]*p[11]*p[14]*p[29] - 4*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[13]*p[29] - 4*p[0]*p[12]*p[14]*p[22] + 2*p[0]*p[12]*p[14]*p[31] + 4*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[13]*p[13]*p[26] + 4*p[0]*p[14]*p[14]*p[17] - 2*p[0]*p[14]*p[14]*p[26] + 2*p[7]*p[11]*p[17] - 2*p[7]*p[11]*p[26] + 2*p[7]*p[13]*p[22] - 2*p[7]*p[13]*p[31] - 2*p[7]*p[14]*p[20] + 2*p[7]*p[14]*p[29] + 2*p[8]*p[12]*p[17] - 2*p[8]*p[12]*p[26] + 2*p[8]*p[13]*p[20] - 2*p[8]*p[13]*p[29] + 2*p[8]*p[14]*p[22] - 2*p[8]*p[14]*p[31] + 2*p[9]*p[11]*p[22] - 2*p[9]*p[11]*p[31] + 2*p[9]*p[12]*p[20] - 2*p[9]*p[12]*p[29] - 2*p[9]*p[13]*p[17] + 2*p[9]*p[13]*p[26] - 2*p[10]*p[11]*p[20] + 2*p[10]*p[11]*p[29] + 2*p[10]*p[12]*p[22] - 2*p[10]*p[12]*p[31] - 2*p[10]*p[14]*p[17] + 2*p[10]*p[14]*p[26];
+   coeff[46] = 2*p[0]*p[9]*p[9]*p[15] + 2*p[0]*p[10]*p[10]*p[15] - 2*p[0]*p[13]*p[13]*p[15] - 2*p[0]*p[14]*p[14]*p[15] - 2*p[7]*p[11]*p[15] + 2*p[7]*p[11]*p[24] - 2*p[8]*p[12]*p[15] + 2*p[8]*p[12]*p[24] + 2*p[9]*p[13]*p[15] - 2*p[9]*p[13]*p[24] + 2*p[10]*p[14]*p[15] - 2*p[10]*p[14]*p[24];
+   coeff[47] = 2*p[0]*p[7]*p[10]*p[19] - 2*p[0]*p[8]*p[9]*p[19] + 2*p[0]*p[9]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[16] - 2*p[0]*p[11]*p[14]*p[19] + 2*p[0]*p[12]*p[13]*p[19] - 2*p[0]*p[13]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[16] - 2*p[7]*p[11]*p[16] + 2*p[7]*p[11]*p[25] + 2*p[7]*p[14]*p[19] - 2*p[7]*p[14]*p[28] - 2*p[8]*p[12]*p[16] + 2*p[8]*p[12]*p[25] - 2*p[8]*p[13]*p[19] + 2*p[8]*p[13]*p[28] - 2*p[9]*p[12]*p[19] + 2*p[9]*p[12]*p[28] + 2*p[9]*p[13]*p[16] - 2*p[9]*p[13]*p[25] + 2*p[10]*p[11]*p[19] - 2*p[10]*p[11]*p[28] + 2*p[10]*p[14]*p[16] - 2*p[10]*p[14]*p[25];
+   coeff[48] = -2*p[0]*p[7]*p[9]*p[22] + 2*p[0]*p[7]*p[10]*p[20] - 2*p[0]*p[8]*p[9]*p[20] - 2*p[0]*p[8]*p[10]*p[22] + 2*p[0]*p[9]*p[9]*p[17] + 2*p[0]*p[10]*p[10]*p[17] + 2*p[0]*p[11]*p[13]*p[22] - 2*p[0]*p[11]*p[14]*p[20] + 2*p[0]*p[12]*p[13]*p[20] + 2*p[0]*p[12]*p[14]*p[22] - 2*p[0]*p[13]*p[13]*p[17] - 2*p[0]*p[14]*p[14]*p[17] - 2*p[7]*p[11]*p[17] + 2*p[7]*p[11]*p[26] - 2*p[7]*p[13]*p[22] + 2*p[7]*p[13]*p[31] + 2*p[7]*p[14]*p[20] - 2*p[7]*p[14]*p[29] - 2*p[8]*p[12]*p[17] + 2*p[8]*p[12]*p[26] - 2*p[8]*p[13]*p[20] + 2*p[8]*p[13]*p[29] - 2*p[8]*p[14]*p[22] + 2*p[8]*p[14]*p[31] - 2*p[9]*p[11]*p[22] + 2*p[9]*p[11]*p[31] - 2*p[9]*p[12]*p[20] + 2*p[9]*p[12]*p[29] + 2*p[9]*p[13]*p[17] - 2*p[9]*p[13]*p[26] + 2*p[10]*p[11]*p[20] - 2*p[10]*p[11]*p[29] - 2*p[10]*p[12]*p[22] + 2*p[10]*p[12]*p[31] + 2*p[10]*p[14]*p[17] - 2*p[10]*p[14]*p[26];
+   coeff[49] = 0;
+   coeff[50] = 2*(p[9]*p[9]*p[15] - p[9]*p[9]*p[24] + p[10]*p[10]*p[15] - p[10]*p[10]*p[24] - p[13]*p[13]*p[15] + p[13]*p[13]*p[24] - p[14]*p[14]*p[15] + p[14]*p[14]*p[24])*p[0];
+   coeff[51] = 2*(p[7]*p[10]*p[19] - p[7]*p[10]*p[28] - p[8]*p[9]*p[19] + p[8]*p[9]*p[28] + p[9]*p[9]*p[16] - p[9]*p[9]*p[25] + p[10]*p[10]*p[16] - p[10]*p[10]*p[25] - p[11]*p[14]*p[19] + p[11]*p[14]*p[28] + p[12]*p[13]*p[19] - p[12]*p[13]*p[28] - p[13]*p[13]*p[16] + p[13]*p[13]*p[25] - p[14]*p[14]*p[16] + p[14]*p[14]*p[25])*p[0];
+   coeff[52] = 2*(-p[7]*p[9]*p[22] + p[7]*p[9]*p[31] + p[7]*p[10]*p[20] - p[7]*p[10]*p[29] - p[8]*p[9]*p[20] + p[8]*p[9]*p[29] - p[8]*p[10]*p[22] + p[8]*p[10]*p[31] + p[9]*p[9]*p[17] - p[9]*p[9]*p[26] + p[10]*p[10]*p[17] - p[10]*p[10]*p[26] + p[11]*p[13]*p[22] - p[11]*p[13]*p[31] - p[11]*p[14]*p[20] + p[11]*p[14]*p[29] + p[12]*p[13]*p[20] - p[12]*p[13]*p[29] + p[12]*p[14]*p[22] - p[12]*p[14]*p[31] - p[13]*p[13]*p[17] + p[13]*p[13]*p[26] - p[14]*p[14]*p[17] + p[14]*p[14]*p[26])*p[0];
+   coeff[53] = 2*(-p[9]*p[9]*p[15] + p[9]*p[9]*p[24] - p[10]*p[10]*p[15] + p[10]*p[10]*p[24] + p[13]*p[13]*p[15] - p[13]*p[13]*p[24] + p[14]*p[14]*p[15] - p[14]*p[14]*p[24])*p[0];
+   coeff[54] = 2*(-p[7]*p[10]*p[19] + p[7]*p[10]*p[28] + p[8]*p[9]*p[19] - p[8]*p[9]*p[28] - p[9]*p[9]*p[16] + p[9]*p[9]*p[25] - p[10]*p[10]*p[16] + p[10]*p[10]*p[25] + p[11]*p[14]*p[19] - p[11]*p[14]*p[28] - p[12]*p[13]*p[19] + p[12]*p[13]*p[28] + p[13]*p[13]*p[16] - p[13]*p[13]*p[25] + p[14]*p[14]*p[16] - p[14]*p[14]*p[25])*p[0];
+   coeff[55] = 2*(p[7]*p[9]*p[22] - p[7]*p[9]*p[31] - p[7]*p[10]*p[20] + p[7]*p[10]*p[29] + p[8]*p[9]*p[20] - p[8]*p[9]*p[29] + p[8]*p[10]*p[22] - p[8]*p[10]*p[31] - p[9]*p[9]*p[17] + p[9]*p[9]*p[26] - p[10]*p[10]*p[17] + p[10]*p[10]*p[26] - p[11]*p[13]*p[22] + p[11]*p[13]*p[31] + p[11]*p[14]*p[20] - p[11]*p[14]*p[29] - p[12]*p[13]*p[20] + p[12]*p[13]*p[29] - p[12]*p[14]*p[22] + p[12]*p[14]*p[31] + p[13]*p[13]*p[17] - p[13]*p[13]*p[26] + p[14]*p[14]*p[17] - p[14]*p[14]*p[26])*p[0];
+   coeff[56] = -p[2] + p[5] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30] - p[21] + p[30];
+   coeff[57] = -2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + p[12]*p[13]*p[24];
+   coeff[58] = -2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - p[14]*p[14]*p[28] - 2*p[19] + p[28];
+   coeff[59] = 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - p[14]*p[14]*p[29] - 2*p[20] + p[29];
+   coeff[60] = (p[7]*p[10] + p[8]*p[9] + p[11]*p[14] + p[12]*p[13])*p[15];
+   coeff[61] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] + p[11]*p[14]*p[16] - p[12]*p[12]*p[19] + p[12]*p[13]*p[16] - p[14]*p[14]*p[19] + p[19];
+   coeff[62] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] - p[11]*p[12]*p[22] + p[11]*p[14]*p[17] - p[12]*p[12]*p[20] + p[12]*p[13]*p[17] + p[13]*p[14]*p[22] - p[14]*p[14]*p[20] + p[20];
+   coeff[63] = 0;
+   coeff[64] = 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+   coeff[65] = 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28] + 2*p[19] - 2*p[28];
+   coeff[66] = -2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29] + 2*p[20] - 2*p[29];
+   coeff[67] = -2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+   coeff[68] = -2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28] - 2*p[19] + 2*p[28];
+   coeff[69] = 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29] - 2*p[20] + 2*p[29];
+   coeff[70] = 2*p[0]*p[7]*p[11]*p[21] - 2*p[0]*p[7]*p[12]*p[23] + 2*p[0]*p[7]*p[14]*p[18] - 2*p[0]*p[8]*p[11]*p[23] - 2*p[0]*p[8]*p[12]*p[21] + 2*p[0]*p[8]*p[13]*p[18] + 2*p[0]*p[9]*p[12]*p[18] + 2*p[0]*p[9]*p[13]*p[21] + 2*p[0]*p[9]*p[14]*p[23] + 2*p[0]*p[10]*p[11]*p[18] + 2*p[0]*p[10]*p[13]*p[23] - 2*p[0]*p[10]*p[14]*p[21] + p[7]*p[8]*p[23] - p[7]*p[8]*p[32] - p[7]*p[10]*p[18] + p[7]*p[10]*p[27] + p[8]*p[8]*p[21] - p[8]*p[8]*p[30] - p[8]*p[9]*p[18] + p[8]*p[9]*p[27] - p[9]*p[10]*p[23] + p[9]*p[10]*p[32] + p[10]*p[10]*p[21] - p[10]*p[10]*p[30] - p[11]*p[12]*p[23] + p[11]*p[12]*p[32] + p[11]*p[14]*p[18] - p[11]*p[14]*p[27] - p[12]*p[12]*p[21] + p[12]*p[12]*p[30] + p[12]*p[13]*p[18] - p[12]*p[13]*p[27] + p[13]*p[14]*p[23] - p[13]*p[14]*p[32] - p[14]*p[14]*p[21] + p[14]*p[14]*p[30];
+   coeff[71] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - p[12]*p[13]*p[24];
+   coeff[72] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + p[14]*p[14]*p[28];
+   coeff[73] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + p[14]*p[14]*p[29];
+   coeff[74] = (p[7]*p[10] + p[8]*p[9] - p[11]*p[14] - p[12]*p[13])*p[15];
+   coeff[75] = p[7]*p[10]*p[16] - p[8]*p[8]*p[19] + p[8]*p[9]*p[16] - p[10]*p[10]*p[19] - p[11]*p[14]*p[16] + p[12]*p[12]*p[19] - p[12]*p[13]*p[16] + p[14]*p[14]*p[19];
+   coeff[76] = -p[7]*p[8]*p[22] + p[7]*p[10]*p[17] - p[8]*p[8]*p[20] + p[8]*p[9]*p[17] + p[9]*p[10]*p[22] - p[10]*p[10]*p[20] + p[11]*p[12]*p[22] - p[11]*p[14]*p[17] + p[12]*p[12]*p[20] - p[12]*p[13]*p[17] - p[13]*p[14]*p[22] + p[14]*p[14]*p[20];
+   coeff[77] = 2*(-p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30])*p[0];
+   coeff[78] = -4*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[7]*p[14]*p[24] - 4*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[8]*p[13]*p[24] - 4*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[9]*p[12]*p[24] - 4*p[0]*p[10]*p[11]*p[15] + 2*p[0]*p[10]*p[11]*p[24] + 2*p[7]*p[10]*p[15] - 2*p[7]*p[10]*p[24] + 2*p[8]*p[9]*p[15] - 2*p[8]*p[9]*p[24] - 2*p[11]*p[14]*p[15] + 2*p[11]*p[14]*p[24] - 2*p[12]*p[13]*p[15] + 2*p[12]*p[13]*p[24];
+   coeff[79] = -4*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[11]*p[28] - 4*p[0]*p[7]*p[14]*p[16] + 2*p[0]*p[7]*p[14]*p[25] + 4*p[0]*p[8]*p[12]*p[19] - 2*p[0]*p[8]*p[12]*p[28] - 4*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[8]*p[13]*p[25] - 4*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[12]*p[25] - 4*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[9]*p[13]*p[28] - 4*p[0]*p[10]*p[11]*p[16] + 2*p[0]*p[10]*p[11]*p[25] + 4*p[0]*p[10]*p[14]*p[19] - 2*p[0]*p[10]*p[14]*p[28] + 2*p[7]*p[10]*p[16] - 2*p[7]*p[10]*p[25] - 2*p[8]*p[8]*p[19] + 2*p[8]*p[8]*p[28] + 2*p[8]*p[9]*p[16] - 2*p[8]*p[9]*p[25] - 2*p[10]*p[10]*p[19] + 2*p[10]*p[10]*p[28] - 2*p[11]*p[14]*p[16] + 2*p[11]*p[14]*p[25] + 2*p[12]*p[12]*p[19] - 2*p[12]*p[12]*p[28] - 2*p[12]*p[13]*p[16] + 2*p[12]*p[13]*p[25] + 2*p[14]*p[14]*p[19] - 2*p[14]*p[14]*p[28];
+   coeff[80] = -4*p[0]*p[7]*p[11]*p[20] + 2*p[0]*p[7]*p[11]*p[29] + 4*p[0]*p[7]*p[12]*p[22] - 2*p[0]*p[7]*p[12]*p[31] - 4*p[0]*p[7]*p[14]*p[17] + 2*p[0]*p[7]*p[14]*p[26] + 4*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[11]*p[31] + 4*p[0]*p[8]*p[12]*p[20] - 2*p[0]*p[8]*p[12]*p[29] - 4*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[8]*p[13]*p[26] - 4*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[12]*p[26] - 4*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[13]*p[29] - 4*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[9]*p[14]*p[31] - 4*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[11]*p[26] - 4*p[0]*p[10]*p[13]*p[22] + 2*p[0]*p[10]*p[13]*p[31] + 4*p[0]*p[10]*p[14]*p[20] - 2*p[0]*p[10]*p[14]*p[29] - 2*p[7]*p[8]*p[22] + 2*p[7]*p[8]*p[31] + 2*p[7]*p[10]*p[17] - 2*p[7]*p[10]*p[26] - 2*p[8]*p[8]*p[20] + 2*p[8]*p[8]*p[29] + 2*p[8]*p[9]*p[17] - 2*p[8]*p[9]*p[26] + 2*p[9]*p[10]*p[22] - 2*p[9]*p[10]*p[31] - 2*p[10]*p[10]*p[20] + 2*p[10]*p[10]*p[29] + 2*p[11]*p[12]*p[22] - 2*p[11]*p[12]*p[31] - 2*p[11]*p[14]*p[17] + 2*p[11]*p[14]*p[26] + 2*p[12]*p[12]*p[20] - 2*p[12]*p[12]*p[29] - 2*p[12]*p[13]*p[17] + 2*p[12]*p[13]*p[26] - 2*p[13]*p[14]*p[22] + 2*p[13]*p[14]*p[31] + 2*p[14]*p[14]*p[20] - 2*p[14]*p[14]*p[29];
+   coeff[81] = 2*p[0]*p[7]*p[14]*p[15] + 2*p[0]*p[8]*p[13]*p[15] + 2*p[0]*p[9]*p[12]*p[15] + 2*p[0]*p[10]*p[11]*p[15] - 2*p[7]*p[10]*p[15] + 2*p[7]*p[10]*p[24] - 2*p[8]*p[9]*p[15] + 2*p[8]*p[9]*p[24] + 2*p[11]*p[14]*p[15] - 2*p[11]*p[14]*p[24] + 2*p[12]*p[13]*p[15] - 2*p[12]*p[13]*p[24];
+   coeff[82] = 2*p[0]*p[7]*p[11]*p[19] + 2*p[0]*p[7]*p[14]*p[16] - 2*p[0]*p[8]*p[12]*p[19] + 2*p[0]*p[8]*p[13]*p[16] + 2*p[0]*p[9]*p[12]*p[16] + 2*p[0]*p[9]*p[13]*p[19] + 2*p[0]*p[10]*p[11]*p[16] - 2*p[0]*p[10]*p[14]*p[19] - 2*p[7]*p[10]*p[16] + 2*p[7]*p[10]*p[25] + 2*p[8]*p[8]*p[19] - 2*p[8]*p[8]*p[28] - 2*p[8]*p[9]*p[16] + 2*p[8]*p[9]*p[25] + 2*p[10]*p[10]*p[19] - 2*p[10]*p[10]*p[28] + 2*p[11]*p[14]*p[16] - 2*p[11]*p[14]*p[25] - 2*p[12]*p[12]*p[19] + 2*p[12]*p[12]*p[28] + 2*p[12]*p[13]*p[16] - 2*p[12]*p[13]*p[25] - 2*p[14]*p[14]*p[19] + 2*p[14]*p[14]*p[28];
+   coeff[83] = 2*p[0]*p[7]*p[11]*p[20] - 2*p[0]*p[7]*p[12]*p[22] + 2*p[0]*p[7]*p[14]*p[17] - 2*p[0]*p[8]*p[11]*p[22] - 2*p[0]*p[8]*p[12]*p[20] + 2*p[0]*p[8]*p[13]*p[17] + 2*p[0]*p[9]*p[12]*p[17] + 2*p[0]*p[9]*p[13]*p[20] + 2*p[0]*p[9]*p[14]*p[22] + 2*p[0]*p[10]*p[11]*p[17] + 2*p[0]*p[10]*p[13]*p[22] - 2*p[0]*p[10]*p[14]*p[20] + 2*p[7]*p[8]*p[22] - 2*p[7]*p[8]*p[31] - 2*p[7]*p[10]*p[17] + 2*p[7]*p[10]*p[26] + 2*p[8]*p[8]*p[20] - 2*p[8]*p[8]*p[29] - 2*p[8]*p[9]*p[17] + 2*p[8]*p[9]*p[26] - 2*p[9]*p[10]*p[22] + 2*p[9]*p[10]*p[31] + 2*p[10]*p[10]*p[20] - 2*p[10]*p[10]*p[29] - 2*p[11]*p[12]*p[22] + 2*p[11]*p[12]*p[31] + 2*p[11]*p[14]*p[17] - 2*p[11]*p[14]*p[26] - 2*p[12]*p[12]*p[20] + 2*p[12]*p[12]*p[29] + 2*p[12]*p[13]*p[17] - 2*p[12]*p[13]*p[26] + 2*p[13]*p[14]*p[22] - 2*p[13]*p[14]*p[31] - 2*p[14]*p[14]*p[20] + 2*p[14]*p[14]*p[29];
+   coeff[84] = 0;
+   coeff[85] = 2*(p[7]*p[14]*p[15] - p[7]*p[14]*p[24] + p[8]*p[13]*p[15] - p[8]*p[13]*p[24] + p[9]*p[12]*p[15] - p[9]*p[12]*p[24] + p[10]*p[11]*p[15] - p[10]*p[11]*p[24])*p[0];
+   coeff[86] = 2*(p[7]*p[11]*p[19] - p[7]*p[11]*p[28] + p[7]*p[14]*p[16] - p[7]*p[14]*p[25] - p[8]*p[12]*p[19] + p[8]*p[12]*p[28] + p[8]*p[13]*p[16] - p[8]*p[13]*p[25] + p[9]*p[12]*p[16] - p[9]*p[12]*p[25] + p[9]*p[13]*p[19] - p[9]*p[13]*p[28] + p[10]*p[11]*p[16] - p[10]*p[11]*p[25] - p[10]*p[14]*p[19] + p[10]*p[14]*p[28])*p[0];
+   coeff[87] = 2*(p[7]*p[11]*p[20] - p[7]*p[11]*p[29] - p[7]*p[12]*p[22] + p[7]*p[12]*p[31] + p[7]*p[14]*p[17] - p[7]*p[14]*p[26] - p[8]*p[11]*p[22] + p[8]*p[11]*p[31] - p[8]*p[12]*p[20] + p[8]*p[12]*p[29] + p[8]*p[13]*p[17] - p[8]*p[13]*p[26] + p[9]*p[12]*p[17] - p[9]*p[12]*p[26] + p[9]*p[13]*p[20] - p[9]*p[13]*p[29] + p[9]*p[14]*p[22] - p[9]*p[14]*p[31] + p[10]*p[11]*p[17] - p[10]*p[11]*p[26] + p[10]*p[13]*p[22] - p[10]*p[13]*p[31] - p[10]*p[14]*p[20] + p[10]*p[14]*p[29])*p[0];
+   coeff[88] = 2*(-p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - p[10]*p[11]*p[15] + p[10]*p[11]*p[24])*p[0];
+   coeff[89] = 2*(-p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + p[10]*p[14]*p[19] - p[10]*p[14]*p[28])*p[0];
+   coeff[90] = 2*(-p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + p[10]*p[14]*p[20] - p[10]*p[14]*p[29])*p[0];
+   coeff[91] = 2*p[0]*p[7]*p[8]*p[23] - 2*p[0]*p[7]*p[10]*p[18] + 2*p[0]*p[8]*p[8]*p[21] - 2*p[0]*p[8]*p[9]*p[18] - 2*p[0]*p[9]*p[10]*p[23] + 2*p[0]*p[10]*p[10]*p[21] - 2*p[0]*p[11]*p[12]*p[23] + 2*p[0]*p[11]*p[14]*p[18] - 2*p[0]*p[12]*p[12]*p[21] + 2*p[0]*p[12]*p[13]*p[18] + 2*p[0]*p[13]*p[14]*p[23] - 2*p[0]*p[14]*p[14]*p[21] - p[7]*p[11]*p[21] + p[7]*p[11]*p[30] + p[7]*p[12]*p[23] - p[7]*p[12]*p[32] - p[7]*p[14]*p[18] + p[7]*p[14]*p[27] + p[8]*p[11]*p[23] - p[8]*p[11]*p[32] + p[8]*p[12]*p[21] - p[8]*p[12]*p[30] - p[8]*p[13]*p[18] + p[8]*p[13]*p[27] - p[9]*p[12]*p[18] + p[9]*p[12]*p[27] - p[9]*p[13]*p[21] + p[9]*p[13]*p[30] - p[9]*p[14]*p[23] + p[9]*p[14]*p[32] - p[10]*p[11]*p[18] + p[10]*p[11]*p[27] - p[10]*p[13]*p[23] + p[10]*p[13]*p[32] + p[10]*p[14]*p[21] - p[10]*p[14]*p[30];
+   coeff[92] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + p[10]*p[11]*p[24];
+   coeff[93] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - p[10]*p[14]*p[28];
+   coeff[94] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - p[10]*p[14]*p[29];
+   coeff[95] = (p[7]*p[14] + p[8]*p[13] + p[9]*p[12] + p[10]*p[11])*p[15];
+   coeff[96] = p[7]*p[11]*p[19] + p[7]*p[14]*p[16] - p[8]*p[12]*p[19] + p[8]*p[13]*p[16] + p[9]*p[12]*p[16] + p[9]*p[13]*p[19] + p[10]*p[11]*p[16] - p[10]*p[14]*p[19];
+   coeff[97] = p[7]*p[11]*p[20] - p[7]*p[12]*p[22] + p[7]*p[14]*p[17] - p[8]*p[11]*p[22] - p[8]*p[12]*p[20] + p[8]*p[13]*p[17] + p[9]*p[12]*p[17] + p[9]*p[13]*p[20] + p[9]*p[14]*p[22] + p[10]*p[11]*p[17] + p[10]*p[13]*p[22] - p[10]*p[14]*p[20];
+   coeff[98] = 2*(-p[7]*p[8]*p[23] + p[7]*p[8]*p[32] + p[7]*p[10]*p[18] - p[7]*p[10]*p[27] - p[8]*p[8]*p[21] + p[8]*p[8]*p[30] + p[8]*p[9]*p[18] - p[8]*p[9]*p[27] + p[9]*p[10]*p[23] - p[9]*p[10]*p[32] - p[10]*p[10]*p[21] + p[10]*p[10]*p[30] + p[11]*p[12]*p[23] - p[11]*p[12]*p[32] - p[11]*p[14]*p[18] + p[11]*p[14]*p[27] + p[12]*p[12]*p[21] - p[12]*p[12]*p[30] - p[12]*p[13]*p[18] + p[12]*p[13]*p[27] - p[13]*p[14]*p[23] + p[13]*p[14]*p[32] + p[14]*p[14]*p[21] - p[14]*p[14]*p[30])*p[0];
+   coeff[99] = 4*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[7]*p[10]*p[24] + 4*p[0]*p[8]*p[9]*p[15] - 2*p[0]*p[8]*p[9]*p[24] - 4*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[11]*p[14]*p[24] - 4*p[0]*p[12]*p[13]*p[15] + 2*p[0]*p[12]*p[13]*p[24] + 2*p[7]*p[14]*p[15] - 2*p[7]*p[14]*p[24] + 2*p[8]*p[13]*p[15] - 2*p[8]*p[13]*p[24] + 2*p[9]*p[12]*p[15] - 2*p[9]*p[12]*p[24] + 2*p[10]*p[11]*p[15] - 2*p[10]*p[11]*p[24];
+   coeff[100] = 4*p[0]*p[7]*p[10]*p[16] - 2*p[0]*p[7]*p[10]*p[25] - 4*p[0]*p[8]*p[8]*p[19] + 2*p[0]*p[8]*p[8]*p[28] + 4*p[0]*p[8]*p[9]*p[16] - 2*p[0]*p[8]*p[9]*p[25] - 4*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[10]*p[10]*p[28] - 4*p[0]*p[11]*p[14]*p[16] + 2*p[0]*p[11]*p[14]*p[25] + 4*p[0]*p[12]*p[12]*p[19] - 2*p[0]*p[12]*p[12]*p[28] - 4*p[0]*p[12]*p[13]*p[16] + 2*p[0]*p[12]*p[13]*p[25] + 4*p[0]*p[14]*p[14]*p[19] - 2*p[0]*p[14]*p[14]*p[28] + 2*p[7]*p[11]*p[19] - 2*p[7]*p[11]*p[28] + 2*p[7]*p[14]*p[16] - 2*p[7]*p[14]*p[25] - 2*p[8]*p[12]*p[19] + 2*p[8]*p[12]*p[28] + 2*p[8]*p[13]*p[16] - 2*p[8]*p[13]*p[25] + 2*p[9]*p[12]*p[16] - 2*p[9]*p[12]*p[25] + 2*p[9]*p[13]*p[19] - 2*p[9]*p[13]*p[28] + 2*p[10]*p[11]*p[16] - 2*p[10]*p[11]*p[25] - 2*p[10]*p[14]*p[19] + 2*p[10]*p[14]*p[28];
+   coeff[101] = -4*p[0]*p[7]*p[8]*p[22] + 2*p[0]*p[7]*p[8]*p[31] + 4*p[0]*p[7]*p[10]*p[17] - 2*p[0]*p[7]*p[10]*p[26] - 4*p[0]*p[8]*p[8]*p[20] + 2*p[0]*p[8]*p[8]*p[29] + 4*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[8]*p[9]*p[26] + 4*p[0]*p[9]*p[10]*p[22] - 2*p[0]*p[9]*p[10]*p[31] - 4*p[0]*p[10]*p[10]*p[20] + 2*p[0]*p[10]*p[10]*p[29] + 4*p[0]*p[11]*p[12]*p[22] - 2*p[0]*p[11]*p[12]*p[31] - 4*p[0]*p[11]*p[14]*p[17] + 2*p[0]*p[11]*p[14]*p[26] + 4*p[0]*p[12]*p[12]*p[20] - 2*p[0]*p[12]*p[12]*p[29] - 4*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[12]*p[13]*p[26] - 4*p[0]*p[13]*p[14]*p[22] + 2*p[0]*p[13]*p[14]*p[31] + 4*p[0]*p[14]*p[14]*p[20] - 2*p[0]*p[14]*p[14]*p[29] + 2*p[7]*p[11]*p[20] - 2*p[7]*p[11]*p[29] - 2*p[7]*p[12]*p[22] + 2*p[7]*p[12]*p[31] + 2*p[7]*p[14]*p[17] - 2*p[7]*p[14]*p[26] - 2*p[8]*p[11]*p[22] + 2*p[8]*p[11]*p[31] - 2*p[8]*p[12]*p[20] + 2*p[8]*p[12]*p[29] + 2*p[8]*p[13]*p[17] - 2*p[8]*p[13]*p[26] + 2*p[9]*p[12]*p[17] - 2*p[9]*p[12]*p[26] + 2*p[9]*p[13]*p[20] - 2*p[9]*p[13]*p[29] + 2*p[9]*p[14]*p[22] - 2*p[9]*p[14]*p[31] + 2*p[10]*p[11]*p[17] - 2*p[10]*p[11]*p[26] + 2*p[10]*p[13]*p[22] - 2*p[10]*p[13]*p[31] - 2*p[10]*p[14]*p[20] + 2*p[10]*p[14]*p[29];
+   coeff[102] = -2*p[0]*p[7]*p[10]*p[15] - 2*p[0]*p[8]*p[9]*p[15] + 2*p[0]*p[11]*p[14]*p[15] + 2*p[0]*p[12]*p[13]*p[15] - 2*p[7]*p[14]*p[15] + 2*p[7]*p[14]*p[24] - 2*p[8]*p[13]*p[15] + 2*p[8]*p[13]*p[24] - 2*p[9]*p[12]*p[15] + 2*p[9]*p[12]*p[24] - 2*p[10]*p[11]*p[15] + 2*p[10]*p[11]*p[24];
+   coeff[103] = -2*p[0]*p[7]*p[10]*p[16] + 2*p[0]*p[8]*p[8]*p[19] - 2*p[0]*p[8]*p[9]*p[16] + 2*p[0]*p[10]*p[10]*p[19] + 2*p[0]*p[11]*p[14]*p[16] - 2*p[0]*p[12]*p[12]*p[19] + 2*p[0]*p[12]*p[13]*p[16] - 2*p[0]*p[14]*p[14]*p[19] - 2*p[7]*p[11]*p[19] + 2*p[7]*p[11]*p[28] - 2*p[7]*p[14]*p[16] + 2*p[7]*p[14]*p[25] + 2*p[8]*p[12]*p[19] - 2*p[8]*p[12]*p[28] - 2*p[8]*p[13]*p[16] + 2*p[8]*p[13]*p[25] - 2*p[9]*p[12]*p[16] + 2*p[9]*p[12]*p[25] - 2*p[9]*p[13]*p[19] + 2*p[9]*p[13]*p[28] - 2*p[10]*p[11]*p[16] + 2*p[10]*p[11]*p[25] + 2*p[10]*p[14]*p[19] - 2*p[10]*p[14]*p[28];
+   coeff[104] = 2*p[0]*p[7]*p[8]*p[22] - 2*p[0]*p[7]*p[10]*p[17] + 2*p[0]*p[8]*p[8]*p[20] - 2*p[0]*p[8]*p[9]*p[17] - 2*p[0]*p[9]*p[10]*p[22] + 2*p[0]*p[10]*p[10]*p[20] - 2*p[0]*p[11]*p[12]*p[22] + 2*p[0]*p[11]*p[14]*p[17] - 2*p[0]*p[12]*p[12]*p[20] + 2*p[0]*p[12]*p[13]*p[17] + 2*p[0]*p[13]*p[14]*p[22] - 2*p[0]*p[14]*p[14]*p[20] - 2*p[7]*p[11]*p[20] + 2*p[7]*p[11]*p[29] + 2*p[7]*p[12]*p[22] - 2*p[7]*p[12]*p[31] - 2*p[7]*p[14]*p[17] + 2*p[7]*p[14]*p[26] + 2*p[8]*p[11]*p[22] - 2*p[8]*p[11]*p[31] + 2*p[8]*p[12]*p[20] - 2*p[8]*p[12]*p[29] - 2*p[8]*p[13]*p[17] + 2*p[8]*p[13]*p[26] - 2*p[9]*p[12]*p[17] + 2*p[9]*p[12]*p[26] - 2*p[9]*p[13]*p[20] + 2*p[9]*p[13]*p[29] - 2*p[9]*p[14]*p[22] + 2*p[9]*p[14]*p[31] - 2*p[10]*p[11]*p[17] + 2*p[10]*p[11]*p[26] - 2*p[10]*p[13]*p[22] + 2*p[10]*p[13]*p[31] + 2*p[10]*p[14]*p[20] - 2*p[10]*p[14]*p[29];
+   coeff[105] = 0;
+   coeff[106] = 2*(-p[7]*p[10]*p[15] + p[7]*p[10]*p[24] - p[8]*p[9]*p[15] + p[8]*p[9]*p[24] + p[11]*p[14]*p[15] - p[11]*p[14]*p[24] + p[12]*p[13]*p[15] - p[12]*p[13]*p[24])*p[0];
+   coeff[107] = 2*(-p[7]*p[10]*p[16] + p[7]*p[10]*p[25] + p[8]*p[8]*p[19] - p[8]*p[8]*p[28] - p[8]*p[9]*p[16] + p[8]*p[9]*p[25] + p[10]*p[10]*p[19] - p[10]*p[10]*p[28] + p[11]*p[14]*p[16] - p[11]*p[14]*p[25] - p[12]*p[12]*p[19] + p[12]*p[12]*p[28] + p[12]*p[13]*p[16] - p[12]*p[13]*p[25] - p[14]*p[14]*p[19] + p[14]*p[14]*p[28])*p[0];
+   coeff[108] = 2*(p[7]*p[8]*p[22] - p[7]*p[8]*p[31] - p[7]*p[10]*p[17] + p[7]*p[10]*p[26] + p[8]*p[8]*p[20] - p[8]*p[8]*p[29] - p[8]*p[9]*p[17] + p[8]*p[9]*p[26] - p[9]*p[10]*p[22] + p[9]*p[10]*p[31] + p[10]*p[10]*p[20] - p[10]*p[10]*p[29] - p[11]*p[12]*p[22] + p[11]*p[12]*p[31] + p[11]*p[14]*p[17] - p[11]*p[14]*p[26] - p[12]*p[12]*p[20] + p[12]*p[12]*p[29] + p[12]*p[13]*p[17] - p[12]*p[13]*p[26] + p[13]*p[14]*p[22] - p[13]*p[14]*p[31] - p[14]*p[14]*p[20] + p[14]*p[14]*p[29])*p[0];
+   coeff[109] = 2*(p[7]*p[10]*p[15] - p[7]*p[10]*p[24] + p[8]*p[9]*p[15] - p[8]*p[9]*p[24] - p[11]*p[14]*p[15] + p[11]*p[14]*p[24] - p[12]*p[13]*p[15] + p[12]*p[13]*p[24])*p[0];
+   coeff[110] = 2*(p[7]*p[10]*p[16] - p[7]*p[10]*p[25] - p[8]*p[8]*p[19] + p[8]*p[8]*p[28] + p[8]*p[9]*p[16] - p[8]*p[9]*p[25] - p[10]*p[10]*p[19] + p[10]*p[10]*p[28] - p[11]*p[14]*p[16] + p[11]*p[14]*p[25] + p[12]*p[12]*p[19] - p[12]*p[12]*p[28] - p[12]*p[13]*p[16] + p[12]*p[13]*p[25] + p[14]*p[14]*p[19] - p[14]*p[14]*p[28])*p[0];
+   coeff[111] = 2*(-p[7]*p[8]*p[22] + p[7]*p[8]*p[31] + p[7]*p[10]*p[17] - p[7]*p[10]*p[26] - p[8]*p[8]*p[20] + p[8]*p[8]*p[29] + p[8]*p[9]*p[17] - p[8]*p[9]*p[26] + p[9]*p[10]*p[22] - p[9]*p[10]*p[31] - p[10]*p[10]*p[20] + p[10]*p[10]*p[29] + p[11]*p[12]*p[22] - p[11]*p[12]*p[31] - p[11]*p[14]*p[17] + p[11]*p[14]*p[26] + p[12]*p[12]*p[20] - p[12]*p[12]*p[29] - p[12]*p[13]*p[17] + p[12]*p[13]*p[26] - p[13]*p[14]*p[22] + p[13]*p[14]*p[31] + p[14]*p[14]*p[20] - p[14]*p[14]*p[29])*p[0];
+   coeff[112] = -p[3] + p[6] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30] - p[23] + p[32];
+   coeff[113] = 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + p[12]*p[14]*p[24];
+   coeff[114] = -2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + p[13]*p[14]*p[28];
+   coeff[115] = -2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + p[13]*p[14]*p[29] - 2*p[22] + p[31];
+   coeff[116] = (-p[7]*p[9] + p[8]*p[10] - p[11]*p[13] + p[12]*p[14])*p[15];
+   coeff[117] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] + p[11]*p[12]*p[19] - p[11]*p[13]*p[16] + p[12]*p[14]*p[16] + p[13]*p[14]*p[19];
+   coeff[118] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] + p[11]*p[12]*p[20] - p[11]*p[13]*p[17] - p[12]*p[12]*p[22] + p[12]*p[14]*p[17] - p[13]*p[13]*p[22] + p[13]*p[14]*p[20] + p[22];
+   coeff[119] = 0;
+   coeff[120] = -2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+   coeff[121] = 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+   coeff[122] = 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29] + 2*p[22] - 2*p[31];
+   coeff[123] = 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+   coeff[124] = -2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+   coeff[125] = -2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29] - 2*p[22] + 2*p[31];
+   coeff[126] = 2*p[0]*p[7]*p[11]*p[23] + 2*p[0]*p[7]*p[12]*p[21] - 2*p[0]*p[7]*p[13]*p[18] + 2*p[0]*p[8]*p[11]*p[21] - 2*p[0]*p[8]*p[12]*p[23] + 2*p[0]*p[8]*p[14]*p[18] - 2*p[0]*p[9]*p[11]*p[18] - 2*p[0]*p[9]*p[13]*p[23] + 2*p[0]*p[9]*p[14]*p[21] + 2*p[0]*p[10]*p[12]*p[18] + 2*p[0]*p[10]*p[13]*p[21] + 2*p[0]*p[10]*p[14]*p[23] - p[7]*p[8]*p[21] + p[7]*p[8]*p[30] + p[7]*p[9]*p[18] - p[7]*p[9]*p[27] + p[8]*p[8]*p[23] - p[8]*p[8]*p[32] - p[8]*p[10]*p[18] + p[8]*p[10]*p[27] + p[9]*p[9]*p[23] - p[9]*p[9]*p[32] - p[9]*p[10]*p[21] + p[9]*p[10]*p[30] + p[11]*p[12]*p[21] - p[11]*p[12]*p[30] - p[11]*p[13]*p[18] + p[11]*p[13]*p[27] - p[12]*p[12]*p[23] + p[12]*p[12]*p[32] + p[12]*p[14]*p[18] - p[12]*p[14]*p[27] - p[13]*p[13]*p[23] + p[13]*p[13]*p[32] + p[13]*p[14]*p[21] - p[13]*p[14]*p[30];
+   coeff[127] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - p[12]*p[14]*p[24];
+   coeff[128] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - p[13]*p[14]*p[28];
+   coeff[129] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - p[13]*p[14]*p[29];
+   coeff[130] = (-p[7]*p[9] + p[8]*p[10] + p[11]*p[13] - p[12]*p[14])*p[15];
+   coeff[131] = p[7]*p[8]*p[19] - p[7]*p[9]*p[16] + p[8]*p[10]*p[16] + p[9]*p[10]*p[19] - p[11]*p[12]*p[19] + p[11]*p[13]*p[16] - p[12]*p[14]*p[16] - p[13]*p[14]*p[19];
+   coeff[132] = p[7]*p[8]*p[20] - p[7]*p[9]*p[17] - p[8]*p[8]*p[22] + p[8]*p[10]*p[17] - p[9]*p[9]*p[22] + p[9]*p[10]*p[20] - p[11]*p[12]*p[20] + p[11]*p[13]*p[17] + p[12]*p[12]*p[22] - p[12]*p[14]*p[17] + p[13]*p[13]*p[22] - p[13]*p[14]*p[20];
+   coeff[133] = 2*(-p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32])*p[0];
+   coeff[134] = 4*p[0]*p[7]*p[13]*p[15] - 2*p[0]*p[7]*p[13]*p[24] - 4*p[0]*p[8]*p[14]*p[15] + 2*p[0]*p[8]*p[14]*p[24] + 4*p[0]*p[9]*p[11]*p[15] - 2*p[0]*p[9]*p[11]*p[24] - 4*p[0]*p[10]*p[12]*p[15] + 2*p[0]*p[10]*p[12]*p[24] - 2*p[7]*p[9]*p[15] + 2*p[7]*p[9]*p[24] + 2*p[8]*p[10]*p[15] - 2*p[8]*p[10]*p[24] + 2*p[11]*p[13]*p[15] - 2*p[11]*p[13]*p[24] - 2*p[12]*p[14]*p[15] + 2*p[12]*p[14]*p[24];
+   coeff[135] = -4*p[0]*p[7]*p[12]*p[19] + 2*p[0]*p[7]*p[12]*p[28] + 4*p[0]*p[7]*p[13]*p[16] - 2*p[0]*p[7]*p[13]*p[25] - 4*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[11]*p[28] - 4*p[0]*p[8]*p[14]*p[16] + 2*p[0]*p[8]*p[14]*p[25] + 4*p[0]*p[9]*p[11]*p[16] - 2*p[0]*p[9]*p[11]*p[25] - 4*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[9]*p[14]*p[28] - 4*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[12]*p[25] - 4*p[0]*p[10]*p[13]*p[19] + 2*p[0]*p[10]*p[13]*p[28] + 2*p[7]*p[8]*p[19] - 2*p[7]*p[8]*p[28] - 2*p[7]*p[9]*p[16] + 2*p[7]*p[9]*p[25] + 2*p[8]*p[10]*p[16] - 2*p[8]*p[10]*p[25] + 2*p[9]*p[10]*p[19] - 2*p[9]*p[10]*p[28] - 2*p[11]*p[12]*p[19] + 2*p[11]*p[12]*p[28] + 2*p[11]*p[13]*p[16] - 2*p[11]*p[13]*p[25] - 2*p[12]*p[14]*p[16] + 2*p[12]*p[14]*p[25] - 2*p[13]*p[14]*p[19] + 2*p[13]*p[14]*p[28];
+   coeff[136] = -4*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[11]*p[31] - 4*p[0]*p[7]*p[12]*p[20] + 2*p[0]*p[7]*p[12]*p[29] + 4*p[0]*p[7]*p[13]*p[17] - 2*p[0]*p[7]*p[13]*p[26] - 4*p[0]*p[8]*p[11]*p[20] + 2*p[0]*p[8]*p[11]*p[29] + 4*p[0]*p[8]*p[12]*p[22] - 2*p[0]*p[8]*p[12]*p[31] - 4*p[0]*p[8]*p[14]*p[17] + 2*p[0]*p[8]*p[14]*p[26] + 4*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[11]*p[26] + 4*p[0]*p[9]*p[13]*p[22] - 2*p[0]*p[9]*p[13]*p[31] - 4*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[9]*p[14]*p[29] - 4*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[12]*p[26] - 4*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[13]*p[29] - 4*p[0]*p[10]*p[14]*p[22] + 2*p[0]*p[10]*p[14]*p[31] + 2*p[7]*p[8]*p[20] - 2*p[7]*p[8]*p[29] - 2*p[7]*p[9]*p[17] + 2*p[7]*p[9]*p[26] - 2*p[8]*p[8]*p[22] + 2*p[8]*p[8]*p[31] + 2*p[8]*p[10]*p[17] - 2*p[8]*p[10]*p[26] - 2*p[9]*p[9]*p[22] + 2*p[9]*p[9]*p[31] + 2*p[9]*p[10]*p[20] - 2*p[9]*p[10]*p[29] - 2*p[11]*p[12]*p[20] + 2*p[11]*p[12]*p[29] + 2*p[11]*p[13]*p[17] - 2*p[11]*p[13]*p[26] + 2*p[12]*p[12]*p[22] - 2*p[12]*p[12]*p[31] - 2*p[12]*p[14]*p[17] + 2*p[12]*p[14]*p[26] + 2*p[13]*p[13]*p[22] - 2*p[13]*p[13]*p[31] - 2*p[13]*p[14]*p[20] + 2*p[13]*p[14]*p[29];
+   coeff[137] = -2*p[0]*p[7]*p[13]*p[15] + 2*p[0]*p[8]*p[14]*p[15] - 2*p[0]*p[9]*p[11]*p[15] + 2*p[0]*p[10]*p[12]*p[15] + 2*p[7]*p[9]*p[15] - 2*p[7]*p[9]*p[24] - 2*p[8]*p[10]*p[15] + 2*p[8]*p[10]*p[24] - 2*p[11]*p[13]*p[15] + 2*p[11]*p[13]*p[24] + 2*p[12]*p[14]*p[15] - 2*p[12]*p[14]*p[24];
+   coeff[138] = 2*p[0]*p[7]*p[12]*p[19] - 2*p[0]*p[7]*p[13]*p[16] + 2*p[0]*p[8]*p[11]*p[19] + 2*p[0]*p[8]*p[14]*p[16] - 2*p[0]*p[9]*p[11]*p[16] + 2*p[0]*p[9]*p[14]*p[19] + 2*p[0]*p[10]*p[12]*p[16] + 2*p[0]*p[10]*p[13]*p[19] - 2*p[7]*p[8]*p[19] + 2*p[7]*p[8]*p[28] + 2*p[7]*p[9]*p[16] - 2*p[7]*p[9]*p[25] - 2*p[8]*p[10]*p[16] + 2*p[8]*p[10]*p[25] - 2*p[9]*p[10]*p[19] + 2*p[9]*p[10]*p[28] + 2*p[11]*p[12]*p[19] - 2*p[11]*p[12]*p[28] - 2*p[11]*p[13]*p[16] + 2*p[11]*p[13]*p[25] + 2*p[12]*p[14]*p[16] - 2*p[12]*p[14]*p[25] + 2*p[13]*p[14]*p[19] - 2*p[13]*p[14]*p[28];
+   coeff[139] = 2*p[0]*p[7]*p[11]*p[22] + 2*p[0]*p[7]*p[12]*p[20] - 2*p[0]*p[7]*p[13]*p[17] + 2*p[0]*p[8]*p[11]*p[20] - 2*p[0]*p[8]*p[12]*p[22] + 2*p[0]*p[8]*p[14]*p[17] - 2*p[0]*p[9]*p[11]*p[17] - 2*p[0]*p[9]*p[13]*p[22] + 2*p[0]*p[9]*p[14]*p[20] + 2*p[0]*p[10]*p[12]*p[17] + 2*p[0]*p[10]*p[13]*p[20] + 2*p[0]*p[10]*p[14]*p[22] - 2*p[7]*p[8]*p[20] + 2*p[7]*p[8]*p[29] + 2*p[7]*p[9]*p[17] - 2*p[7]*p[9]*p[26] + 2*p[8]*p[8]*p[22] - 2*p[8]*p[8]*p[31] - 2*p[8]*p[10]*p[17] + 2*p[8]*p[10]*p[26] + 2*p[9]*p[9]*p[22] - 2*p[9]*p[9]*p[31] - 2*p[9]*p[10]*p[20] + 2*p[9]*p[10]*p[29] + 2*p[11]*p[12]*p[20] - 2*p[11]*p[12]*p[29] - 2*p[11]*p[13]*p[17] + 2*p[11]*p[13]*p[26] - 2*p[12]*p[12]*p[22] + 2*p[12]*p[12]*p[31] + 2*p[12]*p[14]*p[17] - 2*p[12]*p[14]*p[26] - 2*p[13]*p[13]*p[22] + 2*p[13]*p[13]*p[31] + 2*p[13]*p[14]*p[20] - 2*p[13]*p[14]*p[29];
+   coeff[140] = 0;
+   coeff[141] = 2*(-p[7]*p[13]*p[15] + p[7]*p[13]*p[24] + p[8]*p[14]*p[15] - p[8]*p[14]*p[24] - p[9]*p[11]*p[15] + p[9]*p[11]*p[24] + p[10]*p[12]*p[15] - p[10]*p[12]*p[24])*p[0];
+   coeff[142] = 2*(p[7]*p[12]*p[19] - p[7]*p[12]*p[28] - p[7]*p[13]*p[16] + p[7]*p[13]*p[25] + p[8]*p[11]*p[19] - p[8]*p[11]*p[28] + p[8]*p[14]*p[16] - p[8]*p[14]*p[25] - p[9]*p[11]*p[16] + p[9]*p[11]*p[25] + p[9]*p[14]*p[19] - p[9]*p[14]*p[28] + p[10]*p[12]*p[16] - p[10]*p[12]*p[25] + p[10]*p[13]*p[19] - p[10]*p[13]*p[28])*p[0];
+   coeff[143] = 2*(p[7]*p[11]*p[22] - p[7]*p[11]*p[31] + p[7]*p[12]*p[20] - p[7]*p[12]*p[29] - p[7]*p[13]*p[17] + p[7]*p[13]*p[26] + p[8]*p[11]*p[20] - p[8]*p[11]*p[29] - p[8]*p[12]*p[22] + p[8]*p[12]*p[31] + p[8]*p[14]*p[17] - p[8]*p[14]*p[26] - p[9]*p[11]*p[17] + p[9]*p[11]*p[26] - p[9]*p[13]*p[22] + p[9]*p[13]*p[31] + p[9]*p[14]*p[20] - p[9]*p[14]*p[29] + p[10]*p[12]*p[17] - p[10]*p[12]*p[26] + p[10]*p[13]*p[20] - p[10]*p[13]*p[29] + p[10]*p[14]*p[22] - p[10]*p[14]*p[31])*p[0];
+   coeff[144] = 2*(p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - p[10]*p[12]*p[15] + p[10]*p[12]*p[24])*p[0];
+   coeff[145] = 2*(-p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - p[10]*p[13]*p[19] + p[10]*p[13]*p[28])*p[0];
+   coeff[146] = 2*(-p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - p[10]*p[14]*p[22] + p[10]*p[14]*p[31])*p[0];
+   coeff[147] = -2*p[0]*p[7]*p[8]*p[21] + 2*p[0]*p[7]*p[9]*p[18] + 2*p[0]*p[8]*p[8]*p[23] - 2*p[0]*p[8]*p[10]*p[18] + 2*p[0]*p[9]*p[9]*p[23] - 2*p[0]*p[9]*p[10]*p[21] + 2*p[0]*p[11]*p[12]*p[21] - 2*p[0]*p[11]*p[13]*p[18] - 2*p[0]*p[12]*p[12]*p[23] + 2*p[0]*p[12]*p[14]*p[18] - 2*p[0]*p[13]*p[13]*p[23] + 2*p[0]*p[13]*p[14]*p[21] - p[7]*p[11]*p[23] + p[7]*p[11]*p[32] - p[7]*p[12]*p[21] + p[7]*p[12]*p[30] + p[7]*p[13]*p[18] - p[7]*p[13]*p[27] - p[8]*p[11]*p[21] + p[8]*p[11]*p[30] + p[8]*p[12]*p[23] - p[8]*p[12]*p[32] - p[8]*p[14]*p[18] + p[8]*p[14]*p[27] + p[9]*p[11]*p[18] - p[9]*p[11]*p[27] + p[9]*p[13]*p[23] - p[9]*p[13]*p[32] - p[9]*p[14]*p[21] + p[9]*p[14]*p[30] - p[10]*p[12]*p[18] + p[10]*p[12]*p[27] - p[10]*p[13]*p[21] + p[10]*p[13]*p[30] - p[10]*p[14]*p[23] + p[10]*p[14]*p[32];
+   coeff[148] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + p[10]*p[12]*p[24];
+   coeff[149] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + p[10]*p[13]*p[28];
+   coeff[150] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + p[10]*p[14]*p[31];
+   coeff[151] = (-p[7]*p[13] + p[8]*p[14] - p[9]*p[11] + p[10]*p[12])*p[15];
+   coeff[152] = p[7]*p[12]*p[19] - p[7]*p[13]*p[16] + p[8]*p[11]*p[19] + p[8]*p[14]*p[16] - p[9]*p[11]*p[16] + p[9]*p[14]*p[19] + p[10]*p[12]*p[16] + p[10]*p[13]*p[19];
+   coeff[153] = p[7]*p[11]*p[22] + p[7]*p[12]*p[20] - p[7]*p[13]*p[17] + p[8]*p[11]*p[20] - p[8]*p[12]*p[22] + p[8]*p[14]*p[17] - p[9]*p[11]*p[17] - p[9]*p[13]*p[22] + p[9]*p[14]*p[20] + p[10]*p[12]*p[17] + p[10]*p[13]*p[20] + p[10]*p[14]*p[22];
+   coeff[154] = 2*(p[7]*p[8]*p[21] - p[7]*p[8]*p[30] - p[7]*p[9]*p[18] + p[7]*p[9]*p[27] - p[8]*p[8]*p[23] + p[8]*p[8]*p[32] + p[8]*p[10]*p[18] - p[8]*p[10]*p[27] - p[9]*p[9]*p[23] + p[9]*p[9]*p[32] + p[9]*p[10]*p[21] - p[9]*p[10]*p[30] - p[11]*p[12]*p[21] + p[11]*p[12]*p[30] + p[11]*p[13]*p[18] - p[11]*p[13]*p[27] + p[12]*p[12]*p[23] - p[12]*p[12]*p[32] - p[12]*p[14]*p[18] + p[12]*p[14]*p[27] + p[13]*p[13]*p[23] - p[13]*p[13]*p[32] - p[13]*p[14]*p[21] + p[13]*p[14]*p[30])*p[0];
+   coeff[155] = -4*p[0]*p[7]*p[9]*p[15] + 2*p[0]*p[7]*p[9]*p[24] + 4*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[8]*p[10]*p[24] + 4*p[0]*p[11]*p[13]*p[15] - 2*p[0]*p[11]*p[13]*p[24] - 4*p[0]*p[12]*p[14]*p[15] + 2*p[0]*p[12]*p[14]*p[24] - 2*p[7]*p[13]*p[15] + 2*p[7]*p[13]*p[24] + 2*p[8]*p[14]*p[15] - 2*p[8]*p[14]*p[24] - 2*p[9]*p[11]*p[15] + 2*p[9]*p[11]*p[24] + 2*p[10]*p[12]*p[15] - 2*p[10]*p[12]*p[24];
+   coeff[156] = 4*p[0]*p[7]*p[8]*p[19] - 2*p[0]*p[7]*p[8]*p[28] - 4*p[0]*p[7]*p[9]*p[16] + 2*p[0]*p[7]*p[9]*p[25] + 4*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[8]*p[10]*p[25] + 4*p[0]*p[9]*p[10]*p[19] - 2*p[0]*p[9]*p[10]*p[28] - 4*p[0]*p[11]*p[12]*p[19] + 2*p[0]*p[11]*p[12]*p[28] + 4*p[0]*p[11]*p[13]*p[16] - 2*p[0]*p[11]*p[13]*p[25] - 4*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[12]*p[14]*p[25] - 4*p[0]*p[13]*p[14]*p[19] + 2*p[0]*p[13]*p[14]*p[28] + 2*p[7]*p[12]*p[19] - 2*p[7]*p[12]*p[28] - 2*p[7]*p[13]*p[16] + 2*p[7]*p[13]*p[25] + 2*p[8]*p[11]*p[19] - 2*p[8]*p[11]*p[28] + 2*p[8]*p[14]*p[16] - 2*p[8]*p[14]*p[25] - 2*p[9]*p[11]*p[16] + 2*p[9]*p[11]*p[25] + 2*p[9]*p[14]*p[19] - 2*p[9]*p[14]*p[28] + 2*p[10]*p[12]*p[16] - 2*p[10]*p[12]*p[25] + 2*p[10]*p[13]*p[19] - 2*p[10]*p[13]*p[28];
+   coeff[157] = 4*p[0]*p[7]*p[8]*p[20] - 2*p[0]*p[7]*p[8]*p[29] - 4*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[7]*p[9]*p[26] - 4*p[0]*p[8]*p[8]*p[22] + 2*p[0]*p[8]*p[8]*p[31] + 4*p[0]*p[8]*p[10]*p[17] - 2*p[0]*p[8]*p[10]*p[26] - 4*p[0]*p[9]*p[9]*p[22] + 2*p[0]*p[9]*p[9]*p[31] + 4*p[0]*p[9]*p[10]*p[20] - 2*p[0]*p[9]*p[10]*p[29] - 4*p[0]*p[11]*p[12]*p[20] + 2*p[0]*p[11]*p[12]*p[29] + 4*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[11]*p[13]*p[26] + 4*p[0]*p[12]*p[12]*p[22] - 2*p[0]*p[12]*p[12]*p[31] - 4*p[0]*p[12]*p[14]*p[17] + 2*p[0]*p[12]*p[14]*p[26] + 4*p[0]*p[13]*p[13]*p[22] - 2*p[0]*p[13]*p[13]*p[31] - 4*p[0]*p[13]*p[14]*p[20] + 2*p[0]*p[13]*p[14]*p[29] + 2*p[7]*p[11]*p[22] - 2*p[7]*p[11]*p[31] + 2*p[7]*p[12]*p[20] - 2*p[7]*p[12]*p[29] - 2*p[7]*p[13]*p[17] + 2*p[7]*p[13]*p[26] + 2*p[8]*p[11]*p[20] - 2*p[8]*p[11]*p[29] - 2*p[8]*p[12]*p[22] + 2*p[8]*p[12]*p[31] + 2*p[8]*p[14]*p[17] - 2*p[8]*p[14]*p[26] - 2*p[9]*p[11]*p[17] + 2*p[9]*p[11]*p[26] - 2*p[9]*p[13]*p[22] + 2*p[9]*p[13]*p[31] + 2*p[9]*p[14]*p[20] - 2*p[9]*p[14]*p[29] + 2*p[10]*p[12]*p[17] - 2*p[10]*p[12]*p[26] + 2*p[10]*p[13]*p[20] - 2*p[10]*p[13]*p[29] + 2*p[10]*p[14]*p[22] - 2*p[10]*p[14]*p[31];
+   coeff[158] = 2*p[0]*p[7]*p[9]*p[15] - 2*p[0]*p[8]*p[10]*p[15] - 2*p[0]*p[11]*p[13]*p[15] + 2*p[0]*p[12]*p[14]*p[15] + 2*p[7]*p[13]*p[15] - 2*p[7]*p[13]*p[24] - 2*p[8]*p[14]*p[15] + 2*p[8]*p[14]*p[24] + 2*p[9]*p[11]*p[15] - 2*p[9]*p[11]*p[24] - 2*p[10]*p[12]*p[15] + 2*p[10]*p[12]*p[24];
+   coeff[159] = -2*p[0]*p[7]*p[8]*p[19] + 2*p[0]*p[7]*p[9]*p[16] - 2*p[0]*p[8]*p[10]*p[16] - 2*p[0]*p[9]*p[10]*p[19] + 2*p[0]*p[11]*p[12]*p[19] - 2*p[0]*p[11]*p[13]*p[16] + 2*p[0]*p[12]*p[14]*p[16] + 2*p[0]*p[13]*p[14]*p[19] - 2*p[7]*p[12]*p[19] + 2*p[7]*p[12]*p[28] + 2*p[7]*p[13]*p[16] - 2*p[7]*p[13]*p[25] - 2*p[8]*p[11]*p[19] + 2*p[8]*p[11]*p[28] - 2*p[8]*p[14]*p[16] + 2*p[8]*p[14]*p[25] + 2*p[9]*p[11]*p[16] - 2*p[9]*p[11]*p[25] - 2*p[9]*p[14]*p[19] + 2*p[9]*p[14]*p[28] - 2*p[10]*p[12]*p[16] + 2*p[10]*p[12]*p[25] - 2*p[10]*p[13]*p[19] + 2*p[10]*p[13]*p[28];
+   coeff[160] = -2*p[0]*p[7]*p[8]*p[20] + 2*p[0]*p[7]*p[9]*p[17] + 2*p[0]*p[8]*p[8]*p[22] - 2*p[0]*p[8]*p[10]*p[17] + 2*p[0]*p[9]*p[9]*p[22] - 2*p[0]*p[9]*p[10]*p[20] + 2*p[0]*p[11]*p[12]*p[20] - 2*p[0]*p[11]*p[13]*p[17] - 2*p[0]*p[12]*p[12]*p[22] + 2*p[0]*p[12]*p[14]*p[17] - 2*p[0]*p[13]*p[13]*p[22] + 2*p[0]*p[13]*p[14]*p[20] - 2*p[7]*p[11]*p[22] + 2*p[7]*p[11]*p[31] - 2*p[7]*p[12]*p[20] + 2*p[7]*p[12]*p[29] + 2*p[7]*p[13]*p[17] - 2*p[7]*p[13]*p[26] - 2*p[8]*p[11]*p[20] + 2*p[8]*p[11]*p[29] + 2*p[8]*p[12]*p[22] - 2*p[8]*p[12]*p[31] - 2*p[8]*p[14]*p[17] + 2*p[8]*p[14]*p[26] + 2*p[9]*p[11]*p[17] - 2*p[9]*p[11]*p[26] + 2*p[9]*p[13]*p[22] - 2*p[9]*p[13]*p[31] - 2*p[9]*p[14]*p[20] + 2*p[9]*p[14]*p[29] - 2*p[10]*p[12]*p[17] + 2*p[10]*p[12]*p[26] - 2*p[10]*p[13]*p[20] + 2*p[10]*p[13]*p[29] - 2*p[10]*p[14]*p[22] + 2*p[10]*p[14]*p[31];
+   coeff[161] = 0;
+   coeff[162] = 2*(p[7]*p[9]*p[15] - p[7]*p[9]*p[24] - p[8]*p[10]*p[15] + p[8]*p[10]*p[24] - p[11]*p[13]*p[15] + p[11]*p[13]*p[24] + p[12]*p[14]*p[15] - p[12]*p[14]*p[24])*p[0];
+   coeff[163] = 2*(-p[7]*p[8]*p[19] + p[7]*p[8]*p[28] + p[7]*p[9]*p[16] - p[7]*p[9]*p[25] - p[8]*p[10]*p[16] + p[8]*p[10]*p[25] - p[9]*p[10]*p[19] + p[9]*p[10]*p[28] + p[11]*p[12]*p[19] - p[11]*p[12]*p[28] - p[11]*p[13]*p[16] + p[11]*p[13]*p[25] + p[12]*p[14]*p[16] - p[12]*p[14]*p[25] + p[13]*p[14]*p[19] - p[13]*p[14]*p[28])*p[0];
+   coeff[164] = 2*(-p[7]*p[8]*p[20] + p[7]*p[8]*p[29] + p[7]*p[9]*p[17] - p[7]*p[9]*p[26] + p[8]*p[8]*p[22] - p[8]*p[8]*p[31] - p[8]*p[10]*p[17] + p[8]*p[10]*p[26] + p[9]*p[9]*p[22] - p[9]*p[9]*p[31] - p[9]*p[10]*p[20] + p[9]*p[10]*p[29] + p[11]*p[12]*p[20] - p[11]*p[12]*p[29] - p[11]*p[13]*p[17] + p[11]*p[13]*p[26] - p[12]*p[12]*p[22] + p[12]*p[12]*p[31] + p[12]*p[14]*p[17] - p[12]*p[14]*p[26] - p[13]*p[13]*p[22] + p[13]*p[13]*p[31] + p[13]*p[14]*p[20] - p[13]*p[14]*p[29])*p[0];
+   coeff[165] = 2*(-p[7]*p[9]*p[15] + p[7]*p[9]*p[24] + p[8]*p[10]*p[15] - p[8]*p[10]*p[24] + p[11]*p[13]*p[15] - p[11]*p[13]*p[24] - p[12]*p[14]*p[15] + p[12]*p[14]*p[24])*p[0];
+   coeff[166] = 2*(p[7]*p[8]*p[19] - p[7]*p[8]*p[28] - p[7]*p[9]*p[16] + p[7]*p[9]*p[25] + p[8]*p[10]*p[16] - p[8]*p[10]*p[25] + p[9]*p[10]*p[19] - p[9]*p[10]*p[28] - p[11]*p[12]*p[19] + p[11]*p[12]*p[28] + p[11]*p[13]*p[16] - p[11]*p[13]*p[25] - p[12]*p[14]*p[16] + p[12]*p[14]*p[25] - p[13]*p[14]*p[19] + p[13]*p[14]*p[28])*p[0];
+   coeff[167] = 2*(p[7]*p[8]*p[20] - p[7]*p[8]*p[29] - p[7]*p[9]*p[17] + p[7]*p[9]*p[26] - p[8]*p[8]*p[22] + p[8]*p[8]*p[31] + p[8]*p[10]*p[17] - p[8]*p[10]*p[26] - p[9]*p[9]*p[22] + p[9]*p[9]*p[31] + p[9]*p[10]*p[20] - p[9]*p[10]*p[29] - p[11]*p[12]*p[20] + p[11]*p[12]*p[29] + p[11]*p[13]*p[17] - p[11]*p[13]*p[26] + p[12]*p[12]*p[22] - p[12]*p[12]*p[31] - p[12]*p[14]*p[17] + p[12]*p[14]*p[26] + p[13]*p[13]*p[22] - p[13]*p[13]*p[31] - p[13]*p[14]*p[20] + p[13]*p[14]*p[29])*p[0];
+}
+
+} // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/common/point_query.h b/thirdparty/embree-aarch64/kernels/common/point_query.h
new file mode 100644
index 0000000000..27d158ca3a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/point_query.h
@@ -0,0 +1,136 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /* Point query structure for closest point query */
+  template<int K>
+  struct RTC_ALIGN(16) PointQueryK 
+  {
+    /* Default construction does nothing */
+    __forceinline PointQueryK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline PointQueryK(const Vec3vf<K>& p, const vfloat<K>& radius = inf, const vfloat<K>& time = zero)
+      : p(p), time(time), radius(radius) {}
+
+    /* Returns the size of the ray */
+    static __forceinline size_t size() { return K; }
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline vbool<K> valid() const
+    {
+      const vbool<K> vx = (abs(p.x) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vy = (abs(p.y) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vz = (abs(p.z) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vn = radius >= vfloat<K>(0);
+      const vbool<K> vf = abs(time) < vfloat<K>(inf);
+      return vx & vy & vz & vn & vf;
+    }
+
+    __forceinline void get(PointQueryK<1>* ray) const;
+    __forceinline void get(size_t i, PointQueryK<1>& ray) const;
+    __forceinline void set(const PointQueryK<1>* ray);
+    __forceinline void set(size_t i, const PointQueryK<1>& ray);
+
+    Vec3vf<K> p;      // location of the query point
+    vfloat<K> time;   // time for motion blur
+    vfloat<K> radius; // radius for the point query
+  };
+  
+  /* Specialization for a single point query */
+  template<>
+  struct RTC_ALIGN(16) PointQueryK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline PointQueryK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline PointQueryK(const Vec3fa& p, float radius = inf, float time = zero)
+      : p(p), time(time), radius(radius) {}
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline bool valid() const {
+      return all(le_mask(abs(Vec3fa(p)), Vec3fa(FLT_LARGE)) & le_mask(Vec3fa(0.f), Vec3fa(radius))) && abs(time) < float(inf);
+    }
+
+    Vec3f p;  
+    float time;
+    float radius;
+  };
+  
+  /* Converts point query packet to single point query */
+  template<int K>
+  __forceinline void PointQueryK<K>::get(PointQueryK<1>* query) const
+  {
+    for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+    {
+      query[i].p.x    = p.x[i]; 
+      query[i].p.y    = p.y[i]; 
+      query[i].p.z    = p.z[i];
+      query[i].time   = time[i];
+      query[i].radius = radius[i]; 
+    }
+  }
+
+  /* Extracts a single point query out of a point query packet*/
+  template<int K>
+  __forceinline void PointQueryK<K>::get(size_t i, PointQueryK<1>& query) const
+  {
+    query.p.x    = p.x[i]; 
+    query.p.y    = p.y[i]; 
+    query.p.z    = p.z[i];
+    query.radius = radius[i];  
+    query.time   = time[i];  
+  }
+
+  /* Converts single point query to point query packet */
+  template<int K>
+  __forceinline void PointQueryK<K>::set(const PointQueryK<1>* query)
+  {
+    for (size_t i = 0; i < K; i++)
+    {
+      p.x[i]    = query[i].p.x;
+      p.y[i]    = query[i].p.y;
+      p.z[i]    = query[i].p.z;
+      radius[i] = query[i].radius; 
+      time[i]   = query[i].time; 
+    }
+  }
+
+  /* inserts a single point query into a point query packet element */
+  template<int K>
+  __forceinline void PointQueryK<K>::set(size_t i, const PointQueryK<1>& query)
+  {
+    p.x[i]    = query.p.x;
+    p.y[i]    = query.p.y;
+    p.z[i]    = query.p.z;
+    radius[i] = query.radius; 
+    time[i]   = query.time; 
+  }
+
+  /* Shortcuts */
+  typedef PointQueryK<1>  PointQuery;
+  typedef PointQueryK<4>  PointQuery4;
+  typedef PointQueryK<8>  PointQuery8;
+  typedef PointQueryK<16> PointQuery16;
+  struct PointQueryN;
+
+  /* Outputs point query to stream */
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const PointQueryK<K>& query)
+  {
+    cout << "{ " << embree_endl
+        << "  p = "    << query.p      << embree_endl
+        << "  r = "    << query.radius << embree_endl
+        << "  time = " << query.time   << embree_endl
+        << "}";
+    return cout;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/primref.h b/thirdparty/embree-aarch64/kernels/common/primref.h
new file mode 100644
index 0000000000..ce75c982bb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/primref.h
@@ -0,0 +1,138 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct __aligned(32) PrimRef 
+  {
+    __forceinline PrimRef () {}
+
+#if defined(__AVX__)
+    __forceinline PrimRef(const PrimRef& v) { 
+      vfloat8::store((float*)this,vfloat8::load((float*)&v));
+    }
+    __forceinline PrimRef& operator=(const PrimRef& v) { 
+      vfloat8::store((float*)this,vfloat8::load((float*)&v)); return *this;
+    }
+#endif
+
+    __forceinline PrimRef (const BBox3fa& bounds, unsigned int geomID, unsigned int primID) 
+    {
+      lower = Vec3fx(bounds.lower, geomID);
+      upper = Vec3fx(bounds.upper, primID);
+    }
+
+    __forceinline PrimRef (const BBox3fa& bounds, size_t id) 
+    {
+#if defined(__X86_64__) || defined(__aarch64__)
+      lower = Vec3fx(bounds.lower, (unsigned)(id & 0xFFFFFFFF));
+      upper = Vec3fx(bounds.upper, (unsigned)((id >> 32) & 0xFFFFFFFF));
+#else
+      lower = Vec3fx(bounds.lower, (unsigned)id);
+      upper = Vec3fx(bounds.upper, (unsigned)0);
+#endif
+    }
+
+    /*! calculates twice the center of the primitive */
+    __forceinline const Vec3fa center2() const {
+      return lower+upper;
+    }
+    
+    /*! return the bounding box of the primitive */
+    __forceinline const BBox3fa bounds() const {
+      return BBox3fa(lower,upper);
+    }
+
+    /*! size for bin heuristic is 1 */
+    __forceinline unsigned size() const { 
+      return 1;
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const 
+    {
+      bounds_o = bounds();
+      center_o = embree::center2(bounds_o);
+    }
+
+    __forceinline unsigned& geomIDref() {  // FIXME: remove !!!!!!!
+      return lower.u;
+    }
+    __forceinline unsigned& primIDref() {  // FIXME: remove !!!!!!!
+      return upper.u;
+    }
+    
+    /*! returns the geometry ID */
+    __forceinline unsigned geomID() const { 
+      return lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned primID() const { 
+      return upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const { 
+#if defined(__X86_64__) || defined(__aarch64__)
+      return size_t(lower.u) + (size_t(upper.u) << 32);
+#else
+      return size_t(lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+    
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRef& p0, const PrimRef& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRef& ref) {
+      return cout << "{ lower = " << ref.lower << ", upper = " << ref.upper << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << " }";
+    }
+
+  public:
+    Vec3fx lower;     //!< lower bounds and geomID
+    Vec3fx upper;     //!< upper bounds and primID
+  };
+
+  /*! fast exchange for PrimRefs */
+  __forceinline void xchg(PrimRef& a, PrimRef& b)
+  {
+#if defined(__AVX__)
+    const vfloat8 aa = vfloat8::load((float*)&a);
+    const vfloat8 bb = vfloat8::load((float*)&b);
+    vfloat8::store((float*)&a,bb);
+    vfloat8::store((float*)&b,aa);
+#else
+    std::swap(a,b);
+#endif
+  }
+
+  /************************************************************************************/
+  /************************************************************************************/
+  /************************************************************************************/
+  /************************************************************************************/
+  
+  struct SubGridBuildData {
+    unsigned short sx,sy;
+    unsigned int primID;
+    
+    __forceinline SubGridBuildData() {};
+    __forceinline SubGridBuildData(const unsigned int sx, const unsigned int sy, const unsigned int primID) : sx(sx), sy(sy), primID(primID) {};
+    
+    __forceinline size_t x() const { return (size_t)sx & 0x7fff; }
+    __forceinline size_t y() const { return (size_t)sy & 0x7fff; }
+    
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/primref_mb.h b/thirdparty/embree-aarch64/kernels/common/primref_mb.h
new file mode 100644
index 0000000000..b6c1ad5712
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/primref_mb.h
@@ -0,0 +1,262 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+#define MBLUR_BIN_LBBOX 1
+
+namespace embree
+{
+#if MBLUR_BIN_LBBOX
+
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct PrimRefMB
+  {
+    typedef LBBox3fa BBox;
+
+    __forceinline PrimRefMB () {}
+
+    __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+      lbounds.bounds0.lower.a = geomID;
+      lbounds.bounds0.upper.a = primID;
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+
+    __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+      lbounds.bounds0.lower.a = id & 0xFFFFFFFF;
+      lbounds.bounds0.upper.a = (id >> 32) & 0xFFFFFFFF;
+#else
+      lbounds.bounds0.lower.a = id;
+      lbounds.bounds0.upper.a = 0;
+#endif
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+    
+    __forceinline PrimRefMB (const LBBox3fa& lbounds_i, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : lbounds((LBBox3fx)lbounds_i), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+      lbounds.bounds0.lower.u = id & 0xFFFFFFFF;
+      lbounds.bounds0.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+      lbounds.bounds0.lower.u = id;
+      lbounds.bounds0.upper.u = 0;
+#endif
+      lbounds.bounds1.lower.a = activeTimeSegments;
+      lbounds.bounds1.upper.a = totalTimeSegments;
+    }
+
+    /*! returns bounds for binning */
+    __forceinline LBBox3fa bounds() const {
+      return lbounds;
+    }
+
+    /*! returns the number of time segments of this primref */
+    __forceinline unsigned size() const {
+      return lbounds.bounds1.lower.a;
+    }
+
+    __forceinline unsigned totalTimeSegments() const {
+      return lbounds.bounds1.upper.a;
+    }
+
+     /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,float(totalTimeSegments()));
+    }
+
+     /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<=(int)totalTimeSegments());
+      return time_range.lower + time_range.size()*float(i)/float(totalTimeSegments());
+    }
+    
+    /*! checks if time range overlaps */
+    __forceinline bool time_range_overlap(const BBox1f& range) const
+    {
+      if (0.9999f*time_range.upper <= range.lower) return false;
+      if (1.0001f*time_range.lower >= range.upper) return false;
+      return true;
+    }
+
+    /*! returns center for binning */
+    __forceinline Vec3fa binCenter() const {
+      return center2(lbounds.interpolate(0.5f));
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(LBBox3fa& bounds_o, Vec3fa& center_o) const
+    {
+      bounds_o = bounds();
+      center_o = binCenter();
+    }
+
+    /*! returns the geometry ID */
+    __forceinline unsigned geomID() const {
+      return lbounds.bounds0.lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned primID() const {
+      return lbounds.bounds0.upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const {
+#if defined(__X86_64__) || defined(__aarch64__)
+      return size_t(lbounds.bounds0.lower.u) + (size_t(lbounds.bounds0.upper.u) << 32);
+#else
+      return size_t(lbounds.bounds0.lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+      return cout << "{ time_range = " << ref.time_range << ", bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ",  total_segments = " << ref.totalTimeSegments() << " }";
+    }
+
+  public:
+    LBBox3fx lbounds;
+    BBox1f time_range; // entire geometry time range
+  };
+
+#else
+
+  /*! A primitive reference stores the bounds of the primitive and its ID. */
+  struct __aligned(16) PrimRefMB
+  {
+    typedef BBox3fa BBox;
+
+    __forceinline PrimRefMB () {}
+
+    __forceinline PrimRefMB (const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, unsigned int geomID, unsigned int primID)
+      : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+      bbox.lower.a = geomID;
+      bbox.upper.a = primID;
+    }
+    
+    __forceinline PrimRefMB (EmptyTy empty, const LBBox3fa& bounds, unsigned int activeTimeSegments, BBox1f time_range, unsigned int totalTimeSegments, size_t id)
+      : bbox(bounds.interpolate(0.5f)), _activeTimeSegments(activeTimeSegments), _totalTimeSegments(totalTimeSegments), time_range(time_range)
+    {
+      assert(activeTimeSegments > 0);
+#if defined(__X86_64__) || defined(__aarch64__)
+      bbox.lower.u = id & 0xFFFFFFFF;
+      bbox.upper.u = (id >> 32) & 0xFFFFFFFF;
+#else
+      bbox.lower.u = id;
+      bbox.upper.u = 0;
+#endif
+    }
+    
+    /*! returns bounds for binning */
+    __forceinline BBox3fa bounds() const {
+      return bbox;
+    }
+
+    /*! returns the number of time segments of this primref */
+    __forceinline unsigned int size() const { 
+      return _activeTimeSegments;
+    }
+
+    __forceinline unsigned int totalTimeSegments() const { 
+      return _totalTimeSegments;
+    }
+
+     /* calculate overlapping time segment range */
+    __forceinline range<int> timeSegmentRange(const BBox1f& range) const {
+      return getTimeSegmentRange(range,time_range,float(_totalTimeSegments));
+    }
+
+     /* returns time that corresponds to time step */
+    __forceinline float timeStep(const int i) const {
+      assert(i>=0 && i<=(int)_totalTimeSegments);
+      return time_range.lower + time_range.size()*float(i)/float(_totalTimeSegments);
+    }
+    
+    /*! checks if time range overlaps */
+    __forceinline bool time_range_overlap(const BBox1f& range) const
+    {
+      if (0.9999f*time_range.upper <= range.lower) return false;
+      if (1.0001f*time_range.lower >= range.upper) return false;
+      return true;
+    }
+
+    /*! returns center for binning */
+    __forceinline Vec3fa binCenter() const {
+      return center2(bounds());
+    }
+
+    /*! returns bounds and centroid used for binning */
+    __forceinline void binBoundsAndCenter(BBox3fa& bounds_o, Vec3fa& center_o) const
+    {
+      bounds_o = bounds();
+      center_o = center2(bounds());
+    }
+
+    /*! returns the geometry ID */
+    __forceinline unsigned int geomID() const { 
+      return bbox.lower.a;
+    }
+
+    /*! returns the primitive ID */
+    __forceinline unsigned int primID() const { 
+      return bbox.upper.a;
+    }
+
+    /*! returns an size_t sized ID */
+    __forceinline size_t ID() const { 
+#if defined(__X86_64__) || defined(__aarch64__)
+      return size_t(bbox.lower.u) + (size_t(bbox.upper.u) << 32);
+#else
+      return size_t(bbox.lower.u);
+#endif
+    }
+
+    /*! special function for operator< */
+    __forceinline uint64_t ID64() const {
+      return (((uint64_t)primID()) << 32) + (uint64_t)geomID();
+    }
+    
+    /*! allows sorting the primrefs by ID */
+    friend __forceinline bool operator<(const PrimRefMB& p0, const PrimRefMB& p1) {
+      return p0.ID64() < p1.ID64();
+    }
+
+    /*! Outputs primitive reference to a stream. */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PrimRefMB& ref) {
+      return cout << "{ bounds = " << ref.bounds() << ", geomID = " << ref.geomID() << ", primID = " << ref.primID() << ", active_segments = " << ref.size() << ",  total_segments = " << ref.totalTimeSegments() << " }";
+    }
+
+  public:
+    BBox3fa bbox; // bounds, geomID, primID
+    unsigned int _activeTimeSegments;
+    unsigned int _totalTimeSegments;
+    BBox1f time_range; // entire geometry time range
+  };
+
+#endif
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/profile.h b/thirdparty/embree-aarch64/kernels/common/profile.h
new file mode 100644
index 0000000000..a7de36414d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/profile.h
@@ -0,0 +1,159 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! helper structure for the implementation of the profile functions below */
+  struct ProfileTimer
+  {
+    static const size_t N = 20;
+    
+    ProfileTimer () {}
+
+    ProfileTimer (const size_t numSkip) : i(0), j(0), maxJ(0), numSkip(numSkip), t0(0)
+    {
+      for (size_t i=0; i<N; i++) names[i] = nullptr;
+      for (size_t i=0; i<N; i++) dt_fst[i] = 0.0;
+      for (size_t i=0; i<N; i++) dt_min[i] = pos_inf;
+      for (size_t i=0; i<N; i++) dt_avg[i] = 0.0;
+      for (size_t i=0; i<N; i++) dt_max[i] = neg_inf;
+    }
+    
+    __forceinline void begin() 
+    {
+      j=0;
+      t0 = tj = getSeconds();
+    }
+
+    __forceinline void end() {
+      absolute("total");
+      i++;
+    }
+
+    __forceinline void operator() (const char* name) {
+      relative(name);
+    }
+
+    __forceinline void absolute (const char* name) 
+    {
+      const double t1 = getSeconds();
+      const double dt = t1-t0;
+      assert(names[j] == nullptr || names[j] == name);
+      names[j] = name;
+      if (i == 0) dt_fst[j] = dt;
+      if (i>=numSkip) {
+        dt_min[j] = min(dt_min[j],dt);
+        dt_avg[j] = dt_avg[j] + dt;
+        dt_max[j] = max(dt_max[j],dt);
+      }
+      j++;
+      maxJ = max(maxJ,j);
+    }
+
+    __forceinline void relative (const char* name) 
+    {
+      const double t1 = getSeconds();
+      const double dt = t1-tj;
+      tj = t1;
+      assert(names[j] == nullptr || names[j] == name);
+      names[j] = name;
+      if (i == 0) dt_fst[j] = dt;
+      if (i>=numSkip) {
+        dt_min[j] = min(dt_min[j],dt);
+        dt_avg[j] = dt_avg[j] + dt;
+        dt_max[j] = max(dt_max[j],dt);
+      }
+      j++;
+      maxJ = max(maxJ,j);
+    }
+
+    void print(size_t numElements) 
+    {
+      for (size_t k=0; k<N; k++) 
+        dt_avg[k] /= double(i-numSkip);
+
+      printf("  profile [M/s]:\n");
+      for (size_t j=0; j<maxJ; j++)
+        printf("%20s:  fst = %7.2f M/s, min = %7.2f M/s, avg = %7.2f M/s, max = %7.2f M/s\n",
+               names[j],numElements/dt_fst[j]*1E-6,numElements/dt_max[j]*1E-6,numElements/dt_avg[j]*1E-6,numElements/dt_min[j]*1E-6);
+
+      printf("  profile [ms]:\n");
+      for (size_t j=0; j<maxJ; j++) 
+        printf("%20s:  fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+               names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+    }
+
+    void print() 
+    {
+      printf("  profile:\n");
+
+      for (size_t k=0; k<N; k++) 
+        dt_avg[k] /= double(i-numSkip);
+
+      for (size_t j=0; j<maxJ; j++) {
+        printf("%20s:  fst = %7.2f ms, min = %7.2f ms, avg = %7.2f ms, max = %7.2fms\n",
+               names[j],1000.0*dt_fst[j],1000.0*dt_min[j],1000.0*dt_avg[j],1000.0*dt_max[j]);
+      }
+    }
+
+    double avg() {
+      return dt_avg[maxJ-1]/double(i-numSkip);
+    }
+    
+  private:
+    size_t i;
+    size_t j;
+    size_t maxJ;
+    size_t numSkip;
+    double t0;
+    double tj;
+    const char* names[N];
+    double dt_fst[N];
+    double dt_min[N];
+    double dt_avg[N];
+    double dt_max[N];
+  };
+
+  /*! This function executes some code block multiple times and measured sections of it. 
+      Use the following way:
+
+      profile(1,10,1000,[&](ProfileTimer& timer) {
+        // code
+        timer("A");
+        // code 
+        timer("B");
+      });
+  */
+  template<typename Closure>
+    void profile(const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) 
+    {
+      ProfileTimer timer(numSkip);
+      
+      for (size_t i=0; i<numSkip+numIter; i++) 
+      {
+        timer.begin();
+	closure(timer);
+        timer.end();
+      }
+      timer.print(numElements);
+    }
+
+  /*! similar as the function above, but the timer object comes externally */
+  template<typename Closure>
+    void profile(ProfileTimer& timer, const size_t numSkip, const size_t numIter, const size_t numElements, const Closure& closure) 
+    {
+      timer = ProfileTimer(numSkip);
+      
+      for (size_t i=0; i<numSkip+numIter; i++) 
+      {
+        timer.begin();
+	closure(timer);
+        timer.end();
+      }
+      timer.print(numElements);
+    }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/ray.h b/thirdparty/embree-aarch64/kernels/common/ray.h
new file mode 100644
index 0000000000..336d48942c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/ray.h
@@ -0,0 +1,1517 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "instance_stack.h"
+
+// FIXME: if ray gets seperated into ray* and hit, uload4 needs to be adjusted
+
+namespace embree
+{
+  static const size_t MAX_INTERNAL_STREAM_SIZE = 32;
+
+  /* Ray structure for K rays */
+  template<int K>
+  struct RayK
+  {
+    /* Default construction does nothing */
+    __forceinline RayK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline RayK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+                       const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+                       const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+      : org(org), dir(dir), _tnear(tnear), tfar(tfar), _time(time), mask(mask), id(id), flags(flags) {}
+
+    /* Returns the size of the ray */
+    static __forceinline size_t size() { return K; }
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline vbool<K> valid() const
+    {
+      const vbool<K> vx = (abs(org.x) <= vfloat<K>(FLT_LARGE)) & (abs(dir.x) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vy = (abs(org.y) <= vfloat<K>(FLT_LARGE)) & (abs(dir.y) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vz = (abs(org.z) <= vfloat<K>(FLT_LARGE)) & (abs(dir.z) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vn = abs(tnear()) <= vfloat<K>(inf);
+      const vbool<K> vf = abs(tfar) <= vfloat<K>(inf);
+      return vx & vy & vz & vn & vf;
+    }
+
+    __forceinline void get(RayK<1>* ray) const;
+    __forceinline void get(size_t i, RayK<1>& ray) const;
+    __forceinline void set(const RayK<1>* ray);
+    __forceinline void set(size_t i, const RayK<1>& ray);
+
+    __forceinline void copy(size_t dest, size_t source);
+
+    __forceinline vint<K> octant() const
+    {
+      return select(dir.x < 0.0f, vint<K>(1), vint<K>(zero)) |
+             select(dir.y < 0.0f, vint<K>(2), vint<K>(zero)) |
+             select(dir.z < 0.0f, vint<K>(4), vint<K>(zero));
+    }
+
+    /* Ray data */
+    Vec3vf<K> org;    // ray origin
+    vfloat<K> _tnear; // start of ray segment
+    Vec3vf<K> dir;    // ray direction
+    vfloat<K> _time;  // time of this ray for motion blur
+    vfloat<K> tfar;   // end of ray segment
+    vint<K> mask;     // used to mask out objects during traversal
+    vint<K> id;      
+    vint<K> flags;  
+
+    __forceinline vfloat<K>& tnear() { return _tnear; }
+    __forceinline vfloat<K>& time()  { return _time; }
+    __forceinline const vfloat<K>& tnear() const { return _tnear; }
+    __forceinline const vfloat<K>& time()  const { return _time; }
+  };
+
+  /* Ray+hit structure for K rays */
+  template<int K>
+  struct RayHitK : RayK<K>
+  {
+    using RayK<K>::org;
+    using RayK<K>::_tnear;
+    using RayK<K>::dir;
+    using RayK<K>::_time;
+    using RayK<K>::tfar;
+    using RayK<K>::mask;
+    using RayK<K>::id;
+    using RayK<K>::flags;
+
+    using RayK<K>::tnear;
+    using RayK<K>::time;
+
+    /* Default construction does nothing */
+    __forceinline RayHitK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     * has to be smaller than far */
+    __forceinline RayHitK(const Vec3vf<K>& org, const Vec3vf<K>& dir,
+                          const vfloat<K>& tnear = zero, const vfloat<K>& tfar = inf,
+                          const vfloat<K>& time = zero, const vint<K>& mask = -1, const vint<K>& id = 0, const vint<K>& flags = 0)
+      : RayK<K>(org, dir, tnear, tfar, time, mask, id, flags),
+        geomID(RTC_INVALID_GEOMETRY_ID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+    }
+
+    __forceinline RayHitK(const RayK<K>& ray)
+      : RayK<K>(ray),
+        geomID(RTC_INVALID_GEOMETRY_ID) 
+    {
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+    }
+
+    __forceinline RayHitK<K>& operator =(const RayK<K>& ray)
+    {
+      org    = ray.org;
+      _tnear = ray._tnear;
+      dir    = ray.dir;
+      _time  = ray._time;
+      tfar   = ray.tfar;
+      mask   = ray.mask;
+      id     = ray.id;
+      flags  = ray.flags;
+
+      geomID = RTC_INVALID_GEOMETRY_ID;
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = RTC_INVALID_GEOMETRY_ID;
+
+      return *this;
+    }
+
+    /* Calculates if the hit is valid */
+    __forceinline void verifyHit(const vbool<K>& valid0) const
+    {
+      vbool<K> valid = valid0 & geomID != vuint<K>(RTC_INVALID_GEOMETRY_ID);
+      const vbool<K> vt = (abs(tfar) <= vfloat<K>(FLT_LARGE)) | (tfar == vfloat<K>(neg_inf));
+      const vbool<K> vu = (abs(u) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vv = (abs(u) <= vfloat<K>(FLT_LARGE));
+      const vbool<K> vnx = abs(Ng.x) <= vfloat<K>(FLT_LARGE);
+      const vbool<K> vny = abs(Ng.y) <= vfloat<K>(FLT_LARGE);
+      const vbool<K> vnz = abs(Ng.z) <= vfloat<K>(FLT_LARGE);
+      if (any(valid & !vt)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid t");
+      if (any(valid & !vu)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid u");
+      if (any(valid & !vv)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid v");
+      if (any(valid & !vnx)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.x");
+      if (any(valid & !vny)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.y");
+      if (any(valid & !vnz)) throw_RTCError(RTC_ERROR_UNKNOWN,"invalid Ng.z");
+    }
+
+    __forceinline void get(RayHitK<1>* ray) const;
+    __forceinline void get(size_t i, RayHitK<1>& ray) const;
+    __forceinline void set(const RayHitK<1>* ray);
+    __forceinline void set(size_t i, const RayHitK<1>& ray);
+
+    __forceinline void copy(size_t dest, size_t source);
+
+    /* Hit data */
+    Vec3vf<K> Ng;   // geometry normal
+    vfloat<K> u;    // barycentric u coordinate of hit
+    vfloat<K> v;    // barycentric v coordinate of hit
+    vuint<K> primID; // primitive ID
+    vuint<K> geomID; // geometry ID
+    vuint<K> instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+  };
+
+  /* Specialization for a single ray */
+  template<>
+  struct RayK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline RayK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline RayK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+      : org(org,tnear), dir(dir,time), tfar(tfar), mask(mask), id(id), flags(flags) {}
+
+    /* Calculates if this is a valid ray that does not cause issues during traversal */
+    __forceinline bool valid() const {
+      return all(le_mask(abs(Vec3fa(org)), Vec3fa(FLT_LARGE)) & le_mask(abs(Vec3fa(dir)), Vec3fa(FLT_LARGE))) && abs(tnear()) <= float(inf) && abs(tfar) <= float(inf);
+    }
+
+    /* Ray data */
+    Vec3ff org;  // 3 floats for ray origin, 1 float for tnear
+    //float tnear; // start of ray segment
+    Vec3ff dir;  // 3 floats for ray direction, 1 float for time
+    // float time; 
+    float tfar;  // end of ray segment
+    int mask;    // used to mask out objects during traversal
+    int id;      // ray ID
+    int flags;   // ray flags
+
+    __forceinline float& tnear() { return org.w; };
+    __forceinline const float& tnear() const { return org.w; };
+
+    __forceinline float& time() { return dir.w; };
+    __forceinline const float& time() const { return dir.w; };
+
+  };
+
+  template<>
+  struct RayHitK<1> : RayK<1>
+  {
+    /* Default construction does nothing */
+    __forceinline RayHitK() {}
+
+    /* Constructs a ray from origin, direction, and ray segment. Near
+     *  has to be smaller than far */
+    __forceinline RayHitK(const Vec3fa& org, const Vec3fa& dir, float tnear = zero, float tfar = inf, float time = zero, int mask = -1, int id = 0, int flags = 0)
+      : RayK<1>(org, dir, tnear, tfar, time, mask, id, flags),
+        geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+    __forceinline RayHitK(const RayK<1>& ray)
+      : RayK<1>(ray),
+        geomID(RTC_INVALID_GEOMETRY_ID) {}
+
+    __forceinline RayHitK<1>& operator =(const RayK<1>& ray)
+    {
+      org    = ray.org;
+      dir    = ray.dir;
+      tfar   = ray.tfar;
+      mask   = ray.mask;
+      id     = ray.id;
+      flags  = ray.flags;
+
+      geomID = RTC_INVALID_GEOMETRY_ID;
+
+      return *this;
+    }
+
+    /* Calculates if the hit is valid */
+    __forceinline void verifyHit() const
+    {
+      if (geomID == RTC_INVALID_GEOMETRY_ID) return;
+      const bool vt = (abs(tfar) <= FLT_LARGE) || (tfar == float(neg_inf));
+      const bool vu = (abs(u) <= FLT_LARGE);
+      const bool vv = (abs(u) <= FLT_LARGE);
+      const bool vnx = abs(Ng.x) <= FLT_LARGE;
+      const bool vny = abs(Ng.y) <= FLT_LARGE;
+      const bool vnz = abs(Ng.z) <= FLT_LARGE;
+      if (!vt) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid t");
+      if (!vu) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid u");
+      if (!vv) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid v");
+      if (!vnx) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.x");
+      if (!vny) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.y");
+      if (!vnz) throw_RTCError(RTC_ERROR_UNKNOWN, "invalid Ng.z");
+    }
+
+    /* Hit data */
+    Vec3f Ng;            // not normalized geometry normal
+    float u;             // barycentric u coordinate of hit
+    float v;             // barycentric v coordinate of hit
+    unsigned int primID; // primitive ID
+    unsigned int geomID; // geometry ID
+    unsigned int instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID
+  };
+
+  /* Converts ray packet to single rays */
+  template<int K>
+  __forceinline void RayK<K>::get(RayK<1>* ray) const
+  {
+    for (size_t i = 0; i < K; i++) // FIXME: use SIMD transpose
+    {
+      ray[i].org.x = org.x[i]; ray[i].org.y = org.y[i]; ray[i].org.z = org.z[i]; ray[i].tnear() = tnear()[i];
+      ray[i].dir.x = dir.x[i]; ray[i].dir.y = dir.y[i]; ray[i].dir.z = dir.z[i]; ray[i].time()  = time()[i];
+      ray[i].tfar  = tfar[i];  ray[i].mask = mask[i]; ray[i].id = id[i]; ray[i].flags = flags[i];
+    }
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::get(RayHitK<1>* ray) const
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      get(i, ray[i]);
+  }
+
+  /* Extracts a single ray out of a ray packet*/
+  template<int K>
+  __forceinline void RayK<K>::get(size_t i, RayK<1>& ray) const
+  {
+    ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i]; 
+    ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.time()  = time()[i];  
+    ray.tfar  = tfar[i]; ray.mask = mask[i];  ray.id = id[i]; ray.flags = flags[i];
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::get(size_t i, RayHitK<1>& ray) const
+  {
+    ray.org.x = org.x[i]; ray.org.y = org.y[i]; ray.org.z = org.z[i]; ray.tnear() = tnear()[i];
+    ray.dir.x = dir.x[i]; ray.dir.y = dir.y[i]; ray.dir.z = dir.z[i]; ray.tfar  = tfar[i]; ray.time()  = time()[i]; 
+    ray.mask = mask[i];  ray.id = id[i]; ray.flags = flags[i];
+    ray.Ng.x = Ng.x[i]; ray.Ng.y = Ng.y[i]; ray.Ng.z = Ng.z[i];
+    ray.u = u[i]; ray.v = v[i];
+    ray.primID = primID[i]; ray.geomID = geomID[i]; 
+
+    instance_id_stack::copy(instID, ray.instID, i);
+  }
+
+  /* Converts single rays to ray packet */
+  template<int K>
+  __forceinline void RayK<K>::set(const RayK<1>* ray)
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      set(i, ray[i]);
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::set(const RayHitK<1>* ray)
+  {
+    // FIXME: use SIMD transpose
+    for (size_t i = 0; i < K; i++)
+      set(i, ray[i]);
+  }
+
+  /* inserts a single ray into a ray packet element */
+  template<int K>
+  __forceinline void RayK<K>::set(size_t i, const RayK<1>& ray)
+  {
+    org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+    dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+    tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::set(size_t i, const RayHitK<1>& ray)
+  {
+    org.x[i] = ray.org.x; org.y[i] = ray.org.y; org.z[i] = ray.org.z; tnear()[i] = ray.tnear();
+    dir.x[i] = ray.dir.x; dir.y[i] = ray.dir.y; dir.z[i] = ray.dir.z; time()[i] = ray.time();
+    tfar[i] = ray.tfar; mask[i] = ray.mask; id[i] = ray.id; flags[i] = ray.flags;
+    Ng.x[i] = ray.Ng.x; Ng.y[i] = ray.Ng.y; Ng.z[i] = ray.Ng.z;
+    u[i] = ray.u; v[i] = ray.v;
+    primID[i] = ray.primID; geomID[i] = ray.geomID;
+
+    instance_id_stack::copy(ray.instID, instID, i);
+  }
+
+  /* copies a ray packet element into another element*/
+  template<int K>
+  __forceinline void RayK<K>::copy(size_t dest, size_t source)
+  {
+    org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+    dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; 
+    tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source]; 
+  }
+
+  template<int K>
+  __forceinline void RayHitK<K>::copy(size_t dest, size_t source)
+  {
+    org.x[dest] = org.x[source]; org.y[dest] = org.y[source]; org.z[dest] = org.z[source]; tnear()[dest] = tnear()[source];
+    dir.x[dest] = dir.x[source]; dir.y[dest] = dir.y[source]; dir.z[dest] = dir.z[source]; time()[dest] = time()[source]; 
+    tfar [dest] = tfar[source]; mask[dest] = mask[source]; id[dest] = id[source]; flags[dest] = flags[source];
+    Ng.x[dest] = Ng.x[source]; Ng.y[dest] = Ng.y[source]; Ng.z[dest] = Ng.z[source];
+    u[dest] = u[source]; v[dest] = v[source];
+    primID[dest] = primID[source]; geomID[dest] = geomID[source];  
+
+    instance_id_stack::copy(instID, instID, source, dest);
+  }
+
+  /* Shortcuts */
+  typedef RayK<1>  Ray;
+  typedef RayK<4>  Ray4;
+  typedef RayK<8>  Ray8;
+  typedef RayK<16> Ray16;
+  struct RayN;
+
+  typedef RayHitK<1>  RayHit;
+  typedef RayHitK<4>  RayHit4;
+  typedef RayHitK<8>  RayHit8;
+  typedef RayHitK<16> RayHit16;
+  struct RayHitN;
+
+  template<int K, bool intersect>
+  struct RayTypeHelper;
+
+  template<int K>
+  struct RayTypeHelper<K, true>
+  {
+    typedef RayHitK<K> Ty;
+  };
+
+  template<int K>
+  struct RayTypeHelper<K, false>
+  {
+    typedef RayK<K> Ty;
+  };
+
+  template<bool intersect>
+  using RayType = typename RayTypeHelper<1, intersect>::Ty;
+
+  template<int K, bool intersect>
+  using RayTypeK = typename RayTypeHelper<K, intersect>::Ty;
+
+  /* Outputs ray to stream */
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const RayK<K>& ray)
+  {
+    return cout << "{ " << embree_endl
+                << "  org = " << ray.org << embree_endl
+                << "  dir = " << ray.dir << embree_endl
+                << "  near = " << ray.tnear() << embree_endl
+                << "  far = " << ray.tfar << embree_endl
+                << "  time = " << ray.time() << embree_endl
+                << "  mask = " << ray.mask << embree_endl
+                << "  id = " << ray.id << embree_endl
+                << "  flags = " << ray.flags << embree_endl
+                << "}";
+  }
+
+  template<int K>
+  __forceinline embree_ostream operator <<(embree_ostream cout, const RayHitK<K>& ray)
+  {
+    cout << "{ " << embree_endl
+         << "  org = " << ray.org << embree_endl
+         << "  dir = " << ray.dir << embree_endl
+         << "  near = " << ray.tnear() << embree_endl
+         << "  far = " << ray.tfar << embree_endl
+         << "  time = " << ray.time() << embree_endl
+         << "  mask = " << ray.mask << embree_endl
+         << "  id = " << ray.id << embree_endl
+         << "  flags = " << ray.flags << embree_endl
+         << "  Ng = " << ray.Ng
+         << "  u = " << ray.u <<  embree_endl
+         << "  v = " << ray.v << embree_endl
+         << "  primID = " << ray.primID <<  embree_endl
+         << "  geomID = " << ray.geomID << embree_endl
+         << "  instID =";
+    for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+    {
+      cout << " " << ray.instID[l];
+    }
+    cout << embree_endl;
+    return cout << "}";
+  }
+
+  struct RayStreamSOA
+  {
+    __forceinline RayStreamSOA(void* rays, size_t N)
+      : ptr((char*)rays), N(N) {}
+
+    /* ray data access functions */
+    __forceinline float* org_x(size_t offset = 0) { return (float*)&ptr[0*4*N+offset]; }  // x coordinate of ray origin
+    __forceinline float* org_y(size_t offset = 0) { return (float*)&ptr[1*4*N+offset]; }  // y coordinate of ray origin
+    __forceinline float* org_z(size_t offset = 0) { return (float*)&ptr[2*4*N+offset]; }; // z coordinate of ray origin
+    __forceinline float* tnear(size_t offset = 0) { return (float*)&ptr[3*4*N+offset]; }; // start of ray segment
+
+    __forceinline float* dir_x(size_t offset = 0) { return (float*)&ptr[4*4*N+offset]; }; // x coordinate of ray direction
+    __forceinline float* dir_y(size_t offset = 0) { return (float*)&ptr[5*4*N+offset]; }; // y coordinate of ray direction
+    __forceinline float* dir_z(size_t offset = 0) { return (float*)&ptr[6*4*N+offset]; }; // z coordinate of ray direction
+    __forceinline float* time (size_t offset = 0) { return (float*)&ptr[7*4*N+offset]; }; // time of this ray for motion blur
+
+    __forceinline float* tfar (size_t offset = 0) { return (float*)&ptr[8*4*N+offset]; }; // end of ray segment (set to hit distance)
+    __forceinline int*   mask (size_t offset = 0) { return (int*)&ptr[9*4*N+offset];   }; // used to mask out objects during traversal (optional)
+    __forceinline int*   id   (size_t offset = 0) { return (int*)&ptr[10*4*N+offset];  }; // id
+    __forceinline int*   flags(size_t offset = 0) { return (int*)&ptr[11*4*N+offset];  }; // flags
+
+    /* hit data access functions */
+    __forceinline float* Ng_x(size_t offset = 0) { return (float*)&ptr[12*4*N+offset]; }; // x coordinate of geometry normal
+    __forceinline float* Ng_y(size_t offset = 0) { return (float*)&ptr[13*4*N+offset]; }; // y coordinate of geometry normal
+    __forceinline float* Ng_z(size_t offset = 0) { return (float*)&ptr[14*4*N+offset]; }; // z coordinate of geometry normal
+
+    __forceinline float* u(size_t offset = 0) { return (float*)&ptr[15*4*N+offset]; };    // barycentric u coordinate of hit
+    __forceinline float* v(size_t offset = 0) { return (float*)&ptr[16*4*N+offset]; };    // barycentric v coordinate of hit
+
+    __forceinline unsigned int* primID(size_t offset = 0) { return (unsigned int*)&ptr[17*4*N+offset]; };   // primitive ID
+    __forceinline unsigned int* geomID(size_t offset = 0) { return (unsigned int*)&ptr[18*4*N+offset]; };   // geometry ID
+    __forceinline unsigned int* instID(size_t level, size_t offset = 0) { return (unsigned int*)&ptr[19*4*N+level*4*N+offset]; };   // instance ID
+
+    __forceinline Ray getRayByOffset(size_t offset)
+    {
+      Ray ray;
+      ray.org.x   = org_x(offset)[0];
+      ray.org.y   = org_y(offset)[0];
+      ray.org.z   = org_z(offset)[0];
+      ray.tnear() = tnear(offset)[0];
+      ray.dir.x   = dir_x(offset)[0];
+      ray.dir.y   = dir_y(offset)[0];
+      ray.dir.z   = dir_z(offset)[0];
+      ray.time()  = time(offset)[0];
+      ray.tfar    = tfar(offset)[0];
+      ray.mask    = mask(offset)[0];
+      ray.id      = id(offset)[0];
+      ray.flags   = flags(offset)[0];
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x  = vfloat<K>::loadu(org_x(offset));
+      ray.org.y  = vfloat<K>::loadu(org_y(offset));
+      ray.org.z  = vfloat<K>::loadu(org_z(offset));
+      ray.tnear  = vfloat<K>::loadu(tnear(offset));
+      ray.dir.x  = vfloat<K>::loadu(dir_x(offset));
+      ray.dir.y  = vfloat<K>::loadu(dir_y(offset));
+      ray.dir.z  = vfloat<K>::loadu(dir_z(offset));
+      ray.time   = vfloat<K>::loadu(time(offset));
+      ray.tfar   = vfloat<K>::loadu(tfar(offset));
+      ray.mask   = vint<K>::loadu(mask(offset));
+      ray.id     = vint<K>::loadu(id(offset));
+      ray.flags  = vint<K>::loadu(flags(offset));
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x   = vfloat<K>::loadu(valid, org_x(offset));
+      ray.org.y   = vfloat<K>::loadu(valid, org_y(offset));
+      ray.org.z   = vfloat<K>::loadu(valid, org_z(offset));
+      ray.tnear() = vfloat<K>::loadu(valid, tnear(offset));
+      ray.dir.x   = vfloat<K>::loadu(valid, dir_x(offset));
+      ray.dir.y   = vfloat<K>::loadu(valid, dir_y(offset));
+      ray.dir.z   = vfloat<K>::loadu(valid, dir_z(offset));
+      ray.time()  = vfloat<K>::loadu(valid, time(offset));
+      ray.tfar  = vfloat<K>::loadu(valid, tfar(offset));
+
+#if !defined(__AVX__)
+      /* SSE: some ray members must be loaded with scalar instructions to ensure that we don't cause memory faults,
+         because the SSE masked loads always access the entire vector */
+      if (unlikely(!all(valid)))
+      {
+        ray.mask  = zero;
+        ray.id    = zero;
+        ray.flags = zero;
+
+        for (size_t k = 0; k < K; k++)
+        {
+          if (likely(valid[k]))
+          {
+            ray.mask[k]  = mask(offset)[k];
+            ray.id[k]    = id(offset)[k];
+            ray.flags[k] = flags(offset)[k];
+          }
+        }
+      }
+      else
+#endif
+      {
+        ray.mask  = vint<K>::loadu(valid, mask(offset));
+        ray.id    = vint<K>::loadu(valid, id(offset));
+        ray.flags = vint<K>::loadu(valid, flags(offset));
+      }
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+    {
+      /* 
+       * valid_i: stores which of the input rays exist (do not access nonexistent rays!)
+       * valid:   stores which of the rays actually hit something.
+       */
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+        vfloat<K>::storeu(valid, Ng_x(offset), ray.Ng.x);
+        vfloat<K>::storeu(valid, Ng_y(offset), ray.Ng.y);
+        vfloat<K>::storeu(valid, Ng_z(offset), ray.Ng.z);
+        vfloat<K>::storeu(valid, u(offset), ray.u);
+        vfloat<K>::storeu(valid, v(offset), ray.v);
+
+#if !defined(__AVX__)
+        /* SSE: some ray members must be stored with scalar instructions to ensure that we don't cause memory faults,
+           because the SSE masked stores always access the entire vector */
+        if (unlikely(!all(valid_i)))
+        {
+          for (size_t k = 0; k < K; k++)
+          {
+            if (likely(valid[k]))
+            {
+              primID(offset)[k] = ray.primID[k];
+              geomID(offset)[k] = ray.geomID[k];
+
+              instID(0, offset)[k] = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+              for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+                instID(l, offset)[k] = ray.instID[l][k];
+#endif
+            }
+          }
+        }
+        else
+#endif
+        {
+          vuint<K>::storeu(valid, primID(offset), ray.primID);
+          vuint<K>::storeu(valid, geomID(offset), ray.geomID);
+
+          vuint<K>::storeu(valid, instID(0, offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::storeu(valid, instID(l, offset), ray.instID[l]);
+#endif
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+        vfloat<K>::storeu(valid, tfar(offset), ray.tfar);
+    }
+
+    __forceinline size_t getOctantByOffset(size_t offset)
+    {
+      const float dx = dir_x(offset)[0];
+      const float dy = dir_y(offset)[0];
+      const float dz = dir_z(offset)[0];
+      const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+      return octantID;
+    }
+
+    __forceinline bool isValidByOffset(size_t offset)
+    {
+      const float nnear = tnear(offset)[0];
+      const float ffar  = tfar(offset)[0];
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      RayK<K> ray;
+
+#if defined(__AVX2__)
+      ray.org.x   = vfloat<K>::template gather<1>(valid, org_x(), offset);
+      ray.org.y   = vfloat<K>::template gather<1>(valid, org_y(), offset);
+      ray.org.z   = vfloat<K>::template gather<1>(valid, org_z(), offset);
+      ray.tnear() = vfloat<K>::template gather<1>(valid, tnear(), offset);
+      ray.dir.x   = vfloat<K>::template gather<1>(valid, dir_x(), offset);
+      ray.dir.y   = vfloat<K>::template gather<1>(valid, dir_y(), offset);
+      ray.dir.z   = vfloat<K>::template gather<1>(valid, dir_z(), offset);
+      ray.time()  = vfloat<K>::template gather<1>(valid, time(), offset);
+      ray.tfar    = vfloat<K>::template gather<1>(valid, tfar(), offset);
+      ray.mask    = vint<K>::template gather<1>(valid, mask(), offset);
+      ray.id      = vint<K>::template gather<1>(valid, id(), offset);
+      ray.flags   = vint<K>::template gather<1>(valid, flags(), offset);
+#else
+      ray.org     = zero;
+      ray.tnear() = zero;
+      ray.dir     = zero;
+      ray.time()  = zero;
+      ray.tfar    = zero;
+      ray.mask    = zero;
+      ray.id      = zero;
+      ray.flags   = zero;
+
+      for (size_t k = 0; k < K; k++)
+      {
+        if (likely(valid[k]))
+        {
+          const size_t ofs = offset[k];
+
+          ray.org.x[k]   = *org_x(ofs);
+          ray.org.y[k]   = *org_y(ofs);
+          ray.org.z[k]   = *org_z(ofs);
+          ray.tnear()[k] = *tnear(ofs);
+          ray.dir.x[k]   = *dir_x(ofs);
+          ray.dir.y[k]   = *dir_y(ofs);
+          ray.dir.z[k]   = *dir_z(ofs);
+          ray.time()[k]  = *time(ofs);
+          ray.tfar[k]    = *tfar(ofs);
+          ray.mask[k]    = *mask(ofs);
+          ray.id[k]      = *id(ofs);
+          ray.flags[k]   = *flags(ofs);
+        }
+      }
+#endif
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+        vfloat<K>::template scatter<1>(valid, Ng_x(), offset, ray.Ng.x);
+        vfloat<K>::template scatter<1>(valid, Ng_y(), offset, ray.Ng.y);
+        vfloat<K>::template scatter<1>(valid, Ng_z(), offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, u(), offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, v(), offset, ray.v);
+        vuint<K>::template scatter<1>(valid, primID(), offset, ray.primID);
+        vuint<K>::template scatter<1>(valid, geomID(), offset, ray.geomID);
+
+        vuint<K>::template scatter<1>(valid, instID(0), offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          vuint<K>::template scatter<1>(valid, instID(l), offset, ray.instID[l]);
+#endif
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *tfar(ofs) = ray.tfar[k];
+
+          *Ng_x(ofs)   = ray.Ng.x[k];
+          *Ng_y(ofs)   = ray.Ng.y[k];
+          *Ng_z(ofs)   = ray.Ng.z[k];
+          *u(ofs)      = ray.u[k];
+          *v(ofs)      = ray.v[k];
+          *primID(ofs) = ray.primID[k];
+          *geomID(ofs) = ray.geomID[k];
+
+          *instID(0, ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+            *instID(l, ofs) = ray.instID[l][k];
+#endif
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar(), offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *tfar(ofs) = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    char* __restrict__ ptr;
+    size_t N;
+  };
+
+  template<size_t MAX_K>
+  struct StackRayStreamSOA : public RayStreamSOA
+  {
+    __forceinline StackRayStreamSOA(size_t K)
+      : RayStreamSOA(data, K) { assert(K <= MAX_K); }
+
+    char data[MAX_K / 4 * sizeof(RayHit4)];
+  };
+
+
+  struct RayStreamSOP
+  {
+    template<class T>
+    __forceinline void init(T& t)
+    {
+      org_x  = (float*)&t.org.x;
+      org_y  = (float*)&t.org.y;
+      org_z  = (float*)&t.org.z;
+      tnear  = (float*)&t.tnear;
+      dir_x  = (float*)&t.dir.x;
+      dir_y  = (float*)&t.dir.y;
+      dir_z  = (float*)&t.dir.z;
+      time   = (float*)&t.time;
+      tfar   = (float*)&t.tfar;
+      mask   = (unsigned int*)&t.mask;
+      id     = (unsigned int*)&t.id;
+      flags  = (unsigned int*)&t.flags;
+
+      Ng_x   = (float*)&t.Ng.x;
+      Ng_y   = (float*)&t.Ng.y;
+      Ng_z   = (float*)&t.Ng.z;
+      u      = (float*)&t.u;
+      v      = (float*)&t.v;
+      primID = (unsigned int*)&t.primID;
+      geomID = (unsigned int*)&t.geomID;
+
+      for (unsigned l = 0; l < RTC_MAX_INSTANCE_LEVEL_COUNT; ++l)
+        instID[l] = (unsigned int*)&t.instID[l];
+    }
+
+    __forceinline Ray getRayByOffset(size_t offset)
+    {
+      Ray ray;
+      ray.org.x   = *(float* __restrict__)((char*)org_x + offset);
+      ray.org.y   = *(float* __restrict__)((char*)org_y + offset);
+      ray.org.z   = *(float* __restrict__)((char*)org_z + offset);
+      ray.dir.x   = *(float* __restrict__)((char*)dir_x + offset);
+      ray.dir.y   = *(float* __restrict__)((char*)dir_y + offset);
+      ray.dir.z   = *(float* __restrict__)((char*)dir_z + offset);
+      ray.tfar  = *(float* __restrict__)((char*)tfar + offset);
+      ray.tnear() = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+      ray.time()  = time ? *(float* __restrict__)((char*)time + offset) : 0.0f;
+      ray.mask    = mask ? *(unsigned int* __restrict__)((char*)mask + offset) : -1;
+      ray.id      = id ? *(unsigned int* __restrict__)((char*)id + offset) : -1;
+      ray.flags   = flags ? *(unsigned int* __restrict__)((char*)flags + offset) : -1;
+      return ray;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, size_t offset)
+    {
+      RayK<K> ray;
+      ray.org.x   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_x + offset));
+      ray.org.y   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_y + offset));
+      ray.org.z   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)org_z + offset));
+      ray.dir.x   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+      ray.dir.y   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+      ray.dir.z   = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+      ray.tfar    = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+      ray.tnear() = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+      ray.time()  = time ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)time + offset)) : 0.0f;
+      ray.mask    = mask ? vint<K>::loadu(valid, (const void* __restrict__)((char*)mask + offset)) : -1;
+      ray.id      = id ? vint<K>::loadu(valid, (const void* __restrict__)((char*)id + offset)) : -1;
+      ray.flags   = flags ? vint<K>::loadu(valid, (const void* __restrict__)((char*)flags + offset)) : -1;
+      return ray;
+    }
+
+    template<int K>
+    __forceinline Vec3vf<K> getDirByOffset(const vbool<K>& valid, size_t offset)
+    {
+      Vec3vf<K> dir;
+      dir.x = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_x + offset));
+      dir.y = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_y + offset));
+      dir.z = vfloat<K>::loadu(valid, (float* __restrict__)((char*)dir_z + offset));
+      return dir;
+    }
+
+    __forceinline void setHitByOffset(size_t offset, const RayHit& ray)
+    {
+      if (ray.geomID != RTC_INVALID_GEOMETRY_ID)
+      {
+        *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+
+        if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + offset) = ray.Ng.x;
+        if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + offset) = ray.Ng.y;
+        if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + offset) = ray.Ng.z;
+        *(float* __restrict__)((char*)u + offset) = ray.u;
+        *(float* __restrict__)((char*)v + offset) = ray.v;
+        *(unsigned int* __restrict__)((char*)geomID + offset) = ray.geomID;
+        *(unsigned int* __restrict__)((char*)primID + offset) = ray.primID;
+
+        if (likely(instID[0])) {
+          *(unsigned int* __restrict__)((char*)instID[0] + offset) = ray.instID[0];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID; ++l)
+            *(unsigned int* __restrict__)((char*)instID[l] + offset) = ray.instID[l];
+#endif
+        }
+      }
+    }
+
+    __forceinline void setHitByOffset(size_t offset, const Ray& ray)
+    {
+      *(float* __restrict__)((char*)tfar + offset) = ray.tfar;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+
+        if (likely(Ng_x)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_x + offset), ray.Ng.x);
+        if (likely(Ng_y)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_y + offset), ray.Ng.y);
+        if (likely(Ng_z)) vfloat<K>::storeu(valid, (float* __restrict__)((char*)Ng_z + offset), ray.Ng.z);
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)u + offset), ray.u);
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)v + offset), ray.v);
+        vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)primID + offset), ray.primID);
+        vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)geomID + offset), ray.geomID);
+
+        if (likely(instID[0])) {
+          vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[0] + offset), ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::storeu(valid, (unsigned int* __restrict__)((char*)instID[l] + offset), ray.instID[l]);
+#endif
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, size_t offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+        vfloat<K>::storeu(valid, (float* __restrict__)((char*)tfar + offset), ray.tfar);
+    }
+
+    __forceinline size_t getOctantByOffset(size_t offset)
+    {
+      const float dx = *(float* __restrict__)((char*)dir_x + offset);
+      const float dy = *(float* __restrict__)((char*)dir_y + offset);
+      const float dz = *(float* __restrict__)((char*)dir_z + offset);
+      const size_t octantID = (dx < 0.0f ? 1 : 0) + (dy < 0.0f ? 2 : 0) + (dz < 0.0f ? 4 : 0);
+      return octantID;
+    }
+
+    __forceinline bool isValidByOffset(size_t offset)
+    {
+      const float nnear = tnear ? *(float* __restrict__)((char*)tnear + offset) : 0.0f;
+      const float ffar  = *(float* __restrict__)((char*)tfar + offset);
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline vbool<K> isValidByOffset(const vbool<K>& valid, size_t offset)
+    {
+      const vfloat<K> nnear = tnear ? vfloat<K>::loadu(valid, (float* __restrict__)((char*)tnear + offset)) : 0.0f;
+      const vfloat<K> ffar  = vfloat<K>::loadu(valid, (float* __restrict__)((char*)tfar + offset));
+      return nnear <= ffar;
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      RayK<K> ray;
+
+#if defined(__AVX2__)
+      ray.org.x   = vfloat<K>::template gather<1>(valid, org_x, offset);
+      ray.org.y   = vfloat<K>::template gather<1>(valid, org_y, offset);
+      ray.org.z   = vfloat<K>::template gather<1>(valid, org_z, offset);
+      ray.dir.x   = vfloat<K>::template gather<1>(valid, dir_x, offset);
+      ray.dir.y   = vfloat<K>::template gather<1>(valid, dir_y, offset);
+      ray.dir.z   = vfloat<K>::template gather<1>(valid, dir_z, offset);
+      ray.tfar    = vfloat<K>::template gather<1>(valid, tfar, offset);
+      ray.tnear() = tnear ? vfloat<K>::template gather<1>(valid, tnear, offset) : vfloat<K>(zero);
+      ray.time()  = time ? vfloat<K>::template gather<1>(valid, time, offset) : vfloat<K>(zero);
+      ray.mask    = mask ? vint<K>::template gather<1>(valid, (int*)mask, offset) : vint<K>(-1);
+      ray.id      = id ? vint<K>::template gather<1>(valid, (int*)id, offset) : vint<K>(-1);
+      ray.flags   = flags ? vint<K>::template gather<1>(valid, (int*)flags, offset) : vint<K>(-1);
+#else
+      ray.org     = zero;
+      ray.tnear() = zero;
+      ray.dir     = zero;
+      ray.tfar    = zero;
+      ray.time()  = zero;
+      ray.mask    = zero;
+      ray.id      = zero;
+      ray.flags   = zero;
+
+      for (size_t k = 0; k < K; k++)
+      {
+        if (likely(valid[k]))
+        {
+          const size_t ofs = offset[k];
+
+          ray.org.x[k]   = *(float* __restrict__)((char*)org_x + ofs);
+          ray.org.y[k]   = *(float* __restrict__)((char*)org_y + ofs);
+          ray.org.z[k]   = *(float* __restrict__)((char*)org_z + ofs);
+          ray.dir.x[k]   = *(float* __restrict__)((char*)dir_x + ofs);
+          ray.dir.y[k]   = *(float* __restrict__)((char*)dir_y + ofs);
+          ray.dir.z[k]   = *(float* __restrict__)((char*)dir_z + ofs);
+          ray.tfar[k]  = *(float* __restrict__)((char*)tfar + ofs);
+          ray.tnear()[k] = tnear ? *(float* __restrict__)((char*)tnear + ofs) : 0.0f;
+          ray.time()[k]  = time ? *(float* __restrict__)((char*)time + ofs) : 0.0f;
+          ray.mask[k]    = mask ? *(int* __restrict__)((char*)mask + ofs) : -1;
+          ray.id[k]      = id ? *(int* __restrict__)((char*)id + ofs) : -1;
+          ray.flags[k]   = flags ? *(int* __restrict__)((char*)flags + ofs) : -1;
+        }
+      }
+#endif
+
+      return ray;
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+
+        if (likely(Ng_x)) vfloat<K>::template scatter<1>(valid, Ng_x, offset, ray.Ng.x);
+        if (likely(Ng_y)) vfloat<K>::template scatter<1>(valid, Ng_y, offset, ray.Ng.y);
+        if (likely(Ng_z)) vfloat<K>::template scatter<1>(valid, Ng_z, offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, u, offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, v, offset, ray.v);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)geomID, offset, ray.geomID);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)primID, offset, ray.primID);
+
+        if (likely(instID[0])) {
+          vuint<K>::template scatter<1>(valid, (unsigned int*)instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+          for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+            vuint<K>::template scatter<1>(valid, (unsigned int*)instID[l], offset, ray.instID[l]);
+#endif
+        }
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+
+          if (likely(Ng_x)) *(float* __restrict__)((char*)Ng_x + ofs) = ray.Ng.x[k];
+          if (likely(Ng_y)) *(float* __restrict__)((char*)Ng_y + ofs) = ray.Ng.y[k];
+          if (likely(Ng_z)) *(float* __restrict__)((char*)Ng_z + ofs) = ray.Ng.z[k];
+          *(float* __restrict__)((char*)u + ofs) = ray.u[k];
+          *(float* __restrict__)((char*)v + ofs) = ray.v[k];
+          *(unsigned int* __restrict__)((char*)primID + ofs) = ray.primID[k];
+          *(unsigned int* __restrict__)((char*)geomID + ofs) = ray.geomID[k];
+
+          if (likely(instID[0])) {
+            *(unsigned int* __restrict__)((char*)instID[0] + ofs) = ray.instID[0][k];
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+            for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && ray.instID[l-1][k] != RTC_INVALID_GEOMETRY_ID; ++l)
+              *(unsigned int* __restrict__)((char*)instID[l] + ofs) = ray.instID[l][k];
+#endif
+          }
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, tfar, offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          const size_t ofs = offset[k];
+
+          *(float* __restrict__)((char*)tfar + ofs) = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    /* ray data */
+    float* __restrict__ org_x; // x coordinate of ray origin
+    float* __restrict__ org_y; // y coordinate of ray origin
+    float* __restrict__ org_z; // z coordinate of ray origin
+    float* __restrict__ tnear; // start of ray segment (optional)
+
+    float* __restrict__ dir_x; // x coordinate of ray direction
+    float* __restrict__ dir_y; // y coordinate of ray direction
+    float* __restrict__ dir_z; // z coordinate of ray direction
+    float* __restrict__ time;         // time of this ray for motion blur (optional)
+
+    float* __restrict__ tfar;  // end of ray segment (set to hit distance)
+    unsigned int* __restrict__ mask;  // used to mask out objects during traversal (optional)
+    unsigned int* __restrict__ id;    // ray ID
+    unsigned int* __restrict__ flags; // ray flags
+
+    /* hit data */
+    float* __restrict__ Ng_x; // x coordinate of geometry normal (optional)
+    float* __restrict__ Ng_y; // y coordinate of geometry normal (optional)
+    float* __restrict__ Ng_z; // z coordinate of geometry normal (optional)
+
+    float* __restrict__ u;    // barycentric u coordinate of hit
+    float* __restrict__ v;    // barycentric v coordinate of hit
+
+    unsigned int* __restrict__ primID; // primitive ID
+    unsigned int* __restrict__ geomID; // geometry ID
+    unsigned int* __restrict__ instID[RTC_MAX_INSTANCE_LEVEL_COUNT]; // instance ID (optional)
+  };
+
+
+  struct RayStreamAOS
+  {
+    __forceinline RayStreamAOS(void* rays)
+      : ptr((Ray*)rays) {}
+
+    __forceinline Ray& getRayByOffset(size_t offset)
+    {
+      return *(Ray*)((char*)ptr + offset);
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vint<K>& offset);
+
+    template<int K>
+    __forceinline RayK<K> getRayByOffset(const vbool<K>& valid, const vint<K>& offset)
+    {
+      const vint<K> valid_offset = select(valid, offset, vintx(zero));
+      return getRayByOffset(valid_offset);
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.x, offset, ray.Ng.x);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.y, offset, ray.Ng.y);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->Ng.z, offset, ray.Ng.z);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->u, offset, ray.u);
+        vfloat<K>::template scatter<1>(valid, &((RayHit*)ptr)->v, offset, ray.v);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->primID, offset, ray.primID);
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->geomID, offset, ray.geomID);
+
+        vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[0], offset, ray.instID[0]);
+#if (RTC_MAX_INSTANCE_LEVEL_COUNT > 1)
+        for (unsigned l = 1; l < RTC_MAX_INSTANCE_LEVEL_COUNT && any(valid & (ray.instID[l-1] != RTC_INVALID_GEOMETRY_ID)); ++l)
+          vuint<K>::template scatter<1>(valid, (unsigned int*)&((RayHit*)ptr)->instID[l], offset, ray.instID[l]);
+#endif
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          RayHit* __restrict__ ray_k = (RayHit*)((char*)ptr + offset[k]);
+          ray_k->tfar   = ray.tfar[k];
+          ray_k->Ng.x   = ray.Ng.x[k];
+          ray_k->Ng.y   = ray.Ng.y[k];
+          ray_k->Ng.z   = ray.Ng.z[k];
+          ray_k->u      = ray.u[k];
+          ray_k->v      = ray.v[k];
+          ray_k->primID = ray.primID[k];
+          ray_k->geomID = ray.geomID[k];
+
+          instance_id_stack::copy(ray.instID, ray_k->instID, k);
+        }
+#endif
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByOffset(const vbool<K>& valid_i, const vint<K>& offset, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+#if defined(__AVX512F__)
+        vfloat<K>::template scatter<1>(valid, &ptr->tfar, offset, ray.tfar);
+#else
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          Ray* __restrict__ ray_k = (Ray*)((char*)ptr + offset[k]);
+          ray_k->tfar = ray.tfar[k];
+        }
+#endif
+      }
+    }
+
+    Ray* __restrict__ ptr;
+  };
+
+  template<>
+  __forceinline Ray4 RayStreamAOS::getRayByOffset(const vint4& offset)
+  {
+    Ray4 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear */
+    const vfloat4 a0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+    const vfloat4 a1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+    const vfloat4 a2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+    const vfloat4 a3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+
+    transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+    /* load and transpose: dir.x, dir.y, dir.z, time */
+    const vfloat4 b0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->dir);
+    const vfloat4 b1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->dir);
+    const vfloat4 b2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->dir);
+    const vfloat4 b3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->dir);
+
+    transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+
+    vfloat4 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Ray8 RayStreamAOS::getRayByOffset(const vint8& offset)
+  {
+    Ray8 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[0]))->org);
+    const vfloat8 ab1 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[1]))->org);
+    const vfloat8 ab2 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[2]))->org);
+    const vfloat8 ab3 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[3]))->org);
+    const vfloat8 ab4 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[4]))->org);
+    const vfloat8 ab5 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[5]))->org);
+    const vfloat8 ab6 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[6]))->org);
+    const vfloat8 ab7 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[7]))->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[0]))->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[1]))->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[2]))->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[3]))->tfar);
+    const vfloat4 c4 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[4]))->tfar);
+    const vfloat4 c5 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[5]))->tfar);
+    const vfloat4 c6 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[6]))->tfar);
+    const vfloat4 c7 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[7]))->tfar);
+
+    vfloat8 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<>
+  __forceinline Ray16 RayStreamAOS::getRayByOffset(const vint16& offset)
+  {
+    Ray16 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 0]))->org);
+    const vfloat8 ab1  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 1]))->org);
+    const vfloat8 ab2  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 2]))->org);
+    const vfloat8 ab3  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 3]))->org);
+    const vfloat8 ab4  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 4]))->org);
+    const vfloat8 ab5  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 5]))->org);
+    const vfloat8 ab6  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 6]))->org);
+    const vfloat8 ab7  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 7]))->org);
+    const vfloat8 ab8  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 8]))->org);
+    const vfloat8 ab9  = vfloat8::loadu(&((Ray*)((char*)ptr + offset[ 9]))->org);
+    const vfloat8 ab10 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[10]))->org);
+    const vfloat8 ab11 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[11]))->org);
+    const vfloat8 ab12 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[12]))->org);
+    const vfloat8 ab13 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[13]))->org);
+    const vfloat8 ab14 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[14]))->org);
+    const vfloat8 ab15 = vfloat8::loadu(&((Ray*)((char*)ptr + offset[15]))->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+              ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 0]))->tfar);
+    const vfloat4 c1  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 1]))->tfar);
+    const vfloat4 c2  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 2]))->tfar);
+    const vfloat4 c3  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 3]))->tfar);
+    const vfloat4 c4  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 4]))->tfar);
+    const vfloat4 c5  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 5]))->tfar);
+    const vfloat4 c6  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 6]))->tfar);
+    const vfloat4 c7  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 7]))->tfar);
+    const vfloat4 c8  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 8]))->tfar);
+    const vfloat4 c9  = vfloat4::loadu(&((Ray*)((char*)ptr + offset[ 9]))->tfar);
+    const vfloat4 c10 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[10]))->tfar);
+    const vfloat4 c11 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[11]))->tfar);
+    const vfloat4 c12 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[12]))->tfar);
+    const vfloat4 c13 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[13]))->tfar);
+    const vfloat4 c14 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[14]))->tfar);
+    const vfloat4 c15 = vfloat4::loadu(&((Ray*)((char*)ptr + offset[15]))->tfar);
+
+    vfloat16 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+              ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+
+  struct RayStreamAOP
+  {
+    __forceinline RayStreamAOP(void* rays)
+      : ptr((Ray**)rays) {}
+
+    __forceinline Ray& getRayByIndex(size_t index)
+    {
+      return *ptr[index];
+    }
+
+    template<int K>
+    __forceinline RayK<K> getRayByIndex(const vint<K>& index);
+
+    template<int K>
+    __forceinline RayK<K> getRayByIndex(const vbool<K>& valid, const vint<K>& index)
+    {
+      const vint<K> valid_index = select(valid, index, vintx(zero));
+      return getRayByIndex(valid_index);
+    }
+
+    template<int K>
+    __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayHitK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.geomID != RTC_INVALID_GEOMETRY_ID);
+
+      if (likely(any(valid)))
+      {
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          RayHit* __restrict__ ray_k = (RayHit*)ptr[index[k]];
+
+          ray_k->tfar = ray.tfar[k];
+          ray_k->Ng.x   = ray.Ng.x[k];
+          ray_k->Ng.y   = ray.Ng.y[k];
+          ray_k->Ng.z   = ray.Ng.z[k];
+          ray_k->u      = ray.u[k];
+          ray_k->v      = ray.v[k];
+          ray_k->primID = ray.primID[k];
+          ray_k->geomID = ray.geomID[k];
+          instance_id_stack::copy(ray.instID, ray_k->instID, k);
+        }
+      }
+    }
+
+    template<int K>
+    __forceinline void setHitByIndex(const vbool<K>& valid_i, const vint<K>& index, const RayK<K>& ray)
+    {
+      vbool<K> valid = valid_i;
+      valid &= (ray.tfar < 0.0f);
+
+      if (likely(any(valid)))
+      {
+        size_t valid_bits = movemask(valid);
+        while (valid_bits != 0)
+        {
+          const size_t k = bscf(valid_bits);
+          Ray* __restrict__ ray_k = ptr[index[k]];
+
+          ray_k->tfar = ray.tfar[k];
+        }
+      }
+    }
+
+    Ray** __restrict__ ptr;
+  };
+
+  template<>
+  __forceinline Ray4 RayStreamAOP::getRayByIndex(const vint4& index)
+  {
+    Ray4 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear */
+    const vfloat4 a0 = vfloat4::loadu(&ptr[index[0]]->org);
+    const vfloat4 a1 = vfloat4::loadu(&ptr[index[1]]->org);
+    const vfloat4 a2 = vfloat4::loadu(&ptr[index[2]]->org);
+    const vfloat4 a3 = vfloat4::loadu(&ptr[index[3]]->org);
+
+    transpose(a0,a1,a2,a3, ray.org.x, ray.org.y, ray.org.z, ray.tnear());
+
+    /* load and transpose: dir.x, dir.y, dir.z, time */
+    const vfloat4 b0 = vfloat4::loadu(&ptr[index[0]]->dir);
+    const vfloat4 b1 = vfloat4::loadu(&ptr[index[1]]->dir);
+    const vfloat4 b2 = vfloat4::loadu(&ptr[index[2]]->dir);
+    const vfloat4 b3 = vfloat4::loadu(&ptr[index[3]]->dir);
+
+    transpose(b0,b1,b2,b3, ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+
+    vfloat4 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+
+#if defined(__AVX__)
+  template<>
+  __forceinline Ray8 RayStreamAOP::getRayByIndex(const vint8& index)
+  {
+    Ray8 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0 = vfloat8::loadu(&ptr[index[0]]->org);
+    const vfloat8 ab1 = vfloat8::loadu(&ptr[index[1]]->org);
+    const vfloat8 ab2 = vfloat8::loadu(&ptr[index[2]]->org);
+    const vfloat8 ab3 = vfloat8::loadu(&ptr[index[3]]->org);
+    const vfloat8 ab4 = vfloat8::loadu(&ptr[index[4]]->org);
+    const vfloat8 ab5 = vfloat8::loadu(&ptr[index[5]]->org);
+    const vfloat8 ab6 = vfloat8::loadu(&ptr[index[6]]->org);
+    const vfloat8 ab7 = vfloat8::loadu(&ptr[index[7]]->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7, ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0 = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1 = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2 = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3 = vfloat4::loadu(&ptr[index[3]]->tfar);
+    const vfloat4 c4 = vfloat4::loadu(&ptr[index[4]]->tfar);
+    const vfloat4 c5 = vfloat4::loadu(&ptr[index[5]]->tfar);
+    const vfloat4 c6 = vfloat4::loadu(&ptr[index[6]]->tfar);
+    const vfloat4 c7 = vfloat4::loadu(&ptr[index[7]]->tfar);
+
+    vfloat8 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7, ray.tfar, maskf, idf, flagsf);
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+
+#if defined(__AVX512F__)
+  template<>
+  __forceinline Ray16 RayStreamAOP::getRayByIndex(const vint16& index)
+  {
+    Ray16 ray;
+
+    /* load and transpose: org.x, org.y, org.z, tnear, dir.x, dir.y, dir.z, time */
+    const vfloat8 ab0  = vfloat8::loadu(&ptr[index[0]]->org);
+    const vfloat8 ab1  = vfloat8::loadu(&ptr[index[1]]->org);
+    const vfloat8 ab2  = vfloat8::loadu(&ptr[index[2]]->org);
+    const vfloat8 ab3  = vfloat8::loadu(&ptr[index[3]]->org);
+    const vfloat8 ab4  = vfloat8::loadu(&ptr[index[4]]->org);
+    const vfloat8 ab5  = vfloat8::loadu(&ptr[index[5]]->org);
+    const vfloat8 ab6  = vfloat8::loadu(&ptr[index[6]]->org);
+    const vfloat8 ab7  = vfloat8::loadu(&ptr[index[7]]->org);
+    const vfloat8 ab8  = vfloat8::loadu(&ptr[index[8]]->org);
+    const vfloat8 ab9  = vfloat8::loadu(&ptr[index[9]]->org);
+    const vfloat8 ab10 = vfloat8::loadu(&ptr[index[10]]->org);
+    const vfloat8 ab11 = vfloat8::loadu(&ptr[index[11]]->org);
+    const vfloat8 ab12 = vfloat8::loadu(&ptr[index[12]]->org);
+    const vfloat8 ab13 = vfloat8::loadu(&ptr[index[13]]->org);
+    const vfloat8 ab14 = vfloat8::loadu(&ptr[index[14]]->org);
+    const vfloat8 ab15 = vfloat8::loadu(&ptr[index[15]]->org);
+
+    transpose(ab0,ab1,ab2,ab3,ab4,ab5,ab6,ab7,ab8,ab9,ab10,ab11,ab12,ab13,ab14,ab15,
+              ray.org.x, ray.org.y, ray.org.z, ray.tnear(), ray.dir.x, ray.dir.y, ray.dir.z, ray.time());
+
+    /* load and transpose: tfar, mask, id, flags */
+    const vfloat4 c0  = vfloat4::loadu(&ptr[index[0]]->tfar);
+    const vfloat4 c1  = vfloat4::loadu(&ptr[index[1]]->tfar);
+    const vfloat4 c2  = vfloat4::loadu(&ptr[index[2]]->tfar);
+    const vfloat4 c3  = vfloat4::loadu(&ptr[index[3]]->tfar);
+    const vfloat4 c4  = vfloat4::loadu(&ptr[index[4]]->tfar);
+    const vfloat4 c5  = vfloat4::loadu(&ptr[index[5]]->tfar);
+    const vfloat4 c6  = vfloat4::loadu(&ptr[index[6]]->tfar);
+    const vfloat4 c7  = vfloat4::loadu(&ptr[index[7]]->tfar);
+    const vfloat4 c8  = vfloat4::loadu(&ptr[index[8]]->tfar);
+    const vfloat4 c9  = vfloat4::loadu(&ptr[index[9]]->tfar);
+    const vfloat4 c10 = vfloat4::loadu(&ptr[index[10]]->tfar);
+    const vfloat4 c11 = vfloat4::loadu(&ptr[index[11]]->tfar);
+    const vfloat4 c12 = vfloat4::loadu(&ptr[index[12]]->tfar);
+    const vfloat4 c13 = vfloat4::loadu(&ptr[index[13]]->tfar);
+    const vfloat4 c14 = vfloat4::loadu(&ptr[index[14]]->tfar);
+    const vfloat4 c15 = vfloat4::loadu(&ptr[index[15]]->tfar);
+
+    vfloat16 maskf, idf, flagsf;
+    transpose(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,
+              ray.tfar, maskf, idf, flagsf);
+
+    ray.mask  = asInt(maskf);
+    ray.id    = asInt(idf);
+    ray.flags = asInt(flagsf);
+
+    return ray;
+  }
+#endif
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
new file mode 100644
index 0000000000..625fbf6d4f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
@@ -0,0 +1,1799 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "../../include/embree3/rtcore_ray.h"
+
+#if defined(__aarch64__) && defined(BUILD_IOS)
+#include <mutex>
+#endif
+
+using namespace embree;
+
+RTC_NAMESPACE_BEGIN;
+
+  /* mutex to make API thread safe */
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    static std::mutex g_mutex;
+#else
+    static MutexSys g_mutex;
+#endif
+
+  RTC_API RTCDevice rtcNewDevice(const char* config)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewDevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    Device* device = new Device(config);
+    return (RTCDevice) device->refInc();
+    RTC_CATCH_END(nullptr);
+    return (RTCDevice) nullptr;
+  }
+
+  RTC_API void rtcRetainDevice(RTCDevice hdevice) 
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainDevice);
+    RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    device->refInc();
+    RTC_CATCH_END(nullptr);
+  }
+  
+  RTC_API void rtcReleaseDevice(RTCDevice hdevice) 
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseDevice);
+    RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    device->refDec();
+    RTC_CATCH_END(nullptr);
+  }
+  
+  RTC_API ssize_t rtcGetDeviceProperty(RTCDevice hdevice, RTCDeviceProperty prop)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetDeviceProperty);
+    RTC_VERIFY_HANDLE(hdevice);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    return device->getProperty(prop);
+    RTC_CATCH_END(device);
+    return 0;
+  }
+
+  RTC_API void rtcSetDeviceProperty(RTCDevice hdevice, const RTCDeviceProperty prop, ssize_t val)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceProperty);
+    const bool internal_prop = (size_t)prop >= 1000000 && (size_t)prop < 1000004;
+    if (!internal_prop) RTC_VERIFY_HANDLE(hdevice); // allow NULL device for special internal settings
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    device->setProperty(prop,val);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API RTCError rtcGetDeviceError(RTCDevice hdevice)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetDeviceError);
+    if (device == nullptr) return Device::getThreadErrorCode();
+    else                   return device->getDeviceErrorCode();
+    RTC_CATCH_END(device);
+    return RTC_ERROR_UNKNOWN;
+  }
+
+  RTC_API void rtcSetDeviceErrorFunction(RTCDevice hdevice, RTCErrorFunction error, void* userPtr)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceErrorFunction);
+    RTC_VERIFY_HANDLE(hdevice);
+    device->setErrorFunction(error, userPtr);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API void rtcSetDeviceMemoryMonitorFunction(RTCDevice hdevice, RTCMemoryMonitorFunction memoryMonitor, void* userPtr)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetDeviceMemoryMonitorFunction);
+    device->setMemoryMonitorFunction(memoryMonitor, userPtr);
+    RTC_CATCH_END(device);
+  }
+
+  RTC_API RTCBuffer rtcNewBuffer(RTCDevice hdevice, size_t byteSize)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewBuffer);
+    RTC_VERIFY_HANDLE(hdevice);
+    Buffer* buffer = new Buffer((Device*)hdevice, byteSize);
+    return (RTCBuffer)buffer->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API RTCBuffer rtcNewSharedBuffer(RTCDevice hdevice, void* ptr, size_t byteSize)
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewSharedBuffer);
+    RTC_VERIFY_HANDLE(hdevice);
+    Buffer* buffer = new Buffer((Device*)hdevice, byteSize, ptr);
+    return (RTCBuffer)buffer->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API void* rtcGetBufferData(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetBufferData);
+    RTC_VERIFY_HANDLE(hbuffer);
+    return buffer->data();
+    RTC_CATCH_END2(buffer);
+    return nullptr;
+  }
+
+  RTC_API void rtcRetainBuffer(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainBuffer);
+    RTC_VERIFY_HANDLE(hbuffer);
+    buffer->refInc();
+    RTC_CATCH_END2(buffer);
+  }
+  
+  RTC_API void rtcReleaseBuffer(RTCBuffer hbuffer)
+  {
+    Buffer* buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseBuffer);
+    RTC_VERIFY_HANDLE(hbuffer);
+    buffer->refDec();
+    RTC_CATCH_END2(buffer);
+  }
+
+  RTC_API RTCScene rtcNewScene (RTCDevice hdevice) 
+  {
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewScene);
+    RTC_VERIFY_HANDLE(hdevice);
+    Scene* scene = new Scene((Device*)hdevice);
+    return (RTCScene) scene->refInc();
+    RTC_CATCH_END((Device*)hdevice);
+    return nullptr;
+  }
+
+  RTC_API RTCDevice rtcGetSceneDevice(RTCScene hscene)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneDevice);
+    RTC_VERIFY_HANDLE(hscene);
+    return (RTCDevice)scene->device->refInc(); // user will own one additional device reference
+    RTC_CATCH_END2(scene);
+    return (RTCDevice)nullptr;
+  }
+
+  RTC_API void rtcSetSceneProgressMonitorFunction(RTCScene hscene, RTCProgressMonitorFunction progress, void* ptr) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneProgressMonitorFunction);
+    RTC_VERIFY_HANDLE(hscene);
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(g_mutex);
+#else
+    Lock<MutexSys> lock(g_mutex);
+#endif
+    scene->setProgressMonitorFunction(progress,ptr);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetSceneBuildQuality (RTCScene hscene, RTCBuildQuality quality) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneBuildQuality);
+    RTC_VERIFY_HANDLE(hscene);
+    if (quality != RTC_BUILD_QUALITY_LOW &&
+        quality != RTC_BUILD_QUALITY_MEDIUM &&
+        quality != RTC_BUILD_QUALITY_HIGH)
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
+    scene->setBuildQuality(quality);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetSceneFlags (RTCScene hscene, RTCSceneFlags flags) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSceneFlags);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->setSceneFlags(flags);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API RTCSceneFlags rtcGetSceneFlags(RTCScene hscene)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneFlags);
+    RTC_VERIFY_HANDLE(hscene);
+    return scene->getSceneFlags();
+    RTC_CATCH_END2(scene);
+    return RTC_SCENE_FLAG_NONE;
+  }
+  
+  RTC_API void rtcCommitScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCommitScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->commit(false);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcJoinCommitScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcJoinCommitScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->commit(true);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcGetSceneBounds(RTCScene hscene, RTCBounds* bounds_o)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneBounds);
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    BBox3fa bounds = scene->bounds.bounds();
+    bounds_o->lower_x = bounds.lower.x;
+    bounds_o->lower_y = bounds.lower.y;
+    bounds_o->lower_z = bounds.lower.z;
+    bounds_o->align0  = 0;
+    bounds_o->upper_x = bounds.upper.x;
+    bounds_o->upper_y = bounds.upper.y;
+    bounds_o->upper_z = bounds.upper.z;
+    bounds_o->align1  = 0;
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcGetSceneLinearBounds(RTCScene hscene, RTCLinearBounds* bounds_o)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetSceneBounds);
+    RTC_VERIFY_HANDLE(hscene);
+    if (bounds_o == nullptr)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid destination pointer");
+    if (scene->isModified())
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    
+    bounds_o->bounds0.lower_x = scene->bounds.bounds0.lower.x;
+    bounds_o->bounds0.lower_y = scene->bounds.bounds0.lower.y;
+    bounds_o->bounds0.lower_z = scene->bounds.bounds0.lower.z;
+    bounds_o->bounds0.align0  = 0;
+    bounds_o->bounds0.upper_x = scene->bounds.bounds0.upper.x;
+    bounds_o->bounds0.upper_y = scene->bounds.bounds0.upper.y;
+    bounds_o->bounds0.upper_z = scene->bounds.bounds0.upper.z;
+    bounds_o->bounds0.align1  = 0;
+    bounds_o->bounds1.lower_x = scene->bounds.bounds1.lower.x;
+    bounds_o->bounds1.lower_y = scene->bounds.bounds1.lower.y;
+    bounds_o->bounds1.lower_z = scene->bounds.bounds1.lower.z;
+    bounds_o->bounds1.align0  = 0;
+    bounds_o->bounds1.upper_x = scene->bounds.bounds1.upper.x;
+    bounds_o->bounds1.upper_y = scene->bounds.bounds1.upper.y;
+    bounds_o->bounds1.upper_z = scene->bounds.bounds1.upper.z;
+    bounds_o->bounds1.align1  = 0;
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcCollide (RTCScene hscene0, RTCScene hscene1, RTCCollideFunc callback, void* userPtr)
+  {
+    Scene* scene0 = (Scene*) hscene0;
+    Scene* scene1 = (Scene*) hscene1;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCollide);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene0);
+    RTC_VERIFY_HANDLE(hscene1);
+    if (scene0->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (scene1->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (scene0->device != scene1->device) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes are from different devices");
+    auto nUserPrims0 = scene0->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+    auto nUserPrims1 = scene1->getNumPrimitives (Geometry::MTY_USER_GEOMETRY, false);
+    if (scene0->numPrimitives() != nUserPrims0 && scene1->numPrimitives() != nUserPrims1) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scenes must only contain user geometries with a single timestep");
+#endif
+    scene0->intersectors.collide(scene0,scene1,callback,userPtr);
+    RTC_CATCH_END(scene0->device);
+  }
+  
+  inline bool pointQuery(Scene* scene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+  {
+    bool changed = false;
+    if (userContext->instStackSize > 0)
+    {
+      const AffineSpace3fa transform = AffineSpace3fa_load_unaligned((AffineSpace3fa*)userContext->world2inst[userContext->instStackSize-1]);
+
+      float similarityScale = 0.f;
+      const bool similtude = similarityTransform(transform, &similarityScale);
+      assert((similtude && similarityScale > 0) || (!similtude && similarityScale == 0.f));
+
+      PointQuery query_inst;
+      query_inst.p = xfmPoint(transform, Vec3fa(query->x, query->y, query->z)); 
+      query_inst.radius = query->radius * similarityScale;
+      query_inst.time = query->time;
+      
+      PointQueryContext context_inst(scene, (PointQuery*)query,
+        similtude ? POINT_QUERY_TYPE_SPHERE : POINT_QUERY_TYPE_AABB,
+        queryFunc, userContext, similarityScale, userPtr);
+      changed = scene->intersectors.pointQuery((PointQuery*)&query_inst, &context_inst);
+    }
+    else
+    {
+      PointQueryContext context(scene, (PointQuery*)query, 
+        POINT_QUERY_TYPE_SPHERE, queryFunc, userContext, 1.f, userPtr);
+      changed = scene->intersectors.pointQuery((PointQuery*)query, &context);
+    }
+    return changed;
+  }
+
+  RTC_API bool rtcPointQuery(RTCScene hscene, RTCPointQuery* query, RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void* userPtr)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(userContext);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+    if (((size_t)userContext) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "context not aligned to 16 bytes");   
+#endif
+
+    return pointQuery(scene, query, userContext, queryFunc, userPtr);
+    RTC_CATCH_END2_FALSE(scene);
+  }
+  
+  RTC_API bool rtcPointQuery4 (const int* valid, RTCScene hscene, RTCPointQuery4* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery4* query4 = (PointQuery4*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      query4->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query4->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+  
+  RTC_API bool rtcPointQuery8 (const int* valid, RTCScene hscene, RTCPointQuery8* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery8);
+    
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery8* query8 = (PointQuery8*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      query8->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query8->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+
+  RTC_API bool rtcPointQuery16 (const int* valid, RTCScene hscene, RTCPointQuery16* query, struct RTCPointQueryContext* userContext, RTCPointQueryFunction queryFunc, void** userPtrN)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcPointQuery16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene got not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)query) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "query not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(point_query.travs,cnt,cnt,cnt);
+
+    bool changed = false;
+    PointQuery16* query16 = (PointQuery16*)query;
+    PointQuery query1; 
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      PointQuery query1; query16->get(i,query1);
+      changed |= pointQuery(scene, (RTCPointQuery*)&query1, userContext, queryFunc, userPtrN?userPtrN[i]:NULL);
+      query16->set(i,query1);
+    }
+    return changed;
+    RTC_CATCH_END2_FALSE(scene);
+  }
+
+  RTC_API void rtcIntersect1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    STAT3(normal.travs,1,1,1);
+    IntersectContext context(scene,user_context);
+    scene->intersectors.intersect(*rayhit,&context);
+#if defined(DEBUG)
+    ((RayHit*)rayhit)->verifyHit();
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit4* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)rayhit)   & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit4* rayhit4 = (RayHit4*)rayhit;
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; rayhit4->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      rayhit4->set(i,ray1);
+    }
+#else
+    scene->intersectors.intersect4(valid,*rayhit,&context);
+#endif
+    
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcIntersect8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit8* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect8);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");   
+    if (((size_t)rayhit)   & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 32 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit8* rayhit8 = (RayHit8*) rayhit;
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; rayhit8->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      rayhit8->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector8))
+      scene->intersectors.intersect8(valid,*rayhit,&context);
+    else
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,8,1,sizeof(RTCRayHit8),&context);
+#endif
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcIntersect16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit16* rayhit) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");   
+    if (((size_t)rayhit)   & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit not aligned to 64 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(normal.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    RayHit16* rayhit16 = (RayHit16*) rayhit;
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      RayHit ray1; rayhit16->get(i,ray1);
+      scene->intersectors.intersect((RTCRayHit&)ray1,&context);
+      rayhit16->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector16))
+      scene->intersectors.intersect16(valid,*rayhit,&context);
+    else
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,16,1,sizeof(RTCRayHit16),&context);
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect1M (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit* rayhit, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit ) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for single rays */
+    if (likely(M == 1)) {
+      if (likely(rayhit->ray.tnear <= rayhit->ray.tfar)) 
+        scene->intersectors.intersect(*rayhit,&context);
+    } 
+
+    /* codepath for streams */
+    else {
+      scene->device->rayStreamFilters.intersectAOS(scene,rayhit,M,byteStride,&context);   
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1M not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersect1Mp (RTCScene hscene, RTCIntersectContext* user_context, RTCRayHit** rn, unsigned int M) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersect1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rn) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for single rays */
+    if (likely(M == 1)) {
+      if (likely(rn[0]->ray.tnear <= rn[0]->ray.tfar)) 
+        scene->intersectors.intersect(*rn[0],&context);
+    } 
+
+    /* codepath for streams */
+    else {
+      scene->device->rayStreamFilters.intersectAOP(scene,rn,M,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect1Mp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersectNM (RTCScene hscene, RTCIntersectContext* user_context, struct RTCRayHitN* rayhit, unsigned int N, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersectNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,N*M,N*M,N*M);
+    IntersectContext context(scene,user_context);
+
+    /* code path for single ray streams */
+    if (likely(N == 1))
+    {
+      /* fast code path for streams of size 1 */
+      if (likely(M == 1)) {
+        if (likely(((RTCRayHit*)rayhit)->ray.tnear <= ((RTCRayHit*)rayhit)->ray.tfar))
+          scene->intersectors.intersect(*(RTCRayHit*)rayhit,&context);
+      } 
+      /* normal codepath for single ray streams */
+      else {
+        scene->device->rayStreamFilters.intersectAOS(scene,(RTCRayHit*)rayhit,M,byteStride,&context);
+      }
+    }
+    /* code path for ray packet streams */
+    else {
+      scene->device->rayStreamFilters.intersectSOA(scene,(char*)rayhit,N,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNM not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcIntersectNp (RTCScene hscene, RTCIntersectContext* user_context, const RTCRayHitNp* rayhit, unsigned int N) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcIntersectNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)rayhit->ray.org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.org_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.dir_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.tnear not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.time not aligned to 4 bytes");   
+    if (((size_t)rayhit->ray.mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->ray.mask not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_x  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_x not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_y  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_y not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.Ng_z  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.Ng_z not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.u     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.u not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.v     ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.v not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.geomID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.geomID not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.primID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.primID not aligned to 4 bytes");   
+    if (((size_t)rayhit->hit.instID) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "rayhit->hit.instID not aligned to 4 bytes");   
+#endif
+    STAT3(normal.travs,N,N,N);
+    IntersectContext context(scene,user_context);
+    scene->device->rayStreamFilters.intersectSOP(scene,rayhit,N,&context);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectNp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded1 (RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1);
+    STAT3(shadow.travs,1,1,1);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    IntersectContext context(scene,user_context);
+    scene->intersectors.occluded(*ray,&context);
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded4 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay4* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded4);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 16 bytes");   
+    if (((size_t)ray)   & 0x0F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 16 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<4; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray4* ray4 = (Ray4*) ray;
+    for (size_t i=0; i<4; i++) {
+      if (!valid[i]) continue;
+      Ray ray1; ray4->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray4->set(i,ray1);
+    }
+#else
+    scene->intersectors.occluded4(valid,*ray,&context);
+#endif
+    
+    RTC_CATCH_END2(scene);
+  }
+ 
+  RTC_API void rtcOccluded8 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay8* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded8);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 32 bytes");   
+    if (((size_t)ray)   & 0x1F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 32 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<8; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray8* ray8 = (Ray8*) ray;
+    for (size_t i=0; i<8; i++) {
+      if (!valid[i]) continue;
+      Ray ray1; ray8->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray8->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector8))
+      scene->intersectors.occluded8(valid,*ray,&context);
+    else
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,8,1,sizeof(RTCRay8),&context);
+#endif
+
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded16 (const int* valid, RTCScene hscene, RTCIntersectContext* user_context, RTCRay16* ray) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded16);
+
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)valid) & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 64 bytes");   
+    if (((size_t)ray)   & 0x3F) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 64 bytes");   
+#endif
+    STAT(size_t cnt=0; for (size_t i=0; i<16; i++) cnt += ((int*)valid)[i] == -1;);
+    STAT3(shadow.travs,cnt,cnt,cnt);
+
+    IntersectContext context(scene,user_context);
+#if !defined(EMBREE_RAY_PACKETS)
+    Ray16* ray16 = (Ray16*) ray;
+    for (size_t i=0; i<16; i++) {
+      if (!valid[i]) continue;
+      Ray ray1; ray16->get(i,ray1);
+      scene->intersectors.occluded((RTCRay&)ray1,&context);
+      ray16->set(i,ray1);
+    }
+#else
+    if (likely(scene->intersectors.intersector16))
+      scene->intersectors.occluded16(valid,*ray,&context);
+    else
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,16,1,sizeof(RTCRay16),&context);
+#endif
+
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcOccluded1M(RTCScene hscene, RTCIntersectContext* user_context, RTCRay* ray, unsigned int M, size_t byteStride) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1M);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+    /* fast codepath for streams of size 1 */
+    if (likely(M == 1)) {
+      if (likely(ray->tnear <= ray->tfar)) 
+        scene->intersectors.occluded (*ray,&context);
+    } 
+    /* codepath for normal streams */
+    else {
+      scene->device->rayStreamFilters.occludedAOS(scene,ray,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1M not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccluded1Mp(RTCScene hscene, RTCIntersectContext* user_context, RTCRay** ray, unsigned int M) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccluded1Mp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,M,M,M);
+    IntersectContext context(scene,user_context);
+
+    /* fast codepath for streams of size 1 */
+    if (likely(M == 1)) {
+      if (likely(ray[0]->tnear <= ray[0]->tfar)) 
+        scene->intersectors.occluded (*ray[0],&context);
+    } 
+    /* codepath for normal streams */
+    else {
+      scene->device->rayStreamFilters.occludedAOP(scene,ray,M,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccluded1Mp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccludedNM(RTCScene hscene, RTCIntersectContext* user_context, RTCRayN* ray, unsigned int N, unsigned int M, size_t byteStride)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccludedNM);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (byteStride < sizeof(RTCRayHit)) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"byteStride too small");
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray) & 0x03) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "ray not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,N*M,N*N,N*N);
+    IntersectContext context(scene,user_context);
+
+    /* codepath for single rays */
+    if (likely(N == 1))
+    {
+      /* fast path for streams of size 1 */
+      if (likely(M == 1)) {
+        if (likely(((RTCRay*)ray)->tnear <= ((RTCRay*)ray)->tfar))
+          scene->intersectors.occluded (*(RTCRay*)ray,&context);
+      } 
+      /* codepath for normal ray streams */
+      else {
+        scene->device->rayStreamFilters.occludedAOS(scene,(RTCRay*)ray,M,byteStride,&context);
+      }
+    }
+    /* code path for ray packet streams */
+    else {
+      scene->device->rayStreamFilters.occludedSOA(scene,(char*)ray,N,M,byteStride,&context);
+    }
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNM not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcOccludedNp(RTCScene hscene, RTCIntersectContext* user_context, const RTCRayNp* ray, unsigned int N)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcOccludedNp);
+
+#if defined (EMBREE_RAY_PACKETS)
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    if (scene->isModified()) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed");
+    if (((size_t)ray->org_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_x not aligned to 4 bytes");   
+    if (((size_t)ray->org_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_y not aligned to 4 bytes");   
+    if (((size_t)ray->org_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "org_z not aligned to 4 bytes");   
+    if (((size_t)ray->dir_x ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
+    if (((size_t)ray->dir_y ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_y not aligned to 4 bytes");   
+    if (((size_t)ray->dir_z ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_z not aligned to 4 bytes");   
+    if (((size_t)ray->tnear ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "dir_x not aligned to 4 bytes");   
+    if (((size_t)ray->tfar  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "tnear not aligned to 4 bytes");   
+    if (((size_t)ray->time  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "time not aligned to 4 bytes");   
+    if (((size_t)ray->mask  ) & 0x03 ) throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "mask not aligned to 4 bytes");   
+#endif
+    STAT3(shadow.travs,N,N,N);
+    IntersectContext context(scene,user_context);
+    scene->device->rayStreamFilters.occludedSOP(scene,ray,N,&context);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcOccludedNp not supported");
+#endif
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcRetainScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->refInc();
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcReleaseScene (RTCScene hscene) 
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseScene);
+    RTC_VERIFY_HANDLE(hscene);
+    scene->refDec();
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcSetGeometryInstancedScene(RTCGeometry hgeometry, RTCScene hscene)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    Ref<Scene> scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryInstancedScene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    geometry->setInstancedScene(scene);
+    RTC_CATCH_END2(geometry);
+  }
+
+  AffineSpace3fa loadTransform(RTCFormat format, const float* xfm)
+  {
+    AffineSpace3fa space = one;
+    switch (format)
+    {
+    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 4], xfm[ 8]),
+                             Vec3fa(xfm[ 1], xfm[ 5], xfm[ 9]),
+                             Vec3fa(xfm[ 2], xfm[ 6], xfm[10]),
+                             Vec3fa(xfm[ 3], xfm[ 7], xfm[11]));
+      break;
+
+    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+                             Vec3fa(xfm[ 3], xfm[ 4], xfm[ 5]),
+                             Vec3fa(xfm[ 6], xfm[ 7], xfm[ 8]),
+                             Vec3fa(xfm[ 9], xfm[10], xfm[11]));
+      break;
+
+    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+      space = AffineSpace3fa(Vec3fa(xfm[ 0], xfm[ 1], xfm[ 2]),
+                             Vec3fa(xfm[ 4], xfm[ 5], xfm[ 6]),
+                             Vec3fa(xfm[ 8], xfm[ 9], xfm[10]),
+                             Vec3fa(xfm[12], xfm[13], xfm[14]));
+      break;
+
+    default: 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+      break;
+    }
+    return space;
+  }
+
+  void storeTransform(const AffineSpace3fa& space, RTCFormat format, float* xfm)
+  {
+    switch (format)
+    {
+    case RTC_FORMAT_FLOAT3X4_ROW_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vy.x;  xfm[ 2] = space.l.vz.x;  xfm[ 3] = space.p.x;
+      xfm[ 4] = space.l.vx.y;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vz.y;  xfm[ 7] = space.p.y;
+      xfm[ 8] = space.l.vx.z;  xfm[ 9] = space.l.vy.z;  xfm[10] = space.l.vz.z;  xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT3X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;
+      xfm[ 3] = space.l.vy.x;  xfm[ 4] = space.l.vy.y;  xfm[ 5] = space.l.vy.z;
+      xfm[ 6] = space.l.vz.x;  xfm[ 7] = space.l.vz.y;  xfm[ 8] = space.l.vz.z;
+      xfm[ 9] = space.p.x;     xfm[10] = space.p.y;     xfm[11] = space.p.z;
+      break;
+
+    case RTC_FORMAT_FLOAT4X4_COLUMN_MAJOR:
+      xfm[ 0] = space.l.vx.x;  xfm[ 1] = space.l.vx.y;  xfm[ 2] = space.l.vx.z;  xfm[ 3] = 0.f;
+      xfm[ 4] = space.l.vy.x;  xfm[ 5] = space.l.vy.y;  xfm[ 6] = space.l.vy.z;  xfm[ 7] = 0.f;
+      xfm[ 8] = space.l.vz.x;  xfm[ 9] = space.l.vz.y;  xfm[10] = space.l.vz.z;  xfm[11] = 0.f;
+      xfm[12] = space.p.x;     xfm[13] = space.p.y;     xfm[14] = space.p.z;     xfm[15] = 1.f;
+      break;
+
+    default:
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid matrix format");
+      break;
+    }
+  }
+
+  RTC_API void rtcSetGeometryTransform(RTCGeometry hgeometry, unsigned int timeStep, RTCFormat format, const void* xfm)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTransform);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(xfm);
+    const AffineSpace3fa transform = loadTransform(format, (const float*)xfm);
+    geometry->setTransform(transform, timeStep);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTransformQuaternion(RTCGeometry hgeometry, unsigned int timeStep, const RTCQuaternionDecomposition* qd)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTransformQuaternion);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(qd);
+    
+    AffineSpace3fx transform;
+    transform.l.vx.x = qd->scale_x;
+    transform.l.vy.y = qd->scale_y;
+    transform.l.vz.z = qd->scale_z;
+    transform.l.vy.x = qd->skew_xy;
+    transform.l.vz.x = qd->skew_xz;
+    transform.l.vz.y = qd->skew_yz;
+    transform.l.vx.y = qd->translation_x;
+    transform.l.vx.z = qd->translation_y;
+    transform.l.vy.z = qd->translation_z;
+    transform.p.x    = qd->shift_x;
+    transform.p.y    = qd->shift_y;
+    transform.p.z    = qd->shift_z;
+
+    // normalize quaternion
+    Quaternion3f q(qd->quaternion_r, qd->quaternion_i, qd->quaternion_j, qd->quaternion_k);
+    q = normalize(q);
+    transform.l.vx.w = q.i;
+    transform.l.vy.w = q.j;
+    transform.l.vz.w = q.k;
+    transform.p.w    = q.r;
+
+    geometry->setQuaternionDecomposition(transform, timeStep);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcGetGeometryTransform(RTCGeometry hgeometry, float time, RTCFormat format, void* xfm)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryTransform);
+    const AffineSpace3fa transform = geometry->getTransform(time);
+    storeTransform(transform, format, (float*)xfm);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcFilterIntersection(const struct RTCIntersectFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  {
+    IntersectFunctionNArguments* args = (IntersectFunctionNArguments*) args_i;
+    args->report(args,filter_args);
+  }
+
+  RTC_API void rtcFilterOcclusion(const struct RTCOccludedFunctionNArguments* const args_i, const struct RTCFilterFunctionNArguments* filter_args)
+  {
+    OccludedFunctionNArguments* args = (OccludedFunctionNArguments*) args_i;
+    args->report(args,filter_args);
+  }
+  
+  RTC_API RTCGeometry rtcNewGeometry (RTCDevice hdevice, RTCGeometryType type)
+  {
+    Device* device = (Device*) hdevice;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcNewGeometry);
+    RTC_VERIFY_HANDLE(hdevice);
+
+    switch (type)
+    {
+    case RTC_GEOMETRY_TYPE_TRIANGLE:
+    {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+      createTriangleMeshTy createTriangleMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createTriangleMesh);
+      Geometry* geom = createTriangleMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_TRIANGLE is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_QUAD:
+    {
+#if defined(EMBREE_GEOMETRY_QUAD)
+      createQuadMeshTy createQuadMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createQuadMesh);
+      Geometry* geom = createQuadMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_QUAD is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+    case RTC_GEOMETRY_TYPE_DISC_POINT:
+    case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+    {
+#if defined(EMBREE_GEOMETRY_POINT)
+      createPointsTy createPoints = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_builder_cpu_features, createPoints);
+
+      Geometry *geom;
+      switch(type) {
+        case RTC_GEOMETRY_TYPE_SPHERE_POINT:
+          geom = createPoints(device, Geometry::GTY_SPHERE_POINT);
+          break;
+        case RTC_GEOMETRY_TYPE_DISC_POINT:
+          geom = createPoints(device, Geometry::GTY_DISC_POINT);
+          break;
+        case RTC_GEOMETRY_TYPE_ORIENTED_DISC_POINT:
+          geom = createPoints(device, Geometry::GTY_ORIENTED_DISC_POINT);
+          break;
+        default:
+          geom = nullptr;
+          break;
+      }
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_POINT is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE:
+    case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE:
+      
+    case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE:
+      
+    case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE:
+
+    case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE:
+
+    case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE:
+    case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE:
+    case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE:
+    {
+#if defined(EMBREE_GEOMETRY_CURVE)
+      createLineSegmentsTy createLineSegments = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createLineSegments);
+      createCurvesTy createCurves = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createCurves);
+      
+      Geometry* geom;
+      switch (type) {
+      case RTC_GEOMETRY_TYPE_CONE_LINEAR_CURVE             : geom = createLineSegments (device,Geometry::GTY_CONE_LINEAR_CURVE); break;
+      case RTC_GEOMETRY_TYPE_ROUND_LINEAR_CURVE            : geom = createLineSegments (device,Geometry::GTY_ROUND_LINEAR_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE             : geom = createLineSegments (device,Geometry::GTY_FLAT_LINEAR_CURVE); break;
+      //case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_LINEAR_CURVE  : geom = createLineSegments (device,Geometry::GTY_ORIENTED_LINEAR_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_BEZIER_CURVE            : geom = createCurves(device,Geometry::GTY_ROUND_BEZIER_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_BEZIER_CURVE             : geom = createCurves(device,Geometry::GTY_FLAT_BEZIER_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BEZIER_CURVE  : geom = createCurves(device,Geometry::GTY_ORIENTED_BEZIER_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_BSPLINE_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_BSPLINE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_BSPLINE_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_BSPLINE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_BSPLINE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_BSPLINE_CURVE); break;
+        
+      case RTC_GEOMETRY_TYPE_ROUND_HERMITE_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_HERMITE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_HERMITE_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_HERMITE_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_HERMITE_CURVE); break;
+
+      case RTC_GEOMETRY_TYPE_ROUND_CATMULL_ROM_CURVE           : geom = createCurves(device,Geometry::GTY_ROUND_CATMULL_ROM_CURVE); break;
+      case RTC_GEOMETRY_TYPE_FLAT_CATMULL_ROM_CURVE            : geom = createCurves(device,Geometry::GTY_FLAT_CATMULL_ROM_CURVE); break;
+      case RTC_GEOMETRY_TYPE_NORMAL_ORIENTED_CATMULL_ROM_CURVE : geom = createCurves(device,Geometry::GTY_ORIENTED_CATMULL_ROM_CURVE); break;
+      default:                                    geom = nullptr; break;
+      }
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_CURVE is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_SUBDIVISION:
+    {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+      createSubdivMeshTy createSubdivMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX(device->enabled_cpu_features,createSubdivMesh);
+      //SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createSubdivMesh); // FIXME: this does not work for some reason?
+      Geometry* geom = createSubdivMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_SUBDIVISION is not supported");
+#endif
+    }
+    
+    case RTC_GEOMETRY_TYPE_USER:
+    {
+#if defined(EMBREE_GEOMETRY_USER)
+      createUserGeometryTy createUserGeometry = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createUserGeometry);
+      Geometry* geom = createUserGeometry(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_USER is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_INSTANCE:
+    {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+      createInstanceTy createInstance = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createInstance);
+      Geometry* geom = createInstance(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_INSTANCE is not supported");
+#endif
+    }
+
+    case RTC_GEOMETRY_TYPE_GRID:
+    {
+#if defined(EMBREE_GEOMETRY_GRID)
+      createGridMeshTy createGridMesh = nullptr;
+      SELECT_SYMBOL_DEFAULT_AVX_AVX2_AVX512KNL_AVX512SKX(device->enabled_cpu_features,createGridMesh);
+      Geometry* geom = createGridMesh(device);
+      return (RTCGeometry) geom->refInc();
+#else
+      throw_RTCError(RTC_ERROR_UNKNOWN,"RTC_GEOMETRY_TYPE_GRID is not supported");
+#endif
+    }
+    
+    default:
+      throw_RTCError(RTC_ERROR_UNKNOWN,"invalid geometry type");
+    }
+    
+    RTC_CATCH_END(device);
+    return nullptr;
+  }
+
+  RTC_API void rtcSetGeometryUserPrimitiveCount(RTCGeometry hgeometry, unsigned int userPrimitiveCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryUserPrimitiveCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    
+    if (unlikely(geometry->getType() != Geometry::GTY_USER_GEOMETRY))
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"operation only allowed for user geometries"); 
+
+    geometry->setNumPrimitives(userPrimitiveCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTimeStepCount(RTCGeometry hgeometry, unsigned int timeStepCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTimeStepCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (timeStepCount > RTC_MAX_TIME_STEP_COUNT)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"number of time steps is out of range");
+    
+    geometry->setNumTimeSteps(timeStepCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTimeRange(RTCGeometry hgeometry, float startTime, float endTime)
+  {
+    Ref<Geometry> geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTimeRange);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (startTime > endTime)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"startTime has to be smaller or equal to the endTime");
+        
+    geometry->setTimeRange(BBox1f(startTime,endTime));
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryVertexAttributeCount(RTCGeometry hgeometry, unsigned int N)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryVertexAttributeCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setVertexAttributeCount(N);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTopologyCount(RTCGeometry hgeometry, unsigned int N)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTopologyCount);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setTopologyCount(N);
+    RTC_CATCH_END2(geometry);
+  }
+ 
+  RTC_API void rtcSetGeometryBuildQuality (RTCGeometry hgeometry, RTCBuildQuality quality) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBuildQuality);
+    RTC_VERIFY_HANDLE(hgeometry);
+    if (quality != RTC_BUILD_QUALITY_LOW &&
+        quality != RTC_BUILD_QUALITY_MEDIUM &&
+        quality != RTC_BUILD_QUALITY_HIGH &&
+        quality != RTC_BUILD_QUALITY_REFIT)
+      // -- GODOT start --
+      // throw std::runtime_error("invalid build quality");
+      abort();
+      // -- GODOT end --
+    geometry->setBuildQuality(quality);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryMaxRadiusScale(RTCGeometry hgeometry, float maxRadiusScale)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryMaxRadiusScale);
+    RTC_VERIFY_HANDLE(hgeometry);
+#if RTC_MIN_WIDTH
+    if (maxRadiusScale < 1.0f) throw_RTCError(RTC_ERROR_INVALID_OPERATION,"maximal radius scale has to be larger or equal to 1");
+    geometry->setMaxRadiusScale(maxRadiusScale);
+#else
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"min-width feature is not enabled");
+#endif
+    RTC_CATCH_END2(geometry);
+  }
+  
+  RTC_API void rtcSetGeometryMask (RTCGeometry hgeometry, unsigned int mask) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryMask);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setMask(mask);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometrySubdivisionMode (RTCGeometry hgeometry, unsigned topologyID, RTCSubdivisionMode mode) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometrySubdivisionMode);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setSubdivisionMode(topologyID,mode);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryVertexAttributeTopology(RTCGeometry hgeometry, unsigned int vertexAttributeID, unsigned int topologyID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryVertexAttributeTopology);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setVertexAttributeTopology(vertexAttributeID, topologyID);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, RTCBuffer hbuffer, size_t byteOffset, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    Ref<Buffer> buffer = (Buffer*)hbuffer;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_HANDLE(hbuffer);
+    
+    if (geometry->device != buffer->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    geometry->setBuffer(type, slot, format, buffer, byteOffset, byteStride, (unsigned int)itemCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetSharedGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, const void* ptr, size_t byteOffset, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetSharedGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    Ref<Buffer> buffer = new Buffer(geometry->device, itemCount*byteStride, (char*)ptr + byteOffset);
+    geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void* rtcSetNewGeometryBuffer(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot, RTCFormat format, size_t byteStride, size_t itemCount)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetNewGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+
+    if (itemCount > 0xFFFFFFFFu)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"buffer too large");
+    
+    /* vertex buffers need to get overallocated slightly as elements are accessed using SSE loads */
+    size_t bytes = itemCount*byteStride;
+    if (type == RTC_BUFFER_TYPE_VERTEX || type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+      bytes += (16 - (byteStride%16))%16;
+      
+    Ref<Buffer> buffer = new Buffer(geometry->device, bytes);
+    geometry->setBuffer(type, slot, format, buffer, 0, byteStride, (unsigned int)itemCount);
+    return buffer->data();
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+
+  RTC_API void* rtcGetGeometryBufferData(RTCGeometry hgeometry, RTCBufferType type, unsigned int slot)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryBufferData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->getBuffer(type, slot);
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+  
+  RTC_API void rtcEnableGeometry (RTCGeometry hgeometry) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcEnableGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->enable();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcUpdateGeometryBuffer (RTCGeometry hgeometry, RTCBufferType type, unsigned int slot) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcUpdateGeometryBuffer);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->updateBuffer(type, slot);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcDisableGeometry (RTCGeometry hgeometry) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcDisableGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->disable();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryTessellationRate (RTCGeometry hgeometry, float tessellationRate)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryTessellationRate);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setTessellationRate(tessellationRate);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryUserData (RTCGeometry hgeometry, void* ptr) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryUserData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setUserData(ptr);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void* rtcGetGeometryUserData (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry; // no ref counting here!
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryUserData);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->getUserData();
+    RTC_CATCH_END2(geometry);
+    return nullptr;
+  }
+
+  RTC_API void rtcSetGeometryBoundsFunction (RTCGeometry hgeometry, RTCBoundsFunction bounds, void* userPtr)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryBoundsFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setBoundsFunction(bounds,userPtr);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryDisplacementFunction (RTCGeometry hgeometry, RTCDisplacementFunctionN displacement)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryDisplacementFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setDisplacementFunction(displacement);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryIntersectFunction (RTCGeometry hgeometry, RTCIntersectFunctionN intersect) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryIntersectFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setIntersectFunctionN(intersect);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryPointQueryFunction(RTCGeometry hgeometry, RTCPointQueryFunction pointQuery)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryPointQueryFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setPointQueryFunction(pointQuery);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API unsigned int rtcGetGeometryFirstHalfEdge(RTCGeometry hgeometry, unsigned int faceID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryFirstHalfEdge);
+    return geometry->getFirstHalfEdge(faceID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryFace(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryFace);
+    return geometry->getFace(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryNextHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryNextHalfEdge);
+    return geometry->getNextHalfEdge(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryPreviousHalfEdge(RTCGeometry hgeometry, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryPreviousHalfEdge);
+    return geometry->getPreviousHalfEdge(edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API unsigned int rtcGetGeometryOppositeHalfEdge(RTCGeometry hgeometry, unsigned int topologyID, unsigned int edgeID)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometryOppositeHalfEdge);
+    return geometry->getOppositeHalfEdge(topologyID,edgeID);
+    RTC_CATCH_END2(geometry);
+    return -1;
+  }
+
+  RTC_API void rtcSetGeometryOccludedFunction (RTCGeometry hgeometry, RTCOccludedFunctionN occluded) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetOccludedFunctionN);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setOccludedFunctionN(occluded);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryIntersectFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryIntersectFilterFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setIntersectionFilterFunctionN(filter);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcSetGeometryOccludedFilterFunction (RTCGeometry hgeometry, RTCFilterFunctionN filter) 
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcSetGeometryOccludedFilterFunction);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->setOcclusionFilterFunctionN(filter);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcInterpolate(const RTCInterpolateArguments* const args)
+  {
+    Geometry* geometry = (Geometry*) args->geometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcInterpolate);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(args->geometry);
+#endif
+    geometry->interpolate(args);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcInterpolateN(const RTCInterpolateNArguments* const args)
+  {
+    Geometry* geometry = (Geometry*) args->geometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcInterpolateN);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(args->geometry);
+#endif
+    geometry->interpolateN(args);
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API void rtcCommitGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcCommitGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    return geometry->commit();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API unsigned int rtcAttachGeometry (RTCScene hscene, RTCGeometry hgeometry)
+  {
+    Scene* scene = (Scene*) hscene;
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcAttachGeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    if (scene->device != geometry->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    return scene->bind(RTC_INVALID_GEOMETRY_ID,geometry);
+    RTC_CATCH_END2(scene);
+    return -1;
+  }
+
+  RTC_API void rtcAttachGeometryByID (RTCScene hscene, RTCGeometry hgeometry, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcAttachGeometryByID);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_HANDLE(hgeometry);
+    RTC_VERIFY_GEOMID(geomID);
+    if (scene->device != geometry->device)
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"inputs are from different devices");
+    scene->bind(geomID,geometry);
+    RTC_CATCH_END2(scene);
+  }
+  
+  RTC_API void rtcDetachGeometry (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcDetachGeometry);
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+    scene->detachGeometry(geomID);
+    RTC_CATCH_END2(scene);
+  }
+
+  RTC_API void rtcRetainGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcRetainGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->refInc();
+    RTC_CATCH_END2(geometry);
+  }
+  
+  RTC_API void rtcReleaseGeometry (RTCGeometry hgeometry)
+  {
+    Geometry* geometry = (Geometry*) hgeometry;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcReleaseGeometry);
+    RTC_VERIFY_HANDLE(hgeometry);
+    geometry->refDec();
+    RTC_CATCH_END2(geometry);
+  }
+
+  RTC_API RTCGeometry rtcGetGeometry (RTCScene hscene, unsigned int geomID)
+  {
+    Scene* scene = (Scene*) hscene;
+    RTC_CATCH_BEGIN;
+    RTC_TRACE(rtcGetGeometry);
+#if defined(DEBUG)
+    RTC_VERIFY_HANDLE(hscene);
+    RTC_VERIFY_GEOMID(geomID);
+#endif
+    return (RTCGeometry) scene->get(geomID);
+    RTC_CATCH_END2(scene);
+    return nullptr;
+  }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.h b/thirdparty/embree-aarch64/kernels/common/rtcore.h
new file mode 100644
index 0000000000..4b070e122b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore.h
@@ -0,0 +1,142 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../include/embree3/rtcore.h"
+RTC_NAMESPACE_USE
+
+namespace embree
+{  
+  /*! decoding of intersection flags */
+  __forceinline bool isCoherent  (RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_COHERENT; }
+  __forceinline bool isIncoherent(RTCIntersectContextFlags flags) { return (flags & RTC_INTERSECT_CONTEXT_FLAG_COHERENT) == RTC_INTERSECT_CONTEXT_FLAG_INCOHERENT; }
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR >= 8)
+#  define USE_TASK_ARENA 1
+#else
+#  define USE_TASK_ARENA 0
+#endif
+
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION >= 11009) // TBB 2019 Update 9
+#  define TASKING_TBB_USE_TASK_ISOLATION 1
+#else
+#  define TASKING_TBB_USE_TASK_ISOLATION 0
+#endif
+
+/*! Macros used in the rtcore API implementation */
+// -- GODOT start --
+// #define RTC_CATCH_BEGIN try {
+#define RTC_CATCH_BEGIN
+  
+// #define RTC_CATCH_END(device)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END(device)
+  
+// #define RTC_CATCH_END2(scene)                                                \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//   }
+#define RTC_CATCH_END2(scene)
+
+// #define RTC_CATCH_END2_FALSE(scene)                                             \
+//   } catch (std::bad_alloc&) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+//     return false;                                                               \
+//   } catch (rtcore_error& e) {                                                   \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,e.error,e.what());                             \
+//     return false;                                                               \
+//   } catch (std::exception& e) {                                                 \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+//     return false;                                                               \
+//   } catch (...) {                                                               \
+//     Device* device = scene ? scene->device : nullptr;                           \
+//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+//     return false;                                                               \
+//   }
+#define RTC_CATCH_END2_FALSE(scene) return false;
+// -- GODOT end --
+
+#define RTC_VERIFY_HANDLE(handle)                               \
+  if (handle == nullptr) {                                         \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_GEOMID(id)                                   \
+  if (id == RTC_INVALID_GEOMETRY_ID) {                             \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_UPPER(id,upper)                              \
+  if (id > upper) {                                                \
+    throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"invalid argument"); \
+  }
+
+#define RTC_VERIFY_RANGE(id,lower,upper)	\
+  if (id < lower || id > upper)						  \
+    throw_RTCError(RTC_ERROR_INVALID_OPERATION,"argument out of bounds");
+  
+#if 0 // enable to debug print all API calls
+#define RTC_TRACE(x) std::cout << #x << std::endl;
+#else
+#define RTC_TRACE(x) 
+#endif
+
+// -- GODOT begin --
+//   /*! used to throw embree API errors */
+//   struct rtcore_error : public std::exception
+//   {
+//     __forceinline rtcore_error(RTCError error, const std::string& str)
+//       : error(error), str(str) {}
+//     
+//     ~rtcore_error() throw() {}
+//     
+//     const char* what () const throw () {
+//       return str.c_str();
+//     }
+//     
+//     RTCError error;
+//     std::string str;
+//   };
+// -- GODOT end --
+
+#if defined(DEBUG) // only report file and line in debug mode
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+  #define throw_RTCError(error,str) \
+    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
+  // -- GODOT end --
+#else
+  // -- GODOT begin --
+  // #define throw_RTCError(error,str) \
+  //   throw rtcore_error(error,str);
+  #define throw_RTCError(error,str) \
+    abort();
+  // -- GODOT end --
+#endif
+
+#define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+  (settings.byteSize > (offsetof(RTCBuildArguments,member)+sizeof(settings.member))) 
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp
new file mode 100644
index 0000000000..6bb96bba07
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/rtcore_builder.cpp
@@ -0,0 +1,442 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_EXPORT_API
+
+#include "default.h"
+#include "device.h"
+#include "scene.h"
+#include "context.h"
+#include "alloc.h"
+
+#include "../builders/bvh_builder_sah.h"
+#include "../builders/bvh_builder_morton.h"
+
+namespace embree
+{ 
+  namespace isa // FIXME: support more ISAs for builders
+  {
+    struct BVH : public RefCount
+    {
+      BVH (Device* device)
+        : device(device), allocator(device,true), morton_src(device,0), morton_tmp(device,0)
+      {
+        device->refInc();
+      }
+
+      ~BVH() {
+        device->refDec();
+      }
+
+    public:
+      Device* device;
+      FastAllocator allocator;
+      mvector<BVHBuilderMorton::BuildPrim> morton_src;
+      mvector<BVHBuilderMorton::BuildPrim> morton_tmp;
+    };
+
+    void* rtcBuildBVHMorton(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims_i =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+        
+      std::atomic<size_t> progress(0);
+      
+      /* initialize temporary arrays for morton builder */
+      PrimRef* prims = (PrimRef*) prims_i;
+      mvector<BVHBuilderMorton::BuildPrim>& morton_src = bvh->morton_src;
+      mvector<BVHBuilderMorton::BuildPrim>& morton_tmp = bvh->morton_tmp;
+      morton_src.resize(primitiveCount);
+      morton_tmp.resize(primitiveCount);
+
+      /* compute centroid bounds */
+      const BBox3fa centBounds = parallel_reduce ( size_t(0), primitiveCount, BBox3fa(empty), [&](const range<size_t>& r) -> BBox3fa {
+
+          BBox3fa bounds(empty);
+          for (size_t i=r.begin(); i<r.end(); i++) 
+            bounds.extend(prims[i].bounds().center2());
+          return bounds;
+        }, BBox3fa::merge);
+      
+      /* compute morton codes */
+      BVHBuilderMorton::MortonCodeMapping mapping(centBounds);
+      parallel_for ( size_t(0), primitiveCount, [&](const range<size_t>& r) {
+          BVHBuilderMorton::MortonCodeGenerator generator(mapping,&morton_src[r.begin()]);
+          for (size_t i=r.begin(); i<r.end(); i++) {
+            generator(prims[i].bounds(),(unsigned) i);
+          }
+        });
+
+      /* start morton build */
+      std::pair<void*,BBox3fa> root = BVHBuilderMorton::build<std::pair<void*,BBox3fa>>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+        
+        /* lambda function that allocates BVH nodes */
+        [&] ( const FastAllocator::CachedAllocator& alloc, size_t N ) -> void* {
+          return createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+        },
+        
+        /* lambda function that sets bounds */
+        [&] (void* node, const std::pair<void*,BBox3fa>* children, size_t N) -> std::pair<void*,BBox3fa>
+        {
+          BBox3fa bounds = empty;
+          void* childptrs[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+          const RTCBounds* cbounds[BVHBuilderMorton::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) {
+            bounds.extend(children[i].second);
+            childptrs[i] = children[i].first;
+            cbounds[i] = (const RTCBounds*)&children[i].second;
+          }
+          setNodeBounds(node,cbounds,(unsigned int)N,userPtr);
+          setNodeChildren(node,childptrs, (unsigned int)N,userPtr);
+          return std::make_pair(node,bounds);
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&]( const range<unsigned>& current, const FastAllocator::CachedAllocator& alloc) -> std::pair<void*,BBox3fa>
+        {
+	  RTCBuildPrimitive localBuildPrims[RTC_BUILD_MAX_PRIMITIVES_PER_LEAF];
+	  BBox3fa bounds = empty;
+	  for (size_t i=0;i<current.size();i++)
+	    {
+	      const size_t id = morton_src[current.begin()+i].index;
+	      bounds.extend(prims[id].bounds());
+	      localBuildPrims[i] = prims_i[id];
+	    }
+          void* node = createLeaf((RTCThreadLocalAllocator)&alloc,localBuildPrims,current.size(),userPtr);
+          return std::make_pair(node,bounds);
+        },
+        
+        /* lambda that calculates the bounds for some primitive */
+        [&] (const BVHBuilderMorton::BuildPrim& morton) -> BBox3fa {
+          return prims[morton.index].bounds();
+        },
+        
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        morton_src.data(),morton_tmp.data(),primitiveCount,
+        *arguments);
+
+      bvh->allocator.cleanup();
+      return root.first;
+    }
+
+    void* rtcBuildBVHBinnedSAH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+      
+      std::atomic<size_t> progress(0);
+  
+      /* calculate priminfo */
+      auto computeBounds = [&](const range<size_t>& r) -> CentGeomBBox3fa
+        {
+          CentGeomBBox3fa bounds(empty);
+          for (size_t j=r.begin(); j<r.end(); j++)
+            bounds.extend((BBox3fa&)prims[j]);
+          return bounds;
+        };
+      const CentGeomBBox3fa bounds = 
+        parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),CentGeomBBox3fa(empty), computeBounds, CentGeomBBox3fa::merge2);
+
+      const PrimInfo pinfo(0,primitiveCount,bounds);
+      
+      /* build BVH */
+      void* root = BVHBuilderBinnedSAH::build<void*>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+
+        /* lambda function that creates BVH nodes */
+        [&](BVHBuilderBinnedSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+        {
+          void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+          const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+          setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+          return node;
+        },
+
+        /* lambda function that updates BVH nodes */
+        [&](const BVHBuilderBinnedSAH::BuildRecord& precord, const BVHBuilderBinnedSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+          setNodeChildren(node,children, (unsigned int)N,userPtr);
+          return node;
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&](const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+          return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+        },
+        
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        (PrimRef*)prims,pinfo,*arguments);
+        
+      bvh->allocator.cleanup();
+      return root;
+    }
+
+    static __forceinline const std::pair<CentGeomBBox3fa,unsigned int> mergePair(const std::pair<CentGeomBBox3fa,unsigned int>& a, const std::pair<CentGeomBBox3fa,unsigned int>& b) {
+      CentGeomBBox3fa centBounds = CentGeomBBox3fa::merge2(a.first,b.first);
+      unsigned int maxGeomID = max(a.second,b.second); 
+      return std::pair<CentGeomBBox3fa,unsigned int>(centBounds,maxGeomID);
+    }
+
+    void* rtcBuildBVHSpatialSAH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTCBuildPrimitive* prims =  arguments->primitives;
+      size_t primitiveCount = arguments->primitiveCount;
+      RTCCreateNodeFunction createNode = arguments->createNode;
+      RTCSetNodeChildrenFunction setNodeChildren = arguments->setNodeChildren;
+      RTCSetNodeBoundsFunction setNodeBounds = arguments->setNodeBounds;
+      RTCCreateLeafFunction createLeaf = arguments->createLeaf;
+      RTCSplitPrimitiveFunction splitPrimitive = arguments->splitPrimitive;
+      RTCProgressMonitorFunction buildProgress = arguments->buildProgress;
+      void* userPtr = arguments->userPtr;
+      
+      std::atomic<size_t> progress(0);
+  
+      /* calculate priminfo */
+
+      auto computeBounds = [&](const range<size_t>& r) -> std::pair<CentGeomBBox3fa,unsigned int>
+        {
+          CentGeomBBox3fa bounds(empty);
+          unsigned maxGeomID = 0;
+          for (size_t j=r.begin(); j<r.end(); j++)
+          {
+            bounds.extend((BBox3fa&)prims[j]);
+            maxGeomID = max(maxGeomID,prims[j].geomID);
+          }
+          return std::pair<CentGeomBBox3fa,unsigned int>(bounds,maxGeomID);
+        };
+
+
+      const std::pair<CentGeomBBox3fa,unsigned int> pair = 
+        parallel_reduce(size_t(0),primitiveCount,size_t(1024),size_t(1024),std::pair<CentGeomBBox3fa,unsigned int>(CentGeomBBox3fa(empty),0), computeBounds, mergePair);
+
+      CentGeomBBox3fa bounds = pair.first;
+      const unsigned int maxGeomID = pair.second;
+      
+      if (unlikely(maxGeomID >= ((unsigned int)1 << (32-RESERVED_NUM_SPATIAL_SPLITS_GEOMID_BITS))))
+        {
+          /* fallback code for max geomID larger than threshold */
+          return rtcBuildBVHBinnedSAH(arguments);
+        }
+
+      const PrimInfo pinfo(0,primitiveCount,bounds);
+
+      /* function that splits a build primitive */
+      struct Splitter
+      {
+        Splitter (RTCSplitPrimitiveFunction splitPrimitive, unsigned geomID, unsigned primID, void* userPtr)
+          : splitPrimitive(splitPrimitive), geomID(geomID), primID(primID), userPtr(userPtr) {}
+        
+        __forceinline void operator() (PrimRef& prim, const size_t dim, const float pos, PrimRef& left_o, PrimRef& right_o) const 
+        {
+          prim.geomIDref() &= BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK;
+          splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+          left_o.geomIDref()  = geomID; left_o.primIDref()  = primID;
+          right_o.geomIDref() = geomID; right_o.primIDref() = primID;
+        }
+
+        __forceinline void operator() (const BBox3fa& box, const size_t dim, const float pos, BBox3fa& left_o, BBox3fa& right_o) const 
+        {
+          PrimRef prim(box,geomID & BVHBuilderBinnedFastSpatialSAH::GEOMID_MASK,primID);
+          splitPrimitive((RTCBuildPrimitive*)&prim,(unsigned)dim,pos,(RTCBounds*)&left_o,(RTCBounds*)&right_o,userPtr);
+        }
+   
+        RTCSplitPrimitiveFunction splitPrimitive;
+        unsigned geomID;
+        unsigned primID;
+        void* userPtr;
+      };
+
+      /* build BVH */
+      void* root = BVHBuilderBinnedFastSpatialSAH::build<void*>(
+        
+        /* thread local allocator for fast allocations */
+        [&] () -> FastAllocator::CachedAllocator { 
+          return bvh->allocator.getCachedAllocator();
+        },
+
+        /* lambda function that creates BVH nodes */
+        [&] (BVHBuilderBinnedFastSpatialSAH::BuildRecord* children, const size_t N, const FastAllocator::CachedAllocator& alloc) -> void*
+        {
+          void* node = createNode((RTCThreadLocalAllocator)&alloc, (unsigned int)N,userPtr);
+          const RTCBounds* cbounds[GeneralBVHBuilder::MAX_BRANCHING_FACTOR];
+          for (size_t i=0; i<N; i++) cbounds[i] = (const RTCBounds*) &children[i].prims.geomBounds;
+          setNodeBounds(node,cbounds, (unsigned int)N,userPtr);
+          return node;
+        },
+
+        /* lambda function that updates BVH nodes */
+        [&] (const BVHBuilderBinnedFastSpatialSAH::BuildRecord& precord, const BVHBuilderBinnedFastSpatialSAH::BuildRecord* crecords, void* node, void** children, const size_t N) -> void* {
+          setNodeChildren(node,children, (unsigned int)N,userPtr);
+          return node;
+        },
+        
+        /* lambda function that creates BVH leaves */
+        [&] (const PrimRef* prims, const range<size_t>& range, const FastAllocator::CachedAllocator& alloc) -> void* {
+          return createLeaf((RTCThreadLocalAllocator)&alloc,(RTCBuildPrimitive*)(prims+range.begin()),range.size(),userPtr);
+        },
+        
+        /* returns the splitter */
+        [&] ( const PrimRef& prim ) -> Splitter {
+          return Splitter(splitPrimitive,prim.geomID(),prim.primID(),userPtr);
+        },
+
+        /* progress monitor function */
+        [&] (size_t dn) {
+          if (!buildProgress) return true;
+          const size_t n = progress.fetch_add(dn)+dn;
+          const double f = std::min(1.0,double(n)/double(primitiveCount));
+          return buildProgress(userPtr,f);
+        },
+        
+        (PrimRef*)prims,
+        arguments->primitiveArrayCapacity,
+        pinfo,*arguments);
+        
+      bvh->allocator.cleanup();
+      return root;
+    }
+  }
+}
+
+using namespace embree;
+using namespace embree::isa;
+
+RTC_NAMESPACE_BEGIN
+
+    RTC_API RTCBVH rtcNewBVH(RTCDevice device)
+    {
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcNewAllocator);
+      RTC_VERIFY_HANDLE(device);
+      BVH* bvh = new BVH((Device*)device);
+      return (RTCBVH) bvh->refInc();
+      RTC_CATCH_END((Device*)device);
+      return nullptr;
+    }
+
+    RTC_API void* rtcBuildBVH(const RTCBuildArguments* arguments)
+    {
+      BVH* bvh = (BVH*) arguments->bvh;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcBuildBVH);
+      RTC_VERIFY_HANDLE(bvh);
+      RTC_VERIFY_HANDLE(arguments);
+      RTC_VERIFY_HANDLE(arguments->createNode);
+      RTC_VERIFY_HANDLE(arguments->setNodeChildren);
+      RTC_VERIFY_HANDLE(arguments->setNodeBounds);
+      RTC_VERIFY_HANDLE(arguments->createLeaf);
+
+      if (arguments->primitiveArrayCapacity < arguments->primitiveCount)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"primitiveArrayCapacity must be greater or equal to primitiveCount")
+
+      /* initialize the allocator */
+      bvh->allocator.init_estimate(arguments->primitiveCount*sizeof(BBox3fa));
+      bvh->allocator.reset();
+
+      /* switch between differnet builders based on quality level */
+      if (arguments->buildQuality == RTC_BUILD_QUALITY_LOW)
+        return rtcBuildBVHMorton(arguments);
+      else if (arguments->buildQuality == RTC_BUILD_QUALITY_MEDIUM)
+        return rtcBuildBVHBinnedSAH(arguments);
+      else if (arguments->buildQuality == RTC_BUILD_QUALITY_HIGH) {
+        if (arguments->splitPrimitive == nullptr || arguments->primitiveArrayCapacity <= arguments->primitiveCount)
+          return rtcBuildBVHBinnedSAH(arguments);
+        else
+          return rtcBuildBVHSpatialSAH(arguments);
+      }
+      else
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid build quality");
+
+      /* if we are in dynamic mode, then do not clear temporary data */
+      if (!(arguments->buildFlags & RTC_BUILD_FLAG_DYNAMIC))
+      {
+        bvh->morton_src.clear();
+        bvh->morton_tmp.clear();
+      }
+
+      RTC_CATCH_END(bvh->device);
+      return nullptr;
+    }
+
+    RTC_API void* rtcThreadLocalAlloc(RTCThreadLocalAllocator localAllocator, size_t bytes, size_t align)
+    {
+      FastAllocator::CachedAllocator* alloc = (FastAllocator::CachedAllocator*) localAllocator;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcThreadLocalAlloc);
+      return alloc->malloc0(bytes,align);
+      RTC_CATCH_END(alloc->alloc->getDevice());
+      return nullptr;
+    }
+
+    RTC_API void rtcMakeStaticBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcStaticBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->morton_src.clear();
+      bvh->morton_tmp.clear();
+      RTC_CATCH_END(bvh->device);
+    }
+
+    RTC_API void rtcRetainBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      Device* device = bvh ? bvh->device : nullptr;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcRetainBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->refInc();
+      RTC_CATCH_END(device);
+    }
+    
+    RTC_API void rtcReleaseBVH(RTCBVH hbvh)
+    {
+      BVH* bvh = (BVH*) hbvh;
+      Device* device = bvh ? bvh->device : nullptr;
+      RTC_CATCH_BEGIN;
+      RTC_TRACE(rtcReleaseBVH);
+      RTC_VERIFY_HANDLE(hbvh);
+      bvh->refDec();
+      RTC_CATCH_END(device);
+    }
+
+RTC_NAMESPACE_END
diff --git a/thirdparty/embree-aarch64/kernels/common/scene.cpp b/thirdparty/embree-aarch64/kernels/common/scene.cpp
new file mode 100644
index 0000000000..1e23aeb415
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene.cpp
@@ -0,0 +1,976 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene.h"
+
+#include "../bvh/bvh4_factory.h"
+#include "../bvh/bvh8_factory.h"
+#include "../../common/algorithms/parallel_reduce.h"
+
+namespace embree
+{
+  /* error raising rtcIntersect and rtcOccluded functions */
+  void missing_rtcCommit()      { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"scene not committed"); }
+  void invalid_rtcIntersect1()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect and rtcOccluded not enabled"); }
+  void invalid_rtcIntersect4()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect4 and rtcOccluded4 not enabled"); }
+  void invalid_rtcIntersect8()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect8 and rtcOccluded8 not enabled"); }
+  void invalid_rtcIntersect16() { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersect16 and rtcOccluded16 not enabled"); }
+  void invalid_rtcIntersectN()  { throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcIntersectN and rtcOccludedN not enabled"); }
+
+  Scene::Scene (Device* device)
+    : device(device),
+      flags_modified(true), enabled_geometry_types(0),
+      scene_flags(RTC_SCENE_FLAG_NONE),
+      quality_flags(RTC_BUILD_QUALITY_MEDIUM),
+      is_build(false), modified(true),
+      progressInterface(this), progress_monitor_function(nullptr), progress_monitor_ptr(nullptr), progress_monitor_counter(0)
+  {
+    device->refInc();
+
+    intersectors = Accel::Intersectors(missing_rtcCommit);
+
+    /* one can overwrite flags through device for debugging */
+    if (device->quality_flags != -1)
+      quality_flags = (RTCBuildQuality) device->quality_flags;
+    if (device->scene_flags != -1)
+      scene_flags = (RTCSceneFlags) device->scene_flags;
+  }
+
+  Scene::~Scene() noexcept
+  {
+    device->refDec();
+  }
+
+  void Scene::printStatistics()
+  {
+    /* calculate maximum number of time segments */
+    unsigned max_time_steps = 0;
+    for (size_t i=0; i<size(); i++) {
+      if (!get(i)) continue;
+      max_time_steps = max(max_time_steps,get(i)->numTimeSteps);
+    }
+
+    /* initialize vectors*/
+    std::vector<size_t> statistics[Geometry::GTY_END];
+    for (size_t i=0; i<Geometry::GTY_END; i++)
+      statistics[i].resize(max_time_steps);
+
+    /* gather statistics */
+    for (size_t i=0; i<size(); i++)
+    {
+      if (!get(i)) continue;
+      int ty = get(i)->getType();
+      assert(ty<Geometry::GTY_END);
+      int timesegments = get(i)->numTimeSegments();
+      assert((unsigned int)timesegments < max_time_steps);
+      statistics[ty][timesegments] += get(i)->size();
+    }
+
+    /* print statistics */
+    std::cout << std::setw(23) << "segments" << ": ";
+    for (size_t t=0; t<max_time_steps; t++)
+      std::cout << std::setw(10) << t;
+    std::cout << std::endl;
+
+    std::cout << "-------------------------";
+    for (size_t t=0; t<max_time_steps; t++)
+      std::cout << "----------";
+    std::cout << std::endl;
+
+    for (size_t p=0; p<Geometry::GTY_END; p++)
+    {
+      if (std::string(Geometry::gtype_names[p]) == "") continue;
+      std::cout << std::setw(23) << Geometry::gtype_names[p] << ": ";
+      for (size_t t=0; t<max_time_steps; t++)
+        std::cout << std::setw(10) << statistics[p][t];
+      std::cout << std::endl;
+    }
+  }
+
+  void Scene::createTriangleAccel()
+  {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    if (device->tri_accel == "default")
+    {
+      if (quality_flags != RTC_BUILD_QUALITY_LOW)
+      {
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+        switch (mode) {
+        case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh8_factory->BVH8Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          else
+#endif
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh4_factory->BVH4Triangle4(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          break;
+
+        case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+            accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          else
+#endif
+            accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+
+          break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else /* dynamic */
+      {
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+          else
+#endif
+          {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4 (this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST  )); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4i(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+      }
+    }
+    else if (device->tri_accel == "bvh4.triangle4")       accels_add(device->bvh4_factory->BVH4Triangle4 (this));
+    else if (device->tri_accel == "bvh4.triangle4v")      accels_add(device->bvh4_factory->BVH4Triangle4v(this));
+    else if (device->tri_accel == "bvh4.triangle4i")      accels_add(device->bvh4_factory->BVH4Triangle4i(this));
+    else if (device->tri_accel == "qbvh4.triangle4i")     accels_add(device->bvh4_factory->BVH4QuantizedTriangle4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->tri_accel == "bvh8.triangle4")       accels_add(device->bvh8_factory->BVH8Triangle4 (this));
+    else if (device->tri_accel == "bvh8.triangle4v")      accels_add(device->bvh8_factory->BVH8Triangle4v(this));
+    else if (device->tri_accel == "bvh8.triangle4i")      accels_add(device->bvh8_factory->BVH8Triangle4i(this));
+    else if (device->tri_accel == "qbvh8.triangle4i")     accels_add(device->bvh8_factory->BVH8QuantizedTriangle4i(this));
+    else if (device->tri_accel == "qbvh8.triangle4")      accels_add(device->bvh8_factory->BVH8QuantizedTriangle4(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown triangle acceleration structure "+device->tri_accel);
+#endif
+  }
+
+  void Scene::createTriangleMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+    if (device->tri_accel_mb == "default")
+    {
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // BVH8 reduces performance on AVX only-machines
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else
+#endif
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Triangle4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+    }
+    else if (device->tri_accel_mb == "bvh4.triangle4imb") accels_add(device->bvh4_factory->BVH4Triangle4iMB(this));
+    else if (device->tri_accel_mb == "bvh4.triangle4vmb") accels_add(device->bvh4_factory->BVH4Triangle4vMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->tri_accel_mb == "bvh8.triangle4imb") accels_add(device->bvh8_factory->BVH8Triangle4iMB(this));
+    else if (device->tri_accel_mb == "bvh8.triangle4vmb") accels_add(device->bvh8_factory->BVH8Triangle4vMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur triangle acceleration structure "+device->tri_accel_mb);
+#endif
+  }
+
+  void Scene::createQuadAccel()
+  {
+#if defined(EMBREE_GEOMETRY_QUAD)
+    if (device->quad_accel == "default")
+    {
+      if (quality_flags != RTC_BUILD_QUALITY_LOW)
+      {
+        /* static */
+        int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+        switch (mode) {
+        case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          else
+#endif
+          {
+            if (quality_flags == RTC_BUILD_QUALITY_HIGH)
+              accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::HIGH_QUALITY,BVHFactory::IntersectVariant::FAST));
+            else
+              accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+          }
+          break;
+
+        case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+            accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          else
+#endif
+            accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+          break;
+
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4i(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else /* dynamic */
+      {
+#if defined (EMBREE_TARGET_SIMD8)
+          if (device->canUseAVX())
+	  {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+          else
+#endif
+          {
+            int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+            switch (mode) {
+            case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::FAST)); break;
+            case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4v(this,BVHFactory::BuildVariant::DYNAMIC,BVHFactory::IntersectVariant::ROBUST)); break;
+            }
+          }
+      }
+    }
+    else if (device->quad_accel == "bvh4.quad4v")       accels_add(device->bvh4_factory->BVH4Quad4v(this));
+    else if (device->quad_accel == "bvh4.quad4i")       accels_add(device->bvh4_factory->BVH4Quad4i(this));
+    else if (device->quad_accel == "qbvh4.quad4i")      accels_add(device->bvh4_factory->BVH4QuantizedQuad4i(this));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->quad_accel == "bvh8.quad4v")       accels_add(device->bvh8_factory->BVH8Quad4v(this));
+    else if (device->quad_accel == "bvh8.quad4i")       accels_add(device->bvh8_factory->BVH8Quad4i(this));
+    else if (device->quad_accel == "qbvh8.quad4i")      accels_add(device->bvh8_factory->BVH8QuantizedQuad4i(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad acceleration structure "+device->quad_accel);
+#endif
+  }
+
+  void Scene::createQuadMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_QUAD)
+    if (device->quad_accel_mb == "default")
+    {
+      int mode =  2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+      switch (mode) {
+      case /*0b00*/ 0:
+#if defined (EMBREE_TARGET_SIMD8)
+        if (device->canUseAVX())
+          accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+        else
+#endif
+          accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST));
+        break;
+
+      case /*0b01*/ 1:
+#if defined (EMBREE_TARGET_SIMD8)
+        if (device->canUseAVX())
+          accels_add(device->bvh8_factory->BVH8Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+        else
+#endif
+          accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST));
+        break;
+
+      case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::FAST  )); break;
+      case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4Quad4iMB(this,BVHFactory::BuildVariant::STATIC,BVHFactory::IntersectVariant::ROBUST)); break;
+      }
+    }
+    else if (device->quad_accel_mb == "bvh4.quad4imb") accels_add(device->bvh4_factory->BVH4Quad4iMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->quad_accel_mb == "bvh8.quad4imb") accels_add(device->bvh8_factory->BVH8Quad4iMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown quad motion blur acceleration structure "+device->quad_accel_mb);
+#endif
+  }
+
+  void Scene::createHairAccel()
+  {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+    if (device->hair_accel == "default")
+    {
+      int mode = 2*(int)isCompactAccel() + 1*(int)isRobustAccel();
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // only enable on HSW machines, for SNB this codepath is slower
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b01*/ 1: accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+      else
+#endif
+      {
+        switch (mode) {
+        case /*0b00*/ 0: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b01*/ 1: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        case /*0b10*/ 2: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST)); break;
+        case /*0b11*/ 3: accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::ROBUST)); break;
+        }
+      }
+    }
+    else if (device->hair_accel == "bvh4obb.virtualcurve4v" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4v(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel == "bvh4obb.virtualcurve4i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4i(this,BVHFactory::IntersectVariant::FAST));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->hair_accel == "bvh8obb.virtualcurve8v" ) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8v(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel == "bvh4obb.virtualcurve8i" ) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8i(this,BVHFactory::IntersectVariant::FAST));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown hair acceleration structure "+device->hair_accel);
+#endif
+  }
+
+  void Scene::createHairMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+    if (device->hair_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX2()) // only enable on HSW machines, on SNB this codepath is slower
+      {
+        if (isRobustAccel()) accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::ROBUST));
+        else                 accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+      }
+      else
+#endif
+      {
+        if (isRobustAccel()) accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::ROBUST));
+        else                 accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+      }
+    }
+    else if (device->hair_accel_mb == "bvh4.virtualcurve4imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve4iMB(this,BVHFactory::IntersectVariant::FAST));
+
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->hair_accel_mb == "bvh4.virtualcurve8imb") accels_add(device->bvh4_factory->BVH4OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+    else if (device->hair_accel_mb == "bvh8.virtualcurve8imb") accels_add(device->bvh8_factory->BVH8OBBVirtualCurve8iMB(this,BVHFactory::IntersectVariant::FAST));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown motion blur hair acceleration structure "+device->hair_accel_mb);
+#endif
+  }
+
+  void Scene::createSubdivAccel()
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    if (device->subdiv_accel == "default") {
+      accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    }
+    else if (device->subdiv_accel == "bvh4.grid.eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    else if (device->subdiv_accel == "bvh4.subdivpatch1eager" ) accels_add(device->bvh4_factory->BVH4SubdivPatch1(this));
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv accel "+device->subdiv_accel);
+#endif
+  }
+
+  void Scene::createSubdivMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+    if (device->subdiv_accel_mb == "default") {
+      accels_add(device->bvh4_factory->BVH4SubdivPatch1MB(this));
+    }
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown subdiv mblur accel "+device->subdiv_accel_mb);
+#endif
+  }
+
+  void Scene::createUserGeometryAccel()
+  {
+#if defined(EMBREE_GEOMETRY_USER)
+    if (device->object_accel == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4UserGeometry(this,BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    else if (device->object_accel == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometry(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->object_accel == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometry(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry accel "+device->object_accel);
+#endif
+  }
+
+  void Scene::createUserGeometryMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_USER)
+    if (device->object_accel_mb == "default"    ) {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+    }
+    else if (device->object_accel_mb == "bvh4.object") accels_add(device->bvh4_factory->BVH4UserGeometryMB(this));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->object_accel_mb == "bvh8.object") accels_add(device->bvh8_factory->BVH8UserGeometryMB(this));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown user geometry mblur accel "+device->object_accel_mb);
+#endif
+  }
+
+  void Scene::createInstanceAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    // if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel()) {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      } 
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4Instance(this, false, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+  }
+
+  void Scene::createInstanceMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    //if (device->instance_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8InstanceMB(this, false));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4InstanceMB(this, false));
+    }
+    //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+  }
+
+  void Scene::createInstanceExpensiveAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    // if (device->object_accel == "default") 
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel()) {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh8_factory->BVH8Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      } 
+      else
+#endif
+      {
+        if (quality_flags != RTC_BUILD_QUALITY_LOW) {
+          accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::STATIC));
+        } else {
+          accels_add(device->bvh4_factory->BVH4Instance(this, true, BVHFactory::BuildVariant::DYNAMIC));
+        }
+      }
+    }
+    // else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance accel "+device->instance_accel);
+#endif
+  }
+
+  void Scene::createInstanceExpensiveMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+    //if (device->instance_accel_mb == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+        accels_add(device->bvh8_factory->BVH8InstanceMB(this, true));
+      else
+#endif
+        accels_add(device->bvh4_factory->BVH4InstanceMB(this, true));
+    }
+    //else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown instance mblur accel "+device->instance_accel_mb);
+#endif
+  }
+
+  void Scene::createGridAccel()
+  {
+    BVHFactory::IntersectVariant ivariant = isRobustAccel() ? BVHFactory::IntersectVariant::ROBUST : BVHFactory::IntersectVariant::FAST;
+#if defined(EMBREE_GEOMETRY_GRID)
+    if (device->grid_accel == "default")
+    {
+#if defined (EMBREE_TARGET_SIMD8)
+      if (device->canUseAVX() && !isCompactAccel())
+      {
+        accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+      }
+      else
+#endif
+      {
+        accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+      }
+    }
+    else if (device->grid_accel == "bvh4.grid") accels_add(device->bvh4_factory->BVH4Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#if defined (EMBREE_TARGET_SIMD8)
+    else if (device->grid_accel == "bvh8.grid") accels_add(device->bvh8_factory->BVH8Grid(this,BVHFactory::BuildVariant::STATIC,ivariant));
+#endif
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid accel "+device->grid_accel);
+#endif
+
+  }
+
+  void Scene::createGridMBAccel()
+  {
+#if defined(EMBREE_GEOMETRY_GRID)
+    if (device->grid_accel_mb == "default")
+    {
+      accels_add(device->bvh4_factory->BVH4GridMB(this,BVHFactory::BuildVariant::STATIC));
+    }
+    else if (device->grid_accel_mb == "bvh4mb.grid") accels_add(device->bvh4_factory->BVH4GridMB(this));
+    else throw_RTCError(RTC_ERROR_INVALID_ARGUMENT,"unknown grid mb accel "+device->grid_accel);
+#endif
+
+  }
+
+  void Scene::clear() {
+  }
+
+  unsigned Scene::bind(unsigned geomID, Ref<Geometry> geometry)
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(geometriesMutex);
+#else
+    Lock<SpinLock> lock(geometriesMutex);
+#endif
+    if (geomID == RTC_INVALID_GEOMETRY_ID) {
+      geomID = id_pool.allocate();
+      if (geomID == RTC_INVALID_GEOMETRY_ID)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"too many geometries inside scene");
+    }
+    else
+    {
+      if (!id_pool.add(geomID))
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID provided");
+    }
+    if (geomID >= geometries.size()) {
+      geometries.resize(geomID+1);
+      vertices.resize(geomID+1);
+      geometryModCounters_.resize(geomID+1);
+    }
+    geometries[geomID] = geometry;
+    geometryModCounters_[geomID] = 0;
+    if (geometry->isEnabled()) {
+      setModified ();
+    }
+    return geomID;
+  }
+
+  void Scene::detachGeometry(size_t geomID)
+  {
+#if defined(__aarch64__) && defined(BUILD_IOS)
+    std::scoped_lock lock(geometriesMutex);
+#else
+    Lock<SpinLock> lock(geometriesMutex);
+#endif
+
+    if (geomID >= geometries.size())
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry ID");
+
+    Ref<Geometry>& geometry = geometries[geomID];
+    if (geometry == null)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invalid geometry");
+
+    if (geometry->isEnabled()) {
+      setModified ();
+    }
+    accels_deleteGeometry(unsigned(geomID));
+    id_pool.deallocate((unsigned)geomID);
+    geometries[geomID] = null;
+    vertices[geomID] = nullptr;
+    geometryModCounters_[geomID] = 0;
+  }
+
+  void Scene::updateInterface()
+  {
+    is_build = true;
+  }
+
+  void Scene::commit_task ()
+  {
+    checkIfModifiedAndSet ();
+    if (!isModified()) {
+      return;
+    }
+
+    /* print scene statistics */
+    if (device->verbosity(2))
+      printStatistics();
+
+    progress_monitor_counter = 0;
+
+    /* gather scene stats and call preCommit function of each geometry */
+    this->world = parallel_reduce (size_t(0), geometries.size(), GeometryCounts (),
+      [this](const range<size_t>& r)->GeometryCounts
+      {
+        GeometryCounts c;
+        for (auto i=r.begin(); i<r.end(); ++i)
+        {
+          if (geometries[i] && geometries[i]->isEnabled())
+          {
+            geometries[i]->preCommit();
+            geometries[i]->addElementsToCount (c);
+            c.numFilterFunctions += (int) geometries[i]->hasFilterFunctions();
+          }
+        }
+        return c;
+      },
+      std::plus<GeometryCounts>()
+    );
+
+    /* select acceleration structures to build */
+    unsigned int new_enabled_geometry_types = world.enabledGeometryTypesMask();
+    if (flags_modified || new_enabled_geometry_types != enabled_geometry_types)
+    {
+      accels_init();
+
+      /* we need to make all geometries modified, otherwise two level builder will
+        not rebuild currently not modified geometries */
+      parallel_for(geometryModCounters_.size(), [&] ( const size_t i ) {
+          geometryModCounters_[i] = 0;
+        });
+
+      if (getNumPrimitives(TriangleMesh::geom_type,false)) createTriangleAccel();
+      if (getNumPrimitives(TriangleMesh::geom_type,true)) createTriangleMBAccel();
+      if (getNumPrimitives(QuadMesh::geom_type,false)) createQuadAccel();
+      if (getNumPrimitives(QuadMesh::geom_type,true)) createQuadMBAccel();
+      if (getNumPrimitives(GridMesh::geom_type,false)) createGridAccel();
+      if (getNumPrimitives(GridMesh::geom_type,true)) createGridMBAccel();
+      if (getNumPrimitives(SubdivMesh::geom_type,false)) createSubdivAccel();
+      if (getNumPrimitives(SubdivMesh::geom_type,true)) createSubdivMBAccel();
+      if (getNumPrimitives(Geometry::MTY_CURVES,false)) createHairAccel();
+      if (getNumPrimitives(Geometry::MTY_CURVES,true)) createHairMBAccel();
+      if (getNumPrimitives(UserGeometry::geom_type,false)) createUserGeometryAccel();
+      if (getNumPrimitives(UserGeometry::geom_type,true)) createUserGeometryMBAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,false)) createInstanceAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_CHEAP,true)) createInstanceMBAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,false)) createInstanceExpensiveAccel();
+      if (getNumPrimitives(Geometry::MTY_INSTANCE_EXPENSIVE,true)) createInstanceExpensiveMBAccel();
+
+      flags_modified = false;
+      enabled_geometry_types = new_enabled_geometry_types;
+    }
+
+    /* select fast code path if no filter function is present */
+    accels_select(hasFilterFunction());
+
+    /* build all hierarchies of this scene */
+    accels_build();
+
+    /* make static geometry immutable */
+    if (!isDynamicAccel()) {
+      accels_immutable();
+      flags_modified = true; // in non-dynamic mode we have to re-create accels
+    }
+
+    /* call postCommit function of each geometry */
+    parallel_for(geometries.size(), [&] ( const size_t i ) {
+        if (geometries[i] && geometries[i]->isEnabled()) {
+          geometries[i]->postCommit();
+          vertices[i] = geometries[i]->getCompactVertexArray();
+          geometryModCounters_[i] = geometries[i]->getModCounter();
+        }
+      });
+
+    updateInterface();
+
+    if (device->verbosity(2)) {
+      std::cout << "created scene intersector" << std::endl;
+      accels_print(2);
+      std::cout << "selected scene intersector" << std::endl;
+      intersectors.print(2);
+    }
+
+    setModified(false);
+  }
+
+  void Scene::setBuildQuality(RTCBuildQuality quality_flags_i)
+  {
+    if (quality_flags == quality_flags_i) return;
+    quality_flags = quality_flags_i;
+    flags_modified = true;
+  }
+
+  RTCBuildQuality Scene::getBuildQuality() const {
+    return quality_flags;
+  }
+
+  void Scene::setSceneFlags(RTCSceneFlags scene_flags_i)
+  {
+    if (scene_flags == scene_flags_i) return;
+    scene_flags = scene_flags_i;
+    flags_modified = true;
+  }
+
+  RTCSceneFlags Scene::getSceneFlags() const {
+    return scene_flags;
+  }
+
+#if defined(TASKING_INTERNAL)
+
+  void Scene::commit (bool join)
+  {
+    Lock<MutexSys> buildLock(buildMutex,false);
+
+    /* allocates own taskscheduler for each build */
+    Ref<TaskScheduler> scheduler = nullptr;
+    {
+      Lock<MutexSys> lock(schedulerMutex);
+      scheduler = this->scheduler;
+      if (scheduler == null) {
+        buildLock.lock();
+        this->scheduler = scheduler = new TaskScheduler;
+      }
+    }
+
+    /* worker threads join build */
+    if (!buildLock.isLocked())
+    {
+      if (!join)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"use rtcJoinCommitScene to join a build operation");
+
+      scheduler->join();
+      return;
+    }
+
+    /* initiate build */
+    // -- GODOT start --
+    // try {
+      scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+    // }
+    // catch (...) {
+    //   accels_clear();
+    //   updateInterface();
+    //   Lock<MutexSys> lock(schedulerMutex);
+    //   this->scheduler = nullptr;
+    //   throw;
+    // }
+    // -- GODOT end --
+  }
+
+#endif
+
+#if defined(TASKING_TBB) || defined(TASKING_GCD)
+
+  void Scene::commit (bool join)
+  {
+#if defined(TASKING_TBB) && (TBB_INTERFACE_VERSION_MAJOR < 8)
+    if (join)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with this TBB version");
+#endif
+
+    /* try to obtain build lock */
+    Lock<MutexSys> lock(buildMutex,buildMutex.try_lock());
+
+    /* join hierarchy build */
+    if (!lock.isLocked())
+    {
+#if !TASKING_TBB_USE_TASK_ISOLATION
+      if (!join)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"invoking rtcCommitScene from multiple threads is not supported with this TBB version");
+#endif
+
+      do {
+
+#if defined(TASKING_GCD)
+      // Do Nothing
+#else
+#if USE_TASK_ARENA
+        if (join) {
+          device->arena->execute([&]{ group.wait(); });
+        }
+        else
+#endif
+        {
+          group.wait();
+        }
+#endif
+
+        pause_cpu();
+        yield();
+
+      } while (!buildMutex.try_lock());
+
+      buildMutex.unlock();
+      return;
+    }
+
+    /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+    const unsigned int mxcsr = _mm_getcsr();
+    _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+
+    try {
+#if defined(TASKING_TBB)
+#if TBB_INTERFACE_VERSION_MAJOR < 8
+      tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits);
+#else
+      tbb::task_group_context ctx( tbb::task_group_context::isolated, tbb::task_group_context::default_traits | tbb::task_group_context::fp_settings );
+#endif
+      //ctx.set_priority(tbb::priority_high);
+
+#if USE_TASK_ARENA
+      if (join)
+      {
+        device->arena->execute([&]{
+            group.run([&]{
+                tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+              });
+            group.wait();
+          });
+      }
+      else
+#endif
+      {
+        group.run([&]{
+            tbb::parallel_for (size_t(0), size_t(1), size_t(1), [&] (size_t) { commit_task(); }, ctx);
+          });
+        group.wait();
+      }
+
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+
+#elif defined(TASKING_GCD)
+
+      commit_task();
+
+#endif  // #if defined(TASKING_TBB)
+
+    }
+    catch (...)
+    {
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+
+      accels_clear();
+      updateInterface();
+      throw;
+    }
+  }
+#endif
+
+#if defined(TASKING_PPL)
+
+  void Scene::commit (bool join)
+  {
+#if defined(TASKING_PPL)
+    if (join)
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION,"rtcJoinCommitScene not supported with PPL");
+#endif
+
+    /* try to obtain build lock */
+    Lock<MutexSys> lock(buildMutex);
+
+    checkIfModifiedAndSet ();
+    if (!isModified()) {
+      return;
+    }
+
+    /* for best performance set FTZ and DAZ flags in the MXCSR control and status register */
+    const unsigned int mxcsr = _mm_getcsr();
+    _mm_setcsr(mxcsr | /* FTZ */ (1<<15) | /* DAZ */ (1<<6));
+
+    try {
+
+      group.run([&]{
+          concurrency::parallel_for(size_t(0), size_t(1), size_t(1), [&](size_t) { commit_task(); });
+        });
+      group.wait();
+
+       /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+    }
+    catch (...)
+    {
+      /* reset MXCSR register again */
+      _mm_setcsr(mxcsr);
+
+      accels_clear();
+      updateInterface();
+      throw;
+    }
+  }
+#endif
+
+  void Scene::setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr)
+  {
+    progress_monitor_function = func;
+    progress_monitor_ptr      = ptr;
+  }
+
+  void Scene::progressMonitor(double dn)
+  {
+    if (progress_monitor_function) {
+      size_t n = size_t(dn) + progress_monitor_counter.fetch_add(size_t(dn));
+      if (!progress_monitor_function(progress_monitor_ptr, n / (double(numPrimitives())))) {
+        throw_RTCError(RTC_ERROR_CANCELLED,"progress monitor forced termination");
+      }
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene.h b/thirdparty/embree-aarch64/kernels/common/scene.h
new file mode 100644
index 0000000000..b41c6cde91
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene.h
@@ -0,0 +1,390 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+ 
+#pragma once
+
+#include "default.h"
+#include "device.h"
+#include "builder.h"
+#include "../../common/algorithms/parallel_any_of.h"
+#include "scene_triangle_mesh.h"
+#include "scene_quad_mesh.h"
+#include "scene_user_geometry.h"
+#include "scene_instance.h"
+#include "scene_curves.h"
+#include "scene_line_segments.h"
+#include "scene_subdiv_mesh.h"
+#include "scene_grid_mesh.h"
+#include "scene_points.h"
+#include "../subdiv/tessellation_cache.h"
+
+#include "acceln.h"
+#include "geometry.h"
+
+namespace embree
+{
+  /*! Base class all scenes are derived from */
+  class Scene : public AccelN
+  {
+    ALIGNED_CLASS_(std::alignment_of<Scene>::value);
+
+  public:
+    template<typename Ty, bool mblur = false>
+      class Iterator
+      {
+      public:
+      Iterator ()  {}
+      
+      Iterator (Scene* scene, bool all = false) 
+      : scene(scene), all(all) {}
+      
+      __forceinline Ty* at(const size_t i)
+      {
+        Geometry* geom = scene->geometries[i].ptr;
+        if (geom == nullptr) return nullptr;
+        if (!all && !geom->isEnabled()) return nullptr;
+        const size_t mask = geom->getTypeMask() & Ty::geom_type; 
+        if (!(mask)) return nullptr;
+        if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+        return (Ty*) geom;
+      }
+
+      __forceinline Ty* operator[] (const size_t i) {
+        return at(i);
+      }
+
+      __forceinline size_t size() const {
+        return scene->size();
+      }
+      
+      __forceinline size_t numPrimitives() const {
+        return scene->getNumPrimitives(Ty::geom_type,mblur);
+      }
+
+      __forceinline size_t maxPrimitivesPerGeometry() 
+      {
+        size_t ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,mesh->size());
+        }
+        return ret;
+      }
+
+      __forceinline unsigned int maxGeomID() 
+      {
+        unsigned int ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,(unsigned int)i);
+        }
+        return ret;
+      }
+
+      __forceinline unsigned maxTimeStepsPerGeometry()
+      {
+        unsigned ret = 0;
+        for (size_t i=0; i<scene->size(); i++) {
+          Ty* mesh = at(i);
+          if (mesh == nullptr) continue;
+          ret = max(ret,mesh->numTimeSteps);
+        }
+        return ret;
+      }
+      
+    private:
+      Scene* scene;
+      bool all;
+      };
+
+      class Iterator2
+      {
+      public:
+      Iterator2 () {}
+      
+      Iterator2 (Scene* scene, Geometry::GTypeMask typemask, bool mblur) 
+      : scene(scene), typemask(typemask), mblur(mblur) {}
+      
+      __forceinline Geometry* at(const size_t i)
+      {
+        Geometry* geom = scene->geometries[i].ptr;
+        if (geom == nullptr) return nullptr;
+        if (!geom->isEnabled()) return nullptr;
+        if (!(geom->getTypeMask() & typemask)) return nullptr;
+        if ((geom->numTimeSteps != 1) != mblur) return nullptr;
+        return geom;
+      }
+
+      __forceinline Geometry* operator[] (const size_t i) {
+        return at(i);
+      }
+
+      __forceinline size_t size() const {
+        return scene->size();
+      }
+      
+    private:
+      Scene* scene;
+      Geometry::GTypeMask typemask;
+      bool mblur;
+    };
+
+  public:
+    
+    /*! Scene construction */
+    Scene (Device* device);
+
+    /*! Scene destruction */
+    ~Scene () noexcept;
+
+  private:
+    /*! class is non-copyable */
+    Scene (const Scene& other) DELETED; // do not implement
+    Scene& operator= (const Scene& other) DELETED; // do not implement
+
+  public:
+    void createTriangleAccel();
+    void createTriangleMBAccel();
+    void createQuadAccel();
+    void createQuadMBAccel();
+    void createHairAccel();
+    void createHairMBAccel();
+    void createSubdivAccel();
+    void createSubdivMBAccel();
+    void createUserGeometryAccel();
+    void createUserGeometryMBAccel();
+    void createInstanceAccel();
+    void createInstanceMBAccel();
+    void createInstanceExpensiveAccel();
+    void createInstanceExpensiveMBAccel();
+    void createGridAccel();
+    void createGridMBAccel();
+
+    /*! prints statistics about the scene */
+    void printStatistics();
+
+    /*! clears the scene */
+    void clear();
+
+    /*! detaches some geometry */
+    void detachGeometry(size_t geomID);
+
+    void setBuildQuality(RTCBuildQuality quality_flags);
+    RTCBuildQuality getBuildQuality() const;
+    
+    void setSceneFlags(RTCSceneFlags scene_flags);
+    RTCSceneFlags getSceneFlags() const;
+    
+    void commit (bool join);
+    void commit_task ();
+    void build () {}
+
+    void updateInterface();
+
+    /* return number of geometries */
+    __forceinline size_t size() const { return geometries.size(); }
+    
+    /* bind geometry to the scene */
+    unsigned int bind (unsigned geomID, Ref<Geometry> geometry);
+    
+    /* determines if scene is modified */
+    __forceinline bool isModified() const { return modified; }
+
+    /* sets modified flag */
+    __forceinline void setModified(bool f = true) { 
+      modified = f; 
+    }
+
+    __forceinline bool isGeometryModified(size_t geomID)
+    {
+      Ref<Geometry>& g = geometries[geomID];
+      if (!g) return false;
+      return g->getModCounter() > geometryModCounters_[geomID];
+    }
+
+  protected:
+    
+    __forceinline void checkIfModifiedAndSet () 
+    {
+      if (isModified ()) return;
+      
+      auto geometryIsModified = [this](size_t geomID)->bool {
+        return isGeometryModified(geomID);
+      };
+
+      if (parallel_any_of (size_t(0), geometries.size (), geometryIsModified)) {
+        setModified ();
+      }
+    }
+    
+  public:
+
+    /* get mesh by ID */
+    __forceinline       Geometry* get(size_t i)       { assert(i < geometries.size()); return geometries[i].ptr; }
+    __forceinline const Geometry* get(size_t i) const { assert(i < geometries.size()); return geometries[i].ptr; }
+
+    template<typename Mesh>
+      __forceinline       Mesh* get(size_t i)       { 
+      assert(i < geometries.size()); 
+      assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+      return (Mesh*)geometries[i].ptr; 
+    }
+    template<typename Mesh>
+      __forceinline const Mesh* get(size_t i) const { 
+      assert(i < geometries.size()); 
+      assert(geometries[i]->getTypeMask() & Mesh::geom_type);
+      return (Mesh*)geometries[i].ptr; 
+    }
+
+    template<typename Mesh>
+    __forceinline Mesh* getSafe(size_t i) {
+      assert(i < geometries.size());
+      if (geometries[i] == null) return nullptr;
+      if (!(geometries[i]->getTypeMask() & Mesh::geom_type)) return nullptr;
+      else return (Mesh*) geometries[i].ptr;
+    }
+
+    __forceinline Ref<Geometry> get_locked(size_t i)  {
+      Lock<SpinLock> lock(geometriesMutex);
+      assert(i < geometries.size()); 
+      return geometries[i]; 
+    }
+
+    /* flag decoding */
+    __forceinline bool isFastAccel() const { return !isCompactAccel() && !isRobustAccel(); }
+    __forceinline bool isCompactAccel() const { return scene_flags & RTC_SCENE_FLAG_COMPACT; }
+    __forceinline bool isRobustAccel()  const { return scene_flags & RTC_SCENE_FLAG_ROBUST; }
+    __forceinline bool isStaticAccel()  const { return !(scene_flags & RTC_SCENE_FLAG_DYNAMIC); }
+    __forceinline bool isDynamicAccel() const { return scene_flags & RTC_SCENE_FLAG_DYNAMIC; }
+    
+    __forceinline bool hasContextFilterFunction() const {
+      return scene_flags & RTC_SCENE_FLAG_CONTEXT_FILTER_FUNCTION;
+    }
+    
+    __forceinline bool hasGeometryFilterFunction() {
+      return world.numFilterFunctions != 0;
+    }
+      
+    __forceinline bool hasFilterFunction() {
+      return hasContextFilterFunction() || hasGeometryFilterFunction();
+    }
+    
+    /* test if scene got already build */
+    __forceinline bool isBuild() const { return is_build; }
+
+  public:
+    IDPool<unsigned,0xFFFFFFFE> id_pool;
+    vector<Ref<Geometry>> geometries; //!< list of all user geometries
+    vector<unsigned int> geometryModCounters_;
+    vector<float*> vertices;
+    
+  public:
+    Device* device;
+
+    /* these are to detect if we need to recreate the acceleration structures */
+    bool flags_modified;
+    unsigned int enabled_geometry_types;
+    
+    RTCSceneFlags scene_flags;
+    RTCBuildQuality quality_flags;
+    MutexSys buildMutex;
+    SpinLock geometriesMutex;
+    bool is_build;
+  private:
+    bool modified;                   //!< true if scene got modified
+
+  public:
+    
+    /*! global lock step task scheduler */
+#if defined(TASKING_INTERNAL) 
+    MutexSys schedulerMutex;
+    Ref<TaskScheduler> scheduler;
+#elif defined(TASKING_TBB) && TASKING_TBB_USE_TASK_ISOLATION
+    tbb::isolated_task_group group;
+#elif defined(TASKING_TBB)
+    tbb::task_group group;
+#elif defined(TASKING_PPL)
+    concurrency::task_group group;
+#endif
+    
+  public:
+    struct BuildProgressMonitorInterface : public BuildProgressMonitor {
+      BuildProgressMonitorInterface(Scene* scene) 
+      : scene(scene) {}
+      void operator() (size_t dn) const { scene->progressMonitor(double(dn)); }
+    private:
+      Scene* scene;
+    };
+    BuildProgressMonitorInterface progressInterface;
+    RTCProgressMonitorFunction progress_monitor_function;
+    void* progress_monitor_ptr;
+    std::atomic<size_t> progress_monitor_counter;
+    void progressMonitor(double nprims);
+    void setProgressMonitorFunction(RTCProgressMonitorFunction func, void* ptr);
+
+  private:
+    GeometryCounts world;               //!< counts for geometry
+
+  public:
+
+    __forceinline size_t numPrimitives() const {
+      return world.size();
+    }
+
+    __forceinline size_t getNumPrimitives(Geometry::GTypeMask mask, bool mblur) const
+    {
+      size_t count = 0;
+      
+      if (mask & Geometry::MTY_TRIANGLE_MESH)
+        count += mblur ? world.numMBTriangles : world.numTriangles;
+      
+      if (mask & Geometry::MTY_QUAD_MESH)
+        count += mblur ? world.numMBQuads : world.numQuads;
+      
+      if (mask & Geometry::MTY_CURVE2)
+        count += mblur ? world.numMBLineSegments : world.numLineSegments;
+      
+      if (mask & Geometry::MTY_CURVE4)
+        count += mblur ? world.numMBBezierCurves : world.numBezierCurves;
+      
+      if (mask & Geometry::MTY_POINTS)
+        count += mblur ? world.numMBPoints : world.numPoints;
+      
+      if (mask & Geometry::MTY_SUBDIV_MESH)
+        count += mblur ? world.numMBSubdivPatches : world.numSubdivPatches;
+      
+      if (mask & Geometry::MTY_USER_GEOMETRY)
+        count += mblur ? world.numMBUserGeometries : world.numUserGeometries;
+      
+      if (mask & Geometry::MTY_INSTANCE_CHEAP)
+        count += mblur ? world.numMBInstancesCheap : world.numInstancesCheap;
+      
+      if (mask & Geometry::MTY_INSTANCE_EXPENSIVE)
+        count += mblur  ? world.numMBInstancesExpensive : world.numInstancesExpensive;
+      
+      if (mask & Geometry::MTY_GRID_MESH)
+        count += mblur  ? world.numMBGrids : world.numGrids;
+      
+      return count;
+    }
+    
+    template<typename Mesh, bool mblur>
+    __forceinline unsigned getNumTimeSteps()
+    {
+      if (!mblur)
+        return 1;
+
+      Scene::Iterator<Mesh,mblur> iter(this);
+      return iter.maxTimeStepsPerGeometry();
+    }
+
+    template<typename Mesh, bool mblur>
+    __forceinline unsigned int getMaxGeomID()
+    {
+      Scene::Iterator<Mesh,mblur> iter(this);
+      return iter.maxGeomID();
+    }
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_curves.h b/thirdparty/embree-aarch64/kernels/common/scene_curves.h
new file mode 100644
index 0000000000..2649ab0e3e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_curves.h
@@ -0,0 +1,341 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! represents an array of bicubic bezier curves */
+  struct CurveGeometry : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE4;
+
+  public:
+    
+    /*! bezier curve construction */
+    CurveGeometry (Device* device, Geometry::GType gtype);
+    
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void setTessellationRate(float N);
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+    
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns the i'th curve */
+    __forceinline const unsigned int& curve(size_t i) const {
+      return curves[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th tangent of the first time step */
+    __forceinline Vec3ff tangent(size_t i) const {
+      return tangents0[i];
+    }
+
+    /*! returns i'th normal derivative of the first time step */
+    __forceinline Vec3fa dnormal(size_t i) const {
+      return dnormals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th tangent of itime'th timestep */
+    __forceinline Vec3ff tangent(size_t i, size_t itime) const {
+      return tangents[itime][i];
+    }
+
+    /*! returns i'th normal derivative of itime'th timestep */
+    __forceinline Vec3fa dnormal(size_t i, size_t itime) const {
+      return dnormals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i) const
+    {
+      p0 = vertex(i+0);
+      p1 = vertex(i+1);
+      p2 = vertex(i+2);
+      p3 = vertex(i+3);
+    }
+
+    /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, size_t itime) const
+    {
+      p0 = vertex(i+0,itime);
+      p1 = vertex(i+1,itime);
+      p2 = vertex(i+2,itime);
+      p3 = vertex(i+3,itime);
+    }
+
+    /*! gathers the curve starting with i'th vertex */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i) const
+    {
+      p0 = vertex(i+0);
+      p1 = vertex(i+1);
+      p2 = vertex(i+2);
+      p3 = vertex(i+3);
+      n0 = normal(i+0);
+      n1 = normal(i+1);
+      n2 = normal(i+2);
+      n3 = normal(i+3);
+    }
+
+    /*! gathers the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, size_t itime) const
+    {
+      p0 = vertex(i+0,itime);
+      p1 = vertex(i+1,itime);
+      p2 = vertex(i+2,itime);
+      p3 = vertex(i+3,itime);
+      n0 = normal(i+0,itime);
+      n1 = normal(i+1,itime);
+      n2 = normal(i+2,itime);
+      n3 = normal(i+3,itime);
+    }
+
+    /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void prefetchL1_vertices(size_t i) const
+    {
+      prefetchL1(vertices0.getPtr(i)+0);
+      prefetchL1(vertices0.getPtr(i)+64);
+    }
+
+    /*! prefetches the curve starting with i'th vertex of itime'th timestep */
+    __forceinline void prefetchL2_vertices(size_t i) const
+    {
+      prefetchL2(vertices0.getPtr(i)+0);
+      prefetchL2(vertices0.getPtr(i)+64);
+    }  
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1,a2,a3;
+      gather(a0,a1,a2,a3,i,itime);
+      Vec3ff b0,b1,b2,b3;
+      gather(b0,b1,b2,b3,i,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+      p2 = madd(Vec3ff(t0),a2,t1*b2);
+      p3 = madd(Vec3ff(t0),a3,t1*b3);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather(Vec3ff& p0, Vec3ff& p1, Vec3ff& p2, Vec3ff& p3, Vec3fa& n0, Vec3fa& n1, Vec3fa& n2, Vec3fa& n3, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+
+      const float t0 = 1.0f - ftime;
+      const float t1 = ftime;
+      Vec3ff a0,a1,a2,a3; Vec3fa an0,an1,an2,an3;
+      gather(a0,a1,a2,a3,an0,an1,an2,an3,i,itime);
+      Vec3ff b0,b1,b2,b3; Vec3fa bn0,bn1,bn2,bn3;
+      gather(b0,b1,b2,b3,bn0,bn1,bn2,bn3,i,itime+1);
+      p0 = madd(Vec3ff(t0),a0,t1*b0);
+      p1 = madd(Vec3ff(t0),a1,t1*b1);
+      p2 = madd(Vec3ff(t0),a2,t1*b2);
+      p3 = madd(Vec3ff(t0),a3,t1*b3);
+      n0 = madd(Vec3ff(t0),an0,t1*bn0);
+      n1 = madd(Vec3ff(t0),an1,t1*bn1);
+      n2 = madd(Vec3ff(t0),an2,t1*bn2);
+      n3 = madd(Vec3ff(t0),an3,t1*bn3);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+    {
+      Vec3ff v0,v1,v2,v3; Vec3fa n0,n1,n2,n3;
+      unsigned int vertexID = curve(primID);
+      gather(v0,v1,v2,v3,n0,n1,n2,n3,vertexID,itime);
+      SourceCurve3ff ccurve(v0,v1,v2,v3);
+      SourceCurve3fa ncurve(n0,n1,n2,n3);
+      ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+      return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+0);
+      const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context,ray_org,primID,itime+1);
+      return clerp(curve0,curve1,ftime);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i) const
+    {
+      p0 = vertex (i+0);
+      p1 = vertex (i+1);
+      t0 = tangent(i+0);
+      t1 = tangent(i+1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, size_t itime) const
+    {
+      p0 = vertex (i+0,itime);
+      p1 = vertex (i+1,itime);
+      t0 = tangent(i+0,itime);
+      t1 = tangent(i+1,itime);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3ff& p1, Vec3ff& t1, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float f0 = 1.0f - ftime, f1 = ftime;
+      Vec3ff ap0,at0,ap1,at1;
+      gather_hermite(ap0,at0,ap1,at1,i,itime);
+      Vec3ff bp0,bt0,bp1,bt1;
+      gather_hermite(bp0,bt0,bp1,bt1,i,itime+1);
+      p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+      t0 = madd(Vec3ff(f0),at0,f1*bt0);
+      p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+      t1 = madd(Vec3ff(f0),at1,f1*bt1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i) const
+    {
+      p0 = vertex (i+0);
+      p1 = vertex (i+1);
+      t0 = tangent(i+0);
+      t1 = tangent(i+1);
+      n0 = normal(i+0);
+      n1 = normal(i+1);
+      dn0 = dnormal(i+0);
+      dn1 = dnormal(i+1);
+    }
+
+    /*! gathers the hermite curve starting with i'th vertex of itime'th timestep */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3ff& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3ff& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, size_t itime) const
+    {
+      p0 = vertex (i+0,itime);
+      p1 = vertex (i+1,itime);
+      t0 = tangent(i+0,itime);
+      t1 = tangent(i+1,itime);
+      n0 = normal(i+0,itime);
+      n1 = normal(i+1,itime);
+      dn0 = dnormal(i+0,itime);
+      dn1 = dnormal(i+1,itime);
+    }
+
+    /*! loads curve vertices for specified time */
+    __forceinline void gather_hermite(Vec3ff& p0, Vec3fa& t0, Vec3fa& n0, Vec3fa& dn0, Vec3ff& p1, Vec3fa& t1, Vec3fa& n1, Vec3fa& dn1, size_t i, float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const float f0 = 1.0f - ftime, f1 = ftime;
+      Vec3ff ap0,at0,ap1,at1; Vec3fa an0,adn0,an1,adn1;
+      gather_hermite(ap0,at0,an0,adn0,ap1,at1,an1,adn1,i,itime);
+      Vec3ff bp0,bt0,bp1,bt1; Vec3fa bn0,bdn0,bn1,bdn1;
+      gather_hermite(bp0,bt0,bn0,bdn0,bp1,bt1,bn1,bdn1,i,itime+1);
+      p0 = madd(Vec3ff(f0),ap0,f1*bp0);
+      t0 = madd(Vec3ff(f0),at0,f1*bt0);
+      n0 = madd(Vec3ff(f0),an0,f1*bn0);
+      dn0= madd(Vec3ff(f0),adn0,f1*bdn0);
+      p1 = madd(Vec3ff(f0),ap1,f1*bp1);
+      t1 = madd(Vec3ff(f0),at1,f1*bt1);
+      n1 = madd(Vec3ff(f0),an1,f1*bn1);
+      dn1= madd(Vec3ff(f0),adn1,f1*bdn1);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+      __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const size_t itime) const
+    {
+      Vec3ff v0,t0,v1,t1; Vec3fa n0,dn0,n1,dn1;
+      unsigned int vertexID = curve(primID);
+      gather_hermite(v0,t0,n0,dn0,v1,t1,n1,dn1,vertexID,itime);
+
+      SourceCurve3ff ccurve(v0,t0,v1,t1);
+      SourceCurve3fa ncurve(n0,dn0,n1,dn1);
+      ccurve = enlargeRadiusToMinWidth(context,this,ray_org,ccurve);
+      return TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+    }
+
+    template<typename SourceCurve3ff, typename SourceCurve3fa, typename TensorLinearCubicBezierSurface3fa>
+    __forceinline TensorLinearCubicBezierSurface3fa getNormalOrientedHermiteCurve(IntersectContext* context, const Vec3fa& ray_org, const unsigned int primID, const float time) const
+    {
+      float ftime;
+      const size_t itime = timeSegment(time, ftime);
+      const TensorLinearCubicBezierSurface3fa curve0 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+0);
+      const TensorLinearCubicBezierSurface3fa curve1 = getNormalOrientedHermiteCurve<SourceCurve3ff, SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,itime+1);
+      return clerp(curve0,curve1,ftime);
+    }
+
+  private:
+    void resizeBuffers(unsigned int numSteps);
+
+  public:
+    BufferView<unsigned int> curves;        //!< array of curve indices
+    BufferView<Vec3ff> vertices0;           //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
+    BufferView<Vec3ff> tangents0;           //!< fast access to first tangent buffer
+    BufferView<Vec3fa> dnormals0;           //!< fast access to first normal derivative buffer
+    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
+    vector<BufferView<Vec3ff>> tangents;    //!< tangent array for each timestep
+    vector<BufferView<Vec3fa>> dnormals;    //!< normal derivative array for each timestep
+    BufferView<char> flags;                 //!< start, end flag per segment
+    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    int tessellationRate;                   //!< tessellation rate for flat curve
+    float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
+  };
+  
+  DECLARE_ISA_FUNCTION(CurveGeometry*, createCurves, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h
new file mode 100644
index 0000000000..c08658466a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_grid_mesh.h
@@ -0,0 +1,215 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Grid Mesh */
+  struct GridMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_GRID_MESH;
+
+    /*! grid */
+    struct Grid 
+    {
+      unsigned int startVtxID;
+      unsigned int lineVtxOffset;
+      unsigned short resX,resY;
+
+      /* border flags due to 3x3 vertex pattern */
+      __forceinline unsigned int get3x3FlagsX(const unsigned int x) const
+      {
+        return (x + 2 >= (unsigned int)resX) ? (1<<15) : 0;
+      }
+
+      /* border flags due to 3x3 vertex pattern */
+      __forceinline unsigned int get3x3FlagsY(const unsigned int y) const
+      {
+        return (y + 2 >= (unsigned int)resY) ? (1<<15) : 0;
+      }
+
+      /*! outputs grid structure */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Grid& t) {
+        return cout << "Grid { startVtxID " << t.startVtxID << ", lineVtxOffset " << t.lineVtxOffset << ", resX " << t.resX << ", resY " << t.resY << " }";
+      }
+    };
+
+  public:
+
+    /*! grid mesh construction */
+    GridMesh (Device* device); 
+
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+    __forceinline unsigned int getNumSubGrids(const size_t gridID)
+    {
+      const Grid &g = grid(gridID);
+      return max((unsigned int)1,((unsigned int)g.resX >> 1) * ((unsigned int)g.resY >> 1));
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th grid*/
+    __forceinline const Grid& grid(size_t i) const {
+      return grids[i];
+    }
+
+    /*! returns i'th vertex of the first time step  */
+    __forceinline const Vec3fa vertex(size_t i) const { // FIXME: check if this does a unaligned load
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th vertex of the first timestep */
+    __forceinline size_t grid_vertex_index(const Grid& g, size_t x, size_t y) const {
+      assert(x < (size_t)g.resX);
+      assert(y < (size_t)g.resY);
+      return g.startVtxID + x + y * g.lineVtxOffset;
+    }
+    
+    /*! returns i'th vertex of the first timestep */
+    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y) const {
+      const size_t index = grid_vertex_index(g,x,y);
+      return vertex(index);
+    }
+
+    /*! returns i'th vertex of the itime'th timestep */
+    __forceinline const Vec3fa grid_vertex(const Grid& g, size_t x, size_t y, size_t itime) const {
+      const size_t index = grid_vertex_index(g,x,y);
+      return vertex(index,itime);
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, BBox3fa& bbox) const
+    {
+      BBox3fa b(empty);
+      for (size_t t=0; t<numTimeSteps; t++)
+      {
+        for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+          for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+          {
+            const Vec3fa v = grid_vertex(g,x,y,t);
+            if (unlikely(!isvalid(v))) return false;
+            b.extend(v);
+          }
+      }
+
+      bbox = b;
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(const Grid& g, size_t sx, size_t sy, size_t itime, BBox3fa& bbox) const
+    {
+      assert(itime < numTimeSteps);
+      BBox3fa b0(empty);
+      for (size_t y=sy;y<min(sy+3,(size_t)g.resY);y++)
+        for (size_t x=sx;x<min(sx+3,(size_t)g.resX);x++)
+        {
+          const Vec3fa v = grid_vertex(g,x,y,itime);
+          if (unlikely(!isvalid(v))) return false;
+          b0.extend(v);
+        }
+
+      /* use bounds of first time step in builder */
+      bbox = b0;
+      return true;
+    }
+
+    __forceinline bool valid(size_t gridID, size_t itime=0) const {
+      return valid(gridID, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t gridID, const range<size_t>& itime_range) const
+    {
+      if (unlikely(gridID >= grids.size())) return false;
+      const Grid &g = grid(gridID);
+      if (unlikely(g.startVtxID + 0                                     >= vertices0.size())) return false;
+      if (unlikely(g.startVtxID + (g.resY-1)*g.lineVtxOffset + g.resX-1 >= vertices0.size())) return false;
+
+      for (size_t y=0;y<g.resY;y++)
+        for (size_t x=0;x<g.resX;x++)
+          for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+            if (!isvalid(grid_vertex(g,x,y,itime))) return false;
+      return true;
+    }
+
+
+    __forceinline BBox3fa bounds(const Grid& g, size_t sx, size_t sy, size_t itime) const
+    {
+      BBox3fa box(empty);
+      buildBounds(g,sx,sy,itime,box);
+      return box;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, size_t itime) const {
+      BBox3fa bounds0, bounds1;
+      buildBounds(g,sx,sy,itime+0,bounds0);
+      buildBounds(g,sx,sy,itime+1,bounds1);
+      return LBBox3fa(bounds0,bounds1);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const Grid& g, size_t sx, size_t sy, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(g,sx,sy,itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+  public:
+    BufferView<Grid> grids;      //!< array of triangles
+    BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+  };
+
+  namespace isa
+  {
+    struct GridMeshISA : public GridMesh
+    {
+      GridMeshISA (Device* device)
+        : GridMesh(device) {}
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(GridMesh*, createGridMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_instance.h b/thirdparty/embree-aarch64/kernels/common/scene_instance.h
new file mode 100644
index 0000000000..7ff82a4fb8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_instance.h
@@ -0,0 +1,272 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "accel.h"
+
+namespace embree
+{
+  struct MotionDerivativeCoefficients;
+
+  /*! Instanced acceleration structure */
+  struct Instance : public Geometry
+  {
+    ALIGNED_STRUCT_(16);
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_INSTANCE;
+
+  public:
+    Instance (Device* device, Accel* object = nullptr, unsigned int numTimeSteps = 1);
+    ~Instance();
+
+  private:
+    Instance (const Instance& other) DELETED; // do not implement
+    Instance& operator= (const Instance& other) DELETED; // do not implement
+
+  private:
+    LBBox3fa nonlinearBounds(const BBox1f& time_range_in,
+                             const BBox1f& geom_time_range,
+                             float geom_time_segments) const;
+
+    BBox3fa boundSegment(size_t itime,
+      BBox3fa const& obbox0, BBox3fa const& obbox1,
+      BBox3fa const& bbox0, BBox3fa const& bbox1,
+      float t_min, float t_max) const;
+
+    /* calculates the (correct) interpolated bounds */
+    __forceinline BBox3fa bounds(size_t itime0, size_t itime1, float f) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(slerp(local2world[itime0], local2world[itime1], f),
+                         lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+      return xfmBounds(lerp(local2world[itime0], local2world[itime1], f),
+                        lerp(getObjectBounds(itime0), getObjectBounds(itime1), f));
+    }
+
+  public:
+    virtual void setNumTimeSteps (unsigned int numTimeSteps) override;
+    virtual void setInstancedScene(const Ref<Scene>& scene) override;
+    virtual void setTransform(const AffineSpace3fa& local2world, unsigned int timeStep) override;
+    virtual void setQuaternionDecomposition(const AffineSpace3ff& qd, unsigned int timeStep) override;
+    virtual AffineSpace3fa getTransform(float time) override;
+    virtual void setMask (unsigned mask) override;
+    virtual void build() {}
+    virtual void addElementsToCount (GeometryCounts & counts) const override;
+    virtual void commit() override;
+
+  public:
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i) const {
+      assert(i == 0);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(local2world[0]),object->bounds.bounds());
+      return xfmBounds(local2world[0],object->bounds.bounds());
+    }
+
+    /*! gets the bounds of the instanced scene */
+    __forceinline BBox3fa getObjectBounds(size_t itime) const {
+      return object->getBounds(timeStep(itime));
+    }
+
+     /*! calculates the bounds of instance */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const {
+      assert(i == 0);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return xfmBounds(quaternionDecompositionToAffineSpace(local2world[itime]),getObjectBounds(itime));
+      return xfmBounds(local2world[itime],getObjectBounds(itime));
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t i, const BBox1f& dt) const {
+      assert(i == 0);
+      LBBox3fa lbbox = nonlinearBounds(dt, time_range, fnumTimeSegments);
+      return lbbox;
+    }
+
+    /*! calculates the build bounds of the i'th item, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      assert(i==0);
+      const BBox3fa b = bounds(i);
+      if (bbox) *bbox = b;
+      return isvalid(b);
+    }
+
+     /*! calculates the build bounds of the i'th item at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      assert(i==0);
+      const LBBox3fa bounds = linearBounds(i,itime);
+      bbox = bounds.bounds ();
+      return isvalid(bounds);
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return numPrimitives;
+    }
+  
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return numPrimitives != otherVersion;
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      assert(i == 0);
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+        if (!isvalid(bounds(i,itime))) return false;
+
+      return true;
+    }
+
+    __forceinline AffineSpace3fa getLocal2World() const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return quaternionDecompositionToAffineSpace(local2world[0]);
+      return local2world[0];
+    }
+
+    __forceinline AffineSpace3fa getLocal2World(float t) const
+    {
+      float ftime; const unsigned int itime = timeSegment(t, ftime);
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return slerp(local2world[itime+0],local2world[itime+1],ftime);
+      return lerp(local2world[itime+0],local2world[itime+1],ftime);
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local() const {
+      return world2local0;
+    }
+
+    __forceinline AffineSpace3fa getWorld2Local(float t) const {
+      return rcp(getLocal2World(t));
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2Local(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      if (unlikely(gsubtype == GTY_SUBTYPE_INSTANCE_QUATERNION))
+        return getWorld2LocalSlerp(valid, t);
+      return getWorld2LocalLerp(valid, t);
+    }
+
+    private:
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalSlerp(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(slerp(AffineSpace3vff<K>(local2world[itime+0]),
+                         AffineSpace3vff<K>(local2world[itime+1]),
+                         ftime));
+      }
+      else {
+        AffineSpace3vff<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vff<K>(local2world[itime+0]), space0);
+          space1 = select(valid2, AffineSpace3vff<K>(local2world[itime+1]), space1);
+        }
+        return rcp(slerp(space0, space1, ftime));
+      }
+    }
+
+    template<int K>
+    __forceinline AffineSpace3vf<K> getWorld2LocalLerp(const vbool<K>& valid, const vfloat<K>& t) const
+    {
+      vfloat<K> ftime;
+      const vint<K> itime_k = timeSegment(t, ftime);
+      assert(any(valid));
+      const size_t index = bsf(movemask(valid));
+      const int itime = itime_k[index];
+      if (likely(all(valid, itime_k == vint<K>(itime)))) {
+        return rcp(lerp(AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]),
+                        AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]),
+                        ftime));
+      } else {
+        AffineSpace3vf<K> space0,space1;
+        vbool<K> valid1 = valid;
+        while (any(valid1)) {
+          vbool<K> valid2;
+          const int itime = next_unique(valid1, itime_k, valid2);
+          space0 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+0]), space0);
+          space1 = select(valid2, AffineSpace3vf<K>((AffineSpace3fa)local2world[itime+1]), space1);
+        }
+        return rcp(lerp(space0, space1, ftime));
+      }
+    }
+
+  public:
+    Accel* object;                 //!< pointer to instanced acceleration structure
+    AffineSpace3ff* local2world;   //!< transformation from local space to world space for each timestep (either normal matrix or quaternion decomposition)
+    AffineSpace3fa world2local0;   //!< transformation from world space to local space for timestep 0
+  };
+
+  namespace isa
+  {
+    struct InstanceISA : public Instance
+    {
+      InstanceISA (Device* device)
+        : Instance(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfo pinfo(empty);
+        BBox3fa b = empty;
+        if (!buildBounds(0,&b)) return pinfo;
+        // const BBox3fa b = bounds(0);
+        // if (!isvalid(b)) return pinfo;
+
+        const PrimRef prim(b,geomID,unsigned(0));
+        pinfo.add_center2(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfo pinfo(empty);
+        BBox3fa b = empty;
+        if (!buildBounds(0,&b)) return pinfo;
+        // if (!valid(0,range<size_t>(itime))) return pinfo;
+        // const PrimRef prim(linearBounds(0,itime).bounds(),geomID,unsigned(0));
+        const PrimRef prim(b,geomID,unsigned(0));
+        pinfo.add_center2(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        assert(r.begin() == 0);
+        assert(r.end()   == 1);
+
+        PrimInfoMB pinfo(empty);
+        if (!valid(0, timeSegmentRange(t0t1))) return pinfo;
+        const PrimRefMB prim(linearBounds(0,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(0));
+        pinfo.add_primref(prim);
+        prims[k++] = prim;
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(Instance*, createInstance, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h
new file mode 100644
index 0000000000..c0f9ee8f77
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_line_segments.h
@@ -0,0 +1,307 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! represents an array of line segments */
+  struct LineSegments : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_CURVE2;
+
+  public:
+
+    /*! line segments construction */
+    LineSegments (Device* device, Geometry::GType gtype);
+
+  public:
+    void setMask (unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify ();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void setTessellationRate(float N);
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns the i'th segment */
+    __forceinline const unsigned int& segment(size_t i) const {
+      return segments[i];
+    }
+
+    /*! returns the segment to the left of the i'th segment */
+    __forceinline bool segmentLeftExists(size_t i) const {
+      assert (flags);
+      return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_LEFT) != 0;
+    }
+
+    /*! returns the segment to the right of the i'th segment */
+    __forceinline bool segmentRightExists(size_t i) const {
+      assert (flags);
+      return (flags[i] & RTC_CURVE_FLAG_NEIGHBOR_RIGHT) != 0;
+    }
+
+     /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const Vec3ff& v0, const Vec3ff& v1) const
+    {
+      const BBox3ff b = merge(BBox3ff(v0),BBox3ff(v1));
+      return enlarge((BBox3fa)b,maxRadiusScale*Vec3fa(max(v0.w,v1.w)));
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0);
+      const Vec3ff v1 = vertex(index+1);
+      return bounds(v0,v1);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      return bounds(v0,v1);
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0);
+      const Vec3ff v1 = vertex(index+1);
+      const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+      const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+      return bounds(w0,w1);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+    {
+      const unsigned int index = segment(i);
+      const Vec3ff v0 = vertex(index+0,itime);
+      const Vec3ff v1 = vertex(index+1,itime);
+      const Vec3ff w0(xfmVector(space,(Vec3fa)v0),v0.w);
+      const Vec3ff w1(xfmVector(space,(Vec3fa)v1),v1.w);
+      return bounds(w0,w1);
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = segment(i);
+      if (index+1 >= numVertices()) return false;
+      
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        const Vec3ff v0 = vertex(index+0,itime); if (unlikely(!isvalid4(v0))) return false;
+        const Vec3ff v1 = vertex(index+1,itime); if (unlikely(!isvalid4(v1))) return false;
+        if (min(v0.w,v1.w) < 0.0f) return false;
+      }
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+    {
+      if (!valid(i,0)) return false;
+      *bbox = bounds(i); 
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      if (!valid(i,itime+0) || !valid(i,itime+1)) return false;
+      bbox = bounds(i,itime);  // use bounds of first time step in builder
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(time_range))) return false;
+      bbox = linearBounds(i, time_range);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+  public:
+    BufferView<unsigned int> segments;      //!< array of line segment indices
+    BufferView<Vec3ff> vertices0;           //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;            //!< fast access to first normal buffer
+    BufferView<char> flags;                 //!< start, end flag per segment
+    vector<BufferView<Vec3ff>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;     //!< normal array for each timestep
+    vector<BufferView<char>> vertexAttribs; //!< user buffers
+    int tessellationRate;                   //!< tessellation rate for bezier curve
+    float maxRadiusScale = 1.0;             //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    struct LineSegmentsISA : public LineSegments
+    {
+      LineSegmentsISA (Device* device, Geometry::GType gtype)
+        : LineSegments(device,gtype) {}
+
+      Vec3fa computeDirection(unsigned int primID) const
+      {
+        const unsigned vtxID = segment(primID);
+        const Vec3fa v0 = vertex(vtxID+0);
+        const Vec3fa v1 = vertex(vtxID+1);
+        return v1-v0;
+      }
+
+      Vec3fa computeDirection(unsigned int primID, size_t time) const
+      {
+        const unsigned vtxID = segment(primID);
+        const Vec3fa v0 = vertex(vtxID+0,time);
+        const Vec3fa v1 = vertex(vtxID+1,time);
+        return v1-v0;
+      }
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      BBox3fa vbounds(size_t i) const {
+        return bounds(i);
+      }
+      
+      BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const {
+        return bounds(space,i);
+      }
+
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const {
+        return linearBounds(primID,time_range);
+      }
+      
+      LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const {
+        return linearBounds(space,primID,time_range);
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(LineSegments*, createLineSegments, Device* COMMA Geometry::GType);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_points.h b/thirdparty/embree-aarch64/kernels/common/scene_points.h
new file mode 100644
index 0000000000..1d39ed07ba
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_points.h
@@ -0,0 +1,282 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "buffer.h"
+#include "default.h"
+#include "geometry.h"
+
+namespace embree
+{
+  /*! represents an array of points */
+  struct Points : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_POINTS;
+
+   public:
+    /*! line segments construction */
+    Points(Device* device, Geometry::GType gtype);
+
+   public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps(unsigned int numTimeSteps);
+    void setVertexAttributeCount(unsigned int N);
+    void setBuffer(RTCBufferType type,
+                   unsigned int slot,
+                   RTCFormat format,
+                   const Ref<Buffer>& buffer,
+                   size_t offset,
+                   size_t stride,
+                   unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void setMaxRadiusScale(float s);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+   public:
+    /*! returns the number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline Vec3ff vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th normal of the first time step */
+    __forceinline Vec3fa normal(size_t i) const {
+      return normals0[i];
+    }
+
+    /*! returns i'th radius of the first time step */
+    __forceinline float radius(size_t i) const {
+      return vertices0[i].w;
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline Vec3ff vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! returns i'th normal of itime'th timestep */
+    __forceinline Vec3fa normal(size_t i, size_t itime) const {
+      return normals[itime][i];
+    }
+
+    /*! returns i'th radius of itime'th timestep */
+    __forceinline float radius(size_t i, size_t itime) const {
+      return vertices[itime][i].w;
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const Vec3ff& v0) const {
+      return enlarge(BBox3fa(v0), maxRadiusScale*Vec3fa(v0.w));
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(size_t i) const
+    {
+      const Vec3ff v0 = vertex(i);
+      return bounds(v0);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Vec3ff v0 = vertex(i, itime);
+      return bounds(v0);
+    }
+
+    /*! calculates bounding box of i'th line segment */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i) const
+    {
+      const Vec3ff v0 = vertex(i);
+      const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+      return bounds(w0);
+    }
+
+    /*! calculates bounding box of i'th line segment for the itime'th time step */
+    __forceinline BBox3fa bounds(const LinearSpace3fa& space, size_t i, size_t itime) const
+    {
+      const Vec3ff v0 = vertex(i, itime);
+      const Vec3ff w0(xfmVector(space, (Vec3fa)v0), v0.w);
+      return bounds(w0);
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const unsigned int index = (unsigned int)i;
+      if (index >= numVertices())
+        return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++) {
+        const Vec3ff v0 = vertex(index + 0, itime);
+        if (unlikely(!isvalid4(v0)))
+          return false;
+        if (v0.w < 0.0f)
+          return false;
+      }
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i, itime + 0), bounds(i, itime + 1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox) const
+    {
+      if (!valid(i, 0))
+        return false;
+      *bbox = bounds(i);
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      if (!valid(i, itime + 0) || !valid(i, itime + 1))
+        return false;
+      bbox = bounds(i, itime);  // use bounds of first time step in builder
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&](size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&](size_t itime) { return bounds(space, primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& time_range, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(time_range))) return false;
+      bbox = linearBounds(i, time_range);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+   public:
+    BufferView<Vec3ff> vertices0;            //!< fast access to first vertex buffer
+    BufferView<Vec3fa> normals0;             //!< fast access to first normal buffer
+    vector<BufferView<Vec3ff>> vertices;     //!< vertex array for each timestep
+    vector<BufferView<Vec3fa>> normals;      //!< normal array for each timestep
+    vector<BufferView<char>> vertexAttribs;  //!< user buffers
+    float maxRadiusScale = 1.0;              //!< maximal min-width scaling of curve radii
+  };
+
+  namespace isa
+  {
+    struct PointsISA : public Points
+    {
+      PointsISA(Device* device, Geometry::GType gtype) : Points(device, gtype) {}
+
+      Vec3fa computeDirection(unsigned int primID) const
+      {
+        return Vec3fa(1, 0, 0);
+      }
+
+      Vec3fa computeDirection(unsigned int primID, size_t time) const
+      {
+        return Vec3fa(1, 0, 0);
+      }
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, &bounds))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j, itime, bounds))
+            continue;
+          const PrimRef prim(bounds, geomID, unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims,
+                                      const BBox1f& t0t1,
+                                      const range<size_t>& r,
+                                      size_t k,
+                                      unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j = r.begin(); j < r.end(); j++) {
+          if (!valid(j, timeSegmentRange(t0t1)))
+            continue;
+          const PrimRefMB prim(linearBounds(j, t0t1), this->numTimeSegments(), this->time_range, this->numTimeSegments(), geomID, unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      BBox3fa vbounds(size_t i) const
+      {
+        return bounds(i);
+      }
+
+      BBox3fa vbounds(const LinearSpace3fa& space, size_t i) const
+      {
+        return bounds(space, i);
+      }
+
+      LBBox3fa vlinearBounds(size_t primID, const BBox1f& time_range) const
+      {
+        return linearBounds(primID, time_range);
+      }
+
+      LBBox3fa vlinearBounds(const LinearSpace3fa& space, size_t primID, const BBox1f& time_range) const
+      {
+        return linearBounds(space, primID, time_range);
+      }
+    };
+  }  // namespace isa
+
+  DECLARE_ISA_FUNCTION(Points*, createPoints, Device* COMMA Geometry::GType);
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h
new file mode 100644
index 0000000000..d5bb054b14
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_quad_mesh.h
@@ -0,0 +1,277 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Quad Mesh */
+  struct QuadMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_QUAD_MESH;
+    
+    /*! triangle indices */
+    struct Quad
+    {
+      uint32_t v[4];
+
+      /*! outputs triangle indices */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Quad& q) {
+        return cout << "Quad {" << q.v[0] << ", " << q.v[1] << ", " << q.v[2] << ", " << q.v[3] << " }";
+      }
+    };
+
+  public:
+
+    /*! quad mesh construction */
+    QuadMesh (Device* device); 
+  
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th quad */
+    __forceinline const Quad& quad(size_t i) const {
+      return quads[i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! calculates the bounds of the i'th quad */
+    __forceinline BBox3fa bounds(size_t i) const 
+    {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0]);
+      const Vec3fa v1 = vertex(q.v[1]);
+      const Vec3fa v2 = vertex(q.v[2]);
+      const Vec3fa v3 = vertex(q.v[3]);
+      return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+    }
+
+    /*! calculates the bounds of the i'th quad at the itime'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0],itime);
+      const Vec3fa v1 = vertex(q.v[1],itime);
+      const Vec3fa v2 = vertex(q.v[2],itime);
+      const Vec3fa v3 = vertex(q.v[3],itime);
+      return BBox3fa(min(v0,v1,v2,v3),max(v0,v1,v2,v3));
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const Quad& q = quad(i);
+      if (unlikely(q.v[0] >= numVertices())) return false;
+      if (unlikely(q.v[1] >= numVertices())) return false;
+      if (unlikely(q.v[2] >= numVertices())) return false;
+      if (unlikely(q.v[3] >= numVertices())) return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        if (!isvalid(vertex(q.v[0],itime))) return false;
+        if (!isvalid(vertex(q.v[1],itime))) return false;
+        if (!isvalid(vertex(q.v[2],itime))) return false;
+        if (!isvalid(vertex(q.v[3],itime))) return false;
+      }
+
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th quad at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      const Quad& q = quad(i);
+      if (q.v[0] >= numVertices()) return false;
+      if (q.v[1] >= numVertices()) return false;
+      if (q.v[2] >= numVertices()) return false;
+      if (q.v[3] >= numVertices()) return false;
+
+      for (unsigned int t=0; t<numTimeSteps; t++)
+      {
+        const Vec3fa v0 = vertex(q.v[0],t);
+        const Vec3fa v1 = vertex(q.v[1],t);
+        const Vec3fa v2 = vertex(q.v[2],t);
+        const Vec3fa v3 = vertex(q.v[3],t);
+
+        if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2) || !isvalid(v3)))
+          return false;
+      }
+
+      if (bbox) 
+        *bbox = bounds(i);
+
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      const Quad& q = quad(i);
+      if (unlikely(q.v[0] >= numVertices())) return false;
+      if (unlikely(q.v[1] >= numVertices())) return false;
+      if (unlikely(q.v[2] >= numVertices())) return false;
+      if (unlikely(q.v[3] >= numVertices())) return false;
+
+      assert(itime+1 < numTimeSteps);
+      const Vec3fa a0 = vertex(q.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+      const Vec3fa a1 = vertex(q.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+      const Vec3fa a2 = vertex(q.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+      const Vec3fa a3 = vertex(q.v[3],itime+0); if (unlikely(!isvalid(a3))) return false;
+      const Vec3fa b0 = vertex(q.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+      const Vec3fa b1 = vertex(q.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+      const Vec3fa b2 = vertex(q.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+      const Vec3fa b3 = vertex(q.v[3],itime+1); if (unlikely(!isvalid(b3))) return false;
+      
+      /* use bounds of first time step in builder */
+      bbox = BBox3fa(min(a0,a1,a2,a3),max(a0,a1,a2,a3));
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const
+    {
+      if (!valid(i, timeSegmentRange(dt))) return false;
+      bbox = linearBounds(i, dt);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return quads.modCounter;
+    }
+    
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return quads.isModified(otherVersion); // || numPrimitivesChanged;
+    }
+
+    /* returns the projected area */
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      const Quad& q = quad(i);
+      const Vec3fa v0 = vertex(q.v[0]);
+      const Vec3fa v1 = vertex(q.v[1]);
+      const Vec3fa v2 = vertex(q.v[2]);
+      const Vec3fa v3 = vertex(q.v[3]);
+      return areaProjectedTriangle(v0,v1,v3) +
+	areaProjectedTriangle(v1,v2,v3);
+    }
+
+  public:
+    BufferView<Quad> quads;                 //!< array of quads
+    BufferView<Vec3fa> vertices0;           //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices;    //!< vertex array for each timestep
+    vector<BufferView<char>> vertexAttribs; //!< vertex attribute buffers
+  };
+
+  namespace isa
+  {
+    struct QuadMeshISA : public QuadMesh
+    {
+      QuadMeshISA (Device* device)
+        : QuadMesh(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(QuadMesh*, createQuadMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h
new file mode 100644
index 0000000000..d0246009db
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_subdiv_mesh.h
@@ -0,0 +1,326 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+#include "../subdiv/half_edge.h"
+#include "../subdiv/tessellation_cache.h"
+#include "../subdiv/catmullclark_coefficients.h"
+#include "../subdiv/patch.h"
+#include "../../common/algorithms/parallel_map.h"
+#include "../../common/algorithms/parallel_set.h"
+
+namespace embree
+{
+  class SubdivMesh : public Geometry
+  {
+    ALIGNED_CLASS_(16);
+  public:
+
+    typedef HalfEdge::Edge Edge;
+    
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_SUBDIV_MESH;
+
+    /*! structure used to sort half edges using radix sort by their key */
+    struct KeyHalfEdge 
+    {
+      KeyHalfEdge() {}
+      
+      KeyHalfEdge (uint64_t key, HalfEdge* edge) 
+      : key(key), edge(edge) {}
+      
+      __forceinline operator uint64_t() const { 
+	return key; 
+      }
+
+      friend __forceinline bool operator<(const KeyHalfEdge& e0, const KeyHalfEdge& e1) {
+        return e0.key < e1.key;
+      }
+      
+    public:
+      uint64_t key;
+      HalfEdge* edge;
+    };
+
+  public:
+
+    /*! subdiv mesh construction */
+    SubdivMesh(Device* device);
+
+  public:
+    void setMask (unsigned mask);
+    void setSubdivisionMode (unsigned int topologyID, RTCSubdivisionMode mode);
+    void setVertexAttributeTopology(unsigned int vertexAttribID, unsigned int topologyID);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setTopologyCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void setTessellationRate(float N);
+    bool verify();
+    void commit();
+    void addElementsToCount (GeometryCounts & counts) const;
+    void setDisplacementFunction (RTCDisplacementFunctionN func);
+    unsigned int getFirstHalfEdge(unsigned int faceID);
+    unsigned int getFace(unsigned int edgeID);
+    unsigned int getNextHalfEdge(unsigned int edgeID);
+    unsigned int getPreviousHalfEdge(unsigned int edgeID);
+    unsigned int getOppositeHalfEdge(unsigned int topologyID, unsigned int edgeID);
+
+  public:
+
+    /*! return the number of faces */
+    size_t numFaces() const { 
+      return faceVertices.size(); 
+    }
+
+    /*! return the number of edges */
+    size_t numEdges() const { 
+      return topology[0].vertexIndices.size(); 
+    }
+
+    /*! return the number of vertices */
+    size_t numVertices() const { 
+      return vertices[0].size(); 
+    }
+
+    /*! calculates the bounds of the i'th subdivision patch at the j'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t j = 0) const {
+      return topology[0].getHalfEdge(i)->bounds(vertices[j]);
+    }
+
+    /*! check if the i'th primitive is valid */
+    __forceinline bool valid(size_t i) const {
+      return topology[0].valid(i) && !invalidFace(i);
+    }
+
+    /*! check if the i'th primitive is valid for the j'th time range */
+    __forceinline bool valid(size_t i, size_t j) const {
+      return topology[0].valid(i) && !invalidFace(i,j);
+    }
+
+    /*! prints some statistics */
+    void printStatistics();
+
+    /*! initializes the half edge data structure */
+    void initializeHalfEdgeStructures ();
+ 
+  public:
+
+    /*! returns the vertex buffer for some time step */
+    __forceinline const BufferView<Vec3fa>& getVertexBuffer( const size_t t = 0 ) const {
+      return vertices[t];
+    }
+
+    /* returns tessellation level of edge */
+    __forceinline float getEdgeLevel(const size_t i) const
+    {
+      if (levels) return clamp(levels[i],1.0f,4096.0f); // FIXME: do we want to limit edge level?
+      else return clamp(tessellationRate,1.0f,4096.0f); // FIXME: do we want to limit edge level?
+    }
+
+  public:
+    RTCDisplacementFunctionN displFunc;    //!< displacement function
+
+    /*! all buffers in this section are provided by the application */
+  public:
+    
+    /*! the topology contains all data that may differ when
+     *  interpolating different user data buffers */
+    struct Topology
+    {
+    public:
+
+      /*! Default topology construction */
+      Topology () : halfEdges(nullptr,0) {}
+
+      /*! Topology initialization */
+      Topology (SubdivMesh* mesh);
+
+      /*! make the class movable */
+    public: 
+      Topology (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+        : mesh(std::move(other.mesh)), 
+          vertexIndices(std::move(other.vertexIndices)),
+          subdiv_mode(std::move(other.subdiv_mode)),
+          halfEdges(std::move(other.halfEdges)),
+          halfEdges0(std::move(other.halfEdges0)),
+          halfEdges1(std::move(other.halfEdges1)) {}
+      
+      Topology& operator= (Topology&& other) // FIXME: this is only required to workaround compilation issues under Windows
+      {
+        mesh = std::move(other.mesh); 
+        vertexIndices = std::move(other.vertexIndices);
+        subdiv_mode = std::move(other.subdiv_mode);
+        halfEdges = std::move(other.halfEdges);
+        halfEdges0 = std::move(other.halfEdges0);
+        halfEdges1 = std::move(other.halfEdges1);
+        return *this;
+      }
+
+    public:
+      /*! check if the i'th primitive is valid in this topology */
+      __forceinline bool valid(size_t i) const 
+      {
+        if (unlikely(subdiv_mode == RTC_SUBDIVISION_MODE_NO_BOUNDARY)) {
+          if (getHalfEdge(i)->faceHasBorder()) return false;
+        }
+        return true;
+      }
+      
+      /*! updates the interpolation mode for the topology */
+      void setSubdivisionMode (RTCSubdivisionMode mode);
+
+      /*! marks all buffers as modified */
+      void update ();
+
+      /*! verifies index array */
+      bool verify (size_t numVertices);
+
+      /*! initializes the half edge data structure */
+      void initializeHalfEdgeStructures ();
+
+    private:
+      
+      /*! recalculates the half edges */
+      void calculateHalfEdges();
+      
+      /*! updates half edges when recalculation is not necessary */
+      void updateHalfEdges();
+      
+      /*! user input data */
+    public:
+
+      SubdivMesh* mesh;
+
+      /*! indices of the vertices composing each face */
+      BufferView<unsigned int> vertexIndices;
+      
+      /*! subdiv interpolation mode */
+      RTCSubdivisionMode subdiv_mode;
+
+      /*! generated data */
+    public:
+
+      /*! returns the start half edge for face f */
+      __forceinline const HalfEdge* getHalfEdge ( const size_t f ) const { 
+        return &halfEdges[mesh->faceStartEdge[f]]; 
+      }
+
+      /*! Half edge structure, generated by initHalfEdgeStructures */
+      mvector<HalfEdge> halfEdges;
+
+      /*! the following data is only required during construction of the
+       *  half edge structure and can be cleared for static scenes */
+    private:
+      
+      /*! two arrays used to sort the half edges */
+      std::vector<KeyHalfEdge> halfEdges0;
+      std::vector<KeyHalfEdge> halfEdges1;
+    };
+
+    /*! returns the start half edge for topology t and face f */
+    __forceinline const HalfEdge* getHalfEdge ( const size_t t , const size_t f ) const { 
+      return topology[t].getHalfEdge(f);
+    }
+
+    /*! buffer containing the number of vertices for each face */
+    BufferView<unsigned int> faceVertices;
+
+    /*! array of topologies */
+    vector<Topology> topology;
+
+    /*! vertex buffer (one buffer for each time step) */
+    vector<BufferView<Vec3fa>> vertices;
+
+    /*! user data buffers */
+    vector<RawBufferView> vertexAttribs;
+
+    /*! edge crease buffer containing edges (pairs of vertices) that carry edge crease weights */
+    BufferView<Edge> edge_creases;
+    
+    /*! edge crease weights for each edge of the edge_creases buffer */
+    BufferView<float> edge_crease_weights;
+    
+    /*! vertex crease buffer containing all vertices that carry vertex crease weights */
+    BufferView<unsigned int> vertex_creases;
+    
+    /*! vertex crease weights for each vertex of the vertex_creases buffer */
+    BufferView<float> vertex_crease_weights;
+
+    /*! subdivision level for each half edge of the vertexIndices buffer */
+    BufferView<float> levels;
+    float tessellationRate;  // constant rate that is used when levels is not set
+
+    /*! buffer that marks specific faces as holes */
+    BufferView<unsigned> holes;
+
+    /*! all data in this section is generated by initializeHalfEdgeStructures function */
+  private:
+
+    /*! number of half edges used by faces */
+    size_t numHalfEdges; 
+
+    /*! fast lookup table to find the first half edge for some face */
+    mvector<uint32_t> faceStartEdge;
+
+    /*! fast lookup table to find the face for some half edge */
+    mvector<uint32_t> halfEdgeFace;
+
+    /*! set with all holes */
+    parallel_set<uint32_t> holeSet;
+
+    /*! fast lookup table to detect invalid faces */
+    mvector<int8_t> invalid_face;
+
+    /*! test if face i is invalid in timestep j */
+    __forceinline       int8_t& invalidFace(size_t i, size_t j = 0)       { return invalid_face[i*numTimeSteps+j]; }
+    __forceinline const int8_t& invalidFace(size_t i, size_t j = 0) const { return invalid_face[i*numTimeSteps+j]; }
+
+    /*! interpolation cache */
+  public:
+    static __forceinline size_t numInterpolationSlots4(size_t stride) { return (stride+15)/16; }
+    static __forceinline size_t numInterpolationSlots8(size_t stride) { return (stride+31)/32; }
+    static __forceinline size_t interpolationSlot(size_t prim, size_t slot, size_t stride) {
+      const size_t slots = numInterpolationSlots4(stride); 
+      assert(slot < slots); 
+      return slots*prim+slot;
+    }
+    std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_buffer_tags;
+    std::vector<std::vector<SharedLazyTessellationCache::CacheEntry>> vertex_attrib_buffer_tags;
+    std::vector<Patch3fa::Ref> patch_eval_trees;
+    
+    /*! the following data is only required during construction of the
+     *  half edge structure and can be cleared for static scenes */
+  private:
+
+    /*! map with all vertex creases */
+    parallel_map<uint32_t,float> vertexCreaseMap;
+    
+    /*! map with all edge creases */
+    parallel_map<uint64_t,float> edgeCreaseMap;
+
+  protected:
+    
+    /*! counts number of geometry commits */
+    size_t commitCounter;
+  };
+
+  namespace isa
+  {
+    struct SubdivMeshISA : public SubdivMesh
+    {
+      SubdivMeshISA (Device* device)
+        : SubdivMesh(device) {}
+
+      void interpolate(const RTCInterpolateArguments* const args);
+      void interpolateN(const RTCInterpolateNArguments* const args);
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(SubdivMesh*, createSubdivMesh, Device*);
+};
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp
new file mode 100644
index 0000000000..d1c2750f14
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.cpp
@@ -0,0 +1,243 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "scene_triangle_mesh.h"
+#include "scene.h"
+
+namespace embree
+{
+#if defined(EMBREE_LOWEST_ISA)
+
+  TriangleMesh::TriangleMesh (Device* device)
+    : Geometry(device,GTY_TRIANGLE_MESH,0,1)
+  {
+    vertices.resize(numTimeSteps);
+  }
+
+  void TriangleMesh::setMask (unsigned mask) 
+  {
+    this->mask = mask; 
+    Geometry::update();
+  }
+
+  void TriangleMesh::setNumTimeSteps (unsigned int numTimeSteps)
+  {
+    vertices.resize(numTimeSteps);
+    Geometry::setNumTimeSteps(numTimeSteps);
+  }
+
+  void TriangleMesh::setVertexAttributeCount (unsigned int N)
+  {
+    vertexAttribs.resize(N);
+    Geometry::update();
+  }
+  
+  void TriangleMesh::setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num)
+  {
+    /* verify that all accesses are 4 bytes aligned */
+    if (((size_t(buffer->getPtr()) + offset) & 0x3) || (stride & 0x3)) 
+      throw_RTCError(RTC_ERROR_INVALID_OPERATION, "data must be 4 bytes aligned");
+
+    if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (format != RTC_FORMAT_FLOAT3)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex buffer format");
+
+      /* if buffer is larger than 16GB the premultiplied index optimization does not work */
+      if (stride*num > 16ll*1024ll*1024ll*1024ll)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "vertex buffer can be at most 16GB large");
+
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid vertex buffer slot");
+
+      vertices[slot].set(buffer, offset, stride, num, format);
+      vertices[slot].checkPadding16();
+      vertices0 = vertices[0];
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (format < RTC_FORMAT_FLOAT || format > RTC_FORMAT_FLOAT16)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer format");
+
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid vertex attribute buffer slot");
+      
+      vertexAttribs[slot].set(buffer, offset, stride, num, format);
+      vertexAttribs[slot].checkPadding16();
+    }
+    else if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      if (format != RTC_FORMAT_UINT3)
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION, "invalid index buffer format");
+
+      triangles.set(buffer, offset, stride, num, format);
+      setNumPrimitives(num);
+    }
+    else 
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+  }
+
+  void* TriangleMesh::getBuffer(RTCBufferType type, unsigned int slot)
+  {
+    if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return triangles.getPtr();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return vertices[slot].getPtr();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      return vertexAttribs[slot].getPtr();
+    }
+    else
+    {
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+      return nullptr;
+    }
+  }
+
+  void TriangleMesh::updateBuffer(RTCBufferType type, unsigned int slot)
+  {
+    if (type == RTC_BUFFER_TYPE_INDEX)
+    {
+      if (slot != 0)
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      triangles.setModified();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX)
+    {
+      if (slot >= vertices.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      vertices[slot].setModified();
+    }
+    else if (type == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE)
+    {
+      if (slot >= vertexAttribs.size())
+        throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "invalid buffer slot");
+      vertexAttribs[slot].setModified();
+    }
+    else
+    {
+      throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "unknown buffer type");
+    }
+
+    Geometry::update();
+  }
+
+  void TriangleMesh::commit() 
+  {
+    /* verify that stride of all time steps are identical */
+    for (unsigned int t=0; t<numTimeSteps; t++)
+      if (vertices[t].getStride() != vertices[0].getStride())
+        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"stride of vertex buffers have to be identical for each time step");
+
+    Geometry::commit();
+  }
+
+  void TriangleMesh::addElementsToCount (GeometryCounts & counts) const 
+  {
+    if (numTimeSteps == 1) counts.numTriangles += numPrimitives;
+    else                   counts.numMBTriangles += numPrimitives;
+  }
+
+  bool TriangleMesh::verify() 
+  {
+    /*! verify size of vertex arrays */
+    if (vertices.size() == 0) return false;
+    for (const auto& buffer : vertices)
+      if (buffer.size() != numVertices())
+        return false;
+
+    /*! verify size of user vertex arrays */
+    for (const auto& buffer : vertexAttribs)
+      if (buffer.size() != numVertices())
+        return false;
+
+    /*! verify triangle indices */
+    for (size_t i=0; i<size(); i++) {     
+      if (triangles[i].v[0] >= numVertices()) return false; 
+      if (triangles[i].v[1] >= numVertices()) return false; 
+      if (triangles[i].v[2] >= numVertices()) return false; 
+    }
+
+    /*! verify vertices */
+    for (const auto& buffer : vertices)
+      for (size_t i=0; i<buffer.size(); i++)
+	if (!isvalid(buffer[i])) 
+	  return false;
+
+    return true;
+  }
+  
+  void TriangleMesh::interpolate(const RTCInterpolateArguments* const args)
+  {
+    unsigned int primID = args->primID;
+    float u = args->u;
+    float v = args->v;
+    RTCBufferType bufferType = args->bufferType;
+    unsigned int bufferSlot = args->bufferSlot;
+    float* P = args->P;
+    float* dPdu = args->dPdu;
+    float* dPdv = args->dPdv;
+    float* ddPdudu = args->ddPdudu;
+    float* ddPdvdv = args->ddPdvdv;
+    float* ddPdudv = args->ddPdudv;
+    unsigned int valueCount = args->valueCount;
+
+    /* calculate base pointer and stride */
+    assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) ||
+           (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size()));
+    const char* src = nullptr; 
+    size_t stride = 0;
+    if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) {
+      src    = vertexAttribs[bufferSlot].getPtr();
+      stride = vertexAttribs[bufferSlot].getStride();
+    } else {
+      src    = vertices[bufferSlot].getPtr();
+      stride = vertices[bufferSlot].getStride();
+    }
+    
+    for (unsigned int i=0; i<valueCount; i+=4)
+    {
+      size_t ofs = i*sizeof(float);
+      const float w = 1.0f-u-v;
+      const Triangle& tri = triangle(primID);
+      const vbool4 valid = vint4((int)i)+vint4(step) < vint4(int(valueCount));
+      const vfloat4 p0 = vfloat4::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]);
+      const vfloat4 p1 = vfloat4::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]);
+      const vfloat4 p2 = vfloat4::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]);
+      
+      if (P) {
+        vfloat4::storeu(valid,P+i,madd(w,p0,madd(u,p1,v*p2)));
+      }
+      if (dPdu) {
+        assert(dPdu); vfloat4::storeu(valid,dPdu+i,p1-p0);
+        assert(dPdv); vfloat4::storeu(valid,dPdv+i,p2-p0);
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); vfloat4::storeu(valid,ddPdudu+i,vfloat4(zero));
+        assert(ddPdvdv); vfloat4::storeu(valid,ddPdvdv+i,vfloat4(zero));
+        assert(ddPdudv); vfloat4::storeu(valid,ddPdudv+i,vfloat4(zero));
+      }
+    }
+  }
+  
+#endif
+  
+  namespace isa
+  {
+    TriangleMesh* createTriangleMesh(Device* device) {
+      return new TriangleMeshISA(device);
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h
new file mode 100644
index 0000000000..eaf2e1799a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_triangle_mesh.h
@@ -0,0 +1,264 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "geometry.h"
+#include "buffer.h"
+
+namespace embree
+{
+  /*! Triangle Mesh */
+  struct TriangleMesh : public Geometry
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_TRIANGLE_MESH;
+
+    /*! triangle indices */
+    struct Triangle 
+    {
+      uint32_t v[3];
+
+      /*! outputs triangle indices */
+      __forceinline friend embree_ostream operator<<(embree_ostream cout, const Triangle& t) {
+        return cout << "Triangle { " << t.v[0] << ", " << t.v[1] << ", " << t.v[2] << " }";
+      }
+    };
+
+  public:
+
+    /*! triangle mesh construction */
+    TriangleMesh (Device* device); 
+
+    /* geometry interface */
+  public:
+    void setMask(unsigned mask);
+    void setNumTimeSteps (unsigned int numTimeSteps);
+    void setVertexAttributeCount (unsigned int N);
+    void setBuffer(RTCBufferType type, unsigned int slot, RTCFormat format, const Ref<Buffer>& buffer, size_t offset, size_t stride, unsigned int num);
+    void* getBuffer(RTCBufferType type, unsigned int slot);
+    void updateBuffer(RTCBufferType type, unsigned int slot);
+    void commit();
+    bool verify();
+    void interpolate(const RTCInterpolateArguments* const args);
+    void addElementsToCount (GeometryCounts & counts) const;
+
+  public:
+
+    /*! returns number of vertices */
+    __forceinline size_t numVertices() const {
+      return vertices[0].size();
+    }
+    
+    /*! returns i'th triangle*/
+    __forceinline const Triangle& triangle(size_t i) const {
+      return triangles[i];
+    }
+
+    /*! returns i'th vertex of the first time step  */
+    __forceinline const Vec3fa vertex(size_t i) const {
+      return vertices0[i];
+    }
+
+    /*! returns i'th vertex of the first time step */
+    __forceinline const char* vertexPtr(size_t i) const {
+      return vertices0.getPtr(i);
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const Vec3fa vertex(size_t i, size_t itime) const {
+      return vertices[itime][i];
+    }
+
+    /*! returns i'th vertex of itime'th timestep */
+    __forceinline const char* vertexPtr(size_t i, size_t itime) const {
+      return vertices[itime].getPtr(i);
+    }
+
+    /*! calculates the bounds of the i'th triangle */
+    __forceinline BBox3fa bounds(size_t i) const 
+    {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0]);
+      const Vec3fa v1 = vertex(tri.v[1]);
+      const Vec3fa v2 = vertex(tri.v[2]);
+      return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+    }
+
+    /*! calculates the bounds of the i'th triangle at the itime'th timestep */
+    __forceinline BBox3fa bounds(size_t i, size_t itime) const
+    {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0],itime);
+      const Vec3fa v1 = vertex(tri.v[1],itime);
+      const Vec3fa v2 = vertex(tri.v[2],itime);
+      return BBox3fa(min(v0,v1,v2),max(v0,v1,v2));
+    }
+
+    /*! check if the i'th primitive is valid at the itime'th timestep */
+    __forceinline bool valid(size_t i, size_t itime) const {
+      return valid(i, make_range(itime, itime));
+    }
+
+    /*! check if the i'th primitive is valid between the specified time range */
+    __forceinline bool valid(size_t i, const range<size_t>& itime_range) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      for (size_t itime = itime_range.begin(); itime <= itime_range.end(); itime++)
+      {
+        if (!isvalid(vertex(tri.v[0],itime))) return false;
+        if (!isvalid(vertex(tri.v[1],itime))) return false;
+        if (!isvalid(vertex(tri.v[2],itime))) return false;
+      }
+
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive at the itimeGlobal'th time segment */
+    __forceinline LBBox3fa linearBounds(size_t i, size_t itime) const {
+      return LBBox3fa(bounds(i,itime+0),bounds(i,itime+1));
+    }
+
+    /*! calculates the build bounds of the i'th primitive, if it's valid */
+    __forceinline bool buildBounds(size_t i, BBox3fa* bbox = nullptr) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      for (size_t t=0; t<numTimeSteps; t++)
+      {
+        const Vec3fa v0 = vertex(tri.v[0],t);
+        const Vec3fa v1 = vertex(tri.v[1],t);
+        const Vec3fa v2 = vertex(tri.v[2],t);
+        if (unlikely(!isvalid(v0) || !isvalid(v1) || !isvalid(v2)))
+          return false;
+      }
+
+      if (likely(bbox)) 
+        *bbox = bounds(i);
+
+      return true;
+    }
+
+    /*! calculates the build bounds of the i'th primitive at the itime'th time segment, if it's valid */
+    __forceinline bool buildBounds(size_t i, size_t itime, BBox3fa& bbox) const
+    {
+      const Triangle& tri = triangle(i);
+      if (unlikely(tri.v[0] >= numVertices())) return false;
+      if (unlikely(tri.v[1] >= numVertices())) return false;
+      if (unlikely(tri.v[2] >= numVertices())) return false;
+
+      assert(itime+1 < numTimeSteps);
+      const Vec3fa a0 = vertex(tri.v[0],itime+0); if (unlikely(!isvalid(a0))) return false;
+      const Vec3fa a1 = vertex(tri.v[1],itime+0); if (unlikely(!isvalid(a1))) return false;
+      const Vec3fa a2 = vertex(tri.v[2],itime+0); if (unlikely(!isvalid(a2))) return false;
+      const Vec3fa b0 = vertex(tri.v[0],itime+1); if (unlikely(!isvalid(b0))) return false;
+      const Vec3fa b1 = vertex(tri.v[1],itime+1); if (unlikely(!isvalid(b1))) return false;
+      const Vec3fa b2 = vertex(tri.v[2],itime+1); if (unlikely(!isvalid(b2))) return false;
+      
+      /* use bounds of first time step in builder */
+      bbox = BBox3fa(min(a0,a1,a2),max(a0,a1,a2));
+      return true;
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline LBBox3fa linearBounds(size_t primID, const BBox1f& dt) const {
+      return LBBox3fa([&] (size_t itime) { return bounds(primID, itime); }, dt, time_range, fnumTimeSegments);
+    }
+
+    /*! calculates the linear bounds of the i'th primitive for the specified time range */
+    __forceinline bool linearBounds(size_t i, const BBox1f& dt, LBBox3fa& bbox) const  {
+      if (!valid(i, timeSegmentRange(dt))) return false;
+      bbox = linearBounds(i, dt);
+      return true;
+    }
+
+    /*! get fast access to first vertex buffer */
+    __forceinline float * getCompactVertexArray () const {
+      return (float*) vertices0.getPtr();
+    }
+
+    /* gets version info of topology */
+    unsigned int getTopologyVersion() const {
+      return triangles.modCounter;
+    }
+    
+    /* returns true if topology changed */
+    bool topologyChanged(unsigned int otherVersion) const {
+      return triangles.isModified(otherVersion); // || numPrimitivesChanged;
+    }
+
+    /* returns the projected area */
+    __forceinline float projectedPrimitiveArea(const size_t i) const {
+      const Triangle& tri = triangle(i);
+      const Vec3fa v0 = vertex(tri.v[0]);
+      const Vec3fa v1 = vertex(tri.v[1]);
+      const Vec3fa v2 = vertex(tri.v[2]);      
+      return areaProjectedTriangle(v0,v1,v2);
+    }
+
+  public:
+    BufferView<Triangle> triangles;      //!< array of triangles
+    BufferView<Vec3fa> vertices0;        //!< fast access to first vertex buffer
+    vector<BufferView<Vec3fa>> vertices; //!< vertex array for each timestep
+    vector<RawBufferView> vertexAttribs; //!< vertex attributes
+  };
+
+  namespace isa
+  {
+    struct TriangleMeshISA : public TriangleMesh
+    {
+      TriangleMeshISA (Device* device)
+        : TriangleMesh(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+
+  DECLARE_ISA_FUNCTION(TriangleMesh*, createTriangleMesh, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h
new file mode 100644
index 0000000000..8d11ed6986
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/scene_user_geometry.h
@@ -0,0 +1,77 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "accelset.h"
+
+namespace embree
+{
+  /*! User geometry with user defined intersection functions */
+  struct UserGeometry : public AccelSet
+  {
+    /*! type of this geometry */
+    static const Geometry::GTypeMask geom_type = Geometry::MTY_USER_GEOMETRY;
+
+  public:
+    UserGeometry (Device* device, unsigned int items = 0, unsigned int numTimeSteps = 1);
+    virtual void setMask (unsigned mask);
+    virtual void setBoundsFunction (RTCBoundsFunction bounds, void* userPtr);
+    virtual void setIntersectFunctionN (RTCIntersectFunctionN intersect);
+    virtual void setOccludedFunctionN (RTCOccludedFunctionN occluded);
+    virtual void build() {}
+    virtual void addElementsToCount (GeometryCounts & counts) const;
+  };
+
+  namespace isa
+  {
+    struct UserGeometryISA : public UserGeometry
+    {
+      UserGeometryISA (Device* device)
+        : UserGeometry(device) {}
+
+      PrimInfo createPrimRefArray(mvector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,&bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+
+      PrimInfo createPrimRefArrayMB(mvector<PrimRef>& prims, size_t itime, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfo pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          BBox3fa bounds = empty;
+          if (!buildBounds(j,itime,bounds)) continue;
+          const PrimRef prim(bounds,geomID,unsigned(j));
+          pinfo.add_center2(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+      
+      PrimInfoMB createPrimRefMBArray(mvector<PrimRefMB>& prims, const BBox1f& t0t1, const range<size_t>& r, size_t k, unsigned int geomID) const
+      {
+        PrimInfoMB pinfo(empty);
+        for (size_t j=r.begin(); j<r.end(); j++)
+        {
+          if (!valid(j, timeSegmentRange(t0t1))) continue;
+          const PrimRefMB prim(linearBounds(j,t0t1),this->numTimeSegments(),this->time_range,this->numTimeSegments(),geomID,unsigned(j));
+          pinfo.add_primref(prim);
+          prims[k++] = prim;
+        }
+        return pinfo;
+      }
+    };
+  }
+  
+  DECLARE_ISA_FUNCTION(UserGeometry*, createUserGeometry, Device*);
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stack_item.h b/thirdparty/embree-aarch64/kernels/common/stack_item.h
new file mode 100644
index 0000000000..533c385365
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stack_item.h
@@ -0,0 +1,125 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /*! An item on the stack holds the node ID and distance of that node. */
+  template<typename T>
+  struct __aligned(16) StackItemT
+  {
+    /*! assert that the xchg function works */
+    static_assert(sizeof(T) <= 12, "sizeof(T) <= 12 failed");
+
+    __forceinline StackItemT() {}
+
+    __forceinline StackItemT(T &ptr, unsigned &dist) : ptr(ptr), dist(dist) {}
+
+    /*! use SSE instructions to swap stack items */
+    __forceinline static void xchg(StackItemT& a, StackItemT& b) 
+    { 
+      const vfloat4 sse_a = vfloat4::load((float*)&a); 
+      const vfloat4 sse_b = vfloat4::load((float*)&b);
+      vfloat4::store(&a,sse_b);
+      vfloat4::store(&b,sse_a);
+    }
+
+    /*! Sort 2 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2) {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+    }
+    
+    /*! Sort 3 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3)
+    {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+      if (s3.dist < s2.dist) xchg(s3,s2);
+      if (s2.dist < s1.dist) xchg(s2,s1);
+    }
+    
+    /*! Sort 4 stack items. */
+    __forceinline friend void sort(StackItemT& s1, StackItemT& s2, StackItemT& s3, StackItemT& s4)
+    {
+      if (s2.dist < s1.dist) xchg(s2,s1);
+      if (s4.dist < s3.dist) xchg(s4,s3);
+      if (s3.dist < s1.dist) xchg(s3,s1);
+      if (s4.dist < s2.dist) xchg(s4,s2);
+      if (s3.dist < s2.dist) xchg(s3,s2);
+    }
+
+    /*! use SSE instructions to swap stack items */
+    __forceinline static void cmp_xchg(vint4& a, vint4& b) 
+    { 
+#if defined(__AVX512VL__)
+      const vboolf4 mask(shuffle<2,2,2,2>(b) < shuffle<2,2,2,2>(a));
+#else
+      const vboolf4 mask0(b < a);
+      const vboolf4 mask(shuffle<2,2,2,2>(mask0));
+#endif
+      const vint4 c = select(mask,b,a);
+      const vint4 d = select(mask,a,b);
+      a = c;
+      b = d;
+    }
+    
+    /*! Sort 3 stack items. */
+    __forceinline static void sort3(vint4& s1, vint4& s2, vint4& s3)
+    {
+      cmp_xchg(s2,s1);
+      cmp_xchg(s3,s2);
+      cmp_xchg(s2,s1);
+    }
+    
+    /*! Sort 4 stack items. */
+    __forceinline static void sort4(vint4& s1, vint4& s2, vint4& s3, vint4& s4)
+    {
+      cmp_xchg(s2,s1);
+      cmp_xchg(s4,s3);
+      cmp_xchg(s3,s1);
+      cmp_xchg(s4,s2);
+      cmp_xchg(s3,s2);
+    }
+
+
+    /*! Sort N stack items. */
+    __forceinline friend void sort(StackItemT* begin, StackItemT* end)
+    {
+      for (StackItemT* i = begin+1; i != end; ++i)
+      {
+        const vfloat4 item = vfloat4::load((float*)i);
+        const unsigned dist = i->dist;
+        StackItemT* j = i;
+
+        while ((j != begin) && ((j-1)->dist < dist))
+        {
+          vfloat4::store(j, vfloat4::load((float*)(j-1)));
+          --j;
+        }
+
+        vfloat4::store(j, item);
+      }
+    }
+    
+  public:
+    T ptr;
+    unsigned dist;
+  };
+
+  /*! An item on the stack holds the node ID and active ray mask. */
+  template<typename T>
+  struct __aligned(8) StackItemMaskT
+  {
+    T ptr;
+    size_t mask;
+  };
+
+  struct __aligned(8) StackItemMaskCoherent
+  {
+    size_t mask;
+    size_t parent;
+    size_t child;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stat.cpp b/thirdparty/embree-aarch64/kernels/common/stat.cpp
new file mode 100644
index 0000000000..b73c3a8c76
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stat.cpp
@@ -0,0 +1,128 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "stat.h"
+
+namespace embree
+{
+  Stat Stat::instance; 
+  
+  Stat::Stat () {
+  }
+
+  Stat::~Stat () 
+  {
+#ifdef EMBREE_STAT_COUNTERS
+    Stat::print(std::cout);
+#endif
+  }
+
+  void Stat::print(std::ostream& cout)
+  {
+    Counters& cntrs = instance.cntrs;
+    Counters::Data& data = instance.cntrs.code;
+    //Counters::Data& data = instance.cntrs.active;
+
+    /* print absolute numbers */
+    cout << "--------- ABSOLUTE ---------" << std::endl;
+    cout << "  #normal_travs   = " << float(data.normal.travs            )*1E-6 << "M" << std::endl;
+    cout << "    #nodes        = " << float(data.normal.trav_nodes       )*1E-6 << "M" << std::endl;
+    cout << "    #nodes_xfm    = " << float(data.normal.trav_xfm_nodes   )*1E-6 << "M" << std::endl;
+    cout << "    #leaves       = " << float(data.normal.trav_leaves      )*1E-6 << "M" << std::endl;
+    cout << "    #prims        = " << float(data.normal.trav_prims       )*1E-6 << "M" << std::endl;
+    cout << "    #prim_hits    = " << float(data.normal.trav_prim_hits   )*1E-6 << "M" << std::endl;
+
+    cout << "    #stack nodes  = " << float(data.normal.trav_stack_nodes )*1E-6 << "M" << std::endl;
+    cout << "    #stack pop    = " << float(data.normal.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+    size_t normal_box_hits = 0;
+    size_t weighted_box_hits = 0;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) { 
+      normal_box_hits += data.normal.trav_hit_boxes[i];
+      weighted_box_hits += data.normal.trav_hit_boxes[i]*i;
+    }
+    cout << "    #hit_boxes    = " << normal_box_hits << " (total) distribution: ";
+    float average = 0.0f;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) 
+    {
+      float value = 100.0f * data.normal.trav_hit_boxes[i] / normal_box_hits;
+      cout << "[" << i << "] " << value << " ";
+      average += (float)i*data.normal.trav_hit_boxes[i] / normal_box_hits;
+    }
+    cout << "    average = " << average << std::endl;
+    for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.normal.trav_hit_boxes[i]*i / weighted_box_hits << " ";
+    cout << std::endl;
+
+    if (data.shadow.travs) {
+      cout << "  #shadow_travs = " << float(data.shadow.travs         )*1E-6 << "M" << std::endl;
+      cout << "    #nodes      = " << float(data.shadow.trav_nodes    )*1E-6 << "M" << std::endl;
+      cout << "    #nodes_xfm  = " << float(data.shadow.trav_xfm_nodes)*1E-6 << "M" << std::endl;
+      cout << "    #leaves     = " << float(data.shadow.trav_leaves   )*1E-6 << "M" << std::endl;
+      cout << "    #prims      = " << float(data.shadow.trav_prims    )*1E-6 << "M" << std::endl;
+      cout << "    #prim_hits  = " << float(data.shadow.trav_prim_hits)*1E-6 << "M" << std::endl;
+
+      cout << "    #stack nodes = " << float(data.shadow.trav_stack_nodes )*1E-6 << "M" << std::endl;
+      cout << "    #stack pop   = " << float(data.shadow.trav_stack_pop )*1E-6 << "M" << std::endl;
+
+      size_t shadow_box_hits = 0;
+      size_t weighted_shadow_box_hits = 0;
+
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) {        
+        shadow_box_hits += data.shadow.trav_hit_boxes[i];
+        weighted_shadow_box_hits += data.shadow.trav_hit_boxes[i]*i;
+      }
+      cout << "    #hit_boxes    = ";
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i] / shadow_box_hits << " ";
+      cout << std::endl;
+      for (size_t i=0;i<SIZE_HISTOGRAM;i++) cout << "[" << i << "] " << 100.0f * data.shadow.trav_hit_boxes[i]*i / weighted_shadow_box_hits << " ";
+      cout << std::endl;
+    }
+    cout << std::endl;
+
+    /* print per traversal numbers */
+    cout << "--------- PER TRAVERSAL ---------" << std::endl;
+    float active_normal_travs       = float(cntrs.active.normal.travs      )/float(cntrs.all.normal.travs      );
+    float active_normal_trav_nodes  = float(cntrs.active.normal.trav_nodes )/float(cntrs.all.normal.trav_nodes );
+    float active_normal_trav_xfm_nodes  = float(cntrs.active.normal.trav_xfm_nodes )/float(cntrs.all.normal.trav_xfm_nodes );
+    float active_normal_trav_leaves = float(cntrs.active.normal.trav_leaves)/float(cntrs.all.normal.trav_leaves);
+    float active_normal_trav_prims   = float(cntrs.active.normal.trav_prims  )/float(cntrs.all.normal.trav_prims  );
+    float active_normal_trav_prim_hits = float(cntrs.active.normal.trav_prim_hits  )/float(cntrs.all.normal.trav_prim_hits  );
+    float active_normal_trav_stack_pop = float(cntrs.active.normal.trav_stack_pop  )/float(cntrs.all.normal.trav_stack_pop  );
+
+    cout << "  #normal_travs   = " << float(cntrs.code.normal.travs      )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_travs       << "% active" << std::endl;
+    cout << "    #nodes        = " << float(cntrs.code.normal.trav_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_nodes  << "% active" << std::endl;
+    cout << "    #node_xfm     = " << float(cntrs.code.normal.trav_xfm_nodes )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_xfm_nodes  << "% active" << std::endl;
+    cout << "    #leaves       = " << float(cntrs.code.normal.trav_leaves)/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_leaves << "% active" << std::endl;
+    cout << "    #prims        = " << float(cntrs.code.normal.trav_prims  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prims   << "% active" << std::endl;
+    cout << "    #prim_hits    = " << float(cntrs.code.normal.trav_prim_hits  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_prim_hits   << "% active" << std::endl;
+    cout << "    #stack_pop    = " << float(cntrs.code.normal.trav_stack_pop  )/float(cntrs.code.normal.travs) << ", " << 100.0f*active_normal_trav_stack_pop   << "% active" << std::endl;
+
+    if (cntrs.all.shadow.travs) {
+      float active_shadow_travs       = float(cntrs.active.shadow.travs      )/float(cntrs.all.shadow.travs      );
+      float active_shadow_trav_nodes  = float(cntrs.active.shadow.trav_nodes )/float(cntrs.all.shadow.trav_nodes );
+      float active_shadow_trav_xfm_nodes  = float(cntrs.active.shadow.trav_xfm_nodes )/float(cntrs.all.shadow.trav_xfm_nodes );
+      float active_shadow_trav_leaves = float(cntrs.active.shadow.trav_leaves)/float(cntrs.all.shadow.trav_leaves);
+      float active_shadow_trav_prims   = float(cntrs.active.shadow.trav_prims  )/float(cntrs.all.shadow.trav_prims  );
+      float active_shadow_trav_prim_hits = float(cntrs.active.shadow.trav_prim_hits  )/float(cntrs.all.shadow.trav_prim_hits  );
+
+      cout << "  #shadow_travs = " << float(cntrs.code.shadow.travs      )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_travs       << "% active" << std::endl;
+      cout << "    #nodes      = " << float(cntrs.code.shadow.trav_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_nodes  << "% active" << std::endl;
+      cout << "    #nodes_xfm  = " << float(cntrs.code.shadow.trav_xfm_nodes )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_xfm_nodes  << "% active" << std::endl;
+      cout << "    #leaves     = " << float(cntrs.code.shadow.trav_leaves)/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_leaves << "% active" << std::endl;
+      cout << "    #prims      = " << float(cntrs.code.shadow.trav_prims  )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prims   << "% active" << std::endl;
+      cout << "    #prim_hits  = " << float(cntrs.code.shadow.trav_prim_hits  )/float(cntrs.code.shadow.travs) << ", " << 100.0f*active_shadow_trav_prim_hits   << "% active" << std::endl;
+
+    }
+    cout << std::endl;
+
+     /* print user counters for performance tuning */
+    cout << "--------- USER ---------" << std::endl;
+    for (size_t i=0; i<10; i++)
+      cout << "#user" << i << " = " << float(cntrs.user[i])/float(cntrs.all.normal.travs+cntrs.all.shadow.travs) << " per traversal" << std::endl;
+
+    cout << "#user5/user3 " << 100.0f*float(cntrs.user[5])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << "#user6/user3 " << 100.0f*float(cntrs.user[6])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << "#user7/user3 " << 100.0f*float(cntrs.user[7])/float(cntrs.user[3]) << "%" << std::endl;
+    cout << std::endl;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/stat.h b/thirdparty/embree-aarch64/kernels/common/stat.h
new file mode 100644
index 0000000000..3cda2bd014
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/stat.h
@@ -0,0 +1,116 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+/* Macros to gather statistics */
+#ifdef EMBREE_STAT_COUNTERS
+#  define STAT(x) x
+#  define STAT3(s,x,y,z) \
+  STAT(Stat::get().code  .s+=x);               \
+  STAT(Stat::get().active.s+=y);               \
+  STAT(Stat::get().all   .s+=z);
+#  define STAT_USER(i,x) Stat::get().user[i]+=x;
+#else
+#  define STAT(x)
+#  define STAT3(s,x,y,z)
+#  define STAT_USER(i,x) 
+#endif
+
+namespace embree
+{
+  /*! Gathers ray tracing statistics. We count 1) how often a code
+   *  location is reached, 2) how many SIMD lanes are active, 3) how
+   *  many SIMD lanes reach the code location */
+  class Stat
+  { 
+  public:
+
+    static const size_t SIZE_HISTOGRAM = 64+1;
+
+    /*! constructs stat counter class */
+    Stat ();
+
+    /*! destructs stat counter class */
+    ~Stat ();
+
+    class Counters 
+    {
+    public:
+      Counters () { 
+        clear(); 
+      }
+      
+      void clear() 
+      { 
+        all.clear();
+        active.clear();
+        code.clear();
+        for (auto& u : user) u.store(0);
+      }
+
+    public:
+
+	/* per packet and per ray stastics */
+	struct Data
+        {
+          void clear () {
+            normal.clear();
+            shadow.clear();
+            point_query.clear();
+          }
+
+	  /* normal and shadow ray statistics */
+	  struct 
+          {
+            void clear() 
+            {
+              travs.store(0);
+              trav_nodes.store(0);
+              trav_leaves.store(0);
+              trav_prims.store(0);
+              trav_prim_hits.store(0);
+              for (auto& v : trav_hit_boxes) v.store(0);
+              trav_stack_pop.store(0);
+              trav_stack_nodes.store(0); 
+              trav_xfm_nodes.store(0); 
+            }
+
+          public:
+	    std::atomic<size_t> travs;
+	    std::atomic<size_t> trav_nodes;
+	    std::atomic<size_t> trav_leaves;
+	    std::atomic<size_t> trav_prims;
+	    std::atomic<size_t> trav_prim_hits;
+	    std::atomic<size_t> trav_hit_boxes[SIZE_HISTOGRAM+1];
+	    std::atomic<size_t> trav_stack_pop;
+	    std::atomic<size_t> trav_stack_nodes; 
+            std::atomic<size_t> trav_xfm_nodes; 
+            
+	  } normal, shadow, point_query;
+	} all, active, code; 
+
+        std::atomic<size_t> user[10];
+    };
+
+  public:
+
+    static __forceinline Counters& get() {
+      return instance.cntrs;
+    }
+    
+    static void clear() {
+      instance.cntrs.clear();
+    }
+    
+    static void print(embree_ostream cout);
+
+  private: 
+    Counters cntrs;
+
+  private:
+    static Stat instance;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/state.cpp b/thirdparty/embree-aarch64/kernels/common/state.cpp
new file mode 100644
index 0000000000..51fc9b7826
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/state.cpp
@@ -0,0 +1,543 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "state.h"
+#include "../../common/lexers/streamfilters.h"
+
+namespace embree
+{
+  MutexSys g_printMutex;
+
+  State::ErrorHandler State::g_errorHandler;
+
+  State::ErrorHandler::ErrorHandler()
+    : thread_error(createTls()) {}
+
+  State::ErrorHandler::~ErrorHandler()
+  {
+    Lock<MutexSys> lock(errors_mutex);
+    for (size_t i=0; i<thread_errors.size(); i++)
+      delete thread_errors[i];
+    destroyTls(thread_error);
+    thread_errors.clear();
+  }
+
+  RTCError* State::ErrorHandler::error() 
+  {
+    RTCError* stored_error = (RTCError*) getTls(thread_error);
+    if (stored_error) return stored_error;
+
+    Lock<MutexSys> lock(errors_mutex);
+    stored_error = new RTCError(RTC_ERROR_NONE);
+    thread_errors.push_back(stored_error);
+    setTls(thread_error,stored_error);
+    return stored_error;
+  }
+
+  State::State () 
+    : enabled_cpu_features(getCPUFeatures()),
+      enabled_builder_cpu_features(enabled_cpu_features),
+      frequency_level(FREQUENCY_SIMD256)
+  {
+    tri_accel = "default";
+    tri_builder = "default";
+    tri_traverser = "default";
+    
+    tri_accel_mb = "default";
+    tri_builder_mb = "default";
+    tri_traverser_mb = "default";
+
+    quad_accel = "default";
+    quad_builder = "default";
+    quad_traverser = "default";
+
+    quad_accel_mb = "default";
+    quad_builder_mb = "default";
+    quad_traverser_mb = "default";
+
+    line_accel = "default";
+    line_builder = "default";
+    line_traverser = "default";
+
+    line_accel_mb = "default";
+    line_builder_mb = "default";
+    line_traverser_mb = "default";
+    
+    hair_accel = "default";
+    hair_builder = "default";
+    hair_traverser = "default";
+
+    hair_accel_mb = "default";
+    hair_builder_mb = "default";
+    hair_traverser_mb = "default";
+
+    object_accel = "default";
+    object_builder = "default";
+    object_accel_min_leaf_size = 1;
+    object_accel_max_leaf_size = 1;
+
+    object_accel_mb = "default";
+    object_builder_mb = "default";
+    object_accel_mb_min_leaf_size = 1;
+    object_accel_mb_max_leaf_size = 1;
+
+    max_spatial_split_replications = 1.2f;
+    useSpatialPreSplits = false;
+
+    tessellation_cache_size = 128*1024*1024;
+
+    subdiv_accel = "default";
+    subdiv_accel_mb = "default";
+
+    grid_accel = "default";
+    grid_builder = "default";
+    grid_accel_mb = "default";
+    grid_builder_mb = "default";
+
+    instancing_open_min = 0;
+    instancing_block_size = 0;
+    instancing_open_factor = 8.0f; 
+    instancing_open_max_depth = 32;
+    instancing_open_max = 50000000;
+
+    ignore_config_files = false;
+    float_exceptions = false;
+    quality_flags = -1;
+    scene_flags = -1;
+    verbose = 0;
+    benchmark = 0;
+
+    numThreads = 0;
+    numUserThreads = 0;
+
+#if TASKING_INTERNAL
+    set_affinity = true;
+#else
+    set_affinity = false;
+#endif
+    /* per default enable affinity on KNL */
+    if (hasISA(AVX512KNL)) set_affinity = true;
+
+    start_threads = false;
+    enable_selockmemoryprivilege = false;
+#if defined(__LINUX__)
+    hugepages = true;
+#else
+    hugepages = false;
+#endif
+    hugepages_success = true;
+
+    alloc_main_block_size = 0;
+    alloc_num_main_slots = 0;
+    alloc_thread_block_size = 0;
+    alloc_single_thread_alloc = -1;
+
+    error_function = nullptr;
+    error_function_userptr = nullptr;
+
+    memory_monitor_function = nullptr;
+    memory_monitor_userptr = nullptr;
+  }
+
+  State::~State() {
+  }
+
+  bool State::hasISA(const int isa) {
+    return (enabled_cpu_features & isa) == isa;
+  }
+
+  bool State::checkISASupport() {
+#if defined(__ARM_NEON)
+    /*
+     * NEON CPU type is a mixture of NEON and SSE2
+     */
+
+    bool hasSSE2 = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_SSE2;
+
+    /* this will be true when explicitly initialize Device with `isa=neon` config */
+    bool hasNEON = (getCPUFeatures() & enabled_cpu_features) & CPU_FEATURE_NEON;
+
+    return hasSSE2 || hasNEON;
+#else
+    return (getCPUFeatures() & enabled_cpu_features) == enabled_cpu_features;
+#endif
+  }
+  
+  void State::verify()
+  {
+    /* verify that calculations stay in range */
+    assert(rcp(min_rcp_input)*FLT_LARGE+FLT_LARGE < 0.01f*FLT_MAX);
+
+    /* here we verify that CPP files compiled for a specific ISA only
+     * call that same or lower ISA version of non-inlined class member
+     * functions */
+#if defined(DEBUG)
+#if defined(EMBREE_TARGET_SSE2)
+#if !defined(__ARM_NEON)
+    assert(sse2::getISA() <= SSE2);
+#endif
+#endif
+#if defined(EMBREE_TARGET_SSE42)
+    assert(sse42::getISA() <= SSE42);
+#endif
+#if defined(EMBREE_TARGET_AVX)
+    assert(avx::getISA() <= AVX);
+#endif
+#if defined(EMBREE_TARGET_AVX2)
+    assert(avx2::getISA() <= AVX2);
+#endif
+#if defined (EMBREE_TARGET_AVX512KNL)
+    assert(avx512knl::getISA() <= AVX512KNL);
+#endif
+#if defined (EMBREE_TARGET_AVX512SKX)
+    assert(avx512skx::getISA() <= AVX512SKX);
+#endif
+#endif
+  }
+
+  const char* symbols[3] = { "=", ",", "|" };
+
+  bool State::parseFile(const FileName& fileName)
+  {
+    FILE* f = fopen(fileName.c_str(),"r");
+    if (!f) return false;
+    Ref<Stream<int> > file = new FileStream(f,fileName);
+    
+    std::vector<std::string> syms;
+    for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
+      syms.push_back(symbols[i]);
+    
+    Ref<TokenStream> cin = new TokenStream(new LineCommentFilter(file,"#"),
+                                           TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+                                           TokenStream::separators,syms);
+    parse(cin);
+    return true;
+  }
+
+  void State::parseString(const char* cfg)
+  {
+    if (cfg == nullptr) return;
+
+    std::vector<std::string> syms;
+    for (size_t i=0; i<sizeof(symbols)/sizeof(void*); i++) 
+      syms.push_back(symbols[i]);
+    
+    Ref<TokenStream> cin = new TokenStream(new StrStream(cfg),
+                                           TokenStream::alpha+TokenStream::ALPHA+TokenStream::numbers+"_.",
+                                           TokenStream::separators,syms);
+    parse(cin);
+  }
+  
+  int string_to_cpufeatures(const std::string& isa)
+  {
+    if      (isa == "sse" ) return SSE;
+    else if (isa == "sse2") return SSE2;
+    else if (isa == "sse3") return SSE3;
+    else if (isa == "ssse3") return SSSE3;
+    else if (isa == "sse41") return SSE41;
+    else if (isa == "sse4.1") return SSE41;
+    else if (isa == "sse42") return SSE42;
+    else if (isa == "sse4.2") return SSE42;
+    else if (isa == "avx") return AVX;
+    else if (isa == "avxi") return AVXI;
+    else if (isa == "avx2") return AVX2;
+    else if (isa == "avx512knl") return AVX512KNL;
+    else if (isa == "avx512skx") return AVX512SKX;
+    else return SSE2;
+  }
+
+  void State::parse(Ref<TokenStream> cin)
+  {
+    /* parse until end of stream */
+    while (cin->peek() != Token::Eof())
+    {
+      const Token tok = cin->get();
+
+      if (tok == Token::Id("threads") && cin->trySymbol("=")) 
+        numThreads = cin->get().Int();
+
+      else if (tok == Token::Id("user_threads")&& cin->trySymbol("=")) 
+        numUserThreads = cin->get().Int();
+
+      else if (tok == Token::Id("set_affinity")&& cin->trySymbol("=")) 
+        set_affinity = cin->get().Int();
+
+      else if (tok == Token::Id("affinity")&& cin->trySymbol("=")) 
+        set_affinity = cin->get().Int();
+      
+      else if (tok == Token::Id("start_threads")&& cin->trySymbol("=")) 
+        start_threads = cin->get().Int();
+      
+      else if (tok == Token::Id("isa") && cin->trySymbol("=")) {
+        std::string isa = toLowerCase(cin->get().Identifier());
+        enabled_cpu_features = string_to_cpufeatures(isa);
+        enabled_builder_cpu_features = enabled_cpu_features;
+      }
+
+      else if (tok == Token::Id("max_isa") && cin->trySymbol("=")) {
+        std::string isa = toLowerCase(cin->get().Identifier());
+        enabled_cpu_features &= string_to_cpufeatures(isa);
+        enabled_builder_cpu_features &= enabled_cpu_features;
+      }
+
+      else if (tok == Token::Id("max_builder_isa") && cin->trySymbol("=")) {
+        std::string isa = toLowerCase(cin->get().Identifier());
+        enabled_builder_cpu_features &= string_to_cpufeatures(isa);
+      }
+
+      else if (tok == Token::Id("frequency_level") && cin->trySymbol("=")) {
+        std::string freq = cin->get().Identifier();
+        if      (freq == "simd128") frequency_level = FREQUENCY_SIMD128;
+        else if (freq == "simd256") frequency_level = FREQUENCY_SIMD256;
+        else if (freq == "simd512") frequency_level = FREQUENCY_SIMD512;
+      }
+
+      else if (tok == Token::Id("enable_selockmemoryprivilege") && cin->trySymbol("=")) {
+        enable_selockmemoryprivilege = cin->get().Int();
+      }
+      else if (tok == Token::Id("hugepages") && cin->trySymbol("=")) {
+        hugepages = cin->get().Int();
+      }
+
+      else if (tok == Token::Id("ignore_config_files") && cin->trySymbol("="))
+        ignore_config_files = cin->get().Int();
+      else if (tok == Token::Id("float_exceptions") && cin->trySymbol("=")) 
+        float_exceptions = cin->get().Int();
+
+      else if ((tok == Token::Id("tri_accel") || tok == Token::Id("accel")) && cin->trySymbol("="))
+        tri_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_builder") || tok == Token::Id("builder")) && cin->trySymbol("="))
+        tri_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_traverser") || tok == Token::Id("traverser")) && cin->trySymbol("="))
+        tri_traverser = cin->get().Identifier();
+     
+      else if ((tok == Token::Id("tri_accel_mb") || tok == Token::Id("accel_mb")) && cin->trySymbol("="))
+        tri_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_builder_mb") || tok == Token::Id("builder_mb")) && cin->trySymbol("="))
+        tri_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("tri_traverser_mb") || tok == Token::Id("traverser_mb")) && cin->trySymbol("="))
+        tri_traverser_mb = cin->get().Identifier();
+
+      else if ((tok == Token::Id("quad_accel")) && cin->trySymbol("="))
+        quad_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_builder")) && cin->trySymbol("="))
+        quad_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_traverser")) && cin->trySymbol("="))
+        quad_traverser = cin->get().Identifier();
+
+      else if ((tok == Token::Id("quad_accel_mb")) && cin->trySymbol("="))
+        quad_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_builder_mb")) && cin->trySymbol("="))
+        quad_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("quad_traverser_mb")) && cin->trySymbol("="))
+        quad_traverser_mb = cin->get().Identifier();
+
+      else if ((tok == Token::Id("line_accel")) && cin->trySymbol("="))
+        line_accel = cin->get().Identifier();
+      else if ((tok == Token::Id("line_builder")) && cin->trySymbol("="))
+        line_builder = cin->get().Identifier();
+      else if ((tok == Token::Id("line_traverser")) && cin->trySymbol("="))
+        line_traverser = cin->get().Identifier();
+
+      else if ((tok == Token::Id("line_accel_mb")) && cin->trySymbol("="))
+        line_accel_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("line_builder_mb")) && cin->trySymbol("="))
+        line_builder_mb = cin->get().Identifier();
+      else if ((tok == Token::Id("line_traverser_mb")) && cin->trySymbol("="))
+        line_traverser_mb = cin->get().Identifier();
+      
+      else if (tok == Token::Id("hair_accel") && cin->trySymbol("="))
+        hair_accel = cin->get().Identifier();
+      else if (tok == Token::Id("hair_builder") && cin->trySymbol("="))
+        hair_builder = cin->get().Identifier();
+      else if (tok == Token::Id("hair_traverser") && cin->trySymbol("="))
+        hair_traverser = cin->get().Identifier();
+
+      else if (tok == Token::Id("hair_accel_mb") && cin->trySymbol("="))
+        hair_accel_mb = cin->get().Identifier();
+      else if (tok == Token::Id("hair_builder_mb") && cin->trySymbol("="))
+        hair_builder_mb = cin->get().Identifier();
+      else if (tok == Token::Id("hair_traverser_mb") && cin->trySymbol("="))
+        hair_traverser_mb = cin->get().Identifier();
+
+      else if (tok == Token::Id("object_accel") && cin->trySymbol("="))
+        object_accel = cin->get().Identifier();
+      else if (tok == Token::Id("object_builder") && cin->trySymbol("="))
+        object_builder = cin->get().Identifier();
+      else if (tok == Token::Id("object_accel_min_leaf_size") && cin->trySymbol("="))
+        object_accel_min_leaf_size = cin->get().Int();
+      else if (tok == Token::Id("object_accel_max_leaf_size") && cin->trySymbol("="))
+        object_accel_max_leaf_size = cin->get().Int();
+
+      else if (tok == Token::Id("object_accel_mb") && cin->trySymbol("="))
+        object_accel_mb = cin->get().Identifier();
+      else if (tok == Token::Id("object_builder_mb") && cin->trySymbol("="))
+        object_builder_mb = cin->get().Identifier();
+      else if (tok == Token::Id("object_accel_mb_min_leaf_size") && cin->trySymbol("="))
+        object_accel_mb_min_leaf_size = cin->get().Int();
+      else if (tok == Token::Id("object_accel_mb_max_leaf_size") && cin->trySymbol("="))
+        object_accel_mb_max_leaf_size = cin->get().Int();
+
+      else if (tok == Token::Id("instancing_open_min") && cin->trySymbol("="))
+        instancing_open_min = cin->get().Int();
+      else if (tok == Token::Id("instancing_block_size") && cin->trySymbol("=")) {
+        instancing_block_size = cin->get().Int();
+        instancing_open_factor = 0.0f;
+      }
+      else if (tok == Token::Id("instancing_open_max_depth") && cin->trySymbol("="))
+        instancing_open_max_depth = cin->get().Int();
+      else if (tok == Token::Id("instancing_open_factor") && cin->trySymbol("=")) {
+        instancing_block_size = 0;
+        instancing_open_factor = cin->get().Float();
+      }
+      else if (tok == Token::Id("instancing_open_max") && cin->trySymbol("="))
+        instancing_open_max = cin->get().Int();
+
+      else if (tok == Token::Id("subdiv_accel") && cin->trySymbol("="))
+        subdiv_accel = cin->get().Identifier();
+      else if (tok == Token::Id("subdiv_accel_mb") && cin->trySymbol("="))
+        subdiv_accel_mb = cin->get().Identifier();
+
+      else if (tok == Token::Id("grid_accel") && cin->trySymbol("="))
+        grid_accel = cin->get().Identifier();
+      else if (tok == Token::Id("grid_accel_mb") && cin->trySymbol("="))
+        grid_accel_mb = cin->get().Identifier();
+      
+      else if (tok == Token::Id("verbose") && cin->trySymbol("="))
+        verbose = cin->get().Int();
+      else if (tok == Token::Id("benchmark") && cin->trySymbol("="))
+        benchmark = cin->get().Int();
+      
+      else if (tok == Token::Id("quality")) {
+        if (cin->trySymbol("=")) {
+          Token flag = cin->get();
+          if      (flag == Token::Id("low"))    quality_flags = RTC_BUILD_QUALITY_LOW;
+          else if (flag == Token::Id("medium")) quality_flags = RTC_BUILD_QUALITY_MEDIUM;
+          else if (flag == Token::Id("high"))   quality_flags = RTC_BUILD_QUALITY_HIGH;
+        }
+      }
+
+      else if (tok == Token::Id("scene_flags")) {
+        scene_flags = 0;
+        if (cin->trySymbol("=")) {
+          do {
+            Token flag = cin->get();
+            if (flag == Token::Id("dynamic") ) scene_flags |= RTC_SCENE_FLAG_DYNAMIC;
+            else if (flag == Token::Id("compact")) scene_flags |= RTC_SCENE_FLAG_COMPACT;
+            else if (flag == Token::Id("robust")) scene_flags |= RTC_SCENE_FLAG_ROBUST;
+          } while (cin->trySymbol("|"));
+        }
+      }
+      
+      else if (tok == Token::Id("max_spatial_split_replications") && cin->trySymbol("="))
+        max_spatial_split_replications = cin->get().Float();
+
+      else if (tok == Token::Id("presplits") && cin->trySymbol("="))
+        useSpatialPreSplits = cin->get().Int() != 0 ? true : false;
+
+      else if (tok == Token::Id("tessellation_cache_size") && cin->trySymbol("="))
+        tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+      else if (tok == Token::Id("cache_size") && cin->trySymbol("="))
+        tessellation_cache_size = size_t(cin->get().Float()*1024.0f*1024.0f);
+
+      else if (tok == Token::Id("alloc_main_block_size") && cin->trySymbol("="))
+        alloc_main_block_size = cin->get().Int();
+       else if (tok == Token::Id("alloc_num_main_slots") && cin->trySymbol("="))
+        alloc_num_main_slots = cin->get().Int();
+       else if (tok == Token::Id("alloc_thread_block_size") && cin->trySymbol("="))
+         alloc_thread_block_size = cin->get().Int();
+       else if (tok == Token::Id("alloc_single_thread_alloc") && cin->trySymbol("="))
+         alloc_single_thread_alloc = cin->get().Int();
+
+      cin->trySymbol(","); // optional , separator
+    }
+  }
+
+  bool State::verbosity(size_t N) {
+    return N <= verbose;
+  }
+
+  void State::print()
+  {
+    std::cout << "general:" << std::endl;
+    std::cout << "  build threads      = " << numThreads   << std::endl;
+    std::cout << "  build user threads = " << numUserThreads   << std::endl;
+    std::cout << "  start_threads      = " << start_threads << std::endl;
+    std::cout << "  affinity           = " << set_affinity << std::endl;
+    std::cout << "  frequency_level    = ";
+    switch (frequency_level) {
+    case FREQUENCY_SIMD128: std::cout << "simd128" << std::endl; break;
+    case FREQUENCY_SIMD256: std::cout << "simd256" << std::endl; break;
+    case FREQUENCY_SIMD512: std::cout << "simd512" << std::endl; break;
+    default: std::cout << "error" << std::endl; break;
+    }
+    
+    std::cout << "  hugepages          = ";
+    if (!hugepages) std::cout << "disabled" << std::endl;
+    else if (hugepages_success) std::cout << "enabled" << std::endl;
+    else std::cout << "failed" << std::endl;
+
+    std::cout << "  verbosity          = " << verbose << std::endl;
+    std::cout << "  cache_size         = " << float(tessellation_cache_size)*1E-6 << " MB" << std::endl;
+    std::cout << "  max_spatial_split_replications = " << max_spatial_split_replications << std::endl;
+    
+    std::cout << "triangles:" << std::endl;
+    std::cout << "  accel              = " << tri_accel << std::endl;
+    std::cout << "  builder            = " << tri_builder << std::endl;
+    std::cout << "  traverser          = " << tri_traverser << std::endl;
+        
+    std::cout << "motion blur triangles:" << std::endl;
+    std::cout << "  accel              = " << tri_accel_mb << std::endl;
+    std::cout << "  builder            = " << tri_builder_mb << std::endl;
+    std::cout << "  traverser          = " << tri_traverser_mb << std::endl;
+
+    std::cout << "quads:" << std::endl;
+    std::cout << "  accel              = " << quad_accel << std::endl;
+    std::cout << "  builder            = " << quad_builder << std::endl;
+    std::cout << "  traverser          = " << quad_traverser << std::endl;
+
+    std::cout << "motion blur quads:" << std::endl;
+    std::cout << "  accel              = " << quad_accel_mb << std::endl;
+    std::cout << "  builder            = " << quad_builder_mb << std::endl;
+    std::cout << "  traverser          = " << quad_traverser_mb << std::endl;
+
+    std::cout << "line segments:" << std::endl;
+    std::cout << "  accel              = " << line_accel << std::endl;
+    std::cout << "  builder            = " << line_builder << std::endl;
+    std::cout << "  traverser          = " << line_traverser << std::endl;
+
+    std::cout << "motion blur line segments:" << std::endl;
+    std::cout << "  accel              = " << line_accel_mb << std::endl;
+    std::cout << "  builder            = " << line_builder_mb << std::endl;
+    std::cout << "  traverser          = " << line_traverser_mb << std::endl;
+    
+    std::cout << "hair:" << std::endl;
+    std::cout << "  accel              = " << hair_accel << std::endl;
+    std::cout << "  builder            = " << hair_builder << std::endl;
+    std::cout << "  traverser          = " << hair_traverser << std::endl;
+
+    std::cout << "motion blur hair:" << std::endl;
+    std::cout << "  accel              = " << hair_accel_mb << std::endl;
+    std::cout << "  builder            = " << hair_builder_mb << std::endl;
+    std::cout << "  traverser          = " << hair_traverser_mb << std::endl;
+    
+    std::cout << "subdivision surfaces:" << std::endl;
+    std::cout << "  accel              = " << subdiv_accel << std::endl;
+
+    std::cout << "grids:" << std::endl;
+    std::cout << "  accel              = " << grid_accel << std::endl;
+    std::cout << "  builder            = " << grid_builder << std::endl;
+
+    std::cout << "motion blur grids:" << std::endl;
+    std::cout << "  accel              = " << grid_accel_mb << std::endl;
+    std::cout << "  builder            = " << grid_builder_mb << std::endl;
+
+    std::cout << "object_accel:" << std::endl;
+    std::cout << "  min_leaf_size      = " << object_accel_min_leaf_size << std::endl;
+    std::cout << "  max_leaf_size      = " << object_accel_max_leaf_size << std::endl;
+
+    std::cout << "object_accel_mb:" << std::endl;
+    std::cout << "  min_leaf_size      = " << object_accel_mb_min_leaf_size << std::endl;
+    std::cout << "  max_leaf_size      = " << object_accel_mb_max_leaf_size << std::endl;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/state.h b/thirdparty/embree-aarch64/kernels/common/state.h
new file mode 100644
index 0000000000..d0fccc023f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/state.h
@@ -0,0 +1,197 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "default.h"
+
+namespace embree
+{
+  /* mutex to make printing to cout thread safe */
+  extern MutexSys g_printMutex;
+
+  struct State : public RefCount
+  {
+  public:
+    /*! state construction */
+    State ();
+
+    /*! state destruction */
+    ~State();
+
+    /*! verifies that state is correct */
+    void verify();
+
+    /*! parses state from a configuration file */
+    bool parseFile(const FileName& fileName);
+
+    /*! parses the state from a string */
+    void parseString(const char* cfg);
+
+    /*! parses the state from a stream */
+    void parse(Ref<TokenStream> cin);
+
+    /*! prints the state */
+    void print();
+
+    /*! checks if verbosity level is at least N */
+    bool verbosity(size_t N);
+
+    /*! checks if some particular ISA is enabled */
+    bool hasISA(const int isa);
+
+    /*! check whether selected ISA is supported by the HW */    
+    bool checkISASupport();
+    
+  public:
+    std::string tri_accel;                 //!< acceleration structure to use for triangles
+    std::string tri_builder;               //!< builder to use for triangles
+    std::string tri_traverser;             //!< traverser to use for triangles
+    
+  public:
+    std::string tri_accel_mb;              //!< acceleration structure to use for motion blur triangles
+    std::string tri_builder_mb;            //!< builder to use for motion blur triangles
+    std::string tri_traverser_mb;          //!< traverser to use for triangles
+
+  public:
+    std::string quad_accel;                 //!< acceleration structure to use for quads
+    std::string quad_builder;               //!< builder to use for quads
+    std::string quad_traverser;             //!< traverser to use for quads
+
+  public:
+    std::string quad_accel_mb;             //!< acceleration structure to use for motion blur quads
+    std::string quad_builder_mb;           //!< builder to use for motion blur quads
+    std::string quad_traverser_mb;         //!< traverser to use for motion blur quads
+
+  public:
+    std::string line_accel;                 //!< acceleration structure to use for line segments
+    std::string line_builder;               //!< builder to use for line segments
+    std::string line_traverser;             //!< traverser to use for line segments
+
+  public:
+    std::string line_accel_mb;             //!< acceleration structure to use for motion blur line segments
+    std::string line_builder_mb;           //!< builder to use for motion blur line segments
+    std::string line_traverser_mb;         //!< traverser to use for motion blur line segments
+
+  public:
+    std::string hair_accel;                //!< hair acceleration structure to use
+    std::string hair_builder;              //!< builder to use for hair
+    std::string hair_traverser;            //!< traverser to use for hair
+
+  public:
+    std::string hair_accel_mb;             //!< acceleration structure to use for motion blur hair
+    std::string hair_builder_mb;           //!< builder to use for motion blur hair
+    std::string hair_traverser_mb;         //!< traverser to use for motion blur hair
+
+  public:
+    std::string object_accel;               //!< acceleration structure for user geometries
+    std::string object_builder;             //!< builder for user geometries
+    int object_accel_min_leaf_size;         //!< minimum leaf size for object acceleration structure
+    int object_accel_max_leaf_size;         //!< maximum leaf size for object acceleration structure
+
+  public:
+    std::string object_accel_mb;            //!< acceleration structure for user geometries
+    std::string object_builder_mb;          //!< builder for user geometries
+    int object_accel_mb_min_leaf_size;      //!< minimum leaf size for mblur object acceleration structure
+    int object_accel_mb_max_leaf_size;      //!< maximum leaf size for mblur object acceleration structure
+
+  public:
+    std::string subdiv_accel;              //!< acceleration structure to use for subdivision surfaces
+    std::string subdiv_accel_mb;           //!< acceleration structure to use for subdivision surfaces
+
+  public:
+    std::string grid_accel;              //!< acceleration structure to use for grids
+    std::string grid_builder;            //!< builder for grids
+    std::string grid_accel_mb;           //!< acceleration structure to use for motion blur grids
+    std::string grid_builder_mb;         //!< builder for motion blur grids
+
+  public:
+    float max_spatial_split_replications;  //!< maximally replications*N many primitives in accel for spatial splits
+    bool useSpatialPreSplits;              //!< use spatial pre-splits instead of the full spatial split builder
+    size_t tessellation_cache_size;        //!< size of the shared tessellation cache 
+
+  public:
+    size_t instancing_open_min;            //!< instancing opens tree to minimally that number of subtrees
+    size_t instancing_block_size;          //!< instancing opens tree up to average block size of primitives
+    float  instancing_open_factor;         //!< instancing opens tree up to x times the number of instances
+    size_t instancing_open_max_depth;      //!< maximum open depth for geometries
+    size_t instancing_open_max;            //!< instancing opens tree to maximally that number of subtrees
+
+  public:
+    bool ignore_config_files;              //!< if true no more config files get parse
+    bool float_exceptions;                 //!< enable floating point exceptions
+    int quality_flags;
+    int scene_flags;
+    size_t verbose;                        //!< verbosity of output
+    size_t benchmark;                      //!< true
+    
+  public:
+    size_t numThreads;                     //!< number of threads to use in builders
+    size_t numUserThreads;                 //!< number of user provided threads to use in builders
+    bool set_affinity;                     //!< sets affinity for worker threads
+    bool start_threads;                    //!< true when threads should be started at device creation time
+    int enabled_cpu_features;              //!< CPU ISA features to use
+    int enabled_builder_cpu_features;      //!< CPU ISA features to use for builders only
+    enum FREQUENCY_LEVEL {
+      FREQUENCY_SIMD128,
+      FREQUENCY_SIMD256,
+      FREQUENCY_SIMD512
+    } frequency_level;                     //!< frequency level the app wants to run on (default is SIMD256)
+    bool enable_selockmemoryprivilege;     //!< configures the SeLockMemoryPrivilege under Windows to enable huge pages
+    bool hugepages;                        //!< true if huge pages should get used
+    bool hugepages_success;                //!< status for enabling huge pages
+
+  public:
+    size_t alloc_main_block_size;          //!< main allocation block size (shared between threads)
+    int alloc_num_main_slots;              //!< number of such shared blocks to be used to allocate
+    size_t alloc_thread_block_size;        //!< size of thread local allocator block size
+    int alloc_single_thread_alloc;         //!< in single mode nodes and leaves use same thread local allocator
+
+  public:
+
+    /*! checks if we can use AVX */
+    bool canUseAVX() {
+      return hasISA(AVX) && frequency_level != FREQUENCY_SIMD128;
+    }
+
+    /*! checks if we can use AVX2 */
+    bool canUseAVX2() {
+      return hasISA(AVX2) && frequency_level != FREQUENCY_SIMD128;
+    }
+    
+    struct ErrorHandler
+    {
+    public:
+      ErrorHandler();
+      ~ErrorHandler();
+      RTCError* error();
+
+    public:
+      tls_t thread_error;
+      std::vector<RTCError*> thread_errors;
+      MutexSys errors_mutex;
+    };
+    ErrorHandler errorHandler;
+    static ErrorHandler g_errorHandler;
+
+  public:
+    void setErrorFunction(RTCErrorFunction fptr, void* uptr) 
+    {
+      error_function = fptr;
+      error_function_userptr = uptr;
+    }
+
+    RTCErrorFunction error_function;
+    void* error_function_userptr;
+
+  public:
+    void setMemoryMonitorFunction(RTCMemoryMonitorFunction fptr, void* uptr) 
+    {
+      memory_monitor_function = fptr;
+      memory_monitor_userptr = uptr;
+    }
+      
+    RTCMemoryMonitorFunction memory_monitor_function;
+    void* memory_monitor_userptr;
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/common/vector.h b/thirdparty/embree-aarch64/kernels/common/vector.h
new file mode 100644
index 0000000000..b478762240
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/common/vector.h
@@ -0,0 +1,76 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "default.h"
+
+namespace embree
+{
+  /*! invokes the memory monitor callback */
+  struct MemoryMonitorInterface {
+    virtual void memoryMonitor(ssize_t bytes, bool post) = 0;
+  };
+
+  /*! allocator that performs aligned monitored allocations */
+  template<typename T, size_t alignment = 64>
+    struct aligned_monitored_allocator
+    {
+      typedef T value_type;
+      typedef T* pointer;
+      typedef const T* const_pointer;
+      typedef T& reference;
+      typedef const T& const_reference;
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+      
+      __forceinline aligned_monitored_allocator(MemoryMonitorInterface* device) 
+        : device(device), hugepages(false) {}
+
+      __forceinline pointer allocate( size_type n ) 
+      {
+        if (n) {
+          assert(device);
+          device->memoryMonitor(n*sizeof(T),false);
+        }
+        if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+        {
+          pointer p =  (pointer) os_malloc(n*sizeof(value_type),hugepages);
+          assert(p);
+          return p;
+        }
+        return (pointer) alignedMalloc(n*sizeof(value_type),alignment);
+      }
+
+      __forceinline void deallocate( pointer p, size_type n ) 
+      {
+        if (p)
+        {
+          if (n*sizeof(value_type) >= 14 * PAGE_SIZE_2M)
+            os_free(p,n*sizeof(value_type),hugepages); 
+          else
+            alignedFree(p);
+        }
+        else assert(n == 0);
+
+        if (n) {
+          assert(device);
+          device->memoryMonitor(-ssize_t(n)*sizeof(T),true);
+        }
+      }
+
+      __forceinline void construct( pointer p, const_reference val ) {
+        new (p) T(val);
+      }
+
+      __forceinline void destroy( pointer p ) {
+        p->~T();
+      }
+
+    private:
+      MemoryMonitorInterface* device;
+      bool hugepages;
+    };
+
+  /*! monitored vector */
+  template<typename T>
+    using mvector = vector_t<T,aligned_monitored_allocator<T,std::alignment_of<T>::value> >;
+}
diff --git a/thirdparty/embree-aarch64/kernels/config.h b/thirdparty/embree-aarch64/kernels/config.h
new file mode 100644
index 0000000000..80a8ab2a56
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/config.h
@@ -0,0 +1,76 @@
+
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+/* #undef EMBREE_RAY_MASK */
+/* #undef EMBREE_STAT_COUNTERS */
+/* #undef EMBREE_BACKFACE_CULLING */
+/* #undef EMBREE_BACKFACE_CULLING_CURVES */
+#define EMBREE_FILTER_FUNCTION
+/* #undef EMBREE_IGNORE_INVALID_RAYS */
+#define EMBREE_GEOMETRY_TRIANGLE
+/* #undef EMBREE_GEOMETRY_QUAD */
+/* #undef EMBREE_GEOMETRY_CURVE */
+/* #undef EMBREE_GEOMETRY_SUBDIVISION */
+/* #undef EMBREE_GEOMETRY_USER */
+/* #undef EMBREE_GEOMETRY_INSTANCE */
+/* #undef EMBREE_GEOMETRY_GRID */
+/* #undef EMBREE_GEOMETRY_POINT */
+/* #undef EMBREE_RAY_PACKETS */
+/* #undef EMBREE_COMPACT_POLYS */
+
+#define EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR 2.0
+
+#if defined(EMBREE_GEOMETRY_TRIANGLE)
+  #define IF_ENABLED_TRIS(x) x
+#else
+  #define IF_ENABLED_TRIS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_QUAD)
+  #define IF_ENABLED_QUADS(x) x
+#else
+  #define IF_ENABLED_QUADS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE) || defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_CURVES_OR_POINTS(x) x
+#else
+  #define IF_ENABLED_CURVES_OR_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_CURVE)
+  #define IF_ENABLED_CURVES(x) x
+#else
+  #define IF_ENABLED_CURVES(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_POINT)
+  #define IF_ENABLED_POINTS(x) x
+#else
+  #define IF_ENABLED_POINTS(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_SUBDIVISION)
+  #define IF_ENABLED_SUBDIV(x) x
+#else
+  #define IF_ENABLED_SUBDIV(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_USER)
+  #define IF_ENABLED_USER(x) x
+#else
+  #define IF_ENABLED_USER(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_INSTANCE)
+  #define IF_ENABLED_INSTANCE(x) x
+#else
+  #define IF_ENABLED_INSTANCE(x)
+#endif
+
+#if defined(EMBREE_GEOMETRY_GRID)
+  #define IF_ENABLED_GRIDS(x) x
+#else
+  #define IF_ENABLED_GRIDS(x)
+#endif
diff --git a/thirdparty/embree-aarch64/kernels/geometry/cone.h b/thirdparty/embree-aarch64/kernels/geometry/cone.h
new file mode 100644
index 0000000000..961ef86160
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/cone.h
@@ -0,0 +1,321 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct Cone
+    {
+      const Vec3fa p0; //!< start position of cone
+      const Vec3fa p1; //!< end position of cone
+      const float r0;  //!< start radius of cone
+      const float r1;  //!< end radius of cone
+
+      __forceinline Cone(const Vec3fa& p0, const float r0, const Vec3fa& p1, const float r1) 
+        : p0(p0), p1(p1), r0(r0), r1(r1) {}
+
+      __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                   BBox1f& t_o, 
+                                   float& u0_o, Vec3fa& Ng0_o, 
+                                   float& u1_o, Vec3fa& Ng1_o) const 
+      {
+        /* calculate quadratic equation to solve */
+        const Vec3fa v0 = p0-org;
+        const Vec3fa v1 = p1-org;
+        
+        const float rl = rcp_length(v1-v0);
+        const Vec3fa P0 = v0, dP = (v1-v0)*rl;
+        const float dr = (r1-r0)*rl;
+        const Vec3fa O = -P0, dO = dir;
+        
+        const float dOdO = dot(dO,dO);
+        const float OdO = dot(dO,O);
+        const float OO = dot(O,O);
+        const float dOz = dot(dP,dO);
+        const float Oz = dot(dP,O);
+
+        const float R = r0 + Oz*dr;          
+        const float A = dOdO - sqr(dOz) * (1.0f+sqr(dr));
+        const float B = 2.0f * (OdO - dOz*(Oz + R*dr));
+        const float C = OO - (sqr(Oz) + sqr(R));
+
+        /* we miss the cone if determinant is smaller than zero */
+        const float D = B*B - 4.0f*A*C;
+        if (D < 0.0f) return false;
+
+        /* special case for rays that are "parallel" to the cone */
+        const float eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        if (unlikely(abs(A) < eps))
+        {
+          /* cylinder case */
+          if (abs(dr) < 16.0f*float(ulp)) {
+            if (C <= 0.0f) { t_o = BBox1f(neg_inf,pos_inf); return true; } 
+            else           { t_o = BBox1f(pos_inf,neg_inf); return false; }
+          }
+
+          /* cone case */
+          else 
+          {
+            /* if we hit the negative cone there cannot be a hit */
+            const float t = -C/B;
+            const float z0 = Oz+t*dOz;
+            const float z0r = r0+z0*dr;
+            if (z0r < 0.0f) return false;
+
+            /* test if we start inside or outside the cone */
+            if (dOz*dr > 0.0f) t_o = BBox1f(t,pos_inf);
+            else               t_o = BBox1f(neg_inf,t);
+          }
+        }
+
+        /* standard case for "non-parallel" rays */
+        else
+        {
+          const float Q = sqrt(D);
+          const float rcp_2A = rcp(2.0f*A);
+          t_o.lower = (-B-Q)*rcp_2A;
+          t_o.upper = (-B+Q)*rcp_2A;
+          
+          /* standard case where both hits are on same cone */
+          if (likely(A > 0.0f)) {
+            const float z0 = Oz+t_o.lower*dOz;
+            const float z0r = r0+z0*dr;
+            if (z0r < 0.0f) return false;
+          } 
+
+          /* special case where the hits are on the positive and negative cone */
+          else 
+          {
+            /* depending on the ray direction and the open direction
+             * of the cone we have a hit from inside or outside the
+             * cone */
+            if (dOz*dr > 0) t_o.upper = pos_inf;
+            else            t_o.lower = neg_inf;
+          }
+        }
+
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = (Oz+t_o.lower*dOz)*rl;
+          const Vec3fa Pr = t_o.lower*dir;
+          const Vec3fa Pl = v0 + u0_o*(v1-v0);
+          const Vec3fa R = normalize(Pr-Pl);
+          const Vec3fa U = (p1-p0)+(r1-r0)*R;
+          const Vec3fa V = cross(p1-p0,R);
+          Ng0_o = cross(V,U);
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = (Oz+t_o.upper*dOz)*rl;
+          const Vec3fa Pr = t_o.upper*dir;
+          const Vec3fa Pl = v0 + u1_o*(v1-v0);
+          const Vec3fa R = normalize(Pr-Pl);
+          const Vec3fa U = (p1-p0)+(r1-r0)*R;
+          const Vec3fa V = cross(p1-p0,R);
+          Ng1_o = cross(V,U);
+        }
+        return true;
+      }
+
+      __forceinline bool intersect(const Vec3fa& org, const Vec3fa& dir, BBox1f& t_o) const 
+      {
+        float u0_o; Vec3fa Ng0_o; float u1_o; Vec3fa Ng1_o;
+        return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+
+      static bool verify(const size_t id, const Cone& cone, const Ray& ray, bool shouldhit, const float t0, const float t1)
+      {
+        float eps = 0.001f;
+        BBox1f t; bool hit;
+        hit = cone.intersect(ray.org,ray.dir,t);
+
+        bool failed = hit != shouldhit;
+        if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : (t0 == -1E6) ? t.lower > -1E6f : abs(t0-t.lower) > eps;
+        if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : (t1 == +1E6) ? t.upper < +1E6f : abs(t1-t.upper) > eps;
+        if (!failed) return true;
+        embree_cout << "Cone test " << id << " failed: cone = " << cone << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; 
+        return false;
+      }
+
+      /* verify cone class */
+      static bool verify()
+      {
+        bool passed = true;
+        const Cone cone0(Vec3fa(0.0f,0.0f,0.0f),0.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(0,cone0,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,3.0f,pos_inf);
+        passed &= verify(1,cone0,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f);
+        passed &= verify(2,cone0,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(3,cone0,Ray(Vec3fa(+1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,3.0f);
+        passed &= verify(4,cone0,Ray(Vec3fa(-1.0f,0.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,1.0f,pos_inf);
+        passed &= verify(5,cone0,Ray(Vec3fa(+1.0f,0.0f,0.0f),Vec3fa(-1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,1.0f);
+        passed &= verify(6,cone0,Ray(Vec3fa(+0.0f,0.0f,1.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,1.0f,1.0f);
+        passed &= verify(7,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(8,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(+1.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.5f,+1E6);
+        passed &= verify(9,cone0,Ray(Vec3fa(+0.0f,1.0f,0.0f),Vec3fa(-1.0f,+1.0f,+0.0f),0.0f,float(inf)),true,-1E6,-0.5f);
+        const Cone cone1(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),0.0f);
+        passed &= verify(10,cone1,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa(+1.0f,+0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,2.0f);
+        passed &= verify(11,cone1,Ray(Vec3fa(-1.0f,0.0f,2.0f),Vec3fa(+0.0f,+0.0f,-1.0f),0.0f,float(inf)),true,0.0f,4.0f);
+        const Cone cylinder(Vec3fa(0.0f,0.0f,0.0f),1.0f,Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(12,cylinder,Ray(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(13,cylinder,Ray(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(14,cylinder,Ray(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(15,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(16,cylinder,Ray(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(17,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        passed &= verify(18,cylinder,Ray(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        return passed;
+      }
+
+      /*! output operator */
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cone& c) {
+        return cout << "Cone { p0 = " << c.p0 << ", r0 = " << c.r0 << ", p1 = " << c.p1 << ", r1 = " << c.r1 << "}";
+      }
+    };
+
+    template<int N>
+      struct ConeN
+    {
+      typedef Vec3<vfloat<N>> Vec3vfN;
+      
+      const Vec3vfN p0;     //!< start position of cone
+      const Vec3vfN p1;     //!< end position of cone
+      const vfloat<N> r0;   //!< start radius of cone
+      const vfloat<N> r1;   //!< end radius of cone
+
+      __forceinline ConeN(const Vec3vfN& p0, const vfloat<N>& r0, const Vec3vfN& p1, const vfloat<N>& r1) 
+        : p0(p0), p1(p1), r0(r0), r1(r1) {}
+
+      __forceinline Cone operator[] (const size_t i) const
+      {
+        assert(i<N);
+        return Cone(Vec3fa(p0.x[i],p0.y[i],p0.z[i]),r0[i],Vec3fa(p1.x[i],p1.y[i],p1.z[i]),r1[i]);
+      }
+
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                       BBox<vfloat<N>>& t_o, 
+                                       vfloat<N>& u0_o, Vec3vfN& Ng0_o, 
+                                       vfloat<N>& u1_o, Vec3vfN& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const Vec3vfN v0 = p0-Vec3vfN(org);
+        const Vec3vfN v1 = p1-Vec3vfN(org);
+
+        const vfloat<N> rl = rcp_length(v1-v0);
+        const Vec3vfN P0 = v0, dP = (v1-v0)*rl;
+        const vfloat<N> dr = (r1-r0)*rl;
+        const Vec3vfN O = -P0, dO = dir;
+       
+        const vfloat<N> dOdO = dot(dO,dO);
+        const vfloat<N> OdO = dot(dO,O);
+        const vfloat<N> OO = dot(O,O);
+        const vfloat<N> dOz = dot(dP,dO);
+        const vfloat<N> Oz = dot(dP,O);
+        
+        const vfloat<N> R = r0 + Oz*dr;          
+        const vfloat<N> A = dOdO - sqr(dOz) * (vfloat<N>(1.0f)+sqr(dr));
+        const vfloat<N> B = 2.0f * (OdO - dOz*(Oz + R*dr));
+        const vfloat<N> C = OO - (sqr(Oz) + sqr(R));
+
+        /* we miss the cone if determinant is smaller than zero */
+        const vfloat<N> D = B*B - 4.0f*A*C;
+        vbool<N> valid = D >= 0.0f;
+        if (none(valid)) return valid;
+
+        /* special case for rays that are "parallel" to the cone */
+        const vfloat<N> eps = float(1<<8)*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        const vbool<N> validt = valid &  (abs(A) < eps);
+        const vbool<N> validf = valid & !(abs(A) < eps);
+        if (unlikely(any(validt)))
+        {
+          const vboolx validtt = validt & (abs(dr) <  16.0f*float(ulp));
+          const vboolx validtf = validt & (abs(dr) >= 16.0f*float(ulp));
+          
+          /* cylinder case */
+          if (unlikely(any(validtt))) 
+          {
+            t_o.lower = select(validtt, select(C <= 0.0f, vfloat<N>(neg_inf), vfloat<N>(pos_inf)), t_o.lower);
+            t_o.upper = select(validtt, select(C <= 0.0f, vfloat<N>(pos_inf), vfloat<N>(neg_inf)), t_o.upper);
+            valid &= !validtt | C <= 0.0f;
+          }
+
+          /* cone case */
+          if (any(validtf)) 
+          {
+            /* if we hit the negative cone there cannot be a hit */
+            const vfloat<N> t = -C/B;
+            const vfloat<N> z0 = Oz+t*dOz;
+            const vfloat<N> z0r = r0+z0*dr;
+            valid &= !validtf | z0r >= 0.0f;
+
+            /* test if we start inside or outside the cone */
+            t_o.lower = select(validtf, select(dOz*dr > 0.0f, t, vfloat<N>(neg_inf)), t_o.lower);
+            t_o.upper = select(validtf, select(dOz*dr > 0.0f, vfloat<N>(pos_inf), t), t_o.upper);
+          }
+        }
+
+        /* standard case for "non-parallel" rays */
+        if (likely(any(validf)))
+        {
+          const vfloat<N> Q = sqrt(D);
+          const vfloat<N> rcp_2A = 0.5f*rcp(A);
+          t_o.lower = select(validf, (-B-Q)*rcp_2A, t_o.lower);
+          t_o.upper = select(validf, (-B+Q)*rcp_2A, t_o.upper);
+          
+          /* standard case where both hits are on same cone */
+          const vbool<N> validft = validf &   A>0.0f;
+          const vbool<N> validff = validf & !(A>0.0f);
+          if (any(validft)) {
+            const vfloat<N> z0 = Oz+t_o.lower*dOz;
+            const vfloat<N> z0r = r0+z0*dr;
+            valid &= !validft | z0r >= 0.0f;
+          } 
+
+          /* special case where the hits are on the positive and negative cone */
+          if (any(validff)) {
+            /* depending on the ray direction and the open direction
+             * of the cone we have a hit from inside or outside the
+             * cone */
+            t_o.lower = select(validff, select(dOz*dr > 0.0f, t_o.lower, float(neg_inf)), t_o.lower);
+            t_o.upper = select(validff, select(dOz*dr > 0.0f, float(pos_inf), t_o.upper), t_o.upper);
+          }
+        }
+
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = (Oz+t_o.lower*dOz)*rl;
+          const Vec3vfN Pr = t_o.lower*Vec3vfN(dir);
+          const Vec3vfN Pl = v0 + u0_o*(v1-v0);
+          const Vec3vfN R = normalize(Pr-Pl);
+          const Vec3vfN U = (p1-p0)+(r1-r0)*R;
+          const Vec3vfN V = cross(p1-p0,R);
+          Ng0_o = cross(V,U);
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = (Oz+t_o.upper*dOz)*rl;
+          const Vec3vfN Pr = t_o.lower*Vec3vfN(dir);
+          const Vec3vfN Pl = v0 + u1_o*(v1-v0);
+          const Vec3vfN R = normalize(Pr-Pl);
+          const Vec3vfN U = (p1-p0)+(r1-r0)*R;
+          const Vec3vfN V = cross(p1-p0,R);
+          Ng1_o = cross(V,U);
+        }
+        return valid;
+      }
+ 
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const
+      {
+        vfloat<N> u0_o; Vec3vfN Ng0_o; vfloat<N> u1_o; Vec3vfN Ng1_o;
+        return intersect(org,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h
new file mode 100644
index 0000000000..0902baff7d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/coneline_intersector.h
@@ -0,0 +1,209 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    namespace __coneline_internal 
+    {
+      template<int M, typename Epilog, typename ray_tfar_func>
+        static __forceinline bool intersectCone(const vbool<M>& valid_i,
+                                                const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir, 
+                                                const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar,
+                                                const Vec4vf<M>& v0, const Vec4vf<M>& v1,
+                                                const vbool<M>& cL, const vbool<M>& cR,
+                                                const Epilog& epilog)
+      {   
+        vbool<M> valid = valid_i;
+
+        /* move ray origin closer to make calculations numerically stable */
+        const vfloat<M> dOdO = sqr(ray_dir);
+        const vfloat<M> rcp_dOdO = rcp(dOdO);
+        const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz());
+        const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO;
+        const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir;
+
+        const Vec3vf<M> dP = v1.xyz() - v0.xyz();
+        const Vec3vf<M> p0 = ray_org - v0.xyz();
+        const Vec3vf<M> p1 = ray_org - v1.xyz();
+        
+        const vfloat<M> dPdP  = sqr(dP);
+        const vfloat<M> dP0   = dot(p0,dP);
+        const vfloat<M> dP1   = dot(p1,dP); 
+        const vfloat<M> dOdP  = dot(ray_dir,dP);
+
+        // intersect cone body
+        const vfloat<M> dr  = v0.w - v1.w;
+        const vfloat<M> hy  = dPdP + sqr(dr);
+        const vfloat<M> dO0 = dot(ray_dir,p0);
+        const vfloat<M> OO  = sqr(p0);
+        const vfloat<M> dPdP2 = sqr(dPdP);
+        const vfloat<M> dPdPr0 = dPdP*v0.w;
+        
+        const vfloat<M> A = dPdP2     - sqr(dOdP)*hy;
+        const vfloat<M> B = dPdP2*dO0 - dP0*dOdP*hy   + dPdPr0*(dr*dOdP);
+        const vfloat<M> C = dPdP2*OO  - sqr(dP0)*hy   + dPdPr0*(2.0f*dr*dP0 - dPdPr0);
+        
+        const vfloat<M> D = B*B - A*C;
+        valid &= D >= 0.0f;
+        if (unlikely(none(valid))) {
+          return false;
+        }
+
+        /* standard case for "non-parallel" rays */
+        const vfloat<M> Q = sqrt(D);
+        const vfloat<M> rcp_A = rcp(A);
+        /* special case for rays that are "parallel" to the cone - assume miss */
+        const vbool<M> isParallel = abs(A) <= min_rcp_input;
+
+        vfloat<M> t_cone_lower = select (isParallel, neg_inf, (-B-Q)*rcp_A);
+        vfloat<M> t_cone_upper = select (isParallel, pos_inf, (-B+Q)*rcp_A);
+        const vfloat<M> y_lower = dP0 + t_cone_lower*dOdP;
+        const vfloat<M> y_upper = dP0 + t_cone_upper*dOdP;
+        t_cone_lower = select(valid & y_lower > 0.0f & y_lower < dPdP, t_cone_lower, pos_inf);
+        t_cone_upper = select(valid & y_upper > 0.0f & y_upper < dPdP, t_cone_upper, neg_inf);
+
+        const vbool<M> hitDisk0 = valid & cL;
+        const vbool<M> hitDisk1 = valid & cR;
+        const vfloat<M> rcp_dOdP = rcp(dOdP);
+        const vfloat<M> t_disk0 = select (hitDisk0, select (sqr(p0*dOdP-ray_dir*dP0)<(sqr(v0.w)*sqr(dOdP)), -dP0*rcp_dOdP, pos_inf), pos_inf);
+        const vfloat<M> t_disk1 = select (hitDisk1, select (sqr(p1*dOdP-ray_dir*dP1)<(sqr(v1.w)*sqr(dOdP)), -dP1*rcp_dOdP, pos_inf), pos_inf);
+        const vfloat<M> t_disk_lower = min(t_disk0, t_disk1);
+        const vfloat<M> t_disk_upper = max(t_disk0, t_disk1);
+
+        const vfloat<M> t_lower = min(t_cone_lower, t_disk_lower);
+        const vfloat<M> t_upper = max(t_cone_upper, select(t_lower==t_disk_lower, 
+                                                      select(t_disk_upper==vfloat<M>(pos_inf),neg_inf,t_disk_upper), 
+                                                      select(t_disk_lower==vfloat<M>(pos_inf),neg_inf,t_disk_lower)));
+
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_lower & dt+t_lower <= ray_tfar() & t_lower != vfloat<M>(pos_inf);
+        const vbool<M> valid_upper = valid & ray_tnear <= dt+t_upper & dt+t_upper <= ray_tfar() & t_upper != vfloat<M>(neg_inf);
+
+        const vbool<M> valid_first = valid_lower | valid_upper;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        const vfloat<M> t_first = select(valid_lower, t_lower, t_upper);
+        const vfloat<M> y_first = select(valid_lower, y_lower, y_upper);
+
+        const vfloat<M> rcp_dPdP = rcp(dPdP);
+        const Vec3vf<M> dP2drr0dP = dPdP*dr*v0.w*dP;
+        const Vec3vf<M> dPhy = dP*hy;
+        const vbool<M> cone_hit_first = valid & (t_first == t_cone_lower | t_first == t_cone_upper);
+        const vbool<M> disk0_hit_first = valid & (t_first == t_disk0);
+        const Vec3vf<M> Ng_first = select(cone_hit_first, dPdP2*(p0+t_first*ray_dir)+dP2drr0dP-dPhy*y_first, select(disk0_hit_first, -dP, dP));
+        const vfloat<M> u_first = select(cone_hit_first, y_first*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first);
+        const bool is_hit_first = epilog(valid_first, hit);
+
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_upper;
+        const vfloat<M> y_second = y_upper;
+        const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_upper <= ray_tfar());
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+        
+        /* invoke intersection filter for second hit */
+        const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper;
+        const vbool<M> disk0_hit_second = t_second == t_disk0;
+        const Vec3vf<M> Ng_second = select(cone_hit_second, dPdP2*(p0+t_second*ray_dir)+dP2drr0dP-dPhy*y_second, select(disk0_hit_second, -dP, dP));
+        const vfloat<M> u_second = select(cone_hit_second, y_second*rcp_dPdP, select(disk0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+    }
+
+    template<int M>
+      struct ConeLineIntersectorHitM
+      {
+        __forceinline ConeLineIntersectorHitM() {}
+        
+        __forceinline ConeLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+	
+        __forceinline void finalize() {}
+	
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+	
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    template<int M>
+      struct ConeCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+        
+        struct ray_tfar {
+          Ray& ray;
+          __forceinline ray_tfar(Ray& ray) : ray(ray) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar; };
+        };
+
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const vbool<M>& cL, const vbool<M>& cR,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+          const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+          const vfloat<M> ray_tnear(ray.tnear());
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          return  __coneline_internal::intersectCone(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,cL,cR,epilog);
+        }
+      };
+    
+    template<int M, int K>
+      struct ConeCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        struct ray_tfar {
+          RayK<K>& ray;
+          size_t k;
+          __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar[k]; };
+        };
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const vbool<M>& cL, const vbool<M>& cR,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+          const vfloat<M> ray_tnear = ray.tnear()[k];
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          return __coneline_internal::intersectCone(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,cL,cR,epilog);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h
new file mode 100644
index 0000000000..d47218eb8b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/conelinei_intersector.h
@@ -0,0 +1,141 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "coneline_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, int Mx, bool filter>
+    struct ConeCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct ConeCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return ConeCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,cL,cR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct ConeCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct ConeCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; 
+        vbool<M> cL,cR;
+        line.gather(v0,v1,cL,cR,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return ConeCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,cL,cR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi.h
new file mode 100644
index 0000000000..51384f1959
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi.h
@@ -0,0 +1,222 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNi
+  {
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNi) == 22+25*M, "internal data layout issue");
+      return f*sizeof(CurveNi) + (r!=0)*(22 + 25*r);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNi () {}
+
+    /*! fill curve from curve list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
+    {  
+      size_t end = min(begin+M,_end);
+      N = (uint8_t)(end-begin);
+      const unsigned int geomID0 = prims[begin].geomID();
+      this->geomID(N) = geomID0;
+      ty = (uint8_t) scene->get(geomID0)->getType();
+
+      /* encode all primitives */
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRef& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID(); assert(geomID == geomID0);
+        const unsigned int primID = prim.primID();
+        bounds.extend(scene->get(geomID)->vbounds(primID));
+      }
+
+      /* calculate offset and scale */
+      Vec3fa loffset = bounds.lower;
+      float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f)));
+      if (bounds.size() == Vec3fa(zero)) lscale = 0.0f;
+      *this->offset(N) = loffset;
+      *this->scale(N) = lscale;
+      
+      /* encode all primitives */
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRef& prim = prims[begin];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpace(primID);
+        
+        const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
+        const BBox3fa bounds = scene->get(geomID)->vbounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID);
+        
+        bounds_vx_x(N)[i] = (int8_t) space3.vx.x;
+        bounds_vx_y(N)[i] = (int8_t) space3.vx.y;
+        bounds_vx_z(N)[i] = (int8_t) space3.vx.z;
+        bounds_vx_lower(N)[i] = (short) clamp(floor(bounds.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper(N)[i] = (short) clamp(ceil (bounds.upper.x),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.x) && floor(bounds.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.x) && ceil (bounds.upper.x) <= 32767.0f);
+
+        bounds_vy_x(N)[i] = (int8_t) space3.vy.x;
+        bounds_vy_y(N)[i] = (int8_t) space3.vy.y;
+        bounds_vy_z(N)[i] = (int8_t) space3.vy.z;
+        bounds_vy_lower(N)[i] = (short) clamp(floor(bounds.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper(N)[i] = (short) clamp(ceil (bounds.upper.y),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.y) && floor(bounds.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.y) && ceil (bounds.upper.y) <= 32767.0f);
+
+        bounds_vz_x(N)[i] = (int8_t) space3.vz.x;
+        bounds_vz_y(N)[i] = (int8_t) space3.vz.y;
+        bounds_vz_z(N)[i] = (int8_t) space3.vz.z;
+        bounds_vz_lower(N)[i] = (short) clamp(floor(bounds.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper(N)[i] = (short) clamp(ceil (bounds.upper.z),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.lower.z) && floor(bounds.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.upper.z) && ceil (bounds.upper.z) <= 32767.0f);
+               
+        this->primID(N)[i] = primID;
+      }
+    }
+
+    template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      size_t start = set.begin();
+      size_t items = CurveNi::blocks(set.size());
+      size_t numbytes = CurveNi::bytes(set.size());
+      CurveNi* accel = (CurveNi*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      for (size_t i=0; i<items; i++) {
+        accel[i].fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((int8_t*)accel,items);
+    };
+    
+  public:
+    
+    // 27.6 - 46 bytes per primitive
+    uint8_t ty;
+    uint8_t N;
+    uint8_t data[4+25*M+16];
+
+    /*
+    struct Layout
+    {
+      unsigned int geomID;
+      unsigned int primID[N];
+      
+      int8_t bounds_vx_x[N];
+      int8_t bounds_vx_y[N];
+      int8_t bounds_vx_z[N];
+      short bounds_vx_lower[N];
+      short bounds_vx_upper[N];
+      
+      int8_t bounds_vy_x[N];
+      int8_t bounds_vy_y[N];
+      int8_t bounds_vy_z[N];
+      short bounds_vy_lower[N];
+      short bounds_vy_upper[N];
+      
+      int8_t bounds_vz_x[N];
+      int8_t bounds_vz_y[N];
+      int8_t bounds_vz_z[N];
+      short bounds_vz_lower[N];
+      short bounds_vz_upper[N];
+      
+      Vec3f offset;
+      float scale;
+    };
+    */
+    
+    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((int8_t*)this+2); }
+    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
+    
+    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((int8_t*)this+6); }
+    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
+    
+    __forceinline       int8_t* bounds_vx_x(size_t N)       { return (int8_t*)((int8_t*)this+6+4*N); }
+    __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
+    
+    __forceinline       int8_t* bounds_vx_y(size_t N)       { return (int8_t*)((int8_t*)this+6+5*N); }
+    __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
+    
+    __forceinline       int8_t* bounds_vx_z(size_t N)       { return (int8_t*)((int8_t*)this+6+6*N); }
+    __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
+    
+    __forceinline       short* bounds_vx_lower(size_t N)       { return (short*)((int8_t*)this+6+7*N); }
+    __forceinline const short* bounds_vx_lower(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
+    
+    __forceinline       short* bounds_vx_upper(size_t N)       { return (short*)((int8_t*)this+6+9*N); }
+    __forceinline const short* bounds_vx_upper(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
+    
+    __forceinline       int8_t* bounds_vy_x(size_t N)       { return (int8_t*)((int8_t*)this+6+11*N); }
+    __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+11*N); }
+    
+    __forceinline       int8_t* bounds_vy_y(size_t N)       { return (int8_t*)((int8_t*)this+6+12*N); }
+    __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+12*N); }
+    
+    __forceinline       int8_t* bounds_vy_z(size_t N)       { return (int8_t*)((int8_t*)this+6+13*N); }
+    __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+13*N); }
+    
+    __forceinline       short* bounds_vy_lower(size_t N)       { return (short*)((int8_t*)this+6+14*N); }
+    __forceinline const short* bounds_vy_lower(size_t N) const { return (short*)((int8_t*)this+6+14*N); }
+    
+    __forceinline       short* bounds_vy_upper(size_t N)       { return (short*)((int8_t*)this+6+16*N); }
+    __forceinline const short* bounds_vy_upper(size_t N) const { return (short*)((int8_t*)this+6+16*N); }
+    
+    __forceinline       int8_t* bounds_vz_x(size_t N)       { return (int8_t*)((int8_t*)this+6+18*N); }
+    __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+18*N); }
+    
+    __forceinline       int8_t* bounds_vz_y(size_t N)       { return (int8_t*)((int8_t*)this+6+19*N); }
+    __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+19*N); }
+    
+    __forceinline       int8_t* bounds_vz_z(size_t N)       { return (int8_t*)((int8_t*)this+6+20*N); }
+    __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+20*N); }
+    
+    __forceinline       short* bounds_vz_lower(size_t N)       { return (short*)((int8_t*)this+6+21*N); }
+    __forceinline const short* bounds_vz_lower(size_t N) const { return (short*)((int8_t*)this+6+21*N); }
+    
+    __forceinline       short* bounds_vz_upper(size_t N)       { return (short*)((int8_t*)this+6+23*N); }
+    __forceinline const short* bounds_vz_upper(size_t N) const { return (short*)((int8_t*)this+6+23*N); }
+    
+    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((int8_t*)this+6+25*N); }
+    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+25*N); }
+    
+    __forceinline       float* scale(size_t N)       { return (float*)((int8_t*)this+6+25*N+12); }
+    __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+25*N+12); }
+
+    __forceinline       int8_t* end(size_t N)       { return (int8_t*)this+6+25*N+16; }
+    __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+25*N+16; }
+  };
+
+  template<int M>
+    typename CurveNi<M>::Type CurveNi<M>::type;
+
+  typedef CurveNi<4> Curve4i;
+  typedef CurveNi<8> Curve8i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h
new file mode 100644
index 0000000000..0f9038c9fc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_intersector.h
@@ -0,0 +1,569 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct CurveNiIntersector1
+    {
+      typedef CurveNi<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa org1 = (ray.org-offset)*scale;
+        const Vec3fa dir1 = ray.dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+       
+        const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+         
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNiIntersectorK
+    {
+      typedef CurveNi<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculationsK<K> Precalculations;
+      
+      static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        const Vec3fa org1 = (ray_org-offset)*scale;
+        const Vec3fa dir1 = ray_dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+       
+        const vfloat<M> t_lower_x = (vfloat<M>::load(prim.bounds_vx_lower(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vfloat<M>::load(prim.bounds_vx_upper(N))-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vfloat<M>::load(prim.bounds_vy_lower(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vfloat<M>::load(prim.bounds_vy_upper(N))-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vfloat<M>::load(prim.bounds_vz_lower(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vfloat<M>::load(prim.bounds_vz_upper(N))-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k]));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k]));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID));
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+          
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+
+          unsigned int vertexID = geom->curve(primID);
+          Vec3ff a0,a1,a2,a3; Vec3fa n0,n1,n2,n3; geom->gather(a0,a1,a2,a3,n0,n1,n2,n3,vertexID);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            const unsigned int primID1 = prim.primID(N)[i1];
+            geom->prefetchL1_vertices(geom->curve(primID1));
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              const unsigned int primID2 = prim.primID(N)[i2];
+              geom->prefetchL2_vertices(geom->curve(primID2));
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,n0,n1,n2,n3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+      
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; Vec3fa n0,dn0,n1,dn1; geom->gather_hermite(p0,t0,n0,dn0,p1,t1,n1,dn1,geom->curve(primID));
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,n0,dn0,n1,dn1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h
new file mode 100644
index 0000000000..0cd8f833fd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb.h
@@ -0,0 +1,278 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNiMB
+  {
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNiMB) == 6+37*M+24, "internal data layout issue");
+      return f*sizeof(CurveNiMB) + (r!=0)*(6+37*r+24);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNiMB () {}
+
+    /*! fill curve from curve list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t _end, Scene* scene, const BBox1f time_range)
+    {
+      size_t end = min(begin+M,_end);
+      N = (uint8_t)(end-begin);
+      const unsigned int geomID0 = prims[begin].geomID();
+      this->geomID(N) = geomID0;
+      ty = (uint8_t) scene->get(geomID0)->getType();
+
+      /* encode all primitives */
+      LBBox3fa lbounds = empty;
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRefMB& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID(); assert(geomID == geomID0);
+        const unsigned int primID = prim.primID();
+        lbounds.extend(scene->get(geomID)->vlinearBounds(primID,time_range));
+      }
+      BBox3fa bounds = lbounds.bounds();
+
+      /* calculate offset and scale */
+      Vec3fa loffset = bounds.lower;
+      float lscale = reduce_min(256.0f/(bounds.size()*sqrt(3.0f)));
+      if (bounds.size() == Vec3fa(zero)) lscale = 0.0f;
+      *this->offset(N) = loffset;
+      *this->scale(N) = lscale;
+      this->time_offset(N) = time_range.lower;
+      this->time_scale(N) = 1.0f/time_range.size();
+      
+      /* encode all primitives */
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRefMB& prim = prims[begin];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        const LinearSpace3fa space2 = scene->get(geomID)->computeAlignedSpaceMB(primID,time_range);
+        
+        const LinearSpace3fa space3(trunc(126.0f*space2.vx),trunc(126.0f*space2.vy),trunc(126.0f*space2.vz));
+        const LBBox3fa bounds = scene->get(geomID)->vlinearBounds(loffset,lscale,max(length(space3.vx),length(space3.vy),length(space3.vz)),space3.transposed(),primID,time_range);
+        
+        // NOTE: this weird (int8_t) (short) cast works around VS2015 Win32 compiler bug
+        bounds_vx_x(N)[i] = (int8_t) (short) space3.vx.x;
+        bounds_vx_y(N)[i] = (int8_t) (short) space3.vx.y;
+        bounds_vx_z(N)[i] = (int8_t) (short) space3.vx.z;
+        bounds_vx_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.x),-32767.0f,32767.0f);
+        bounds_vx_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.x),-32767.0f,32767.0f);
+        bounds_vx_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.x),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.x) && floor(bounds.bounds0.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.x) && ceil (bounds.bounds0.upper.x) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.x) && floor(bounds.bounds1.lower.x) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.x) && ceil (bounds.bounds1.upper.x) <= 32767.0f);
+        
+        bounds_vy_x(N)[i] = (int8_t) (short) space3.vy.x;
+        bounds_vy_y(N)[i] = (int8_t) (short) space3.vy.y;
+        bounds_vy_z(N)[i] = (int8_t) (short) space3.vy.z;
+        bounds_vy_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.y),-32767.0f,32767.0f);
+        bounds_vy_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.y),-32767.0f,32767.0f);
+        bounds_vy_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.y),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.y) && floor(bounds.bounds0.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.y) && ceil (bounds.bounds0.upper.y) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.y) && floor(bounds.bounds1.lower.y) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.y) && ceil (bounds.bounds1.upper.y) <= 32767.0f);
+
+        bounds_vz_x(N)[i] = (int8_t) (short) space3.vz.x;
+        bounds_vz_y(N)[i] = (int8_t) (short) space3.vz.y;
+        bounds_vz_z(N)[i] = (int8_t) (short) space3.vz.z;
+        bounds_vz_lower0(N)[i] = (short) clamp(floor(bounds.bounds0.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper0(N)[i] = (short) clamp(ceil (bounds.bounds0.upper.z),-32767.0f,32767.0f);
+        bounds_vz_lower1(N)[i] = (short) clamp(floor(bounds.bounds1.lower.z),-32767.0f,32767.0f);
+        bounds_vz_upper1(N)[i] = (short) clamp(ceil (bounds.bounds1.upper.z),-32767.0f,32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds0.lower.z) && floor(bounds.bounds0.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds0.upper.z) && ceil (bounds.bounds0.upper.z) <= 32767.0f);
+        assert(-32767.0f <= floor(bounds.bounds1.lower.z) && floor(bounds.bounds1.lower.z) <= 32767.0f);
+        assert(-32767.0f <= ceil (bounds.bounds1.upper.z) && ceil (bounds.bounds1.upper.z) <= 32767.0f);
+               
+        this->primID(N)[i] = primID;
+      }
+      
+      return lbounds;
+    }
+
+    template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start = prims.begin();
+      size_t end   = prims.end();
+      size_t items = CurveNiMB::blocks(prims.size());
+      size_t numbytes = CurveNiMB::bytes(prims.size());
+      CurveNiMB* accel = (CurveNiMB*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      const typename BVH::NodeRef node = bvh->encodeLeaf((int8_t*)accel,items);
+      
+      LBBox3fa bounds = empty;
+      for (size_t i=0; i<items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range));
+      
+      return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range);
+    };
+
+    
+  public:
+    
+    // 27.6 - 46 bytes per primitive
+    uint8_t ty;
+    uint8_t N;
+    uint8_t data[4+37*M+24];
+
+    /*
+    struct Layout
+    {
+      unsigned int geomID;
+      unsigned int primID[N];
+      
+      int8_t bounds_vx_x[N];
+      int8_t bounds_vx_y[N];
+      int8_t bounds_vx_z[N];
+      short bounds_vx_lower0[N];
+      short bounds_vx_upper0[N];
+      short bounds_vx_lower1[N];
+      short bounds_vx_upper1[N];
+      
+      int8_t bounds_vy_x[N];
+      int8_t bounds_vy_y[N];
+      int8_t bounds_vy_z[N];
+      short bounds_vy_lower0[N];
+      short bounds_vy_upper0[N];
+      short bounds_vy_lower1[N];
+      short bounds_vy_upper1[N];
+      
+      int8_t bounds_vz_x[N];
+      int8_t bounds_vz_y[N];
+      int8_t bounds_vz_z[N];
+      short bounds_vz_lower0[N];
+      short bounds_vz_upper0[N];
+      short bounds_vz_lower1[N];
+      short bounds_vz_upper1[N];
+      
+      Vec3f offset;
+      float scale;
+
+      float time_offset;
+      float time_scale;
+    };
+    */
+    
+    __forceinline       unsigned int& geomID(size_t N)       { return *(unsigned int*)((int8_t*)this+2); }
+    __forceinline const unsigned int& geomID(size_t N) const { return *(unsigned int*)((int8_t*)this+2); }
+    
+    __forceinline       unsigned int* primID(size_t N)       { return (unsigned int*)((int8_t*)this+6); }
+    __forceinline const unsigned int* primID(size_t N) const { return (unsigned int*)((int8_t*)this+6); }
+    
+    __forceinline       int8_t* bounds_vx_x(size_t N)       { return (int8_t*)((int8_t*)this+6+4*N); }
+    __forceinline const int8_t* bounds_vx_x(size_t N) const { return (int8_t*)((int8_t*)this+6+4*N); }
+    
+    __forceinline       int8_t* bounds_vx_y(size_t N)       { return (int8_t*)((int8_t*)this+6+5*N); }
+    __forceinline const int8_t* bounds_vx_y(size_t N) const { return (int8_t*)((int8_t*)this+6+5*N); }
+    
+    __forceinline       int8_t* bounds_vx_z(size_t N)       { return (int8_t*)((int8_t*)this+6+6*N); }
+    __forceinline const int8_t* bounds_vx_z(size_t N) const { return (int8_t*)((int8_t*)this+6+6*N); }
+    
+    __forceinline       short* bounds_vx_lower0(size_t N)       { return (short*)((int8_t*)this+6+7*N); }
+    __forceinline const short* bounds_vx_lower0(size_t N) const { return (short*)((int8_t*)this+6+7*N); }
+    
+    __forceinline       short* bounds_vx_upper0(size_t N)       { return (short*)((int8_t*)this+6+9*N); }
+    __forceinline const short* bounds_vx_upper0(size_t N) const { return (short*)((int8_t*)this+6+9*N); }
+
+    __forceinline       short* bounds_vx_lower1(size_t N)       { return (short*)((int8_t*)this+6+11*N); }
+    __forceinline const short* bounds_vx_lower1(size_t N) const { return (short*)((int8_t*)this+6+11*N); }
+    
+    __forceinline       short* bounds_vx_upper1(size_t N)       { return (short*)((int8_t*)this+6+13*N); }
+    __forceinline const short* bounds_vx_upper1(size_t N) const { return (short*)((int8_t*)this+6+13*N); }
+
+    __forceinline       int8_t* bounds_vy_x(size_t N)       { return (int8_t*)((int8_t*)this+6+15*N); }
+    __forceinline const int8_t* bounds_vy_x(size_t N) const { return (int8_t*)((int8_t*)this+6+15*N); }
+    
+    __forceinline       int8_t* bounds_vy_y(size_t N)       { return (int8_t*)((int8_t*)this+6+16*N); }
+    __forceinline const int8_t* bounds_vy_y(size_t N) const { return (int8_t*)((int8_t*)this+6+16*N); }
+    
+    __forceinline       int8_t* bounds_vy_z(size_t N)       { return (int8_t*)((int8_t*)this+6+17*N); }
+    __forceinline const int8_t* bounds_vy_z(size_t N) const { return (int8_t*)((int8_t*)this+6+17*N); }
+    
+    __forceinline       short* bounds_vy_lower0(size_t N)       { return (short*)((int8_t*)this+6+18*N); }
+    __forceinline const short* bounds_vy_lower0(size_t N) const { return (short*)((int8_t*)this+6+18*N); }
+    
+    __forceinline       short* bounds_vy_upper0(size_t N)       { return (short*)((int8_t*)this+6+20*N); }
+    __forceinline const short* bounds_vy_upper0(size_t N) const { return (short*)((int8_t*)this+6+20*N); }
+
+    __forceinline       short* bounds_vy_lower1(size_t N)       { return (short*)((int8_t*)this+6+22*N); }
+    __forceinline const short* bounds_vy_lower1(size_t N) const { return (short*)((int8_t*)this+6+22*N); }
+    
+    __forceinline       short* bounds_vy_upper1(size_t N)       { return (short*)((int8_t*)this+6+24*N); }
+    __forceinline const short* bounds_vy_upper1(size_t N) const { return (short*)((int8_t*)this+6+24*N); }
+    
+    __forceinline       int8_t* bounds_vz_x(size_t N)       { return (int8_t*)((int8_t*)this+6+26*N); }
+    __forceinline const int8_t* bounds_vz_x(size_t N) const { return (int8_t*)((int8_t*)this+6+26*N); }
+    
+    __forceinline       int8_t* bounds_vz_y(size_t N)       { return (int8_t*)((int8_t*)this+6+27*N); }
+    __forceinline const int8_t* bounds_vz_y(size_t N) const { return (int8_t*)((int8_t*)this+6+27*N); }
+    
+    __forceinline       int8_t* bounds_vz_z(size_t N)       { return (int8_t*)((int8_t*)this+6+28*N); }
+    __forceinline const int8_t* bounds_vz_z(size_t N) const { return (int8_t*)((int8_t*)this+6+28*N); }
+    
+    __forceinline       short* bounds_vz_lower0(size_t N)       { return (short*)((int8_t*)this+6+29*N); }
+    __forceinline const short* bounds_vz_lower0(size_t N) const { return (short*)((int8_t*)this+6+29*N); }
+    
+    __forceinline       short* bounds_vz_upper0(size_t N)       { return (short*)((int8_t*)this+6+31*N); }
+    __forceinline const short* bounds_vz_upper0(size_t N) const { return (short*)((int8_t*)this+6+31*N); }
+
+    __forceinline       short* bounds_vz_lower1(size_t N)       { return (short*)((int8_t*)this+6+33*N); }
+    __forceinline const short* bounds_vz_lower1(size_t N) const { return (short*)((int8_t*)this+6+33*N); }
+    
+    __forceinline       short* bounds_vz_upper1(size_t N)       { return (short*)((int8_t*)this+6+35*N); }
+    __forceinline const short* bounds_vz_upper1(size_t N) const { return (short*)((int8_t*)this+6+35*N); }
+
+    __forceinline       Vec3f* offset(size_t N)       { return (Vec3f*)((int8_t*)this+6+37*N); }
+    __forceinline const Vec3f* offset(size_t N) const { return (Vec3f*)((int8_t*)this+6+37*N); }
+    
+    __forceinline       float* scale(size_t N)       { return (float*)((int8_t*)this+6+37*N+12); }
+    __forceinline const float* scale(size_t N) const { return (float*)((int8_t*)this+6+37*N+12); }
+
+    __forceinline       float& time_offset(size_t N)       { return *(float*)((int8_t*)this+6+37*N+16); }
+    __forceinline const float& time_offset(size_t N) const { return *(float*)((int8_t*)this+6+37*N+16); }
+    
+    __forceinline       float& time_scale(size_t N)       { return *(float*)((int8_t*)this+6+37*N+20); }
+    __forceinline const float& time_scale(size_t N) const { return *(float*)((int8_t*)this+6+37*N+20); }
+
+    __forceinline       int8_t* end(size_t N)       { return (int8_t*)this+6+37*N+24; }
+    __forceinline const int8_t* end(size_t N) const { return (int8_t*)this+6+37*N+24; }
+  };
+
+  template<int M>
+    typename CurveNiMB<M>::Type CurveNiMB<M>::type;
+
+  typedef CurveNiMB<4> Curve4iMB;
+  typedef CurveNiMB<8> Curve8iMB;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h
new file mode 100644
index 0000000000..0cbc764668
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNi_mb_intersector.h
@@ -0,0 +1,516 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi_mb.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct CurveNiMBIntersector1
+    {
+      typedef CurveNiMB<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline vbool<M> intersect(Ray& ray, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+        const Vec3fa org1 = (ray.org-offset)*scale;
+        const Vec3fa dir1 = ray.dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+        const vfloat<M> ltime = (ray.time()-prim.time_offset(N))*prim.time_scale(N);
+        const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N));
+        const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N));
+        const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0);
+        const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N));
+        const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N));
+        const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0);
+
+        const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N));
+        const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N));
+        const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0);
+        const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N));
+        const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N));
+        const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0);
+        
+        const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N));
+        const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N));
+        const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0);
+        const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N));
+        const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N));
+        const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0);
+       
+        const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time());
+
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time());
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+
+          if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time());
+          if (Intersector().intersect(pre,ray,context,geom,primID,p0,t0,p1,t1,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray.org, primID,ray.time());
+          if (Intersector().intersect(pre,ray,context,geom,primID,curve,Epilog(ray,context,geomID,primID)))
+              return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNiMBIntersectorK
+    {
+      typedef CurveNiMB<M> Primitive;
+      typedef Vec3vf<M> Vec3vfM;
+      typedef LinearSpace3<Vec3vfM>LinearSpace3vfM;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline vbool<M> intersect(RayK<K>& ray, const size_t k, const Primitive& prim, vfloat<M>& tNear_o)
+      {
+        const size_t N = prim.N;
+        const vfloat4 offset_scale = vfloat4::loadu(prim.offset(N));
+        const Vec3fa offset = Vec3fa(offset_scale);
+        const Vec3fa scale = Vec3fa(shuffle<3,3,3,3>(offset_scale));
+
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        const Vec3fa org1 = (ray_org-offset)*scale;
+        const Vec3fa dir1 = ray_dir*scale;
+        
+        const LinearSpace3vfM space(vfloat<M>::load(prim.bounds_vx_x(N)), vfloat<M>::load(prim.bounds_vx_y(N)), vfloat<M>::load(prim.bounds_vx_z(N)),
+                                    vfloat<M>::load(prim.bounds_vy_x(N)), vfloat<M>::load(prim.bounds_vy_y(N)), vfloat<M>::load(prim.bounds_vy_z(N)),
+                                    vfloat<M>::load(prim.bounds_vz_x(N)), vfloat<M>::load(prim.bounds_vz_y(N)), vfloat<M>::load(prim.bounds_vz_z(N)));
+
+        const Vec3vfM dir2 = xfmVector(space,Vec3vfM(dir1));
+        const Vec3vfM org2 = xfmPoint (space,Vec3vfM(org1));
+        const Vec3vfM rcp_dir2 = rcp_safe(dir2);
+
+        const vfloat<M> ltime = (ray.time()[k]-prim.time_offset(N))*prim.time_scale(N);
+        const vfloat<M> vx_lower0 = vfloat<M>::load(prim.bounds_vx_lower0(N));
+        const vfloat<M> vx_lower1 = vfloat<M>::load(prim.bounds_vx_lower1(N));
+        const vfloat<M> vx_lower = madd(ltime,vx_lower1-vx_lower0,vx_lower0);
+        const vfloat<M> vx_upper0 = vfloat<M>::load(prim.bounds_vx_upper0(N));
+        const vfloat<M> vx_upper1 = vfloat<M>::load(prim.bounds_vx_upper1(N));
+        const vfloat<M> vx_upper = madd(ltime,vx_upper1-vx_upper0,vx_upper0);
+
+        const vfloat<M> vy_lower0 = vfloat<M>::load(prim.bounds_vy_lower0(N));
+        const vfloat<M> vy_lower1 = vfloat<M>::load(prim.bounds_vy_lower1(N));
+        const vfloat<M> vy_lower = madd(ltime,vy_lower1-vy_lower0,vy_lower0);
+        const vfloat<M> vy_upper0 = vfloat<M>::load(prim.bounds_vy_upper0(N));
+        const vfloat<M> vy_upper1 = vfloat<M>::load(prim.bounds_vy_upper1(N));
+        const vfloat<M> vy_upper = madd(ltime,vy_upper1-vy_upper0,vy_upper0);
+        
+        const vfloat<M> vz_lower0 = vfloat<M>::load(prim.bounds_vz_lower0(N));
+        const vfloat<M> vz_lower1 = vfloat<M>::load(prim.bounds_vz_lower1(N));
+        const vfloat<M> vz_lower = madd(ltime,vz_lower1-vz_lower0,vz_lower0);
+        const vfloat<M> vz_upper0 = vfloat<M>::load(prim.bounds_vz_upper0(N));
+        const vfloat<M> vz_upper1 = vfloat<M>::load(prim.bounds_vz_upper1(N));
+        const vfloat<M> vz_upper = madd(ltime,vz_upper1-vz_upper0,vz_upper0);
+       
+        const vfloat<M> t_lower_x = (vx_lower-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_upper_x = (vx_upper-vfloat<M>(org2.x))*vfloat<M>(rcp_dir2.x);
+        const vfloat<M> t_lower_y = (vy_lower-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_upper_y = (vy_upper-vfloat<M>(org2.y))*vfloat<M>(rcp_dir2.y);
+        const vfloat<M> t_lower_z = (vz_lower-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+        const vfloat<M> t_upper_z = (vz_upper-vfloat<M>(org2.z))*vfloat<M>(rcp_dir2.z);
+
+        const vfloat<M> round_up  (1.0f+3.0f*float(ulp));
+        const vfloat<M> round_down(1.0f-3.0f*float(ulp));
+        const vfloat<M> tNear = round_down*max(mini(t_lower_x,t_upper_x),mini(t_lower_y,t_upper_y),mini(t_lower_z,t_upper_z),vfloat<M>(ray.tnear()[k]));
+        const vfloat<M> tFar  = round_up  *min(maxi(t_lower_x,t_upper_x),maxi(t_lower_y,t_upper_y),maxi(t_lower_z,t_upper_z),vfloat<M>(ray.tfar[k]));
+        tNear_o = tNear;
+        return (vint<M>(step) < vint<M>(prim.N)) & (tNear <= tFar);
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]);
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff a0,a1,a2,a3; geom->gather(a0,a1,a2,a3,geom->curve(primID),ray.time()[k]);
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_n(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_n(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_h(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_h(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          Vec3ff p0,t0,p1,t1; geom->gather_hermite(p0,t0,p1,t1,geom->curve(primID),ray.time()[k]);
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,p0,t0,p1,t1,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_hn(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_hn(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = context->scene->get<CurveGeometry>(geomID);
+          const Vec3fa ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const TensorLinearCubicBezierSurface3fa curve = geom->getNormalOrientedHermiteCurve<typename Intersector::SourceCurve3ff, typename Intersector::SourceCurve3fa, TensorLinearCubicBezierSurface3fa>(context, ray_org, primID,ray.time()[k]);
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,curve,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNv.h b/thirdparty/embree-aarch64/kernels/geometry/curveNv.h
new file mode 100644
index 0000000000..6eb5e30b39
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNv.h
@@ -0,0 +1,101 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNi.h"
+
+namespace embree
+{
+  template<int M>
+    struct CurveNv : public CurveNi<M>
+  {
+    using CurveNi<M>::N;
+      
+    struct Type : public PrimitiveType {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+M-1)/M; }
+
+    static __forceinline size_t bytes(size_t N)
+    {
+      const size_t f = N/M, r = N%M;
+      static_assert(sizeof(CurveNv) == 22+25*M+4*16*M, "internal data layout issue");
+      return f*sizeof(CurveNv) + (r!=0)*(22 + 25*r + 4*16*r);
+    }
+
+  public:
+
+    /*! Default constructor. */
+    __forceinline CurveNv () {}
+
+    /*! fill curve from curve list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t _end, Scene* scene)
+    {
+      size_t end = min(begin+M,_end);
+      size_t N = end-begin;
+
+      /* encode all primitives */
+      for (size_t i=0; i<N; i++)
+      {
+        const PrimRef& prim = prims[begin+i];
+        const unsigned int geomID = prim.geomID();
+        const unsigned int primID = prim.primID();
+        CurveGeometry* mesh = (CurveGeometry*) scene->get(geomID);
+        const unsigned vtxID = mesh->curve(primID);
+        Vec3fa::storeu(&this->vertices(i,N)[0],mesh->vertex(vtxID+0));
+        Vec3fa::storeu(&this->vertices(i,N)[1],mesh->vertex(vtxID+1));
+        Vec3fa::storeu(&this->vertices(i,N)[2],mesh->vertex(vtxID+2));
+        Vec3fa::storeu(&this->vertices(i,N)[3],mesh->vertex(vtxID+3));
+      }
+    }
+
+    template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      if (set.size() == 0)
+        return BVH::emptyNode;
+      
+      /* fall back to CurveNi for oriented curves */
+      unsigned int geomID = prims[set.begin()].geomID();
+      if (bvh->scene->get(geomID)->getCurveType() == Geometry::GTY_SUBTYPE_ORIENTED_CURVE) {
+        return CurveNi<M>::createLeaf(bvh,prims,set,alloc);
+      }
+      if (bvh->scene->get(geomID)->getCurveBasis() == Geometry::GTY_BASIS_HERMITE) {
+        return CurveNi<M>::createLeaf(bvh,prims,set,alloc);
+      }
+      
+      size_t start = set.begin();
+      size_t items = CurveNv::blocks(set.size());
+      size_t numbytes = CurveNv::bytes(set.size());
+      CurveNv* accel = (CurveNv*) alloc.malloc1(numbytes,BVH::byteAlignment);
+      for (size_t i=0; i<items; i++) {
+        accel[i].CurveNv<M>::fill(prims,start,set.end(),bvh->scene);
+        accel[i].CurveNi<M>::fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel,items);
+    };
+    
+  public:
+    unsigned char data[4*16*M];
+    __forceinline       Vec3fa* vertices(size_t i, size_t N)       { return (Vec3fa*)CurveNi<M>::end(N)+4*i; }
+    __forceinline const Vec3fa* vertices(size_t i, size_t N) const { return (Vec3fa*)CurveNi<M>::end(N)+4*i; }
+  };
+
+  template<int M>
+    typename CurveNv<M>::Type CurveNv<M>::type;
+
+  typedef CurveNv<4> Curve4v;
+  typedef CurveNv<8> Curve8v;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h
new file mode 100644
index 0000000000..e20da2882e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curveNv_intersector.h
@@ -0,0 +1,181 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curveNv.h"
+#include "curveNi_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct CurveNvIntersector1 : public CurveNiIntersector1<M>
+    {
+      typedef CurveNv<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersector1<M>::intersect(ray,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+          
+          if (Intersector().intersect(pre,ray,context,geom,primID,a0,a1,a2,a3,Epilog(ray,context,geomID,primID)))
+            return true;
+          
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar));
+        }
+        return false;
+      }
+    };
+
+    template<int M, int K>
+      struct CurveNvIntersectorK : public CurveNiIntersectorK<M,K>
+    {
+      typedef CurveNv<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline void intersect_t(Precalculations& pre, RayHitK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(normal.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID));
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+      }
+
+      template<typename Intersector, typename Epilog>
+        static __forceinline bool occluded_t(Precalculations& pre, RayK<K>& ray, const size_t k, IntersectContext* context, const Primitive& prim)
+      {
+        vfloat<M> tNear;
+        vbool<M> valid = CurveNiIntersectorK<M,K>::intersect(ray,k,prim,tNear);
+
+        const size_t N = prim.N;
+        size_t mask = movemask(valid);
+        while (mask)
+        {
+          const size_t i = bscf(mask);
+          STAT3(shadow.trav_prims,1,1,1);
+          const unsigned int geomID = prim.geomID(N);
+          const unsigned int primID = prim.primID(N)[i];
+          const CurveGeometry* geom = (CurveGeometry*) context->scene->get(geomID);
+          const Vec3ff a0 = Vec3ff::loadu(&prim.vertices(i,N)[0]);
+          const Vec3ff a1 = Vec3ff::loadu(&prim.vertices(i,N)[1]);
+          const Vec3ff a2 = Vec3ff::loadu(&prim.vertices(i,N)[2]);
+          const Vec3ff a3 = Vec3ff::loadu(&prim.vertices(i,N)[3]);
+
+          size_t mask1 = mask;
+          const size_t i1 = bscf(mask1);
+          if (mask) {
+            prefetchL1(&prim.vertices(i1,N)[0]);
+            prefetchL1(&prim.vertices(i1,N)[4]);
+            if (mask1) {
+              const size_t i2 = bsf(mask1);
+              prefetchL2(&prim.vertices(i2,N)[0]);
+              prefetchL2(&prim.vertices(i2,N)[4]);
+            }
+          }
+
+          if (Intersector().intersect(pre,ray,k,context,geom,primID,a0,a1,a2,a3,Epilog(ray,k,context,geomID,primID)))
+            return true;
+
+          mask &= movemask(tNear <= vfloat<M>(ray.tfar[k]));
+        }
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h
new file mode 100644
index 0000000000..204958f7cc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector.h
@@ -0,0 +1,98 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../subdiv/bezier_curve.h"
+#include "../common/primref.h"
+#include "bezier_hair_intersector.h"
+#include "bezier_ribbon_intersector.h"
+#include "bezier_curve_intersector.h"
+#include "oriented_curve_intersector.h"
+#include "../bvh/node_intersector1.h"
+
+// FIXME: this file seems replicate of curve_intersector_virtual.h
+
+namespace embree
+{
+  namespace isa
+  {
+    struct VirtualCurveIntersector1
+    {
+      typedef unsigned char Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+      
+      template<int N, int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+        leafIntersector.intersect<1>(&pre,&ray,context,prim);
+      }
+      
+      template<int N, int Nx, bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+        return leafIntersector.occluded<1>(&pre,&ray,context,prim);
+      }
+    };
+
+    template<int K>
+      struct VirtualCurveIntersectorK 
+      {
+        typedef unsigned char Primitive;
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          size_t mask = movemask(valid_i);
+          while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim);
+        }
+        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          vbool<K> valid_o = false;
+          size_t mask = movemask(valid_i);
+          while (mask) {
+            size_t k = bscf(mask);
+            if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim))
+              set(valid_o, k);
+          }
+          return valid_o;
+        }
+        
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          leafIntersector.intersect<K>(&pre,&ray,k,context,prim);
+        }
+        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurvePrimitive::Intersectors& leafIntersector = ((VirtualCurvePrimitive*) This->leafIntersector)->vtbl[ty];
+          return leafIntersector.occluded<K>(&pre,&ray,k,context,prim);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h
new file mode 100644
index 0000000000..343cc8ff28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_distance.h
@@ -0,0 +1,129 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename NativeCurve3fa, int M>
+    struct DistanceCurveHit
+    {
+      __forceinline DistanceCurveHit() {}
+
+      __forceinline DistanceCurveHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N,
+                                     const NativeCurve3fa& curve3D)
+        : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {}
+      
+      __forceinline void finalize() 
+      {
+        vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N));
+        vv = V;
+        vt = T;
+      }
+      
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { 
+        return curve3D.eval_du(vu[i]);
+      }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      int i, N;
+      NativeCurve3fa curve3D;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+    };
+    
+    template<typename NativeCurve3fa>
+    struct DistanceCurve1Intersector1
+    {
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculations1& pre,Ray& ray,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3fa& v0, const Vec3fa& v1, const Vec3fa& v2, const Vec3fa& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        
+        /* transform control points into ray space */
+        const NativeCurve3fa curve3Di(v0,v1,v2,v3);
+        const NativeCurve3fa curve3D = enlargeRadiusToMinWidth(context,geom,ray.org,curve3Di);
+        const NativeCurve3fa curve2D = curve3D.xfm_pr(pre.ray_space,ray.org);
+      
+        /* evaluate the bezier curve */
+        vboolx valid = vfloatx(step) < vfloatx(float(N));
+        const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
+        const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
+
+        /* approximative intersection with cone */
+        const Vec4vfx v = p1-p0;
+        const Vec4vfx w = -p0;
+        const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
+        const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
+        const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
+        const Vec4vfx p = madd(u,v,p0);
+        const vfloatx t = p.z*pre.depth_scale;
+        const vfloatx d2 = madd(p.x,p.x,p.y*p.y); 
+        const vfloatx r = p.w;
+        const vfloatx r2 = r*r;
+        valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+          valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
+
+        /* update hit information */
+        bool ishit = false;
+        if (unlikely(any(valid))) {
+          DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,0,N,curve3D);
+          ishit = ishit | epilog(valid,hit);
+        }
+
+        if (unlikely(VSIZEX < N)) 
+        {
+          /* process SIMD-size many segments per iteration */
+          for (int i=VSIZEX; i<N; i+=VSIZEX)
+          {
+            /* evaluate the bezier curve */
+            vboolx valid = vintx(i)+vintx(step) < vintx(N);
+            const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
+            const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
+            
+            /* approximative intersection with cone */
+            const Vec4vfx v = p1-p0;
+            const Vec4vfx w = -p0;
+            const vfloatx d0 = madd(w.x,v.x,w.y*v.y);
+            const vfloatx d1 = madd(v.x,v.x,v.y*v.y);
+            const vfloatx u = clamp(d0*rcp(d1),vfloatx(zero),vfloatx(one));
+            const Vec4vfx p = madd(u,v,p0);
+            const vfloatx t = p.z*pre.depth_scale;
+            const vfloatx d2 = madd(p.x,p.x,p.y*p.y); 
+            const vfloatx r = p.w;
+            const vfloatx r2 = r*r;
+            valid &= (d2 <= r2) & (vfloatx(ray.tnear()) <= t) & (t <= vfloatx(ray.tfar));
+            if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+              valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*pre.depth_scale; // ignore self intersections
+
+             /* update hit information */
+            if (unlikely(any(valid))) {
+              DistanceCurveHit<NativeCurve3fa,VSIZEX> hit(valid,u,0.0f,t,i,N,curve3D);
+              ishit = ishit | epilog(valid,hit);
+            }
+          }
+        }
+        return ishit;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h
new file mode 100644
index 0000000000..47531027fc
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_oriented.h
@@ -0,0 +1,417 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+#include "curve_intersector_sweep.h"
+#include "../subdiv/linear_bezier_patch.h"
+
+#define DBG(x)
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Ray, typename Epilog>
+      struct TensorLinearCubicBezierSurfaceIntersector
+      {
+        const LinearSpace3fa& ray_space;
+        Ray& ray;
+        TensorLinearCubicBezierSurface3fa curve3d;
+        TensorLinearCubicBezierSurface2fa curve2d;
+        float eps;
+        const Epilog& epilog;
+        bool isHit;
+
+        __forceinline TensorLinearCubicBezierSurfaceIntersector (const LinearSpace3fa& ray_space, Ray& ray, const TensorLinearCubicBezierSurface3fa& curve3d, const Epilog& epilog)
+          : ray_space(ray_space), ray(ray), curve3d(curve3d), epilog(epilog), isHit(false)
+        {
+          const TensorLinearCubicBezierSurface3fa curve3dray = curve3d.xfm(ray_space,ray.org);
+          curve2d = TensorLinearCubicBezierSurface2fa(CubicBezierCurve2fa(curve3dray.L),CubicBezierCurve2fa(curve3dray.R));
+          const BBox2fa b2 = curve2d.bounds();
+          eps = 8.0f*float(ulp)*reduce_max(max(abs(b2.lower),abs(b2.upper)));
+        }
+        
+        __forceinline Interval1f solve_linear(const float u0, const float u1, const float& p0, const float& p1)
+        {
+          if (p1 == p0) {
+            if (p0 == 0.0f) return Interval1f(u0,u1);
+            else return Interval1f(empty);
+          }
+          const float t = -p0/(p1-p0);
+          const float tt = lerp(u0,u1,t);
+          return Interval1f(tt);
+        }
+
+        __forceinline void solve_linear(const float u0, const float u1, const Interval1f& p0, const Interval1f& p1, Interval1f& u)
+        {
+          if (sign(p0.lower) != sign(p0.upper)) u.extend(u0);
+          if (sign(p0.lower) != sign(p1.lower)) u.extend(solve_linear(u0,u1,p0.lower,p1.lower));
+          if (sign(p0.upper) != sign(p1.upper)) u.extend(solve_linear(u0,u1,p0.upper,p1.upper));
+          if (sign(p1.lower) != sign(p1.upper)) u.extend(u1);
+        }
+
+        __forceinline Interval1f bezier_clipping(const CubicBezierCurve<Interval1f>& curve)
+        {
+          Interval1f u = empty;
+          solve_linear(0.0f/3.0f,1.0f/3.0f,curve.v0,curve.v1,u);
+          solve_linear(0.0f/3.0f,2.0f/3.0f,curve.v0,curve.v2,u);
+          solve_linear(0.0f/3.0f,3.0f/3.0f,curve.v0,curve.v3,u);
+          solve_linear(1.0f/3.0f,2.0f/3.0f,curve.v1,curve.v2,u);
+          solve_linear(1.0f/3.0f,3.0f/3.0f,curve.v1,curve.v3,u);
+          solve_linear(2.0f/3.0f,3.0f/3.0f,curve.v2,curve.v3,u);
+          return intersect(u,Interval1f(0.0f,1.0f));
+        }
+        
+        __forceinline Interval1f bezier_clipping(const LinearBezierCurve<Interval1f>& curve)
+        {
+          Interval1f v = empty;
+          solve_linear(0.0f,1.0f,curve.v0,curve.v1,v);
+          return intersect(v,Interval1f(0.0f,1.0f));
+        }
+
+        __forceinline void solve_bezier_clipping(BBox1f cu, BBox1f cv, const TensorLinearCubicBezierSurface2fa& curve2)
+        {
+          BBox2fa bounds = curve2.bounds();
+          if (bounds.upper.x < 0.0f) return;
+          if (bounds.upper.y < 0.0f) return;
+          if (bounds.lower.x > 0.0f) return;
+          if (bounds.lower.y > 0.0f) return;
+          
+          if (max(cu.size(),cv.size()) < 1E-4f)
+          {
+            const float u = cu.center();
+            const float v = cv.center();
+            TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org);
+            const float t = curve_z.eval(u,v);
+            if (ray.tnear() <= t && t <= ray.tfar) {
+              const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v));
+              BezierCurveHit hit(t,u,v,Ng);
+              isHit |= epilog(hit);
+            }
+            return;
+          }
+          
+          const Vec2fa dv = curve2.axis_v();
+          const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (!curve0v.hasRoot()) return;
+          
+          const Interval1f v = bezier_clipping(curve0v);
+          if (isEmpty(v)) return;
+          TensorLinearCubicBezierSurface2fa curve2a = curve2.clip_v(v);
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+
+          const Vec2fa du = curve2.axis_u();
+          const TensorLinearCubicBezierSurface1f curve1u = curve2a.xfm(du);
+          CubicBezierCurve<Interval1f> curve0u = curve1u.reduce_v();         
+          int roots = curve0u.maxRoots();
+          if (roots == 0) return;
+          
+          if (roots == 1)
+          {
+            const Interval1f u = bezier_clipping(curve0u);
+            if (isEmpty(u)) return;
+            TensorLinearCubicBezierSurface2fa curve2b = curve2a.clip_u(u);
+            cu = BBox1f(lerp(cu.lower,cu.upper,u.lower),lerp(cu.lower,cu.upper,u.upper));
+            solve_bezier_clipping(cu,cv,curve2b);
+            return;
+          }
+
+          TensorLinearCubicBezierSurface2fa curve2l, curve2r;
+          curve2a.split_u(curve2l,curve2r);
+          solve_bezier_clipping(BBox1f(cu.lower,cu.center()),cv,curve2l);
+          solve_bezier_clipping(BBox1f(cu.center(),cu.upper),cv,curve2r);
+        }
+        
+        __forceinline bool solve_bezier_clipping()
+        {
+          solve_bezier_clipping(BBox1f(0.0f,1.0f),BBox1f(0.0f,1.0f),curve2d);
+          return isHit;
+        }
+
+        __forceinline void solve_newton_raphson(BBox1f cu, BBox1f cv)
+        {
+          Vec2fa uv(cu.center(),cv.center());
+          const Vec2fa dfdu = curve2d.eval_du(uv.x,uv.y);
+          const Vec2fa dfdv = curve2d.eval_dv(uv.x,uv.y);
+          const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv));
+          solve_newton_raphson_loop(cu,cv,uv,dfdu,dfdv,rcp_J);
+        }
+
+        __forceinline void solve_newton_raphson_loop(BBox1f cu, BBox1f cv, const Vec2fa& uv_in, const Vec2fa& dfdu, const Vec2fa& dfdv, const LinearSpace2fa& rcp_J)
+        {
+          Vec2fa uv = uv_in;
+          
+          for (size_t i=0; i<200; i++)
+          {
+            const Vec2fa f = curve2d.eval(uv.x,uv.y);
+            const Vec2fa duv = rcp_J*f;
+            uv -= duv;
+
+            if (max(abs(f.x),abs(f.y)) < eps)
+            {
+              const float u = uv.x;
+              const float v = uv.y;
+              if (!(u >= 0.0f && u <= 1.0f)) return; // rejects NaNs
+              if (!(v >= 0.0f && v <= 1.0f)) return; // rejects NaNs
+              const TensorLinearCubicBezierSurface1f curve_z = curve3d.xfm(ray_space.row2(),ray.org);
+              const float t = curve_z.eval(u,v);
+              if (!(ray.tnear() <= t && t <= ray.tfar)) return; // rejects NaNs
+              const Vec3fa Ng = cross(curve3d.eval_du(u,v),curve3d.eval_dv(u,v));
+              BezierCurveHit hit(t,u,v,Ng);
+              isHit |= epilog(hit);
+              return;
+            }
+          }       
+        }
+
+        __forceinline bool clip_v(BBox1f& cu, BBox1f& cv)
+        {
+          const Vec2fa dv = curve2d.eval_dv(cu.lower,cv.lower);
+          const TensorLinearCubicBezierSurface1f curve1v = curve2d.xfm(dv).clip(cu,cv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (!curve0v.hasRoot()) return false;
+          Interval1f v = bezier_clipping(curve0v);
+          if (isEmpty(v)) return false;
+          v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f));
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+          return true;
+        }
+
+        __forceinline bool solve_krawczyk(bool very_small, BBox1f& cu, BBox1f& cv)
+        {
+          /* perform bezier clipping in v-direction to get tight v-bounds */
+          TensorLinearCubicBezierSurface2fa curve2 = curve2d.clip(cu,cv);
+          const Vec2fa dv = curve2.axis_v();
+          const TensorLinearCubicBezierSurface1f curve1v = curve2.xfm(dv);
+          LinearBezierCurve<Interval1f> curve0v = curve1v.reduce_u();
+          if (unlikely(!curve0v.hasRoot())) return true;
+          Interval1f v = bezier_clipping(curve0v);
+          if (unlikely(isEmpty(v))) return true;
+          v = intersect(v + Interval1f(-0.1f,+0.1f),Interval1f(0.0f,1.0f));
+          curve2 = curve2.clip_v(v);
+          cv = BBox1f(lerp(cv.lower,cv.upper,v.lower),lerp(cv.lower,cv.upper,v.upper));
+
+          /* perform one newton raphson iteration */
+          Vec2fa c(cu.center(),cv.center());
+          Vec2fa f,dfdu,dfdv; curve2d.eval(c.x,c.y,f,dfdu,dfdv);
+          const LinearSpace2fa rcp_J = rcp(LinearSpace2fa(dfdu,dfdv));
+          const Vec2fa c1 = c - rcp_J*f;
+          
+          /* calculate bounds of derivatives */
+          const BBox2fa bounds_du = (1.0f/cu.size())*curve2.derivative_u().bounds();
+          const BBox2fa bounds_dv = (1.0f/cv.size())*curve2.derivative_v().bounds();
+
+          /* calculate krawczyk test */
+          LinearSpace2<Vec2<Interval1f>> I(Interval1f(1.0f), Interval1f(0.0f),
+                                           Interval1f(0.0f), Interval1f(1.0f));
+
+          LinearSpace2<Vec2<Interval1f>> G(Interval1f(bounds_du.lower.x,bounds_du.upper.x), Interval1f(bounds_dv.lower.x,bounds_dv.upper.x),
+                                           Interval1f(bounds_du.lower.y,bounds_du.upper.y), Interval1f(bounds_dv.lower.y,bounds_dv.upper.y));
+
+          const LinearSpace2<Vec2f> rcp_J2(rcp_J);
+          const LinearSpace2<Vec2<Interval1f>> rcp_Ji(rcp_J2);
+          
+          const Vec2<Interval1f> x(cu,cv);
+          const Vec2<Interval1f> K = Vec2<Interval1f>(Vec2f(c1)) + (I - rcp_Ji*G)*(x-Vec2<Interval1f>(Vec2f(c)));
+
+          /* test if there is no solution */
+          const Vec2<Interval1f> KK = intersect(K,x);
+          if (unlikely(isEmpty(KK.x) || isEmpty(KK.y))) return true;
+
+          /* exit if convergence cannot get proven, but terminate if we are very small */
+          if (unlikely(!subset(K,x) && !very_small)) return false;
+
+          /* solve using newton raphson iteration of convergence is guarenteed */
+          solve_newton_raphson_loop(cu,cv,c1,dfdu,dfdv,rcp_J);
+          return true;
+        }
+
+        __forceinline void solve_newton_raphson_no_recursion(BBox1f cu, BBox1f cv)
+        {
+           if (!clip_v(cu,cv)) return;
+           return solve_newton_raphson(cu,cv);
+        }
+        
+        __forceinline void solve_newton_raphson_recursion(BBox1f cu, BBox1f cv)
+        {
+          unsigned int sptr = 0;
+          const unsigned int stack_size = 4;
+          unsigned int mask_stack[stack_size];
+          BBox1f cu_stack[stack_size];
+          BBox1f cv_stack[stack_size];
+          goto entry;
+          
+          /* terminate if stack is empty */
+          while (sptr)
+          {
+            /* pop from stack */
+            {
+              sptr--;
+              size_t mask = mask_stack[sptr];
+              cu = cu_stack[sptr];
+              cv = cv_stack[sptr];
+              const size_t i = bscf(mask);
+              mask_stack[sptr] = mask;
+              if (mask) sptr++; // there are still items on the stack
+              
+              /* process next element recurse into each hit curve segment */
+              const float u0 = float(i+0)*(1.0f/(VSIZEX-1));
+              const float u1 = float(i+1)*(1.0f/(VSIZEX-1));
+              const BBox1f cui(lerp(cu.lower,cu.upper,u0),lerp(cu.lower,cu.upper,u1));
+              cu = cui;
+            }
+
+#if 0
+            solve_newton_raphson_no_recursion(cu,cv);
+            continue;
+            
+#else
+            /* we assume convergence for small u ranges and verify using krawczyk */
+            if (cu.size() < 1.0f/6.0f) {
+              const bool very_small = cu.size() < 0.001f || sptr >= stack_size;
+              if (solve_krawczyk(very_small,cu,cv)) {
+                continue;
+              }
+            }
+#endif
+
+          entry:
+          
+            /* split the curve into VSIZEX-1 segments in u-direction */
+            vboolx valid = true;
+            TensorLinearCubicBezierSurface<Vec2vfx> subcurves = curve2d.clip_v(cv).vsplit_u(valid,cu);
+            
+            /* slabs test in u-direction */
+            Vec2vfx ndv = cross(subcurves.axis_v());
+            BBox<vfloatx> boundsv = subcurves.vxfm(ndv).bounds();
+            valid &= boundsv.lower <= eps;
+            valid &= boundsv.upper >= -eps;
+            if (none(valid)) continue;
+
+            /* slabs test in v-direction */
+            Vec2vfx ndu = cross(subcurves.axis_u());
+            BBox<vfloatx> boundsu = subcurves.vxfm(ndu).bounds();
+            valid &= boundsu.lower <= eps;
+            valid &= boundsu.upper >= -eps;
+            if (none(valid)) continue;
+
+            /* push valid segments to stack */
+            assert(sptr < stack_size);
+            mask_stack [sptr] = movemask(valid);
+            cu_stack   [sptr] = cu;
+            cv_stack   [sptr] = cv;
+            sptr++;
+          }
+        }
+        
+        __forceinline bool solve_newton_raphson_main()
+        {
+          BBox1f vu(0.0f,1.0f);
+          BBox1f vv(0.0f,1.0f);
+          solve_newton_raphson_recursion(vu,vv);
+          return isHit;
+        }
+      };
+
+
+    template<template<typename Ty> class SourceCurve>
+      struct OrientedCurve1Intersector1
+    {
+      //template<typename Ty> using Curve = SourceCurve<Ty>;
+      typedef SourceCurve<Vec3ff> SourceCurve3ff;
+      typedef SourceCurve<Vec3fa> SourceCurve3fa;
+      
+      __forceinline OrientedCurve1Intersector1() {}
+      
+      __forceinline OrientedCurve1Intersector1(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID, 
+                                const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
+                                const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
+                                const Epilog& epilog) const
+      {
+        STAT3(normal.trav_prims,1,1,1);
+
+        SourceCurve3ff ccurve(v0i,v1i,v2i,v3i);
+        SourceCurve3fa ncurve(n0i,n1i,n2i,n3i);
+        ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve);
+        TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+      }
+
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID,
+                                const TensorLinearCubicBezierSurface3fa& curve, const Epilog& epilog) const
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray,Epilog>(pre.ray_space,ray,curve,epilog).solve_newton_raphson_main();
+      }
+    };
+
+    template<template<typename Ty> class SourceCurve, int K>
+      struct OrientedCurve1IntersectorK
+    {
+      //template<typename Ty> using Curve = SourceCurve<Ty>;
+      typedef SourceCurve<Vec3ff> SourceCurve3ff;
+      typedef SourceCurve<Vec3fa> SourceCurve3fa;
+      
+      struct Ray1
+      {
+        __forceinline Ray1(RayK<K>& ray, size_t k)
+          : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {}
+
+        Vec3fa org;
+        Vec3fa dir;
+        float _tnear;
+        float& tfar;
+
+        __forceinline float& tnear() { return _tnear; }
+        //__forceinline float& tfar()  { return _tfar; }
+        __forceinline const float& tnear() const { return _tnear; }
+        //__forceinline const float& tfar()  const { return _tfar; }
+      };
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0i, const Vec3ff& v1i, const Vec3ff& v2i, const Vec3ff& v3i,
+                                   const Vec3fa& n0i, const Vec3fa& n1i, const Vec3fa& n2i, const Vec3fa& n3i,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+        SourceCurve3ff ccurve(v0i,v1i,v2i,v3i);
+        SourceCurve3fa ncurve(n0i,n1i,n2i,n3i);
+        ccurve = enlargeRadiusToMinWidth(context,geom,ray.org,ccurve);
+        TensorLinearCubicBezierSurface3fa curve = TensorLinearCubicBezierSurface3fa::fromCenterAndNormalCurve(ccurve,ncurve);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main();
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const TensorLinearCubicBezierSurface3fa& curve,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+        //return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_bezier_clipping();
+        return TensorLinearCubicBezierSurfaceIntersector<Ray1,Epilog>(pre.ray_space[k],ray,curve,epilog).solve_newton_raphson_main();
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h
new file mode 100644
index 0000000000..6e9fc91925
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_precalculations.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/geometry.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct CurvePrecalculations1
+    {
+      float depth_scale;
+      LinearSpace3fa ray_space;
+           
+      __forceinline CurvePrecalculations1() {}
+
+      __forceinline CurvePrecalculations1(const Ray& ray, const void* ptr)
+      {
+        depth_scale = rsqrt(dot(ray.dir,ray.dir));
+        LinearSpace3fa space = frame(depth_scale*ray.dir);
+        space.vz *= depth_scale;
+        ray_space = space.transposed();
+      }
+    };
+    
+    template<int K>
+      struct CurvePrecalculationsK
+    {
+      vfloat<K> depth_scale;
+      LinearSpace3fa ray_space[K];
+
+      __forceinline CurvePrecalculationsK(const vbool<K>& valid, const RayK<K>& ray)
+      {
+        size_t mask = movemask(valid);
+        depth_scale = rsqrt(dot(ray.dir,ray.dir));
+        while (mask) {
+          size_t k = bscf(mask);
+          Vec3fa ray_dir_k = Vec3fa(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+          LinearSpace3fa ray_space_k = frame(depth_scale[k]*ray_dir_k);
+          ray_space_k.vz *= depth_scale[k];
+          ray_space[k] = ray_space_k.transposed();
+        }
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h
new file mode 100644
index 0000000000..a99cf99d56
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_ribbon.h
@@ -0,0 +1,214 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "quad_intersector.h"
+#include "curve_intersector_precalculations.h"
+
+#define Bezier1Intersector1 RibbonCurve1Intersector1
+#define Bezier1IntersectorK RibbonCurve1IntersectorK
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename NativeCurve3ff, int M>
+    struct RibbonHit
+    {
+      __forceinline RibbonHit() {}
+
+      __forceinline RibbonHit(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const int i, const int N,
+                              const NativeCurve3ff& curve3D)
+        : U(U), V(V), T(T), i(i), N(N), curve3D(curve3D), valid(valid) {}
+      
+      __forceinline void finalize() 
+      {
+        vu = (vfloat<M>(step)+U+vfloat<M>(float(i)))*(1.0f/float(N));
+        vv = V;
+        vt = T;
+      }
+      
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { 
+        return curve3D.eval_du(vu[i]);
+      }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      int i, N;
+      NativeCurve3ff curve3D;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+    };
+
+    /* calculate squared distance of point p0 to line p1->p2 */
+    __forceinline std::pair<vfloatx,vfloatx> sqr_point_line_distance(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2)
+    {
+      const vfloatx num = det(p2-p1,p1-p0);
+      const vfloatx den2 = dot(p2-p1,p2-p1);
+      return std::make_pair(num*num,den2);
+    }
+    
+    /* performs culling against a cylinder */
+    __forceinline vboolx cylinder_culling_test(const Vec2vfx& p0, const Vec2vfx& p1, const Vec2vfx& p2, const vfloatx& r)
+    {
+      const std::pair<vfloatx,vfloatx> d = sqr_point_line_distance(p0,p1,p2);
+      return d.first <= r*r*d.second;
+    }
+
+    template<typename NativeCurve3ff, typename Epilog>
+    __forceinline bool intersect_ribbon(const Vec3fa& ray_org, const Vec3fa& ray_dir, const float ray_tnear, const float& ray_tfar,
+                                        const LinearSpace3fa& ray_space, const float& depth_scale,
+                                        const NativeCurve3ff& curve3D, const int N,
+                                        const Epilog& epilog)
+    {
+      /* transform control points into ray space */
+      const NativeCurve3ff curve2D = curve3D.xfm_pr(ray_space,ray_org);
+      float eps = 4.0f*float(ulp)*reduce_max(max(abs(curve2D.v0),abs(curve2D.v1),abs(curve2D.v2),abs(curve2D.v3)));
+      
+      /* evaluate the bezier curve */
+      bool ishit = false;
+      vboolx valid = vfloatx(step) < vfloatx(float(N));
+      const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(0,N);
+      const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(0,N);
+      valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
+      
+      if (any(valid)) 
+      {
+        Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(0,N);
+        Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(0,N);
+        dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
+        dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
+        const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
+        const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
+        const Vec3vfx nn0 = normalize(n0);
+        const Vec3vfx nn1 = normalize(n1);
+        const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
+        const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
+        const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
+        const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+        
+        vfloatx vu,vv,vt;
+        vboolx valid0 = intersect_quad_backface_culling(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
+
+        if (any(valid0))
+        {
+          /* ignore self intersections */
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
+            vfloatx r = lerp(p0.w, p1.w, vu);
+            valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
+          }
+          
+          if (any(valid0))
+          {
+            vv = madd(2.0f,vv,vfloatx(-1.0f));
+            RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,0,N,curve3D);
+            ishit |= epilog(bhit.valid,bhit);
+          }
+        }
+      }
+      
+      if (unlikely(VSIZEX < N)) 
+      {
+        /* process SIMD-size many segments per iteration */
+        for (int i=VSIZEX; i<N; i+=VSIZEX)
+        {
+          /* evaluate the bezier curve */
+          vboolx valid = vintx(i)+vintx(step) < vintx(N);
+          const Vec4vfx p0 = curve2D.template eval0<VSIZEX>(i,N);
+          const Vec4vfx p1 = curve2D.template eval1<VSIZEX>(i,N);
+          valid &= cylinder_culling_test(zero,Vec2vfx(p0.x,p0.y),Vec2vfx(p1.x,p1.y),max(p0.w,p1.w));
+          if (none(valid)) continue;
+          
+          Vec3vfx dp0dt = curve2D.template derivative0<VSIZEX>(i,N);
+          Vec3vfx dp1dt = curve2D.template derivative1<VSIZEX>(i,N);
+          dp0dt = select(reduce_max(abs(dp0dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp0dt);
+          dp1dt = select(reduce_max(abs(dp1dt)) < vfloatx(eps),Vec3vfx(p1-p0),dp1dt);
+          const Vec3vfx n0(dp0dt.y,-dp0dt.x,0.0f);
+          const Vec3vfx n1(dp1dt.y,-dp1dt.x,0.0f);
+          const Vec3vfx nn0 = normalize(n0);
+          const Vec3vfx nn1 = normalize(n1);
+          const Vec3vfx lp0 = madd(p0.w,nn0,Vec3vfx(p0));
+          const Vec3vfx lp1 = madd(p1.w,nn1,Vec3vfx(p1));
+          const Vec3vfx up0 = nmadd(p0.w,nn0,Vec3vfx(p0));
+          const Vec3vfx up1 = nmadd(p1.w,nn1,Vec3vfx(p1));
+          
+          vfloatx vu,vv,vt;
+          vboolx valid0 = intersect_quad_backface_culling(valid,zero,Vec3fa(0,0,1),ray_tnear,ray_tfar,lp0,lp1,up1,up0,vu,vv,vt);
+
+          if (any(valid0))
+          {
+            /* ignore self intersections */
+            if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) {
+              vfloatx r = lerp(p0.w, p1.w, vu);
+              valid0 &= vt > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale;
+            }
+            
+            if (any(valid0))
+            {
+              vv = madd(2.0f,vv,vfloatx(-1.0f));
+              RibbonHit<NativeCurve3ff,VSIZEX> bhit(valid0,vu,vv,vt,i,N,curve3D);
+              ishit |= epilog(bhit.valid,bhit);
+            }
+          }
+        }
+      }
+      return ishit;
+    }
+        
+    template<template<typename Ty> class NativeCurve>
+    struct RibbonCurve1Intersector1
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        NativeCurve3ff curve(v0,v1,v2,v3);
+        curve = enlargeRadiusToMinWidth(context,geom,ray.org,curve);
+        return intersect_ribbon<NativeCurve3ff>(ray.org,ray.dir,ray.tnear(),ray.tfar,
+                                                pre.ray_space,pre.depth_scale,
+                                                curve,N,
+                                                epilog);
+      }
+    };
+    
+    template<template<typename Ty> class NativeCurve, int K>
+    struct RibbonCurve1IntersectorK
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& ray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        const int N = geom->tessellationRate;
+        const Vec3fa ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+        const Vec3fa ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+        NativeCurve3ff curve(v0,v1,v2,v3);
+        curve = enlargeRadiusToMinWidth(context,geom,ray_org,curve);
+        return intersect_ribbon<NativeCurve3ff>(ray_org,ray_dir,ray.tnear()[k],ray.tfar[k],
+                                                pre.ray_space[k],pre.depth_scale[k],
+                                                curve,N,
+                                                epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h
new file mode 100644
index 0000000000..883cedc3d2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_sweep.h
@@ -0,0 +1,362 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "cylinder.h"
+#include "plane.h"
+#include "line_intersector.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    static const size_t numJacobianIterations = 5;
+#if defined(__AVX__)
+    static const size_t numBezierSubdivisions = 2;
+#else
+    static const size_t numBezierSubdivisions = 3;
+#endif
+
+    struct BezierCurveHit
+    {
+      __forceinline BezierCurveHit() {}
+
+      __forceinline BezierCurveHit(const float t, const float u, const Vec3fa& Ng)
+        : t(t), u(u), v(0.0f), Ng(Ng) {}
+
+      __forceinline BezierCurveHit(const float t, const float u, const float v, const Vec3fa& Ng)
+        : t(t), u(u), v(v), Ng(Ng) {}
+      
+      __forceinline void finalize() {}
+      
+    public:
+      float t;
+      float u;
+      float v; 
+      Vec3fa Ng;
+    };
+    
+    template<typename NativeCurve3ff, typename Ray, typename Epilog>
+    __forceinline bool intersect_bezier_iterative_debug(const Ray& ray, const float dt, const NativeCurve3ff& curve, size_t i,
+                                                        const vfloatx& u, const BBox<vfloatx>& tp, const BBox<vfloatx>& h0, const BBox<vfloatx>& h1, 
+                                                        const Vec3vfx& Ng, const Vec4vfx& dP0du, const Vec4vfx& dP3du,
+                                                        const Epilog& epilog)
+    {
+      if (tp.lower[i]+dt > ray.tfar) return false;
+      Vec3fa Ng_o = Vec3fa(Ng.x[i],Ng.y[i],Ng.z[i]);
+      if (h0.lower[i] == tp.lower[i]) Ng_o = -Vec3fa(dP0du.x[i],dP0du.y[i],dP0du.z[i]);
+      if (h1.lower[i] == tp.lower[i]) Ng_o = +Vec3fa(dP3du.x[i],dP3du.y[i],dP3du.z[i]);
+      BezierCurveHit hit(tp.lower[i]+dt,u[i],Ng_o);
+      return epilog(hit);
+    }
+
+    template<typename NativeCurve3ff, typename Ray, typename Epilog> 
+     __forceinline bool intersect_bezier_iterative_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve, float u, float t, const Epilog& epilog)
+    {
+      const Vec3fa org = zero;
+      const Vec3fa dir = ray.dir;
+      const float length_ray_dir = length(dir);
+
+      /* error of curve evaluations is propertional to largest coordinate */
+      const BBox3ff box = curve.bounds();
+      const float P_err = 16.0f*float(ulp)*reduce_max(max(abs(box.lower),abs(box.upper)));
+     
+      for (size_t i=0; i<numJacobianIterations; i++) 
+      {
+        const Vec3fa Q = madd(Vec3fa(t),dir,org);
+        //const Vec3fa dQdu = zero;
+        const Vec3fa dQdt = dir;
+        const float Q_err = 16.0f*float(ulp)*length_ray_dir*t; // works as org=zero here
+           
+        Vec3ff P,dPdu,ddPdu; curve.eval(u,P,dPdu,ddPdu);
+        //const Vec3fa dPdt = zero;
+
+        const Vec3fa R = Q-P;
+        const float len_R = length(R); //reduce_max(abs(R));
+        const float R_err = max(Q_err,P_err);
+        const Vec3fa dRdu = /*dQdu*/-dPdu;
+        const Vec3fa dRdt = dQdt;//-dPdt;
+
+        const Vec3fa T = normalize(dPdu);
+        const Vec3fa dTdu = dnormalize(dPdu,ddPdu);
+        //const Vec3fa dTdt = zero;
+        const float cos_err = P_err/length(dPdu);
+
+        /* Error estimate for dot(R,T):
+
+           dot(R,T) = cos(R,T) |R| |T|
+                    = (cos(R,T) +- cos_error) * (|R| +- |R|_err) * (|T| +- |T|_err)
+                    = cos(R,T)*|R|*|T| 
+                      +- cos(R,T)*(|R|*|T|_err + |T|*|R|_err)
+                      +- cos_error*(|R| + |T|)
+                      +- lower order terms
+           with cos(R,T) being in [0,1] and |T| = 1 we get:
+             dot(R,T)_err = |R|*|T|_err + |R|_err = cos_error*(|R|+1)
+        */
+              
+        const float f = dot(R,T);
+        const float f_err = len_R*P_err + R_err + cos_err*(1.0f+len_R);
+        const float dfdu = dot(dRdu,T) + dot(R,dTdu);
+        const float dfdt = dot(dRdt,T);// + dot(R,dTdt);
+
+        const float K = dot(R,R)-sqr(f);
+        const float dKdu = /*2.0f*/(dot(R,dRdu)-f*dfdu);
+        const float dKdt = /*2.0f*/(dot(R,dRdt)-f*dfdt);
+        const float rsqrt_K = rsqrt(K);
+
+        const float g = sqrt(K)-P.w;
+        const float g_err = R_err + f_err + 16.0f*float(ulp)*box.upper.w;
+        const float dgdu = /*0.5f*/dKdu*rsqrt_K-dPdu.w;
+        const float dgdt = /*0.5f*/dKdt*rsqrt_K;//-dPdt.w;
+
+        const LinearSpace2f J = LinearSpace2f(dfdu,dfdt,dgdu,dgdt);
+        const Vec2f dut = rcp(J)*Vec2f(f,g);
+        const Vec2f ut = Vec2f(u,t) - dut;
+        u = ut.x; t = ut.y;
+
+        if (abs(f) < f_err && abs(g) < g_err)
+        {
+          t+=dt;
+          if (!(ray.tnear() <= t && t <= ray.tfar)) return false; // rejects NaNs
+          if (!(u >= 0.0f && u <= 1.0f)) return false; // rejects NaNs
+          const Vec3fa R = normalize(Q-P);
+          const Vec3fa U = madd(Vec3fa(dPdu.w),R,dPdu);
+          const Vec3fa V = cross(dPdu,R);
+          BezierCurveHit hit(t,u,cross(V,U));
+          return epilog(hit);
+        }
+      }
+      return false;
+    }
+
+    template<typename NativeCurve3ff, typename Ray, typename Epilog>
+    bool intersect_bezier_recursive_jacobian(const Ray& ray, const float dt, const NativeCurve3ff& curve,
+                                             float u0, float u1, unsigned int depth, const Epilog& epilog)
+    {
+#if defined(__AVX__)
+      typedef vbool8 vboolx; // maximally 8-wide to work around KNL issues
+      typedef vint8 vintx; 
+      typedef vfloat8 vfloatx;
+#else
+      typedef vbool4 vboolx;
+      typedef vint4 vintx; 
+      typedef vfloat4 vfloatx;
+#endif
+      typedef Vec3<vfloatx> Vec3vfx;
+      typedef Vec4<vfloatx> Vec4vfx;
+    
+      unsigned int maxDepth = numBezierSubdivisions;
+      bool found = false;
+      const Vec3fa org = zero;
+      const Vec3fa dir = ray.dir;
+
+      unsigned int sptr = 0;
+      const unsigned int stack_size = numBezierSubdivisions+1; // +1 because of unstable workaround below
+      struct StackEntry {
+        vboolx valid;
+        vfloatx tlower;
+        float u0;
+        float u1;
+        unsigned int depth;
+      };
+      StackEntry stack[stack_size];
+      goto entry;
+
+       /* terminate if stack is empty */
+      while (sptr)
+      {
+        /* pop from stack */
+        {
+          sptr--;
+          vboolx valid = stack[sptr].valid;
+          const vfloatx tlower = stack[sptr].tlower;
+          valid &= tlower+dt <= ray.tfar;
+          if (none(valid)) continue;
+          u0 = stack[sptr].u0;
+          u1 = stack[sptr].u1;
+          depth = stack[sptr].depth;
+          const size_t i = select_min(valid,tlower); clear(valid,i);
+          stack[sptr].valid = valid;
+          if (any(valid)) sptr++; // there are still items on the stack
+
+          /* process next segment */
+          const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1)));
+          u0 = vu0[i+0];
+          u1 = vu0[i+1];
+        }
+      entry:
+
+        /* subdivide curve */
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(vfloatx::size-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(vfloatx::size-1)));
+        Vec4vfx P0, dP0du; curve.veval(vu0,P0,dP0du); dP0du = dP0du * Vec4vfx(dscale);
+        const Vec4vfx P3 = shift_right_1(P0);
+        const Vec4vfx dP3du = shift_right_1(dP0du); 
+        const Vec4vfx P1 = P0 + dP0du; 
+        const Vec4vfx P2 = P3 - dP3du;
+        
+        /* calculate bounding cylinders */
+        const vfloatx rr1 = sqr_point_to_line_distance(Vec3vfx(dP0du),Vec3vfx(P3-P0));
+        const vfloatx rr2 = sqr_point_to_line_distance(Vec3vfx(dP3du),Vec3vfx(P3-P0));
+        const vfloatx maxr12 = sqrt(max(rr1,rr2));
+        const vfloatx one_plus_ulp  = 1.0f+2.0f*float(ulp);
+        const vfloatx one_minus_ulp = 1.0f-2.0f*float(ulp);
+        vfloatx r_outer = max(P0.w,P1.w,P2.w,P3.w)+maxr12;
+        vfloatx r_inner = min(P0.w,P1.w,P2.w,P3.w)-maxr12;
+        r_outer = one_plus_ulp*r_outer;
+        r_inner = max(0.0f,one_minus_ulp*r_inner);
+        const CylinderN<vfloatx::size> cylinder_outer(Vec3vfx(P0),Vec3vfx(P3),r_outer);
+        const CylinderN<vfloatx::size> cylinder_inner(Vec3vfx(P0),Vec3vfx(P3),r_inner);
+        vboolx valid = true; clear(valid,vfloatx::size-1);
+        
+        /* intersect with outer cylinder */
+        BBox<vfloatx> tc_outer; vfloatx u_outer0; Vec3vfx Ng_outer0; vfloatx u_outer1; Vec3vfx Ng_outer1;
+        valid &= cylinder_outer.intersect(org,dir,tc_outer,u_outer0,Ng_outer0,u_outer1,Ng_outer1);
+        if (none(valid)) continue;
+        
+        /* intersect with cap-planes */
+        BBox<vfloatx> tp(ray.tnear()-dt,ray.tfar-dt);
+        tp = embree::intersect(tp,tc_outer);
+        BBox<vfloatx> h0 = HalfPlaneN<vfloatx::size>(Vec3vfx(P0),+Vec3vfx(dP0du)).intersect(org,dir);
+        tp = embree::intersect(tp,h0);
+        BBox<vfloatx> h1 = HalfPlaneN<vfloatx::size>(Vec3vfx(P3),-Vec3vfx(dP3du)).intersect(org,dir);
+        tp = embree::intersect(tp,h1);
+        valid &= tp.lower <= tp.upper;
+        if (none(valid)) continue;
+        
+        /* clamp and correct u parameter */
+        u_outer0 = clamp(u_outer0,vfloatx(0.0f),vfloatx(1.0f));
+        u_outer1 = clamp(u_outer1,vfloatx(0.0f),vfloatx(1.0f));
+        u_outer0 = lerp(u0,u1,(vfloatx(step)+u_outer0)*(1.0f/float(vfloatx::size)));
+        u_outer1 = lerp(u0,u1,(vfloatx(step)+u_outer1)*(1.0f/float(vfloatx::size)));
+        
+        /* intersect with inner cylinder */
+        BBox<vfloatx> tc_inner;
+        vfloatx u_inner0 = zero; Vec3vfx Ng_inner0 = zero; vfloatx u_inner1 = zero; Vec3vfx Ng_inner1 = zero;
+        const vboolx valid_inner = cylinder_inner.intersect(org,dir,tc_inner,u_inner0,Ng_inner0,u_inner1,Ng_inner1);
+        
+        /* at the unstable area we subdivide deeper */
+        const vboolx unstable0 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner0)) < 0.3f);
+        const vboolx unstable1 = (!valid_inner) | (abs(dot(Vec3vfx(Vec3fa(ray.dir)),Ng_inner1)) < 0.3f);
+      
+        /* subtract the inner interval from the current hit interval */
+        BBox<vfloatx> tp0, tp1;
+        subtract(tp,tc_inner,tp0,tp1);
+        vboolx valid0 = valid & (tp0.lower <= tp0.upper);
+        vboolx valid1 = valid & (tp1.lower <= tp1.upper);
+        if (none(valid0 | valid1)) continue;
+        
+        /* iterate over all first hits front to back */
+        const vintx termDepth0 = select(unstable0,vintx(maxDepth+1),vintx(maxDepth));
+        vboolx recursion_valid0 = valid0 & (depth < termDepth0);
+        valid0 &= depth >= termDepth0;
+        
+        while (any(valid0))
+        {
+          const size_t i = select_min(valid0,tp0.lower); clear(valid0,i);
+          found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer0[i],tp0.lower[i],epilog);
+          //found = found | intersect_bezier_iterative_debug   (ray,dt,curve,i,u_outer0,tp0,h0,h1,Ng_outer0,dP0du,dP3du,epilog);
+          valid0 &= tp0.lower+dt <= ray.tfar;
+        }
+        valid1 &= tp1.lower+dt <= ray.tfar;
+        
+        /* iterate over all second hits front to back */
+        const vintx termDepth1 = select(unstable1,vintx(maxDepth+1),vintx(maxDepth));
+        vboolx recursion_valid1 = valid1 & (depth < termDepth1);
+        valid1 &= depth >= termDepth1;
+        while (any(valid1))
+        {
+          const size_t i = select_min(valid1,tp1.lower); clear(valid1,i);
+          found = found | intersect_bezier_iterative_jacobian(ray,dt,curve,u_outer1[i],tp1.upper[i],epilog);
+          //found = found | intersect_bezier_iterative_debug   (ray,dt,curve,i,u_outer1,tp1,h0,h1,Ng_outer1,dP0du,dP3du,epilog);
+          valid1 &= tp1.lower+dt <= ray.tfar;
+        }
+
+        /* push valid segments to stack */
+        recursion_valid0 &= tp0.lower+dt <= ray.tfar;
+        recursion_valid1 &= tp1.lower+dt <= ray.tfar;
+        const vboolx recursion_valid = recursion_valid0 | recursion_valid1;
+        if (any(recursion_valid))
+        {
+          assert(sptr < stack_size);
+          stack[sptr].valid = recursion_valid;
+          stack[sptr].tlower = select(recursion_valid0,tp0.lower,tp1.lower);
+          stack[sptr].u0 = u0;
+          stack[sptr].u1 = u1;
+          stack[sptr].depth = depth+1;
+          sptr++;
+        }
+      }
+      return found;
+    }
+
+    template<template<typename Ty> class NativeCurve>
+    struct SweepCurve1Intersector1
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      template<typename Epilog>
+      __noinline bool intersect(const CurvePrecalculations1& pre, Ray& ray,
+                                IntersectContext* context,
+                                const CurveGeometry* geom, const unsigned int primID,
+                                const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+
+        /* move ray closer to make intersection stable */
+        NativeCurve3ff curve0(v0,v1,v2,v3);
+        curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0);
+        const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
+        const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
+        const NativeCurve3ff curve1 = curve0-ref;
+        return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+      }
+    };
+
+    template<template<typename Ty> class NativeCurve, int K>
+    struct SweepCurve1IntersectorK
+    {
+      typedef NativeCurve<Vec3ff> NativeCurve3ff;
+      
+      struct Ray1
+      {
+        __forceinline Ray1(RayK<K>& ray, size_t k)
+          : org(ray.org.x[k],ray.org.y[k],ray.org.z[k]), dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]), _tnear(ray.tnear()[k]), tfar(ray.tfar[k]) {}
+
+        Vec3fa org;
+        Vec3fa dir;
+        float _tnear;
+        float& tfar;
+
+        __forceinline float& tnear() { return _tnear; }
+        //__forceinline float& tfar()  { return _tfar; }
+        __forceinline const float& tnear() const { return _tnear; }
+        //__forceinline const float& tfar()  const { return _tfar; }
+        
+      };
+
+      template<typename Epilog>
+      __forceinline bool intersect(const CurvePrecalculationsK<K>& pre, RayK<K>& vray, size_t k,
+                                   IntersectContext* context,
+                                   const CurveGeometry* geom, const unsigned int primID,
+                                   const Vec3ff& v0, const Vec3ff& v1, const Vec3ff& v2, const Vec3ff& v3,
+                                   const Epilog& epilog)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Ray1 ray(vray,k);
+
+        /* move ray closer to make intersection stable */
+        NativeCurve3ff curve0(v0,v1,v2,v3);
+        curve0 = enlargeRadiusToMinWidth(context,geom,ray.org,curve0);
+        const float dt = dot(curve0.center()-ray.org,ray.dir)*rcp(dot(ray.dir,ray.dir));
+        const Vec3ff ref(madd(Vec3fa(dt),ray.dir,ray.org),0.0f);
+        const NativeCurve3ff curve1 = curve0-ref;
+        return intersect_bezier_recursive_jacobian(ray,dt,curve1,0.0f,1.0f,1,epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h
new file mode 100644
index 0000000000..e1f4238130
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual.h
@@ -0,0 +1,671 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../subdiv/bezier_curve.h"
+#include "../common/primref.h"
+#include "curve_intersector_precalculations.h"
+#include "../bvh/node_intersector1.h"
+#include "../bvh/node_intersector_packet.h"
+
+#include "intersector_epilog.h"
+
+#include "../subdiv/bezier_curve.h"
+#include "../subdiv/bspline_curve.h"
+#include "../subdiv/hermite_curve.h"
+#include "../subdiv/catmullrom_curve.h"
+
+#include "spherei_intersector.h"
+#include "disci_intersector.h"
+
+#include "linei_intersector.h"
+#include "roundlinei_intersector.h"
+#include "conelinei_intersector.h"
+
+#include "curveNi_intersector.h"
+#include "curveNv_intersector.h"
+#include "curveNi_mb_intersector.h"
+
+#include "curve_intersector_distance.h"
+#include "curve_intersector_ribbon.h"
+#include "curve_intersector_oriented.h"
+#include "curve_intersector_sweep.h"
+
+namespace embree
+{
+  struct VirtualCurveIntersector
+  {
+    typedef void (*Intersect1Ty)(void* pre, void* ray, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded1Ty )(void* pre, void* ray, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect4Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded4Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect8Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded8Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    
+    typedef void (*Intersect16Ty)(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+    typedef bool (*Occluded16Ty) (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+  public:
+    struct Intersectors
+    {
+      Intersectors() {} // WARNING: Do not zero initialize this, as we otherwise get problems with thread unsafe local static variable initialization (e.g. on VS2013) in curve_intersector_virtual.cpp.
+      
+      template<int K> void intersect(void* pre, void* ray, IntersectContext* context, const void* primitive);
+      template<int K> bool occluded (void* pre, void* ray, IntersectContext* context, const void* primitive);
+
+      template<int K> void intersect(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+      template<int K> bool occluded (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive);
+
+    public:
+      Intersect1Ty intersect1;
+      Occluded1Ty  occluded1;
+      Intersect4Ty intersect4;
+      Occluded4Ty  occluded4;
+      Intersect8Ty intersect8;
+      Occluded8Ty  occluded8;
+      Intersect16Ty intersect16;
+      Occluded16Ty  occluded16;
+    };
+    
+    Intersectors vtbl[Geometry::GTY_END];
+  };
+
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<1> (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(intersect1); intersect1(pre,ray,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<1>  (void* pre, void* ray, IntersectContext* context, const void* primitive) { assert(occluded1); return occluded1(pre,ray,context,primitive); }
+      
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<4>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect4); intersect4(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<4> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded4); return occluded4(pre,ray,k,context,primitive); }
+      
+#if defined(__AVX__)
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<8>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect8); intersect8(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<8> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded8); return occluded8(pre,ray,k,context,primitive); }
+#endif
+  
+#if defined(__AVX512F__)
+  template<> __forceinline void VirtualCurveIntersector::Intersectors::intersect<16>(void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(intersect16); intersect16(pre,ray,k,context,primitive); }
+  template<> __forceinline bool VirtualCurveIntersector::Intersectors::occluded<16> (void* pre, void* ray, size_t k, IntersectContext* context, const void* primitive) { assert(occluded16); return occluded16(pre,ray,k,context,primitive); }
+#endif
+  
+  namespace isa
+  {
+    struct VirtualCurveIntersector1
+    {
+      typedef unsigned char Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+      
+      template<int N, int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+        leafIntersector.intersect<1>(&pre,&ray,context,prim);
+      }
+
+      template<int N, int Nx, bool robust>      
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        assert(num == 1);
+        RTCGeometryType ty = (RTCGeometryType)(*prim);
+        assert(This->leafIntersector);
+        VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+        return leafIntersector.occluded<1>(&pre,&ray,context,prim);
+      }
+    };
+
+    template<int K>
+      struct VirtualCurveIntersectorK 
+      {
+        typedef unsigned char Primitive;
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        template<bool robust>        
+        static __forceinline void intersect(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          size_t mask = movemask(valid_i);
+          while (mask) leafIntersector.intersect<K>(&pre,&ray,bscf(mask),context,prim);
+        }
+        
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          vbool<K> valid_o = false;
+          size_t mask = movemask(valid_i);
+          while (mask) {
+            size_t k = bscf(mask);
+            if (leafIntersector.occluded<K>(&pre,&ray,k,context,prim))
+              set(valid_o, k);
+          }
+          return valid_o;
+        }
+        
+        template<int N, int Nx, bool robust>              
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          leafIntersector.intersect<K>(&pre,&ray,k,context,prim);
+        }
+        
+        template<int N, int Nx, bool robust>      
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          assert(num == 1);
+          RTCGeometryType ty = (RTCGeometryType)(*prim);
+          assert(This->leafIntersector);
+          VirtualCurveIntersector::Intersectors& leafIntersector = ((VirtualCurveIntersector*) This->leafIntersector)->vtbl[ty];
+          return leafIntersector.occluded<K>(&pre,&ray,k,context,prim);
+        }
+      };
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearRoundConeNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &RoundLinearCurveMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &RoundLinearCurveMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearConeNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &ConeCurveMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &ConeCurveMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearRoundConeNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &RoundLinearCurveMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &RoundLinearCurveMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &RoundLinearCurveMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &RoundLinearCurveMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&RoundLinearCurveMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &RoundLinearCurveMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&RoundLinearCurveMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &RoundLinearCurveMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+    template<int N>
+    static VirtualCurveIntersector::Intersectors LinearConeNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &ConeCurveMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &ConeCurveMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &ConeCurveMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &ConeCurveMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&ConeCurveMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &ConeCurveMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&ConeCurveMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &ConeCurveMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+
+
+    template<int N>
+      static VirtualCurveIntersector::Intersectors LinearRibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &FlatLinearCurveMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &FlatLinearCurveMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors LinearRibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &FlatLinearCurveMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &FlatLinearCurveMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &FlatLinearCurveMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &FlatLinearCurveMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&FlatLinearCurveMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &FlatLinearCurveMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&FlatLinearCurveMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &FlatLinearCurveMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors SphereNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &SphereMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &SphereMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &SphereMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &SphereMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors SphereNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &SphereMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &SphereMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &SphereMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &SphereMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&SphereMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &SphereMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&SphereMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &SphereMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors DiscNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &DiscMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &DiscMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &DiscMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &DiscMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors DiscNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &DiscMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &DiscMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &DiscMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &DiscMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&DiscMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &DiscMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&DiscMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &DiscMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors OrientedDiscNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &OrientedDiscMiIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &OrientedDiscMiIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<int N>
+      static VirtualCurveIntersector::Intersectors OrientedDiscNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &OrientedDiscMiMBIntersector1<N,N,true>::intersect;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &OrientedDiscMiMBIntersector1<N,N,true>::occluded;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &OrientedDiscMiMBIntersectorK<N,N,4,true>::intersect;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &OrientedDiscMiMBIntersectorK<N,N,4,true>::occluded;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&OrientedDiscMiMBIntersectorK<N,N,8,true>::intersect;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &OrientedDiscMiMBIntersectorK<N,N,8,true>::occluded;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&OrientedDiscMiMBIntersectorK<N,N,16,true>::intersect;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &OrientedDiscMiMBIntersectorK<N,N,16,true>::occluded;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNiIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNvIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNvIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNvIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNvIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors RibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_t <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty) &CurveNiMBIntersectorK<N,4>::template intersect_t<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty)  &CurveNiMBIntersectorK<N,4>::template occluded_t <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNvIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNvIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNvIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNvIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNvIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNvIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNvIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNvIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNvIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors CurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_t<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_t <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_t<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_t <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_t<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_t <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_t<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_t <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors OrientedCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors OrientedCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_n<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_n <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_n<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_n <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_n<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_n <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_n<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_n <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteRibbonNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteRibbonNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<RibbonCurve1Intersector1<Curve>, Intersect1EpilogMU<VSIZEX,true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_h <RibbonCurve1Intersector1<Curve>, Occluded1EpilogMU<VSIZEX,true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<RibbonCurve1IntersectorK<Curve,4>, Intersect1KEpilogMU<VSIZEX,4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <RibbonCurve1IntersectorK<Curve,4>, Occluded1KEpilogMU<VSIZEX,4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<RibbonCurve1IntersectorK<Curve,8>, Intersect1KEpilogMU<VSIZEX,8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <RibbonCurve1IntersectorK<Curve,8>, Occluded1KEpilogMU<VSIZEX,8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<RibbonCurve1IntersectorK<Curve,16>, Intersect1KEpilogMU<VSIZEX,16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <RibbonCurve1IntersectorK<Curve,16>, Occluded1KEpilogMU<VSIZEX,16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_h<SweepCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_h <SweepCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_h<SweepCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_h <SweepCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_h<SweepCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_h <SweepCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_h<SweepCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_h <SweepCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+    
+    template<template<typename Ty> class Curve, int N>
+      static VirtualCurveIntersector::Intersectors HermiteOrientedCurveNiMBIntersectors()
+    {
+      VirtualCurveIntersector::Intersectors intersectors;
+      intersectors.intersect1 = (VirtualCurveIntersector::Intersect1Ty) &CurveNiMBIntersector1<N>::template intersect_hn<OrientedCurve1Intersector1<Curve>, Intersect1Epilog1<true> >;
+      intersectors.occluded1  = (VirtualCurveIntersector::Occluded1Ty)  &CurveNiMBIntersector1<N>::template occluded_hn <OrientedCurve1Intersector1<Curve>, Occluded1Epilog1<true> >;
+      intersectors.intersect4 = (VirtualCurveIntersector::Intersect4Ty)&CurveNiMBIntersectorK<N,4>::template intersect_hn<OrientedCurve1IntersectorK<Curve,4>, Intersect1KEpilog1<4,true> >;
+      intersectors.occluded4  = (VirtualCurveIntersector::Occluded4Ty) &CurveNiMBIntersectorK<N,4>::template occluded_hn <OrientedCurve1IntersectorK<Curve,4>, Occluded1KEpilog1<4,true> >;
+#if defined(__AVX__)
+      intersectors.intersect8 = (VirtualCurveIntersector::Intersect8Ty)&CurveNiMBIntersectorK<N,8>::template intersect_hn<OrientedCurve1IntersectorK<Curve,8>, Intersect1KEpilog1<8,true> >;
+      intersectors.occluded8  = (VirtualCurveIntersector::Occluded8Ty) &CurveNiMBIntersectorK<N,8>::template occluded_hn <OrientedCurve1IntersectorK<Curve,8>, Occluded1KEpilog1<8,true> >;
+#endif
+#if defined(__AVX512F__)
+      intersectors.intersect16 = (VirtualCurveIntersector::Intersect16Ty)&CurveNiMBIntersectorK<N,16>::template intersect_hn<OrientedCurve1IntersectorK<Curve,16>, Intersect1KEpilog1<16,true> >;
+      intersectors.occluded16  = (VirtualCurveIntersector::Occluded16Ty) &CurveNiMBIntersectorK<N,16>::template occluded_hn <OrientedCurve1IntersectorK<Curve,16>, Occluded1KEpilog1<16,true> >;
+#endif
+      return intersectors;
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h
new file mode 100644
index 0000000000..69cf612275
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bezier_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveBezierCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveBezierCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBezierCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h
new file mode 100644
index 0000000000..d37e41098e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_bspline_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveBSplineCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveBSplineCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveBSplineCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h
new file mode 100644
index 0000000000..a133a11d63
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_catmullrom_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveCatmullRomCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveCatmullRomCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveCatmullRomCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h
new file mode 100644
index 0000000000..9aec35da45
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_hermite_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveHermiteCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveHermiteCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveHermiteCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h
new file mode 100644
index 0000000000..dd37d194f5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_linear_curve.h
@@ -0,0 +1,21 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurveLinearCurveInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector4iMB(VirtualCurveIntersector &prim);
+#if defined(__AVX__)
+    void AddVirtualCurveLinearCurveInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurveLinearCurveInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h
new file mode 100644
index 0000000000..fe5ceed840
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/curve_intersector_virtual_point.h
@@ -0,0 +1,22 @@
+// Copyright 2020 Light Transport Entertainment Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "curve_intersector_virtual.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    void AddVirtualCurvePointInterector4i(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector4v(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector4iMB(VirtualCurveIntersector &prim);
+
+#if defined (__AVX__)
+    void AddVirtualCurvePointInterector8i(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector8v(VirtualCurveIntersector &prim);
+    void AddVirtualCurvePointInterector8iMB(VirtualCurveIntersector &prim);
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/cylinder.h b/thirdparty/embree-aarch64/kernels/geometry/cylinder.h
new file mode 100644
index 0000000000..39a582864c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/cylinder.h
@@ -0,0 +1,223 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct Cylinder
+    {
+      const Vec3fa p0;  //!< start location
+      const Vec3fa p1;  //!< end position
+      const float rr;   //!< squared radius of cylinder
+
+      __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float r) 
+        : p0(p0), p1(p1), rr(sqr(r)) {}
+
+      __forceinline Cylinder(const Vec3fa& p0, const Vec3fa& p1, const float rr, bool) 
+        : p0(p0), p1(p1), rr(rr) {}
+
+      __forceinline bool intersect(const Vec3fa& org,
+                                   const Vec3fa& dir, 
+                                   BBox1f& t_o, 
+                                   float& u0_o, Vec3fa& Ng0_o,
+                                   float& u1_o, Vec3fa& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const float rl = rcp_length(p1-p0);
+        const Vec3fa P0 = p0, dP = (p1-p0)*rl;
+        const Vec3fa O = org-P0, dO = dir;
+        
+        const float dOdO = dot(dO,dO);
+        const float OdO = dot(dO,O);
+        const float OO = dot(O,O);
+        const float dOz = dot(dP,dO);
+        const float Oz = dot(dP,O);
+        
+        const float A = dOdO - sqr(dOz);
+        const float B = 2.0f * (OdO - dOz*Oz);
+        const float C = OO - sqr(Oz) - rr;
+        
+        /* we miss the cylinder if determinant is smaller than zero */
+        const float D = B*B - 4.0f*A*C;
+        if (D < 0.0f) {
+          t_o = BBox1f(pos_inf,neg_inf);
+          return false;
+        }
+        
+        /* special case for rays that are parallel to the cylinder */
+        const float eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        if (abs(A) < eps) 
+        {
+          if (C <= 0.0f) {
+            t_o = BBox1f(neg_inf,pos_inf);
+            return true;
+          } else {
+            t_o = BBox1f(pos_inf,neg_inf);
+            return false;
+          }
+        }
+        
+        /* standard case for rays that are not parallel to the cylinder */
+        const float Q = sqrt(D);
+        const float rcp_2A = rcp(2.0f*A);
+        const float t0 = (-B-Q)*rcp_2A;
+        const float t1 = (-B+Q)*rcp_2A;
+        
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = madd(t0,dOz,Oz)*rl;
+          const Vec3fa Pr = t0*dir;
+          const Vec3fa Pl = madd(u0_o,p1-p0,p0);
+          Ng0_o = Pr-Pl;
+        }
+
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = madd(t1,dOz,Oz)*rl;
+          const Vec3fa Pr = t1*dir;
+          const Vec3fa Pl = madd(u1_o,p1-p0,p0);
+          Ng1_o = Pr-Pl;
+        }
+
+        t_o.lower = t0;
+        t_o.upper = t1;
+        return true;
+      }
+
+      __forceinline bool intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox1f& t_o) const
+      {
+        float u0_o; Vec3fa Ng0_o;
+        float u1_o; Vec3fa Ng1_o;
+        return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+
+      static bool verify(const size_t id, const Cylinder& cylinder, const RayHit& ray, bool shouldhit, const float t0, const float t1)
+      {
+        float eps = 0.001f;
+        BBox1f t; bool hit;
+        hit = cylinder.intersect(ray.org,ray.dir,t);
+
+        bool failed = hit != shouldhit;
+        if (shouldhit) failed |= std::isinf(t0) ? t0 != t.lower : abs(t0-t.lower) > eps;
+        if (shouldhit) failed |= std::isinf(t1) ? t1 != t.upper : abs(t1-t.upper) > eps;
+        if (!failed) return true;
+        embree_cout << "Cylinder test " << id << " failed: cylinder = " << cylinder << ", ray = " << ray << ", hit = " << hit << ", t = " << t << embree_endl; 
+        return false;
+      }
+
+      /* verify cylinder class */
+      static bool verify()
+      {
+        bool passed = true;
+        const Cylinder cylinder(Vec3fa(0.0f,0.0f,0.0f),Vec3fa(1.0f,0.0f,0.0f),1.0f);
+        passed &= verify(0,cylinder,RayHit(Vec3fa(-2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(1,cylinder,RayHit(Vec3fa(+2.0f,1.0f,0.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),true,0.0f,2.0f);
+        passed &= verify(2,cylinder,RayHit(Vec3fa(+2.0f,1.0f,2.0f),Vec3fa( 0.0f,-1.0f,+0.0f),0.0f,float(inf)),false,0.0f,0.0f);
+        passed &= verify(3,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(4,cylinder,RayHit(Vec3fa(+0.0f,0.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),true,neg_inf,pos_inf);
+        passed &= verify(5,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa( 1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        passed &= verify(6,cylinder,RayHit(Vec3fa(+0.0f,2.0f,0.0f),Vec3fa(-1.0f, 0.0f,+0.0f),0.0f,float(inf)),false,pos_inf,neg_inf);
+        return passed;
+      }
+
+      /*! output operator */
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const Cylinder& c) {
+        return cout << "Cylinder { p0 = " << c.p0 << ", p1 = " << c.p1 << ", r = " << sqrtf(c.rr) << "}";
+      }
+    };
+
+    template<int N>
+      struct CylinderN
+    { 
+      const Vec3vf<N> p0;     //!< start location
+      const Vec3vf<N> p1;     //!< end position
+      const vfloat<N> rr;   //!< squared radius of cylinder
+
+      __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& r)
+        : p0(p0), p1(p1), rr(sqr(r)) {}
+
+      __forceinline CylinderN(const Vec3vf<N>& p0, const Vec3vf<N>& p1, const vfloat<N>& rr, bool)
+        : p0(p0), p1(p1), rr(rr) {}
+
+     
+      __forceinline vbool<N> intersect(const Vec3fa& org, const Vec3fa& dir, 
+                                       BBox<vfloat<N>>& t_o, 
+                                       vfloat<N>& u0_o, Vec3vf<N>& Ng0_o,
+                                       vfloat<N>& u1_o, Vec3vf<N>& Ng1_o) const
+      {
+        /* calculate quadratic equation to solve */
+        const vfloat<N> rl = rcp_length(p1-p0);
+        const Vec3vf<N> P0 = p0, dP = (p1-p0)*rl;
+        const Vec3vf<N> O = Vec3vf<N>(org)-P0, dO = dir;
+        
+        const vfloat<N> dOdO = dot(dO,dO);
+        const vfloat<N> OdO = dot(dO,O);
+        const vfloat<N> OO = dot(O,O);
+        const vfloat<N> dOz = dot(dP,dO);
+        const vfloat<N> Oz = dot(dP,O);
+        
+        const vfloat<N> A = dOdO - sqr(dOz);
+        const vfloat<N> B = 2.0f * (OdO - dOz*Oz);
+        const vfloat<N> C = OO - sqr(Oz) - rr;
+        
+        /* we miss the cylinder if determinant is smaller than zero */
+        const vfloat<N> D = B*B - 4.0f*A*C;
+        vbool<N> valid = D >= 0.0f;
+        if (none(valid)) {
+          t_o = BBox<vfloat<N>>(empty);
+          return valid;
+        }
+
+        /* standard case for rays that are not parallel to the cylinder */
+        const vfloat<N> Q = sqrt(D);
+        const vfloat<N> rcp_2A = rcp(2.0f*A);
+        const vfloat<N> t0 = (-B-Q)*rcp_2A;
+        const vfloat<N> t1 = (-B+Q)*rcp_2A;
+        
+        /* calculates u and Ng for near hit */
+        {
+          u0_o = madd(t0,dOz,Oz)*rl;
+          const Vec3vf<N> Pr = t0*Vec3vf<N>(dir);
+          const Vec3vf<N> Pl = madd(u0_o,p1-p0,p0);
+          Ng0_o = Pr-Pl;
+        }
+        
+        /* calculates u and Ng for far hit */
+        {
+          u1_o = madd(t1,dOz,Oz)*rl;
+          const Vec3vf<N> Pr = t1*Vec3vf<N>(dir);
+          const Vec3vf<N> Pl = madd(u1_o,p1-p0,p0);
+          Ng1_o = Pr-Pl;
+        }
+
+        t_o.lower = select(valid, t0, vfloat<N>(pos_inf));
+        t_o.upper = select(valid, t1, vfloat<N>(neg_inf));
+
+        /* special case for rays that are parallel to the cylinder */
+        const vfloat<N> eps = 16.0f*float(ulp)*max(abs(dOdO),abs(sqr(dOz)));
+        vbool<N> validt = valid & (abs(A) < eps); 
+        if (unlikely(any(validt))) 
+        {
+          vbool<N> inside = C <= 0.0f;
+          t_o.lower = select(validt,select(inside,vfloat<N>(neg_inf),vfloat<N>(pos_inf)),t_o.lower);
+          t_o.upper = select(validt,select(inside,vfloat<N>(pos_inf),vfloat<N>(neg_inf)),t_o.upper);
+          valid &= !validt | inside;
+        }
+        return valid;
+      }
+
+      __forceinline vbool<N> intersect(const Vec3fa& org_i, const Vec3fa& dir, BBox<vfloat<N>>& t_o) const
+      {
+        vfloat<N> u0_o; Vec3vf<N> Ng0_o;
+        vfloat<N> u1_o; Vec3vf<N> Ng1_o;
+        return intersect(org_i,dir,t_o,u0_o,Ng0_o,u1_o,Ng1_o);
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h
new file mode 100644
index 0000000000..e8305780e5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/disc_intersector.h
@@ -0,0 +1,216 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_points.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct DiscIntersectorHitM
+    {
+      __forceinline DiscIntersectorHitM() {}
+
+      __forceinline DiscIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng)
+      {
+      }
+
+      __forceinline void finalize() {}
+
+      __forceinline Vec2f uv(const size_t i) const
+      {
+        return Vec2f(vu[i], vv[i]);
+      }
+      __forceinline float t(const size_t i) const
+      {
+        return vt[i];
+      }
+      __forceinline Vec3fa Ng(const size_t i) const
+      {
+        return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
+      }
+
+     public:
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct DiscIntersector1
+    {
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+          const vbool<M>& valid_i,
+          Ray& ray,
+          IntersectContext* context,
+          const Points* geom,
+          const Precalculations& pre,
+          const Vec4vf<M>& v0i,
+          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+        const vfloat<M> rd2    = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+
+        valid &= (vfloat<M>(ray.tnear()) <= projC0) & (projC0 <= vfloat<M>(ray.tfar));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale;  // ignore self intersections
+        if (unlikely(none(valid)))
+          return false;
+        
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
+        return epilog(valid, hit);
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          Ray& ray,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Vec3vf<M>& normal,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid         = valid_i;
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        vfloat<M> divisor       = dot(Vec3vf<M>((Vec3fa)ray.dir), normal);
+        const vbool<M> parallel = divisor == vfloat<M>(0.f);
+        valid &= !parallel;
+        divisor = select(parallel, 1.f, divisor);  // prevent divide by zero
+
+        vfloat<M> t = dot(center - Vec3vf<M>((Vec3fa)ray.org), Vec3vf<M>(normal)) / divisor;
+
+        valid &= (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar));
+        if (unlikely(none(valid)))
+          return false;
+
+        Vec3vf<M> intersection = Vec3vf<M>((Vec3fa)ray.org) + Vec3vf<M>((Vec3fa)ray.dir) * t;
+        vfloat<M> dist2        = dot(intersection - center, intersection - center);
+        valid &= dist2 < radius * radius;
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, t, normal);
+        return epilog(valid, hit);
+      }
+    };
+
+    template<int M, int K>
+    struct DiscIntersectorK
+    {
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray,
+                                          size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+        const vfloat<M> rd2    = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+
+        valid &= (vfloat<M>(ray.tnear()[k]) <= projC0) & (projC0 <= vfloat<M>(ray.tfar[k]));
+        if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f)
+          valid &= projC0 > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR) * radius * pre.depth_scale[k];  // ignore self intersections
+        if (unlikely(none(valid)))
+          return false;
+
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, projC0, -ray_dir);
+        return epilog(valid, hit);
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray,
+                                          size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Vec3vf<M>& normal,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid         = valid_i;
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+        
+        vfloat<M> divisor       = dot(Vec3vf<M>(ray_dir), normal);
+        const vbool<M> parallel = divisor == vfloat<M>(0.f);
+        valid &= !parallel;
+        divisor = select(parallel, 1.f, divisor);  // prevent divide by zero
+
+        vfloat<M> t = dot(center - Vec3vf<M>(ray_org), Vec3vf<M>(normal)) / divisor;
+
+        valid &= (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k]));
+        if (unlikely(none(valid)))
+          return false;
+
+        Vec3vf<M> intersection = Vec3vf<M>(ray_org) + Vec3vf<M>(ray_dir) * t;
+        vfloat<M> dist2        = dot(intersection - center, intersection - center);
+        valid &= dist2 < radius * radius;
+        if (unlikely(none(valid)))
+          return false;
+
+        DiscIntersectorHitM<M> hit(zero, zero, t, normal);
+        return epilog(valid, hit);
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h
new file mode 100644
index 0000000000..e1dc3aa98e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/disci_intersector.h
@@ -0,0 +1,277 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "disc_intersector.h"
+#include "intersector_epilog.h"
+#include "pointi.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, int Mx, bool filter>
+    struct DiscMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct DiscMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time());
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time());
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct DiscMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersectorK<Mx, K>::intersect(
+            valid, ray, k, context, geom, pre, v0,
+            Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct DiscMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Disc.gather(v0, geom, ray.time()[k]);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0, Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct OrientedDiscMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct OrientedDiscMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time());
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Intersect1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time());
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, n0, Occluded1EpilogM<M, Mx, filter>(ray, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct OrientedDiscMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersectorK<Mx, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersectorK<Mx, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct OrientedDiscMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time()[k]);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        DiscIntersectorK<Mx, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& Disc)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(Disc.geomID());
+        Vec4vf<M> v0; Vec3vf<M> n0;
+        Disc.gather(v0, n0, geom, ray.time()[k]);
+        const vbool<Mx> valid = Disc.template valid<Mx>();
+        return DiscIntersectorK<Mx, K>::intersect(
+            valid, ray, k, context, geom, pre, v0, n0,
+            Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, Disc.geomID(), Disc.primID()));
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/filter.h b/thirdparty/embree-aarch64/kernels/geometry/filter.h
new file mode 100644
index 0000000000..4cdf7a395a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/filter.h
@@ -0,0 +1,204 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+#include "../common/ray.h"
+#include "../common/hit.h"
+#include "../common/context.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    __forceinline bool runIntersectionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      if (geometry->intersectionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+            
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      
+      copyHitToRay(*(RayHit*)args->ray,*(Hit*)args->hit);
+      return true;
+    }
+    
+    __forceinline bool runIntersectionFilter1(const Geometry* const geometry, RayHit& ray, IntersectContext* context, Hit& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      int mask = -1;
+      args.valid = &mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = 1;
+      return runIntersectionFilter1Helper(&args,geometry,context);
+    }
+
+    __forceinline void reportIntersection1(IntersectFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
+    {
+#if defined(EMBREE_FILTER_FUNCTION)
+      IntersectContext* MAYBE_UNUSED context = args->internal_context;
+      const Geometry* const geometry = args->geometry;
+      if (geometry->intersectionFilterN) {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(filter_args);
+      }
+      
+      //if (args->valid[0] == 0)
+      //  return;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(filter_args);
+      }
+#endif
+    }
+    
+    __forceinline bool runOcclusionFilter1Helper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      if (geometry->occlusionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+
+        if (args->valid[0] == 0)
+          return false;
+      }
+      return true;
+    }
+
+    __forceinline bool runOcclusionFilter1(const Geometry* const geometry, Ray& ray, IntersectContext* context, Hit& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      int mask = -1;
+      args.valid = &mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = 1;
+      return runOcclusionFilter1Helper(&args,geometry,context);
+    }
+
+    __forceinline void reportOcclusion1(OccludedFunctionNArguments* args, const RTCFilterFunctionNArguments* filter_args)
+    {
+#if defined(EMBREE_FILTER_FUNCTION)
+      IntersectContext* MAYBE_UNUSED context = args->internal_context;
+      const Geometry* const geometry = args->geometry;
+      if (geometry->occlusionFilterN) {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(filter_args);
+      }
+      
+      //if (args->valid[0] == 0)
+      //  return false;
+      
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(filter_args);
+      }
+#endif
+    }
+
+    template<int K>
+      __forceinline vbool<K> runIntersectionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      vint<K>* mask = (vint<K>*) args->valid;
+      if (geometry->intersectionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->intersectionFilterN(args);
+      }
+
+      vbool<K> valid_o = *mask != vint<K>(zero);
+      if (none(valid_o)) return valid_o;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+      }
+
+      valid_o = *mask != vint<K>(zero);
+      if (none(valid_o)) return valid_o;
+      
+      copyHitToRay(valid_o,*(RayHitK<K>*)args->ray,*(HitK<K>*)args->hit);
+      return valid_o;
+    }
+    
+    template<int K>
+    __forceinline vbool<K> runIntersectionFilter(const vbool<K>& valid, const Geometry* const geometry, RayHitK<K>& ray, IntersectContext* context, HitK<K>& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      vint<K> mask = valid.mask32();
+      args.valid = (int*)&mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = K;
+      return runIntersectionFilterHelper<K>(&args,geometry,context);
+    }
+
+    template<int K>
+      __forceinline vbool<K> runOcclusionFilterHelper(RTCFilterFunctionNArguments* args, const Geometry* const geometry, IntersectContext* context)
+    {
+      vint<K>* mask = (vint<K>*) args->valid;
+      if (geometry->occlusionFilterN)
+      {
+        assert(context->scene->hasGeometryFilterFunction());
+        geometry->occlusionFilterN(args);
+      }
+
+      vbool<K> valid_o = *mask != vint<K>(zero);
+      
+      if (none(valid_o)) return valid_o;
+
+      if (context->user->filter) {
+        assert(context->scene->hasContextFilterFunction());
+        context->user->filter(args);
+      }
+
+      valid_o = *mask != vint<K>(zero);
+
+      RayK<K>* ray = (RayK<K>*) args->ray;
+      ray->tfar = select(valid_o, vfloat<K>(neg_inf), ray->tfar);
+      return valid_o;
+    }
+
+    template<int K>
+      __forceinline vbool<K> runOcclusionFilter(const vbool<K>& valid, const Geometry* const geometry, RayK<K>& ray, IntersectContext* context, HitK<K>& hit)
+    {
+      RTCFilterFunctionNArguments args;
+      vint<K> mask = valid.mask32();
+      args.valid = (int*)&mask;
+      args.geometryUserPtr = geometry->userPtr;
+      args.context = context->user;
+      args.ray = (RTCRayN*)&ray;
+      args.hit = (RTCHitN*)&hit;
+      args.N = K;
+      return runOcclusionFilterHelper<K>(&args,geometry,context);
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h
new file mode 100644
index 0000000000..46a0af0827
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_intersector.h
@@ -0,0 +1,99 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "grid_soa_intersector1.h"
+#include "grid_soa_intersector_packet.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      class SubdivPatch1Precalculations : public T
+    { 
+    public:
+      __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr)
+        : T(ray,ptr) {}
+    };
+
+    template<int K, typename T>
+      class SubdivPatch1PrecalculationsK : public T
+    { 
+    public:
+      __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray)
+        : T(valid,ray) {}
+    };
+
+    class Grid1Intersector1
+    {
+    public:
+      typedef GridSOA Primitive;
+      typedef Grid1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) 
+      {
+        GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
+      }
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        intersect(pre,ray,context,prim,ty,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
+      }
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        return occluded(pre,ray,context,prim,ty,lazy_node);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        assert(false && "not implemented");
+        return false;
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, size_t& lazy_node) {
+        assert(false && "not implemented");
+        return false;
+      }
+    };
+
+    template <int K>
+      struct GridIntersectorK
+    {
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
+      
+      
+      static __forceinline void intersect(const vbool<K>& valid, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+      }
+      
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, size_t& lazy_node)
+      {
+        GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+      }
+    };
+
+    typedef Grid1IntersectorK<4>  SubdivPatch1Intersector4;
+    typedef Grid1IntersectorK<8>  SubdivPatch1Intersector8;
+    typedef Grid1IntersectorK<16> SubdivPatch1Intersector16;
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h
new file mode 100644
index 0000000000..d3b275586c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa.h
@@ -0,0 +1,275 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_subdiv_mesh.h"
+#include "../bvh/bvh.h"
+#include "../subdiv/tessellation.h"
+#include "../subdiv/tessellation_cache.h"
+#include "subdivpatch1.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class GridSOA
+    {
+    public:
+
+      /*! GridSOA constructor */
+      GridSOA(const SubdivPatch1Base* patches, const unsigned time_steps,
+              const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight,
+              const SubdivMesh* const geom, const size_t totalBvhBytes, const size_t gridBytes, BBox3fa* bounds_o = nullptr);
+
+      /*! Subgrid creation */
+      template<typename Allocator>
+        static GridSOA* create(const SubdivPatch1Base* patches, const unsigned time_steps,
+                               unsigned x0, unsigned x1, unsigned y0, unsigned y1, 
+                               const Scene* scene, Allocator& alloc, BBox3fa* bounds_o = nullptr)
+      {
+        const unsigned width = x1-x0+1;  
+        const unsigned height = y1-y0+1; 
+        const GridRange range(0,width-1,0,height-1);
+        size_t bvhBytes = 0;
+        if (time_steps == 1) 
+          bvhBytes = getBVHBytes(range,sizeof(BVH4::AABBNode),0);
+        else {
+          bvhBytes = (time_steps-1)*getBVHBytes(range,sizeof(BVH4::AABBNodeMB),0);
+          bvhBytes += getTemporalBVHBytes(make_range(0,int(time_steps-1)),sizeof(BVH4::AABBNodeMB4D));
+        }
+        const size_t gridBytes = 4*size_t(width)*size_t(height)*sizeof(float);  
+        size_t rootBytes = time_steps*sizeof(BVH4::NodeRef);
+#if !defined(__X86_64__) && !defined(__aarch64__)
+        rootBytes += 4; // We read 2 elements behind the grid. As we store at least 8 root bytes after the grid we are fine in 64 bit mode. But in 32 bit mode we have to do additional padding.
+#endif
+        void* data = alloc(offsetof(GridSOA,data)+bvhBytes+time_steps*gridBytes+rootBytes);
+        assert(data);
+        return new (data) GridSOA(patches,time_steps,x0,x1,y0,y1,patches->grid_u_res,patches->grid_v_res,scene->get<SubdivMesh>(patches->geomID()),bvhBytes,gridBytes,bounds_o);
+      }
+
+      /*! Grid creation */
+      template<typename Allocator>
+        static GridSOA* create(const SubdivPatch1Base* const patches, const unsigned time_steps,
+                               const Scene* scene, const Allocator& alloc, BBox3fa* bounds_o = nullptr) 
+      {
+        return create(patches,time_steps,0,patches->grid_u_res-1,0,patches->grid_v_res-1,scene,alloc,bounds_o);
+      }
+
+       /*! returns reference to root */
+      __forceinline       BVH4::NodeRef& root(size_t t = 0)       { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
+      __forceinline const BVH4::NodeRef& root(size_t t = 0) const { return (BVH4::NodeRef&)data[rootOffset + t*sizeof(BVH4::NodeRef)]; }
+
+      /*! returns pointer to BVH array */
+      __forceinline       int8_t* bvhData()       { return &data[0]; }
+      __forceinline const int8_t* bvhData() const { return &data[0]; }
+
+      /*! returns pointer to Grid array */
+      __forceinline       float* gridData(size_t t = 0)       { return (float*) &data[gridOffset + t*gridBytes]; }
+      __forceinline const float* gridData(size_t t = 0) const { return (float*) &data[gridOffset + t*gridBytes]; }
+      
+      __forceinline void* encodeLeaf(size_t u, size_t v) {
+        return (void*) (16*(v * width + u + 1)); // +1 to not create empty leaf
+      }
+      __forceinline float* decodeLeaf(size_t t, const void* ptr) {
+        return gridData(t) + (((size_t) (ptr) >> 4) - 1);
+      }
+
+      /*! returns the size of the BVH over the grid in bytes */
+      static size_t getBVHBytes(const GridRange& range, const size_t nodeBytes, const size_t leafBytes);
+
+      /*! returns the size of the temporal BVH over the time range BVHs */
+      static size_t getTemporalBVHBytes(const range<int> time_range, const size_t nodeBytes);
+
+      /*! calculates bounding box of grid range */
+      __forceinline BBox3fa calculateBounds(size_t time, const GridRange& range) const
+      {
+        const float* const grid_array = gridData(time);
+        const float* const grid_x_array = grid_array + 0 * dim_offset;
+        const float* const grid_y_array = grid_array + 1 * dim_offset;
+        const float* const grid_z_array = grid_array + 2 * dim_offset;
+        
+        /* compute the bounds just for the range! */
+        BBox3fa bounds( empty );
+        for (unsigned v = range.v_start; v<=range.v_end; v++) 
+        {
+          for (unsigned u = range.u_start; u<=range.u_end; u++)
+          {
+            const float x = grid_x_array[ v * width + u];
+            const float y = grid_y_array[ v * width + u];
+            const float z = grid_z_array[ v * width + u];
+            bounds.extend( Vec3fa(x,y,z) );
+          }
+        }
+        assert(is_finite(bounds));
+        return bounds;
+      }
+
+      /*! Evaluates grid over patch and builds BVH4 tree over the grid. */
+      std::pair<BVH4::NodeRef,BBox3fa> buildBVH(BBox3fa* bounds_o);
+      
+      /*! Create BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,BBox3fa> buildBVH(const GridRange& range, size_t& allocator);
+
+      /*! Evaluates grid over patch and builds MSMBlur BVH4 tree over the grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, BBox3fa* bounds_o);
+      
+      /*! Create MBlur BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMBlurBVH(size_t time, const GridRange& range, size_t& allocator);
+
+      /*! Create MSMBlur BVH4 tree over grid. */
+      std::pair<BVH4::NodeRef,LBBox3fa> buildMSMBlurBVH(const range<int> time_range, size_t& allocator, BBox3fa* bounds_o);
+
+      template<typename Loader>
+        struct MapUV
+      {
+        typedef typename Loader::vfloat vfloat;
+        const float* const grid_uv;
+        size_t line_offset;
+        size_t lines;
+
+        __forceinline MapUV(const float* const grid_uv, size_t line_offset, const size_t lines)
+          : grid_uv(grid_uv), line_offset(line_offset), lines(lines) {}
+
+        __forceinline void operator() (vfloat& u, vfloat& v) const {
+          const Vec3<vfloat> tri_v012_uv = Loader::gather(grid_uv,line_offset,lines);	
+          const Vec2<vfloat> uv0 = GridSOA::decodeUV(tri_v012_uv[0]);
+          const Vec2<vfloat> uv1 = GridSOA::decodeUV(tri_v012_uv[1]);
+          const Vec2<vfloat> uv2 = GridSOA::decodeUV(tri_v012_uv[2]);        
+          const Vec2<vfloat> uv = u * uv1 + v * uv2 + (1.0f-u-v) * uv0;        
+          u = uv[0];v = uv[1]; 
+        }
+      };
+
+      struct Gather2x3
+      {
+        enum { M = 4 };
+        typedef vbool4 vbool;
+        typedef vint4 vint;
+        typedef vfloat4 vfloat;
+        
+        static __forceinline const Vec3vf4 gather(const float* const grid, const size_t line_offset, const size_t lines)
+        {
+          vfloat4 r0 = vfloat4::loadu(grid + 0*line_offset);
+          vfloat4 r1 = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid
+          if (unlikely(line_offset == 2))
+          {
+            r0 = shuffle<0,1,1,1>(r0);
+            r1 = shuffle<0,1,1,1>(r1);
+          }
+          return Vec3vf4(unpacklo(r0,r1),       // r00, r10, r01, r11
+                         shuffle<1,1,2,2>(r0),  // r01, r01, r02, r02
+                         shuffle<0,1,1,2>(r1)); // r10, r11, r11, r12
+        }
+
+        static __forceinline void gather(const float* const grid_x, 
+                                         const float* const grid_y, 
+                                         const float* const grid_z, 
+                                         const size_t line_offset,
+                                         const size_t lines,
+                                         Vec3vf4& v0_o,
+                                         Vec3vf4& v1_o,
+                                         Vec3vf4& v2_o)
+        {
+          const Vec3vf4 tri_v012_x = gather(grid_x,line_offset,lines);
+          const Vec3vf4 tri_v012_y = gather(grid_y,line_offset,lines);
+          const Vec3vf4 tri_v012_z = gather(grid_z,line_offset,lines);
+          v0_o = Vec3vf4(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]);
+          v1_o = Vec3vf4(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]);
+          v2_o = Vec3vf4(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]);
+        }
+      };
+      
+#if defined (__AVX__)
+      struct Gather3x3
+      {
+        enum { M = 8 };
+        typedef vbool8 vbool;
+        typedef vint8 vint;
+        typedef vfloat8 vfloat;
+        
+        static __forceinline const Vec3vf8 gather(const float* const grid, const size_t line_offset, const size_t lines)
+        {
+          vfloat4 ra = vfloat4::loadu(grid + 0*line_offset);
+          vfloat4 rb = vfloat4::loadu(grid + 1*line_offset); // this accesses 2 elements too much in case of 2x2 grid, but this is ok as we ensure enough padding after the grid
+          vfloat4 rc;
+          if (likely(lines > 2)) 
+            rc = vfloat4::loadu(grid + 2*line_offset);
+          else                   
+            rc = rb;
+
+          if (unlikely(line_offset == 2))
+          {
+            ra = shuffle<0,1,1,1>(ra);
+            rb = shuffle<0,1,1,1>(rb);
+            rc = shuffle<0,1,1,1>(rc);
+          }
+          
+          const vfloat8 r0 = vfloat8(ra,rb);
+          const vfloat8 r1 = vfloat8(rb,rc);
+          return Vec3vf8(unpacklo(r0,r1),         // r00, r10, r01, r11, r10, r20, r11, r21
+                         shuffle<1,1,2,2>(r0),    // r01, r01, r02, r02, r11, r11, r12, r12
+                         shuffle<0,1,1,2>(r1));   // r10, r11, r11, r12, r20, r21, r21, r22
+        }
+
+        static __forceinline void gather(const float* const grid_x, 
+                                         const float* const grid_y, 
+                                         const float* const grid_z, 
+                                         const size_t line_offset,
+                                         const size_t lines,
+                                         Vec3vf8& v0_o,
+                                         Vec3vf8& v1_o,
+                                         Vec3vf8& v2_o)
+        {
+          const Vec3vf8 tri_v012_x = gather(grid_x,line_offset,lines);
+          const Vec3vf8 tri_v012_y = gather(grid_y,line_offset,lines);
+          const Vec3vf8 tri_v012_z = gather(grid_z,line_offset,lines);
+          v0_o = Vec3vf8(tri_v012_x[0],tri_v012_y[0],tri_v012_z[0]);
+          v1_o = Vec3vf8(tri_v012_x[1],tri_v012_y[1],tri_v012_z[1]);
+          v2_o = Vec3vf8(tri_v012_x[2],tri_v012_y[2],tri_v012_z[2]);
+        }
+      };
+#endif
+
+      template<typename vfloat>
+      static __forceinline Vec2<vfloat> decodeUV(const vfloat& uv)
+      {
+        typedef typename vfloat::Int vint;
+        const vint iu  = asInt(uv) & 0xffff;
+        const vint iv  = srl(asInt(uv),16);
+	const vfloat u = (vfloat)iu * vfloat(8.0f/0x10000);
+	const vfloat v = (vfloat)iv * vfloat(8.0f/0x10000);
+	return Vec2<vfloat>(u,v);
+      }
+      
+      __forceinline unsigned int geomID() const  {
+        return _geomID;
+      } 
+      
+      __forceinline unsigned int primID() const  {
+        return _primID;
+      } 
+
+    public:
+      BVH4::NodeRef troot;
+#if !defined(__X86_64__) && !defined(__aarch64__)
+      unsigned align1;
+#endif
+      unsigned time_steps;
+      unsigned width;
+
+      unsigned height;
+      unsigned dim_offset;
+      unsigned _geomID;
+      unsigned _primID;
+
+      unsigned align2;
+      unsigned gridOffset;
+      unsigned gridBytes;
+      unsigned rootOffset;
+
+      int8_t data[1];      //!< after the struct we first store the BVH, then the grid, and finally the roots
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h
new file mode 100644
index 0000000000..2ed922a5ae
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector1.h
@@ -0,0 +1,207 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "../common/ray.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    class GridSOAIntersector1
+    {
+    public:
+      typedef void Primitive;
+      
+      class Precalculations
+      { 
+      public:
+        __forceinline Precalculations (const Ray& ray, const void* ptr)
+          : grid(nullptr) {}
+        
+      public:
+        GridSOA* grid;
+        int itime;
+        float ftime;
+      };
+      
+      template<typename Loader>
+        static __forceinline void intersect(RayHit& ray,
+                                            IntersectContext* context, 
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);       
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+      
+      template<typename Loader>
+        static __forceinline bool occluded(Ray& ray,
+                                           IntersectContext* context, 
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> v0, v1, v2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+      
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) 
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, context, grid_x            , line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }      
+    };
+
+    class GridSOAMBIntersector1
+    {
+    public:
+      typedef void Primitive;
+      typedef GridSOAIntersector1::Precalculations Precalculations;
+      
+      template<typename Loader>
+        static __forceinline void intersect(RayHit& ray, const float ftime,
+                                            IntersectContext* context, 
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        intersector.intersect(ray,v0,v1,v2,mapUV,Intersect1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+      
+      template<typename Loader>
+        static __forceinline bool occluded(Ray& ray, const float ftime,
+                                           IntersectContext* context, 
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+       
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+        
+        GridSOA::MapUV<Loader> mapUV(grid_uv,line_offset,lines);
+        PlueckerIntersector1<Loader::M> intersector(ray,nullptr);
+        return intersector.intersect(ray,v0,v1,v2,mapUV,Occluded1EpilogMU<Loader::M,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+      
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node) 
+      { 
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(pre.itime,prim);
+        
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(pre.itime,prim);
+        
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, pre.ftime, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, pre.ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }      
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h
new file mode 100644
index 0000000000..41d66e1e28
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/grid_soa_intersector_packet.h
@@ -0,0 +1,445 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "grid_soa.h"
+#include "../common/ray.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int K>
+      struct MapUV0
+    {
+      const float* const grid_uv;
+      size_t ofs00, ofs01, ofs10, ofs11;
+      
+      __forceinline MapUV0(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11)
+        : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {}
+      
+      __forceinline void operator() (vfloat<K>& u, vfloat<K>& v) const {
+        const vfloat<K> uv00(grid_uv[ofs00]);
+        const vfloat<K> uv01(grid_uv[ofs01]);
+        const vfloat<K> uv10(grid_uv[ofs10]);
+        const vfloat<K> uv11(grid_uv[ofs11]);
+        const Vec2vf<K> uv0 = GridSOA::decodeUV(uv00);
+        const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01);
+        const Vec2vf<K> uv2 = GridSOA::decodeUV(uv10);
+        const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0));
+        u = uv[0]; v = uv[1];
+      }
+    };
+    
+    template<int K>
+      struct MapUV1
+    {
+      const float* const grid_uv;
+      size_t ofs00, ofs01, ofs10, ofs11;
+      
+      __forceinline MapUV1(const float* const grid_uv, size_t ofs00, size_t ofs01, size_t ofs10, size_t ofs11)
+        : grid_uv(grid_uv), ofs00(ofs00), ofs01(ofs01), ofs10(ofs10), ofs11(ofs11) {}
+      
+      __forceinline void operator() (vfloat<K>& u, vfloat<K>& v) const {
+        const vfloat<K> uv00(grid_uv[ofs00]);
+        const vfloat<K> uv01(grid_uv[ofs01]);
+        const vfloat<K> uv10(grid_uv[ofs10]);
+        const vfloat<K> uv11(grid_uv[ofs11]);
+        const Vec2vf<K> uv0 = GridSOA::decodeUV(uv10);
+        const Vec2vf<K> uv1 = GridSOA::decodeUV(uv01);
+        const Vec2vf<K> uv2 = GridSOA::decodeUV(uv11);
+        const Vec2vf<K> uv = madd(u,uv1,madd(v,uv2,(1.0f-u-v)*uv0));
+        u = uv[0]; v = uv[1];
+      }
+    };
+    
+    template<int K>
+      class GridSOAIntersectorK
+    {
+    public:
+      typedef void Primitive;
+
+      class Precalculations
+      {
+#if defined(__AVX__)
+        static const int M = 8;
+#else
+        static const int M = 4;
+#endif
+
+      public:
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray)
+          : grid(nullptr), intersector(valid,ray) {}
+
+      public:
+        GridSOA* grid;
+        PlueckerIntersectorK<M,K> intersector; // FIXME: use quad intersector
+      };
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            const size_t ofs00 = (y+0)*line_offset+(x+0);
+            const size_t ofs01 = (y+0)*line_offset+(x+1);
+            const size_t ofs10 = (y+1)*line_offset+(x+0);
+            const size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+
+            pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+            pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+          }
+        }
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        vbool<K> valid = valid_i;
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            const size_t ofs00 = (y+0)*line_offset+(x+0);
+            const size_t ofs01 = (y+0)*line_offset+(x+1);
+            const size_t ofs10 = (y+1)*line_offset+(x+0);
+            const size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> p00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> p01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> p10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> p11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+
+            pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+            pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+          }
+        }
+        return !valid;
+      }
+
+      template<typename Loader>
+        static __forceinline void intersect(RayHitK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+
+      template<typename Loader>
+        static __forceinline bool occluded(RayK<K>& ray, size_t k,
+                                           IntersectContext* context,
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+        Vec3<vfloat> v0, v1, v2; Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,v0,v1,v2);
+        return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, k, context, grid_x            , line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(0,prim);
+
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, k, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x            , line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, k, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }
+    };
+
+    template<int K>
+    class GridSOAMBIntersectorK
+    {
+    public:
+      typedef void Primitive;
+      typedef typename GridSOAIntersectorK<K>::Precalculations Precalculations;
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        vfloat<K> vftime;
+        vint<K> vitime = getTimeSegment(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
+
+        vbool<K> valid1 = valid_i;
+        while (any(valid1)) {
+          const size_t j = bsf(movemask(valid1));
+          const int itime = vitime[j];
+          const vbool<K> valid2 = valid1 & (itime == vitime);
+          valid1 = valid1 & !valid2;
+          intersect(valid2,pre,ray,vftime,itime,context,prim,lazy_node);
+        }
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            size_t ofs00 = (y+0)*line_offset+(x+0);
+            size_t ofs01 = (y+0)*line_offset+(x+1);
+            size_t ofs10 = (y+1)*line_offset+(x+0);
+            size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            ofs00 += grid_offset;
+            ofs01 += grid_offset;
+            ofs10 += grid_offset;
+            ofs11 += grid_offset;
+            const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            const Vec3vf<K> p00 = lerp(a00,b00,ftime);
+            const Vec3vf<K> p01 = lerp(a01,b01,ftime);
+            const Vec3vf<K> p10 = lerp(a10,b10,ftime);
+            const Vec3vf<K> p11 = lerp(a11,b11,ftime);
+
+            pre.intersector.intersectK(valid_i,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+            pre.intersector.intersectK(valid_i,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),IntersectKEpilogMU<1,K,true>(ray,context,pre.grid->geomID(),pre.grid->primID()));
+          }
+        }
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        vfloat<K> vftime;
+        vint<K> vitime = getTimeSegment(ray.time(), vfloat<K>((float)(pre.grid->time_steps-1)), vftime);
+
+        vbool<K> valid_o = valid_i;
+        vbool<K> valid1 = valid_i;
+        while (any(valid1)) {
+          const int j = int(bsf(movemask(valid1)));
+          const int itime = vitime[j];
+          const vbool<K> valid2 = valid1 & (itime == vitime);
+          valid1 = valid1 & !valid2;
+          valid_o &= !valid2 | occluded(valid2,pre,ray,vftime,itime,context,prim,lazy_node);
+        }
+        return !valid_o;
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, const vfloat<K>& ftime, int itime, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const size_t line_offset   = pre.grid->width;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        vbool<K> valid = valid_i;
+        const size_t max_x = pre.grid->width  == 2 ? 1 : 2;
+        const size_t max_y = pre.grid->height == 2 ? 1 : 2;
+        for (size_t y=0; y<max_y; y++)        
+        {
+          for (size_t x=0; x<max_x; x++)
+          {
+            size_t ofs00 = (y+0)*line_offset+(x+0);
+            size_t ofs01 = (y+0)*line_offset+(x+1);
+            size_t ofs10 = (y+1)*line_offset+(x+0);
+            size_t ofs11 = (y+1)*line_offset+(x+1);
+            const Vec3vf<K> a00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> a01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> a10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> a11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            ofs00 += grid_offset;
+            ofs01 += grid_offset;
+            ofs10 += grid_offset;
+            ofs11 += grid_offset;
+            const Vec3vf<K> b00(grid_x[ofs00],grid_y[ofs00],grid_z[ofs00]);
+            const Vec3vf<K> b01(grid_x[ofs01],grid_y[ofs01],grid_z[ofs01]);
+            const Vec3vf<K> b10(grid_x[ofs10],grid_y[ofs10],grid_z[ofs10]);
+            const Vec3vf<K> b11(grid_x[ofs11],grid_y[ofs11],grid_z[ofs11]);
+            const Vec3vf<K> p00 = lerp(a00,b00,ftime);
+            const Vec3vf<K> p01 = lerp(a01,b01,ftime);
+            const Vec3vf<K> p10 = lerp(a10,b10,ftime);
+            const Vec3vf<K> p11 = lerp(a11,b11,ftime);
+
+            pre.intersector.intersectK(valid,ray,p00,p01,p10,MapUV0<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+            pre.intersector.intersectK(valid,ray,p10,p01,p11,MapUV1<K>(grid_uv,ofs00,ofs01,ofs10,ofs11),OccludedKEpilogMU<1,K,true>(valid,ray,context,pre.grid->geomID(),pre.grid->primID()));
+            if (none(valid)) break;
+          }
+        }
+        return valid;
+      }
+
+      template<typename Loader>
+        static __forceinline void intersect(RayHitK<K>& ray, size_t k,
+                                            const float ftime,
+                                            IntersectContext* context,
+                                            const float* const grid_x,
+                                            const size_t line_offset,
+                                            const size_t lines,
+                                            Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Intersect1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      };
+
+      template<typename Loader>
+        static __forceinline bool occluded(RayK<K>& ray, size_t k,
+                                           const float ftime,
+                                           IntersectContext* context,
+                                           const float* const grid_x,
+                                           const size_t line_offset,
+                                           const size_t lines,
+                                           Precalculations& pre)
+      {
+        typedef typename Loader::vfloat vfloat;
+        const size_t grid_offset   = pre.grid->gridBytes >> 2;
+        const size_t dim_offset    = pre.grid->dim_offset;
+        const float* const grid_y  = grid_x + 1 * dim_offset;
+        const float* const grid_z  = grid_x + 2 * dim_offset;
+        const float* const grid_uv = grid_x + 3 * dim_offset;
+
+        Vec3<vfloat> a0, a1, a2;
+        Loader::gather(grid_x,grid_y,grid_z,line_offset,lines,a0,a1,a2);
+
+        Vec3<vfloat> b0, b1, b2;
+        Loader::gather(grid_x+grid_offset,grid_y+grid_offset,grid_z+grid_offset,line_offset,lines,b0,b1,b2);
+
+        Vec3<vfloat> v0 = lerp(a0,b0,vfloat(ftime));
+        Vec3<vfloat> v1 = lerp(a1,b1,vfloat(ftime));
+        Vec3<vfloat> v2 = lerp(a2,b2,vfloat(ftime));
+
+        return pre.intersector.intersect(ray,k,v0,v1,v2,GridSOA::MapUV<Loader>(grid_uv,line_offset,lines),Occluded1KEpilogMU<Loader::M,K,true>(ray,k,context,pre.grid->geomID(),pre.grid->primID()));
+      }
+
+      /*! Intersect a ray with the primitive. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      { 
+        float ftime;
+        int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
+
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+
+#if defined(__AVX__)
+        intersect<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre);
+#else
+        intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre);
+        if (likely(lines > 2))
+          intersect<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre);
+#endif
+      }
+
+      /*! Test if the ray is occluded by the primitive */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        float ftime;
+        int itime = getTimeSegment(ray.time()[k], float(pre.grid->time_steps-1), ftime);
+
+        const size_t line_offset   = pre.grid->width;
+        const size_t lines         = pre.grid->height;
+        const float* const grid_x  = pre.grid->decodeLeaf(itime,prim);
+
+#if defined(__AVX__)
+        return occluded<GridSOA::Gather3x3>( ray, k, ftime, context, grid_x, line_offset, lines, pre);
+#else
+        if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x, line_offset, lines, pre)) return true;
+        if (likely(lines > 2))
+          if (occluded<GridSOA::Gather2x3>(ray, k, ftime, context, grid_x+line_offset, line_offset, lines, pre)) return true;
+#endif
+        return false;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/instance.h b/thirdparty/embree-aarch64/kernels/geometry/instance.h
new file mode 100644
index 0000000000..66893d581f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/instance.h
@@ -0,0 +1,78 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene_instance.h"
+
+namespace embree
+{
+  struct InstancePrimitive
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return 1; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return N; }
+
+  public:
+
+    InstancePrimitive (const Instance* instance, unsigned int instID) 
+    : instance(instance) 
+    , instID_(instID)
+    {}
+
+    __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene)
+    {
+      assert(end-i == 1);
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance, geomID);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime)
+    {
+      assert(end-i == 1);
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance,geomID);
+      return instance->linearBounds(0,itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      assert(end-i == 1);
+      const PrimRefMB& prim = prims[i]; i++;
+      const unsigned int geomID = prim.geomID();
+      const Instance* instance = scene->get<Instance>(geomID);
+      new (this) InstancePrimitive(instance,geomID);
+      return instance->linearBounds(0,time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(Instance* instance) {
+      return instance->bounds(0);
+    }
+
+  public:
+    const Instance* instance;
+    const unsigned int instID_ = std::numeric_limits<unsigned int>::max ();
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h
new file mode 100644
index 0000000000..91731a39c5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/instance_intersector.h
@@ -0,0 +1,84 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "instance.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct InstanceIntersector1
+    {
+      typedef InstancePrimitive Primitive;
+
+      struct Precalculations {
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+      static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+    };
+
+    struct InstanceIntersector1MB
+    {
+      typedef InstancePrimitive Primitive;
+
+      struct Precalculations {
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim);
+      static bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim);
+      static bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim);
+    };
+
+    template<int K>
+      struct InstanceIntersectorK
+    {
+      typedef InstancePrimitive Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+
+    template<int K>
+      struct InstanceIntersectorKMB
+    {
+      typedef InstancePrimitive Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim);
+      static vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim);
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/intersector_epilog.h b/thirdparty/embree-aarch64/kernels/geometry/intersector_epilog.h
new file mode 100644
index 0000000000..0df49dd6e9
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/intersector_epilog.h
@@ -0,0 +1,1074 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/context.h"
+#include "filter.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct UVIdentity {
+      __forceinline void operator() (vfloat<M>& u, vfloat<M>& v) const {}
+    };
+
+
+    template<bool filter>
+    struct Intersect1Epilog1
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1Epilog1(RayHit& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t;
+            bool found = runIntersectionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            return found;
+          }
+        }
+#endif
+
+        /* update hit information */
+        ray.tfar = hit.t;
+        ray.Ng = hit.Ng;
+        ray.u = hit.u;
+        ray.v = hit.v;
+        ray.primID = primID;
+        ray.geomID = geomID;
+        instance_id_stack::copy(context->user->instID, ray.instID);
+        return true;
+      }
+    };
+
+    template<bool filter>
+    struct Occluded1Epilog1
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1Epilog1(Ray& ray,
+                                     IntersectContext* context,
+                                     const unsigned int geomID,
+                                     const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) {
+            HitK<1> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t;
+            const bool found = runOcclusionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            return found;
+          }
+        }
+#endif
+        return true;
+      }
+    };
+
+    template<int K, bool filter>
+    struct Intersect1KEpilog1
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1KEpilog1(RayHitK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+        hit.finalize();
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar[k];
+            ray.tfar[k] = hit.t;
+            const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+            if (!found) ray.tfar[k] = old_t;
+            return found;
+          }
+        }
+#endif
+
+        /* update hit information */
+        ray.tfar[k] = hit.t;
+        ray.Ng.x[k] = hit.Ng.x;
+        ray.Ng.y[k] = hit.Ng.y;
+        ray.Ng.z[k] = hit.Ng.z;
+        ray.u[k] = hit.u;
+        ray.v[k] = hit.v;
+        ray.primID[k] = primID;
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy<const unsigned*, vuint<K>*, const size_t&>(context->user->instID, ray.instID, k);
+        return true;
+      }
+    };
+    
+    template<int K, bool filter>
+    struct Occluded1KEpilog1
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1KEpilog1(RayK<K>& ray, size_t k,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter())) {
+            hit.finalize();
+            HitK<K> h(context->user,geomID,primID,hit.u,hit.v,hit.Ng);
+            const float old_t = ray.tfar[k];
+            ray.tfar[k] = hit.t;
+            const bool found = any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+            if (!found) ray.tfar[k] = old_t;
+            return found;
+          }
+        }
+#endif 
+        return true;
+      }
+    };
+    
+    template<int M, int Mx, bool filter>
+    struct Intersect1EpilogM
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Intersect1EpilogM(RayHit& ray,
+                                      IntersectContext* context,
+                                      const vuint<M>& geomIDs,
+                                      const vuint<M>& primIDs)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        vbool<Mx> valid = valid_i;
+        if (Mx > M) valid &= (1<<M)-1;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+        unsigned int geomID = geomIDs[i];
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        bool foundhit = false;
+        goto entry;
+        while (true)
+        {
+          if (unlikely(none(valid))) return foundhit;
+          i = select_min(valid,hit.vt);
+
+          geomID = geomIDs[i];
+        entry:
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask) == 0) {
+            clear(valid,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION) 
+          /* call intersection filter function */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+              const Vec2f uv = hit.uv(i);
+              HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar;
+              ray.tfar = hit.t(i);
+              const bool found = runIntersectionFilter1(geometry,ray,context,h);
+              if (!found) ray.tfar = old_t;
+              foundhit |= found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        ray.tfar = hit.vt[i];
+        ray.Ng.x = hit.vNg.x[i];
+        ray.Ng.y = hit.vNg.y[i];
+        ray.Ng.z = hit.vNg.z[i];
+        ray.u = uv.x;
+        ray.v = uv.y;
+        ray.primID = primIDs[i];
+        ray.geomID = geomID;
+        instance_id_stack::copy(context->user->instID, ray.instID);
+        return true;
+
+      }
+    };
+
+#if 0 && defined(__AVX512F__) // do not enable, this reduced frequency for BVH4
+    template<int M, bool filter>
+    struct Intersect1EpilogM<M,16,filter>
+    {
+      static const size_t Mx = 16;
+      RayHit& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Intersect1EpilogM(RayHit& ray,
+                                      IntersectContext* context,
+                                      const vuint<M>& geomIDs,
+                                      const vuint<M>& primIDs)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+      {
+        Scene* MAYBE_UNUSED scene = context->scene;
+        vbool<Mx> valid = valid_i;
+        if (Mx > M) valid &= (1<<M)-1;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+        unsigned int geomID = geomIDs[i];
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        bool foundhit = false;
+        goto entry;
+        while (true)
+        {
+          if (unlikely(none(valid))) return foundhit;
+          i = select_min(valid,hit.vt);
+
+          geomID = geomIDs[i];
+        entry:
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask) == 0) {
+            clear(valid,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION) 
+          /* call intersection filter function */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+              const Vec2f uv = hit.uv(i);
+              HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar;
+              ray.tfar = hit.t(i);
+              const bool found = runIntersectionFilter1(geometry,ray,context,h);
+              if (!found) ray.tfar = old_t;
+              foundhit |= found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+
+        vbool<Mx> finalMask(((unsigned int)1 << i));
+        ray.update(finalMask,hit.vt,hit.vu,hit.vv,hit.vNg.x,hit.vNg.y,hit.vNg.z,geomID,primIDs);
+        instance_id_stack::foreach([&](unsigned level)
+        {
+          ray.instID[level] = context->user->instID[level];
+          return (context->user->instID[level] != RTC_INVALID_GEOMETRY_ID);
+        });
+        return true;
+
+      }
+    };
+#endif    
+    
+    template<int M, int Mx, bool filter>
+    struct Occluded1EpilogM
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Occluded1EpilogM(Ray& ray,
+                                     IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        if (unlikely(filter))
+          hit.finalize(); /* called only once */
+
+        vbool<Mx> valid = valid_i;
+        if (Mx > M) valid &= (1<<M)-1;
+        size_t m=movemask(valid);
+        goto entry;
+        while (true)
+        {
+          if (unlikely(m == 0)) return false;
+        entry:
+          size_t i=bsf(m);
+
+          const unsigned int geomID = geomIDs[i];
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask) == 0) {
+            m=btc(m,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+          /* if we have no filter then the test passed */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+            {
+              const Vec2f uv = hit.uv(i);
+              HitK<1> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar;
+              ray.tfar = hit.t(i);
+              if (runOcclusionFilter1(geometry,ray,context,h)) return true;
+              ray.tfar = old_t;
+              m=btc(m,i);
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+
+        return true;
+      }
+    };
+
+    template<int M, bool filter>
+    struct Intersect1EpilogMU
+    {
+      RayHit& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1EpilogMU(RayHit& ray,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+
+        vbool<M> valid = valid_i;
+        hit.finalize();
+
+        size_t i = select_min(valid,hit.vt);
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter()))
+        {
+          bool foundhit = false;
+          while (true)
+          {
+            /* call intersection filter function */
+            Vec2f uv = hit.uv(i);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t(i);
+            HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+            const bool found = runIntersectionFilter1(geometry,ray,context,h);
+            if (!found) ray.tfar = old_t;
+            foundhit |= found;
+            clear(valid,i);
+            valid &= hit.vt <= ray.tfar; // intersection filters may modify tfar value
+            if (unlikely(none(valid))) break;
+            i = select_min(valid,hit.vt);
+          }
+          return foundhit;
+        }
+#endif
+
+        /* update hit information */
+        const Vec2f uv = hit.uv(i);
+        const Vec3fa Ng = hit.Ng(i);
+        ray.tfar = hit.t(i);
+        ray.Ng.x = Ng.x;
+        ray.Ng.y = Ng.y;
+        ray.Ng.z = Ng.z;
+        ray.u = uv.x;
+        ray.v = uv.y;
+        ray.primID = primID;
+        ray.geomID = geomID;
+        instance_id_stack::copy(context->user->instID, ray.instID);
+        return true;
+      }
+    };
+    
+    template<int M, bool filter>
+    struct Occluded1EpilogMU
+    {
+      Ray& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1EpilogMU(Ray& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid, Hit& hit) const
+      {
+        /* ray mask test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        if ((geometry->mask & ray.mask) == 0) return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+        {
+          hit.finalize();
+          for (size_t m=movemask(valid), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m))
+          {
+            const Vec2f uv = hit.uv(i);
+            const float old_t = ray.tfar;
+            ray.tfar = hit.t(i);
+            HitK<1> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+            if (runOcclusionFilter1(geometry,ray,context,h)) return true;
+            ray.tfar = old_t;
+          }
+          return false;
+        }
+#endif
+        return true;
+      }
+    };
+        
+    template<int M, int K, bool filter>
+    struct IntersectKEpilogM
+    {
+      RayHitK<K>& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+      const size_t i;
+
+      __forceinline IntersectKEpilogM(RayHitK<K>& ray,
+                                      IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs,
+                                     size_t i)
+        : ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+
+        vfloat<K> u, v, t;
+        Vec3vf<K> Ng;
+        vbool<K> valid = valid_i;
+
+        std::tie(u,v,t,Ng) = hit();
+
+        const unsigned int geomID = geomIDs[i];
+        const unsigned int primID = primIDs[i];
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+        /* ray masking test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* occlusion filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(m_accept,ray.tfar,old_t);
+            return m_accept;
+          }
+        }
+#endif
+
+        /* update hit information */
+        vfloat<K>::store(valid,&ray.tfar,t);
+        vfloat<K>::store(valid,&ray.Ng.x,Ng.x);
+        vfloat<K>::store(valid,&ray.Ng.y,Ng.y);
+        vfloat<K>::store(valid,&ray.Ng.z,Ng.z);
+        vfloat<K>::store(valid,&ray.u,u);
+        vfloat<K>::store(valid,&ray.v,v);
+        vuint<K>::store(valid,&ray.primID,primID);
+        vuint<K>::store(valid,&ray.geomID,geomID);
+        instance_id_stack::copy<const unsigned*, vuint<K>*, const vbool<K>&>(context->user->instID, ray.instID, valid);
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct OccludedKEpilogM
+    {
+      vbool<K>& valid0;
+      RayK<K>& ray;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+      const size_t i;
+
+      __forceinline OccludedKEpilogM(vbool<K>& valid0,
+                                     RayK<K>& ray,
+                                     IntersectContext* context,
+                                     const vuint<M>& geomIDs,
+                                     const vuint<M>& primIDs,
+                                     size_t i)
+        : valid0(valid0), ray(ray), context(context), geomIDs(geomIDs), primIDs(primIDs), i(i) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        vbool<K> valid = valid_i;
+
+        /* ray masking test */
+        Scene* scene MAYBE_UNUSED = context->scene;
+        const unsigned int geomID = geomIDs[i];
+        const unsigned int primID = primIDs[i];
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return valid;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            vfloat<K> u, v, t;
+            Vec3vf<K> Ng;
+            std::tie(u,v,t,Ng) = hit();
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            valid = runOcclusionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(valid,ray.tfar,old_t);
+          }
+        }
+#endif
+
+        /* update occlusion */
+        valid0 = valid0 & !valid;
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct IntersectKEpilogMU
+    {
+      RayHitK<K>& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline IntersectKEpilogMU(RayHitK<K>& ray,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_org, const Hit& hit) const
+      {
+        vbool<K> valid = valid_org;
+        vfloat<K> u, v, t;
+        Vec3vf<K> Ng;
+        std::tie(u,v,t,Ng) = hit();
+
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+        /* ray masking test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            const vbool<K> m_accept = runIntersectionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(m_accept,ray.tfar,old_t);
+            return m_accept;
+          }
+        }
+#endif
+
+        /* update hit information */
+        vfloat<K>::store(valid,&ray.tfar,t);
+        vfloat<K>::store(valid,&ray.Ng.x,Ng.x);
+        vfloat<K>::store(valid,&ray.Ng.y,Ng.y);
+        vfloat<K>::store(valid,&ray.Ng.z,Ng.z);
+        vfloat<K>::store(valid,&ray.u,u);
+        vfloat<K>::store(valid,&ray.v,v);
+        vuint<K>::store(valid,&ray.primID,primID);
+        vuint<K>::store(valid,&ray.geomID,geomID);
+        instance_id_stack::copy<const unsigned*, vuint<K>*, const vbool<K>&>(context->user->instID, ray.instID, valid);
+
+        return valid;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct OccludedKEpilogMU
+    {
+      vbool<K>& valid0;
+      RayK<K>& ray;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline OccludedKEpilogMU(vbool<K>& valid0,
+                                      RayK<K>& ray,
+                                      IntersectContext* context,
+                                      const unsigned int geomID,
+                                      const unsigned int primID)
+        : valid0(valid0), ray(ray), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline vbool<K> operator() (const vbool<K>& valid_i, const Hit& hit) const
+      {
+        vbool<K> valid = valid_i;
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+        valid &= (geometry->mask & ray.mask) != 0;
+        if (unlikely(none(valid))) return false;
+#endif
+
+        /* occlusion filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            vfloat<K> u, v, t;
+            Vec3vf<K> Ng;
+            std::tie(u,v,t,Ng) = hit();
+            HitK<K> h(context->user,geomID,primID,u,v,Ng);
+            const vfloat<K> old_t = ray.tfar;
+            ray.tfar = select(valid,t,ray.tfar);
+            valid = runOcclusionFilter(valid,geometry,ray,context,h);
+            ray.tfar = select(valid,ray.tfar,old_t);
+          }
+        }
+#endif
+
+        /* update occlusion */
+        valid0 = valid0 & !valid;
+        return valid;
+      }
+    };
+    
+    template<int M, int Mx, int K, bool filter>
+    struct Intersect1KEpilogM
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Intersect1KEpilogM(RayHitK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const vuint<M>& geomIDs,
+                                       const vuint<M>& primIDs)
+        : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        vbool<Mx> valid = valid_i;
+        hit.finalize();
+        if (Mx > M) valid &= (1<<M)-1;
+        size_t i = select_min(valid,hit.vt);
+        assert(i<M);
+        unsigned int geomID = geomIDs[i];
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        bool foundhit = false;
+        goto entry;
+        while (true)
+        {
+          if (unlikely(none(valid))) return foundhit;
+          i = select_min(valid,hit.vt);
+          assert(i<M);
+          geomID = geomIDs[i];
+        entry:
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask[k]) == 0) {
+            clear(valid,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION) 
+          /* call intersection filter function */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter())) {
+              assert(i<M);
+              const Vec2f uv = hit.uv(i);
+              HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+              if (!found) ray.tfar[k] = old_t;
+              foundhit = foundhit | found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+        assert(i<M);
+        /* update hit information */
+#if 0 && defined(__AVX512F__) // do not enable, this reduced frequency for BVH4
+        ray.updateK(i,k,hit.vt,hit.vu,hit.vv,vfloat<Mx>(hit.vNg.x),vfloat<Mx>(hit.vNg.y),vfloat<Mx>(hit.vNg.z),geomID,vuint<Mx>(primIDs));
+#else
+        const Vec2f uv = hit.uv(i);
+        ray.tfar[k] = hit.t(i);
+        ray.Ng.x[k] = hit.vNg.x[i];
+        ray.Ng.y[k] = hit.vNg.y[i];
+        ray.Ng.z[k] = hit.vNg.z[i];
+        ray.u[k] = uv.x;
+        ray.v[k] = uv.y;
+        ray.primID[k] = primIDs[i];
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy<const unsigned*, vuint<K>*, const size_t&>(context->user->instID, ray.instID, k);
+#endif
+        return true;
+      }
+    };
+    
+    template<int M, int Mx, int K, bool filter>
+    struct Occluded1KEpilogM
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const vuint<M>& geomIDs;
+      const vuint<M>& primIDs;
+
+      __forceinline Occluded1KEpilogM(RayK<K>& ray, size_t k,
+                                      IntersectContext* context,
+                                      const vuint<M>& geomIDs,
+                                      const vuint<M>& primIDs)
+        : ray(ray), k(k), context(context), geomIDs(geomIDs), primIDs(primIDs) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<Mx>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION) || defined(EMBREE_RAY_MASK)
+        if (unlikely(filter))
+          hit.finalize(); /* called only once */
+
+        vbool<Mx> valid = valid_i;
+        if (Mx > M) valid &= (1<<M)-1;
+        size_t m=movemask(valid);
+        goto entry;
+        while (true)
+        {
+          if (unlikely(m == 0)) return false;
+        entry:
+          size_t i=bsf(m);
+
+          const unsigned int geomID = geomIDs[i];
+          Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+
+#if defined(EMBREE_RAY_MASK)
+          /* goto next hit if mask test fails */
+          if ((geometry->mask & ray.mask[k]) == 0) {
+            m=btc(m,i);
+            continue;
+          }
+#endif
+
+#if defined(EMBREE_FILTER_FUNCTION)
+          /* execute occlusion filer */
+          if (filter) {
+            if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primIDs[i],uv.x,uv.y,hit.Ng(i));
+              if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true;
+              ray.tfar[k] = old_t;
+              m=btc(m,i);
+              continue;
+            }
+          }
+#endif
+          break;
+        }
+#endif
+        return true;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Intersect1KEpilogMU
+    {
+      RayHitK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Intersect1KEpilogMU(RayHitK<K>& ray, size_t k,
+                                        IntersectContext* context,
+                                        const unsigned int geomID,
+                                        const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        /* ray mask test */
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* finalize hit calculation */
+        vbool<M> valid = valid_i;
+        hit.finalize();
+        size_t i = select_min(valid,hit.vt);
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasIntersectionFilter()))
+          {
+            bool foundhit = false;
+            while (true)
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+              const bool found = any(runIntersectionFilter(vbool<K>(1<<k),geometry,ray,context,h));
+              if (!found) ray.tfar[k] = old_t;
+              foundhit = foundhit | found;
+              clear(valid,i);
+              valid &= hit.vt <= ray.tfar[k]; // intersection filters may modify tfar value
+              if (unlikely(none(valid))) break;
+              i = select_min(valid,hit.vt);
+            }
+            return foundhit;
+          }
+        }
+#endif
+
+        /* update hit information */
+#if 0 && defined(__AVX512F__) // do not enable, this reduced frequency for BVH4
+        const Vec3fa Ng = hit.Ng(i);
+        ray.updateK(i,k,hit.vt,hit.vu,hit.vv,vfloat<M>(Ng.x),vfloat<M>(Ng.y),vfloat<M>(Ng.z),geomID,vuint<M>(primID));
+#else
+        const Vec2f uv = hit.uv(i);
+        const Vec3fa Ng = hit.Ng(i);
+        ray.tfar[k] = hit.t(i);
+        ray.Ng.x[k] = Ng.x;
+        ray.Ng.y[k] = Ng.y;
+        ray.Ng.z[k] = Ng.z;
+        ray.u[k] = uv.x;
+        ray.v[k] = uv.y;
+        ray.primID[k] = primID;
+        ray.geomID[k] = geomID;
+        instance_id_stack::copy<const unsigned*, vuint<K>*, const size_t&>(context->user->instID, ray.instID, k);
+#endif
+        return true;
+      }
+    };
+    
+    template<int M, int K, bool filter>
+    struct Occluded1KEpilogMU
+    {
+      RayK<K>& ray;
+      size_t k;
+      IntersectContext* context;
+      const unsigned int geomID;
+      const unsigned int primID;
+
+      __forceinline Occluded1KEpilogMU(RayK<K>& ray, size_t k,
+                                       IntersectContext* context,
+                                       const unsigned int geomID,
+                                       const unsigned int primID)
+        : ray(ray), k(k), context(context), geomID(geomID), primID(primID) {}
+
+      template<typename Hit>
+      __forceinline bool operator() (const vbool<M>& valid_i, Hit& hit) const
+      {
+        Scene* scene MAYBE_UNUSED = context->scene;
+        Geometry* geometry MAYBE_UNUSED = scene->get(geomID);
+#if defined(EMBREE_RAY_MASK)
+        /* ray mask test */
+        if ((geometry->mask & ray.mask[k]) == 0)
+          return false;
+#endif
+
+        /* intersection filter test */
+#if defined(EMBREE_FILTER_FUNCTION)
+        if (filter) {
+          if (unlikely(context->hasContextFilter() || geometry->hasOcclusionFilter()))
+          {
+            hit.finalize();
+            for (size_t m=movemask(valid_i), i=bsf(m); m!=0; m=btc(m,i), i=bsf(m))
+            {
+              const Vec2f uv = hit.uv(i);
+              const float old_t = ray.tfar[k];
+              ray.tfar[k] = hit.t(i);
+              HitK<K> h(context->user,geomID,primID,uv.x,uv.y,hit.Ng(i));
+              if (any(runOcclusionFilter(vbool<K>(1<<k),geometry,ray,context,h))) return true;
+              ray.tfar[k] = old_t;
+            }
+            return false;
+          }
+        }
+#endif 
+        return true;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/intersector_iterators.h b/thirdparty/embree-aarch64/kernels/geometry/intersector_iterators.h
new file mode 100644
index 0000000000..5c1ba5cb61
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/intersector_iterators.h
@@ -0,0 +1,172 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/scene.h"
+#include "../common/ray.h"
+#include "../common/point_query.h"
+#include "../bvh/node_intersector1.h"
+#include "../bvh/node_intersector_packet.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Intersector>
+    struct ArrayIntersector1
+    {
+      typedef typename Intersector::Primitive Primitive;
+      typedef typename Intersector::Precalculations Precalculations;
+
+      template<int N, int Nx, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++)
+          Intersector::intersect(pre,ray,context,prim[i]);
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          if (Intersector::occluded(pre,ray,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+      
+      template<int N>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0; i<num; i++)
+          changed |= Intersector::pointQuery(query, context, prim[i]);
+        return changed;
+      }
+
+      template<int K>
+      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+      }
+
+      template<int K>
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        return valid;
+      }
+    };
+
+    template<int K, typename Intersector>
+    struct ArrayIntersectorK_1
+    {
+      typedef typename Intersector::Primitive Primitive;
+      typedef typename Intersector::Precalculations Precalculations;
+
+      template<bool robust>
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          Intersector::intersect(valid,pre,ray,context,prim[i]);
+        }
+      }
+
+      template<bool robust>
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        vbool<K> valid0 = valid;
+        for (size_t i=0; i<num; i++) {
+          valid0 &= !Intersector::occluded(valid0,pre,ray,context,prim[i]);
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          Intersector::intersect(pre,ray,k,context,prim[i]);
+        }
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        for (size_t i=0; i<num; i++) {
+          if (Intersector::occluded(pre,ray,k,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+    };
+
+    // =============================================================================================
+
+    template<int K, typename IntersectorK>
+    struct ArrayIntersectorKStream
+    {
+      typedef typename IntersectorK::Primitive PrimitiveK;
+      typedef typename IntersectorK::Precalculations PrecalculationsK;
+
+      static __forceinline void intersectK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
+
+        for (size_t i=0; i<num; i++) {
+          IntersectorK::intersect(valid,pre,ray,context,prim[i]);
+        }
+      }
+
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, const Accel::Intersectors* This, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(valid,ray); // FIXME: might cause trouble
+        vbool<K> valid0 = valid;
+        for (size_t i=0; i<num; i++) {
+          valid0 &= !IntersectorK::occluded(valid0,pre,ray,context,prim[i]);
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      static __forceinline void intersect(const Accel::Intersectors* This, RayHitK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+        for (size_t i=0; i<num; i++) {
+          IntersectorK::intersect(pre,ray,k,context,prim[i]);
+        }
+      }
+
+      static __forceinline bool occluded(const Accel::Intersectors* This, RayK<K>& ray, size_t k, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+        for (size_t i=0; i<num; i++) {
+          if (IntersectorK::occluded(pre,ray,k,context,prim[i]))
+            return true;
+        }
+        return false;
+      }
+
+      static __forceinline size_t occluded(const Accel::Intersectors* This, size_t cur_mask, RayK<K>** __restrict__ inputPackets, IntersectContext* context, const PrimitiveK* prim, size_t num, size_t& lazy_node)
+      {
+        size_t m_occluded = 0;
+        for (size_t i=0; i<num; i++) {
+          size_t bits = cur_mask & (~m_occluded);
+          for (; bits!=0; )
+          {
+            const size_t rayID = bscf(bits);
+            RayHitK<K> &ray = *inputPackets[rayID / K];
+            const size_t k = rayID % K;
+            PrecalculationsK pre(ray.tnear() <= ray.tfar,ray); // FIXME: might cause trouble
+            if (IntersectorK::occluded(pre,ray,k,context,prim[i]))
+            {
+              m_occluded |= (size_t)1 << rayID;
+              ray.tfar[k] = neg_inf;
+            }
+          }
+        }
+        return m_occluded;
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h
new file mode 100644
index 0000000000..eef5b0b1fd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/line_intersector.h
@@ -0,0 +1,141 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct LineIntersectorHitM
+      {
+        __forceinline LineIntersectorHitM() {}
+
+        __forceinline LineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+        
+        __forceinline void finalize() {}
+        
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+        
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    template<int M>
+      struct FlatLinearCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Epilog& epilog)
+        {
+          /* transform end points into ray space */
+          vbool<M> valid = valid_i;
+          vfloat<M> depth_scale = pre.depth_scale;
+          LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space;
+
+          const Vec3vf<M> ray_org ((Vec3fa)ray.org);
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          
+          Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w);
+          Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w);
+          
+          /* approximative intersection with cone */
+          const Vec4vf<M> v = p1-p0;
+          const Vec4vf<M> w = -p0;
+          const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y);
+          const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y);
+          const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one));
+          const Vec4vf<M> p = madd(u,v,p0);
+          const vfloat<M> t = p.z;
+          const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y);
+          const vfloat<M> r = p.w;
+          const vfloat<M> r2 = r*r;
+          valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()) <= t) & (t <= vfloat<M>(ray.tfar));
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+            valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections
+          if (unlikely(none(valid))) return false;
+          
+          /* ignore denormalized segments */
+          const Vec3vf<M> T = v1.xyz()-v0.xyz();
+          valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero));
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          LineIntersectorHitM<M> hit(u,zero,t,T);
+          return epilog(valid,hit);
+        }
+      };
+    
+    template<int M, int K>
+      struct FlatLinearCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Epilog& epilog)
+        {
+          /* transform end points into ray space */
+          vbool<M> valid = valid_i;
+          vfloat<M> depth_scale = pre.depth_scale[k];
+          LinearSpace3<Vec3vf<M>> ray_space = pre.ray_space[k];
+          const Vec3vf<M> ray_org(ray.org.x[k],ray.org.y[k],ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k],ray.dir.y[k],ray.dir.z[k]);
+
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          
+          Vec4vf<M> p0(xfmVector(ray_space,v0.xyz()-ray_org), v0.w);
+          Vec4vf<M> p1(xfmVector(ray_space,v1.xyz()-ray_org), v1.w);
+          
+          /* approximative intersection with cone */
+          const Vec4vf<M> v = p1-p0;
+          const Vec4vf<M> w = -p0;
+          const vfloat<M> d0 = madd(w.x,v.x,w.y*v.y);
+          const vfloat<M> d1 = madd(v.x,v.x,v.y*v.y);
+          const vfloat<M> u = clamp(d0*rcp(d1),vfloat<M>(zero),vfloat<M>(one));
+          const Vec4vf<M> p = madd(u,v,p0);
+          const vfloat<M> t = p.z;
+          const vfloat<M> d2 = madd(p.x,p.x,p.y*p.y);
+          const vfloat<M> r = p.w;
+          const vfloat<M> r2 = r*r;
+          valid &= (d2 <= r2) & (vfloat<M>(ray.tnear()[k]) <= t) & (t <= vfloat<M>(ray.tfar[k]));
+          if (EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR != 0.0f) 
+            valid &= t > float(EMBREE_CURVE_SELF_INTERSECTION_AVOIDANCE_FACTOR)*r*depth_scale; // ignore self intersections
+          if (unlikely(none(valid))) return false;
+          
+          /* ignore denormalized segments */
+          const Vec3vf<M> T = v1.xyz()-v0.xyz();
+          valid &= (T.x != vfloat<M>(zero)) | (T.y != vfloat<M>(zero)) | (T.z != vfloat<M>(zero));
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          LineIntersectorHitM<M> hit(u,zero,t,T);
+          return epilog(valid,hit);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/linei.h b/thirdparty/embree-aarch64/kernels/geometry/linei.h
new file mode 100644
index 0000000000..a72029ca53
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/linei.h
@@ -0,0 +1,709 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  template<int M>
+  struct LineMi
+  {
+    /* Virtual interface to query information about the line segment type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;      
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored line segments */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N line segments */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+    /* Returns required number of bytes for N line segments */
+    static __forceinline size_t bytes(size_t N) { return blocks(N)*sizeof(LineMi); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline LineMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline LineMi(const vuint<M>& v0, unsigned short leftExists, unsigned short rightExists, const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype)
+      : gtype((unsigned char)gtype), m((unsigned char)popcnt(vuint<M>(primIDs) != vuint<M>(-1))), sharedGeomID(geomIDs[0]), leftExists (leftExists), rightExists(rightExists), v0(v0), primIDs(primIDs)
+    {
+      assert(all(vuint<M>(geomID()) == geomIDs));
+    }
+
+    /* Returns a mask that tells which line segments are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+      /* Returns a mask that tells which line segments are valid */
+    template<int Mx>
+    __forceinline vbool<Mx> valid() const { return vuint<Mx>(primIDs) != vuint<Mx>(-1); }
+
+    /* Returns if the specified line segment is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored line segments */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    //template<class T>
+    //static __forceinline T unmask(T &index) { return index & 0x3fffffff; }
+
+    __forceinline     unsigned int geomID(unsigned int i = 0) const { return sharedGeomID; }
+    //__forceinline       vuint<M> geomID()       { return unmask(geomIDs); }
+    //__forceinline const vuint<M> geomID() const { return unmask(geomIDs); }
+    //__forceinline unsigned int geomID(const size_t i) const { assert(i<M); return unmask(geomIDs[i]); }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* gather the line segments */
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              const LineSegments* geom,
+                              float time) const;
+
+    /* gather the line segments with lateral info */
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              Vec4vf<M>& pL,
+                              Vec4vf<M>& pR,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               Vec4vf<M>& pL,
+                               Vec4vf<M>& pR,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              Vec4vf<M>& pL,
+                              Vec4vf<M>& pR,
+                              const LineSegments* geom,
+                              float time) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              vbool<M>& cL,
+                              vbool<M>& cR,
+                              const LineSegments* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0,
+                               Vec4vf<M>& p1,
+                               vbool<M>& cL,
+                               vbool<M>& cR,
+                               const LineSegments* geom,
+                               const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0,
+                              Vec4vf<M>& p1,
+                              vbool<M>& cL,
+                              vbool<M>& cR,
+                              const LineSegments* geom,
+                              float time) const;
+
+    /* Calculate the bounds of the line segments */
+    __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID(i));
+        const Vec3ff& p0 = geom->vertex(v0[i]+0,itime);
+        const Vec3ff& p1 = geom->vertex(v0[i]+1,itime);
+        BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1));
+        b = enlarge(b,Vec3fa(max(p0.w,p1.w)));
+        bounds.extend(b);
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0), bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps) {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID(i));
+        allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range) 
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(geomID((unsigned int)i));
+        allBounds.extend(geom->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill line segment from line segment list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      Geometry::GType gty = scene->get(prims[begin].geomID())->getType();
+      vuint<M> geomID, primID;
+      vuint<M> v0;
+      unsigned short leftExists = 0;
+      unsigned short rightExists = 0;
+      const PrimRefT* prim = &prims[begin];
+
+      for (size_t i=0; i<M; i++)
+      {
+        const LineSegments* geom = scene->get<LineSegments>(prim->geomID());
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+          v0[i] = geom->segment(prim->primID());
+          leftExists |= geom->segmentLeftExists(primID[i]) << i;
+          rightExists |= geom->segmentRightExists(primID[i]) << i;         
+          begin++;
+        } else {
+          assert(i);
+          if (i>0) {
+            geomID[i] = geomID[i-1];
+            primID[i] = -1;
+            v0[i] = v0[i-1];
+          }
+        }
+        if (begin<end) prim = &prims[begin]; // FIXME: remove this line
+      }
+      new (this) LineMi(v0,leftExists,rightExists,geomID,primID,gty); // FIXME: use non temporal store
+    }
+
+     template<typename BVH, typename Allocator>
+      __forceinline static typename BVH::NodeRef createLeaf (BVH* bvh, const PrimRef* prims, const range<size_t>& set, const Allocator& alloc)
+    {
+      size_t start = set.begin();
+      size_t items = LineMi::blocks(set.size());
+      size_t numbytes = LineMi::bytes(set.size());
+      LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float));
+      for (size_t i=0; i<items; i++) {
+        accel[i].fill(prims,start,set.end(),bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel,items);
+    };
+    
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims,begin,end,scene);
+      return linearBounds(scene,itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims,begin,end,scene);
+      return linearBounds(scene,time_range);
+    }
+
+      template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start = prims.begin();
+      size_t end   = prims.end();
+      size_t items = LineMi::blocks(prims.size());
+      size_t numbytes = LineMi::bytes(prims.size());
+      LineMi* accel = (LineMi*) alloc.malloc1(numbytes,M*sizeof(float));
+      const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel,items);
+      
+      LBBox3fa bounds = empty;
+      for (size_t i=0; i<items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(),start,end,bvh->scene,prims.time_range));
+      
+      return typename BVH::NodeRecordMB4D(node,bounds,prims.time_range);
+    };
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(LineSegments* geom)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const Vec3ff& p0 = geom->vertex(v0[i]+0);
+        const Vec3ff& p1 = geom->vertex(v0[i]+1);
+        BBox3fa b = merge(BBox3fa(p0),BBox3fa(p1));
+        b = enlarge(b,Vec3fa(max(p0.w,p1.w)));
+        bounds.extend(b);
+      }
+      return bounds;
+    }
+
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const LineMi& line) {
+      return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}";
+    }
+    
+  public:
+    unsigned char gtype;
+    unsigned char m;
+    unsigned int sharedGeomID;
+    unsigned short leftExists, rightExists;
+    vuint<M> v0;      // index of start vertex
+  private:
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+  __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                       Vec4vf4& p1,
+                                       const LineSegments* geom,
+                                       const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf4 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         vbool4&  cL,
+                                         vbool4&  cR,
+                                         const LineSegments* geom) const
+  {
+    gather(p0,p1,geom);
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                          Vec4vf4& p1,
+                                          vbool4&  cL,
+                                          vbool4&  cR,
+                                          const LineSegments* geom,
+                                          const int itime) const
+  {
+    gatheri(p0,p1,geom,itime);
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                         Vec4vf4& p1,
+                                         vbool4&  cL,
+                                         vbool4&  cR,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf4 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf4 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    cL = !vbool4(leftExists);
+    cR = !vbool4(rightExists);
+  }
+
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<4>::gatheri(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    transpose(a0,a1,a2,a3,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    transpose(b0,b1,b2,b3,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<4>::gather(Vec4vf4& p0,
+                                              Vec4vf4& p1,
+                                              Vec4vf4& pL,
+                                              Vec4vf4& pR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf4 a0,a1,aL,aR;
+    gatheri(a0,a1,aL,aR,geom,itime);
+    Vec4vf4 b0,b1,bL,bR;
+    gatheri(b0,b1,bL,bR,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    pL = lerp(aL,bL,vfloat4(ftime));
+    pR = lerp(aR,bR,vfloat4(ftime));
+  }
+
+#if defined(__AVX__)
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4]));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5]));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6]));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7]));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+  __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                       Vec4vf8& p1,
+                                       const LineSegments* geom,
+                                       const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+  }
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         const LineSegments* geom,
+                                         float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf8 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0]));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1]));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2]));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3]));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4]));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5]));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6]));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7]));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1)) : vfloat4(inf);
+    const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1)) : vfloat4(inf);
+    const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1)) : vfloat4(inf);
+    const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1)) : vfloat4(inf);
+    const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2)) : vfloat4(inf);
+    const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2)) : vfloat4(inf);
+    const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2)) : vfloat4(inf);
+    const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2)) : vfloat4(inf);
+    const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(v0[0],itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(v0[1],itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(v0[2],itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(v0[3],itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(v0[4],itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(v0[5],itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(v0[6],itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(v0[7],itime));
+    transpose(a0,a1,a2,a3,a4,a5,a6,a7,p0.x,p0.y,p0.z,p0.w);
+    
+    const vfloat4 b0 = vfloat4::loadu(geom->vertexPtr(v0[0]+1,itime));
+    const vfloat4 b1 = vfloat4::loadu(geom->vertexPtr(v0[1]+1,itime));
+    const vfloat4 b2 = vfloat4::loadu(geom->vertexPtr(v0[2]+1,itime));
+    const vfloat4 b3 = vfloat4::loadu(geom->vertexPtr(v0[3]+1,itime));
+    const vfloat4 b4 = vfloat4::loadu(geom->vertexPtr(v0[4]+1,itime));
+    const vfloat4 b5 = vfloat4::loadu(geom->vertexPtr(v0[5]+1,itime));
+    const vfloat4 b6 = vfloat4::loadu(geom->vertexPtr(v0[6]+1,itime));
+    const vfloat4 b7 = vfloat4::loadu(geom->vertexPtr(v0[7]+1,itime));
+    transpose(b0,b1,b2,b3,b4,b5,b6,b7,p1.x,p1.y,p1.z,p1.w);
+    
+    const vfloat4 l0 = (leftExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]-1,itime)) : vfloat4(inf);
+    const vfloat4 l1 = (leftExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]-1,itime)) : vfloat4(inf);
+    const vfloat4 l2 = (leftExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]-1,itime)) : vfloat4(inf);
+    const vfloat4 l3 = (leftExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]-1,itime)) : vfloat4(inf);
+    const vfloat4 l4 = (leftExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]-1,itime)) : vfloat4(inf);
+    const vfloat4 l5 = (leftExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]-1,itime)) : vfloat4(inf);
+    const vfloat4 l6 = (leftExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]-1,itime)) : vfloat4(inf);
+    const vfloat4 l7 = (leftExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]-1,itime)) : vfloat4(inf);
+    transpose(l0,l1,l2,l3,l4,l5,l6,l7,pL.x,pL.y,pL.z,pL.w);
+    
+    const vfloat4 r0 = (rightExists & (1<<0)) ? vfloat4::loadu(geom->vertexPtr(v0[0]+2,itime)) : vfloat4(inf);
+    const vfloat4 r1 = (rightExists & (1<<1)) ? vfloat4::loadu(geom->vertexPtr(v0[1]+2,itime)) : vfloat4(inf);
+    const vfloat4 r2 = (rightExists & (1<<2)) ? vfloat4::loadu(geom->vertexPtr(v0[2]+2,itime)) : vfloat4(inf);
+    const vfloat4 r3 = (rightExists & (1<<3)) ? vfloat4::loadu(geom->vertexPtr(v0[3]+2,itime)) : vfloat4(inf);
+    const vfloat4 r4 = (rightExists & (1<<4)) ? vfloat4::loadu(geom->vertexPtr(v0[4]+2,itime)) : vfloat4(inf);
+    const vfloat4 r5 = (rightExists & (1<<5)) ? vfloat4::loadu(geom->vertexPtr(v0[5]+2,itime)) : vfloat4(inf);
+    const vfloat4 r6 = (rightExists & (1<<6)) ? vfloat4::loadu(geom->vertexPtr(v0[6]+2,itime)) : vfloat4(inf);
+    const vfloat4 r7 = (rightExists & (1<<7)) ? vfloat4::loadu(geom->vertexPtr(v0[7]+2,itime)) : vfloat4(inf);
+    transpose(r0,r1,r2,r3,r4,r5,r6,r7,pR.x,pR.y,pR.z,pR.w);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              Vec4vf8& pL,
+                                              Vec4vf8& pR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf8 a0,a1,aL,aR;
+    gatheri(a0,a1,aL,aR,geom,itime);
+    Vec4vf8 b0,b1,bL,bR;
+    gatheri(b0,b1,bL,bR,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+    pL = lerp(aL,bL,vfloat8(ftime));
+    pR = lerp(aR,bR,vfloat8(ftime));
+  }
+
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                         Vec4vf8& p1,
+                                         vbool8& cL,
+                                         vbool8& cR,
+                                         const LineSegments* geom) const
+  {
+    gather(p0,p1,geom);
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gatheri(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              vbool8& cL,
+                                              vbool8& cR,
+                                              const LineSegments* geom,
+                                              const int itime) const
+  {
+    gatheri(p0,p1,geom,itime);
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+  template<>
+    __forceinline void LineMi<8>::gather(Vec4vf8& p0,
+                                              Vec4vf8& p1,
+                                              vbool8& cL,
+                                              vbool8& cR,
+                                              const LineSegments* geom,
+                                              float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+    
+    Vec4vf8 a0,a1;
+    gatheri(a0,a1,geom,itime);
+    Vec4vf8 b0,b1;
+    gatheri(b0,b1,geom,itime+1);
+    p0 = lerp(a0,b0,vfloat8(ftime));
+    p1 = lerp(a1,b1,vfloat8(ftime));
+    cL = !vbool8(leftExists);
+    cR = !vbool8(rightExists);
+  }
+  
+#endif
+  
+  template<int M>
+  typename LineMi<M>::Type LineMi<M>::type;
+
+  typedef LineMi<4> Line4i;
+  typedef LineMi<8> Line8i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h
new file mode 100644
index 0000000000..a431796a88
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/linei_intersector.h
@@ -0,0 +1,124 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "linei.h"
+#include "line_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, int Mx, bool filter>
+    struct FlatLinearCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct FlatLinearCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return FlatLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct FlatLinearCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct FlatLinearCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1; line.gather(v0,v1,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return FlatLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/object.h b/thirdparty/embree-aarch64/kernels/geometry/object.h
new file mode 100644
index 0000000000..f26391de52
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/object.h
@@ -0,0 +1,84 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  struct Object
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored primitives */
+    static __forceinline size_t max_size() { return 1; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return N; }
+
+  public:
+
+    /*! constructs a virtual object */
+    Object (unsigned geomID, unsigned primID) 
+    : _geomID(geomID), _primID(primID) {}
+
+    __forceinline unsigned geomID() const {
+      return _geomID;
+    }
+
+    __forceinline unsigned primID() const {
+      return _primID;
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& i, size_t end, Scene* scene)
+    {
+      const PrimRef& prim = prims[i]; i++;
+      new (this) Object(prim.geomID(), prim.primID());
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& i, size_t end, Scene* scene, size_t itime)
+    {
+      const PrimRef& prim = prims[i]; i++;
+      const unsigned geomID = prim.geomID();
+      const unsigned primID = prim.primID();
+      new (this) Object(geomID, primID);
+      AccelSet* accel = (AccelSet*) scene->get(geomID);
+      return accel->linearBounds(primID,itime);
+    }
+
+    /*! fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& i, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      const PrimRefMB& prim = prims[i]; i++;
+      const unsigned geomID = prim.geomID();
+      const unsigned primID = prim.primID();
+      new (this) Object(geomID, primID);
+      AccelSet* accel = (AccelSet*) scene->get(geomID);
+      return accel->linearBounds(primID,time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(AccelSet* mesh) {
+      return mesh->bounds(primID());
+    }
+
+  private:
+    unsigned int _geomID;  //!< geometry ID
+    unsigned int _primID;  //!< primitive ID
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h
new file mode 100644
index 0000000000..97882e0e59
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/object_intersector.h
@@ -0,0 +1,127 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "object.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<bool mblur>
+    struct ObjectIntersector1
+    {
+      typedef Object Primitive;
+     
+      static const bool validIntersectorK = false;
+
+      struct Precalculations {
+        __forceinline Precalculations() {}
+        __forceinline Precalculations (const Ray& ray, const void *ptr) {}
+      };
+      
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& prim) 
+      {
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        if ((ray.mask & accel->mask) == 0) 
+          return;
+#endif
+
+        accel->intersect(ray,prim.geomID(),prim.primID(),context,reportIntersection1);
+      }
+      
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& prim)
+      {
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        if ((ray.mask & accel->mask) == 0) 
+          return false;
+#endif
+
+        accel->occluded(ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        return ray.tfar < 0.0f;
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim)
+      {
+        AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID());
+        context->geomID = prim.geomID();
+        context->primID = prim.primID();
+        return accel->pointQuery(query, context);
+      }
+      
+      template<int K>
+      static __forceinline void intersectK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        assert(false);
+      }
+
+      template<int K>
+      static __forceinline vbool<K> occludedK(const vbool<K>& valid, /* PrecalculationsK& pre, */ RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, size_t& lazy_node)
+      {
+        assert(false);
+        return valid;
+      }
+    };
+
+    template<int K, bool mblur>
+    struct ObjectIntersectorK
+    {
+      typedef Object Primitive;
+      
+      struct Precalculations {
+        __forceinline Precalculations (const vbool<K>& valid, const RayK<K>& ray) {}
+      };
+      
+      static __forceinline void intersect(const vbool<K>& valid_i, const Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vbool<K> valid = valid_i;
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (ray.mask & accel->mask) != 0;
+        if (none(valid)) return;
+#endif
+        accel->intersect(valid,ray,prim.geomID(),prim.primID(),context,&reportIntersection1);
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, const Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& prim)
+      {
+        vbool<K> valid = valid_i;
+        AccelSet* accel = (AccelSet*) context->scene->get(prim.geomID());
+        
+        /* perform ray mask test */
+#if defined(EMBREE_RAY_MASK)
+        valid &= (ray.mask & accel->mask) != 0;
+        if (none(valid)) return false;
+#endif
+        accel->occluded(valid,ray,prim.geomID(),prim.primID(),context,&reportOcclusion1);
+        return ray.tfar < 0.0f;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        intersect(vbool<K>(1<<int(k)),pre,ray,context,prim);
+      }
+      
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& prim) {
+        occluded(vbool<K>(1<<int(k)),pre,ray,context,prim);
+        return ray.tfar[k] < 0.0f; 
+      }
+    };
+
+    typedef ObjectIntersectorK<4,false>  ObjectIntersector4;
+    typedef ObjectIntersectorK<8,false>  ObjectIntersector8;
+    typedef ObjectIntersectorK<16,false> ObjectIntersector16;
+
+    typedef ObjectIntersectorK<4,true>  ObjectIntersector4MB;
+    typedef ObjectIntersectorK<8,true>  ObjectIntersector8MB;
+    typedef ObjectIntersectorK<16,true> ObjectIntersector16MB;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/plane.h b/thirdparty/embree-aarch64/kernels/geometry/plane.h
new file mode 100644
index 0000000000..ebe45db558
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/plane.h
@@ -0,0 +1,57 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    struct HalfPlane
+    {
+      const Vec3fa P;  //!< plane origin
+      const Vec3fa N;  //!< plane normal
+
+      __forceinline HalfPlane(const Vec3fa& P, const Vec3fa& N) 
+        : P(P), N(N) {}
+      
+      __forceinline BBox1f intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const
+      {
+        Vec3fa O = Vec3fa(ray_org) - P;
+        Vec3fa D = Vec3fa(ray_dir);
+        float ON = dot(O,N);
+        float DN = dot(D,N);
+        bool eps = abs(DN) < min_rcp_input;
+        float t = -ON*rcp(DN);
+        float lower = select(eps || DN < 0.0f, float(neg_inf), t);
+        float upper = select(eps || DN > 0.0f, float(pos_inf), t);
+        return BBox1f(lower,upper);
+      }
+    };
+
+    template<int M>
+      struct HalfPlaneN
+      {
+        const Vec3vf<M> P;  //!< plane origin
+        const Vec3vf<M> N;  //!< plane normal
+
+        __forceinline HalfPlaneN(const Vec3vf<M>& P, const Vec3vf<M>& N)
+          : P(P), N(N) {}
+
+        __forceinline BBox<vfloat<M>> intersect(const Vec3fa& ray_org, const Vec3fa& ray_dir) const
+        {
+          Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray_org) - P;
+          Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray_dir);
+          vfloat<M> ON = dot(O,N);
+          vfloat<M> DN = dot(D,N);
+          vbool<M> eps = abs(DN) < min_rcp_input;
+          vfloat<M> t = -ON*rcp(DN);
+          vfloat<M> lower = select(eps | DN < 0.0f, vfloat<M>(neg_inf), t);
+          vfloat<M> upper = select(eps | DN > 0.0f, vfloat<M>(pos_inf), t);
+          return BBox<vfloat<M>>(lower,upper);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/pointi.h b/thirdparty/embree-aarch64/kernels/geometry/pointi.h
new file mode 100644
index 0000000000..4ba298e86b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/pointi.h
@@ -0,0 +1,417 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  template<int M>
+  struct PointMi
+  {
+    /* Virtual interface to query information about the line segment type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+   public:
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored line segments */
+    static __forceinline size_t max_size()
+    {
+      return M;
+    }
+
+    /* Returns required number of primitive blocks for N line segments */
+    static __forceinline size_t blocks(size_t N)
+    {
+      return (N + max_size() - 1) / max_size();
+    }
+
+    /* Returns required number of bytes for N line segments */
+    static __forceinline size_t bytes(size_t N)
+    {
+      return blocks(N) * sizeof(PointMi);
+    }
+
+   public:
+    /* Default constructor */
+    __forceinline PointMi() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline PointMi(const vuint<M>& geomIDs, const vuint<M>& primIDs, Geometry::GType gtype, uint32_t numPrimitives)
+        : gtype((unsigned char)gtype),
+          numPrimitives(numPrimitives),
+          sharedGeomID(geomIDs[0]),
+          primIDs(primIDs)
+    {
+      assert(all(vuint<M>(geomID()) == geomIDs));
+    }
+
+    /* Returns a mask that tells which line segments are valid */
+    __forceinline vbool<M> valid() const {
+      return vint<M>(step) < vint<M>(numPrimitives);
+    }
+
+    /* Returns a mask that tells which line segments are valid */
+    template<int Mx> __forceinline vbool<Mx> valid() const {
+      return vint<Mx>(step) < vint<Mx>(numPrimitives);
+    }
+
+    /* Returns if the specified line segment is valid */
+    __forceinline bool valid(const size_t i) const
+    {
+      assert(i < M);
+      return i < numPrimitives;
+    }
+
+    /* Returns the number of stored line segments */
+    __forceinline size_t size() const {
+      return numPrimitives;
+    }
+
+    __forceinline unsigned int geomID(unsigned int i = 0) const {
+      return sharedGeomID;
+    }
+
+    __forceinline vuint<M>& primID() {
+      return primIDs;
+    }
+    __forceinline const vuint<M>& primID() const {
+      return primIDs;
+    }
+    __forceinline unsigned int primID(const size_t i) const {
+      assert(i < M);
+      return primIDs[i];
+    }
+
+    /* gather the line segments */
+    __forceinline void gather(Vec4vf<M>& p0, const Points* geom) const;
+    __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom) const;
+
+    __forceinline void gatheri(Vec4vf<M>& p0, const Points* geom, const int itime) const;
+    __forceinline void gatheri(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, const int itime) const;
+
+    __forceinline void gather(Vec4vf<M>& p0, const Points* geom, float time) const;
+    __forceinline void gather(Vec4vf<M>& p0, Vec3vf<M>& n0, const Points* geom, float time) const;
+
+    /* Calculate the bounds of the line segments */
+    __forceinline const BBox3fa bounds(const Scene* scene, size_t itime = 0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID(i));
+        bounds.extend(geom->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* scene, size_t itime) {
+      return LBBox3fa(bounds(scene, itime + 0), bounds(scene, itime + 1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID(i));
+        allBounds.extend(geom->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i = 0; i < M && valid(i); i++) {
+        const Points* geom = scene->get<Points>(geomID((unsigned int)i));
+        allBounds.extend(geom->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill line segment from line segment list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      Geometry::GType gty = scene->get(prims[begin].geomID())->getType();
+      vuint<M> geomID, primID;
+      vuint<M> v0;
+      const PrimRefT* prim = &prims[begin];
+
+      int numPrimitives = 0;
+      for (size_t i = 0; i < M; i++) {
+        if (begin < end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+          begin++;
+          numPrimitives++;
+        } else {
+          assert(i);
+          if (i > 0) {
+            geomID[i] = geomID[i - 1];
+            primID[i] = primID[i - 1];
+          }
+        }
+        if (begin < end)
+          prim = &prims[begin];  // FIXME: remove this line
+      }
+      new (this) PointMi(geomID, primID, gty, numPrimitives);  // FIXME: use non temporal store
+    }
+
+    template<typename BVH, typename Allocator>
+    __forceinline static typename BVH::NodeRef createLeaf(BVH* bvh,
+                                                          const PrimRef* prims,
+                                                          const range<size_t>& set,
+                                                          const Allocator& alloc)
+    {
+      size_t start    = set.begin();
+      size_t items    = PointMi::blocks(set.size());
+      size_t numbytes = PointMi::bytes(set.size());
+      PointMi* accel  = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float));
+      for (size_t i = 0; i < items; i++) {
+        accel[i].fill(prims, start, set.end(), bvh->scene);
+      }
+      return bvh->encodeLeaf((char*)accel, items);
+    };
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(
+        const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    template<typename BVH, typename SetMB, typename Allocator>
+    __forceinline static typename BVH::NodeRecordMB4D createLeafMB(BVH* bvh, const SetMB& prims, const Allocator& alloc)
+    {
+      size_t start                     = prims.object_range.begin();
+      size_t end                       = prims.object_range.end();
+      size_t items                     = PointMi::blocks(prims.object_range.size());
+      size_t numbytes                  = PointMi::bytes(prims.object_range.size());
+      PointMi* accel                   = (PointMi*)alloc.malloc1(numbytes, M * sizeof(float));
+      const typename BVH::NodeRef node = bvh->encodeLeaf((char*)accel, items);
+
+      LBBox3fa bounds = empty;
+      for (size_t i = 0; i < items; i++)
+        bounds.extend(accel[i].fillMB(prims.prims->data(), start, end, bvh->scene, prims.time_range));
+
+      return typename BVH::NodeRecordMB4D(node, bounds, prims.time_range);
+    };
+
+    /*! output operator */
+    friend __forceinline embree_ostream operator<<(embree_ostream cout, const PointMi& line)
+    {
+      return cout << "Line" << M << "i {" << line.v0 << ", " << line.geomID() << ", " << line.primID() << "}";
+    }
+
+   public:
+    unsigned char gtype;
+    unsigned char numPrimitives;
+    unsigned int sharedGeomID;
+
+   private:
+    vuint<M> primIDs;  // primitive ID
+  };
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom) const
+  {
+    const vfloat4 a0   = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1   = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2   = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3   = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom) const
+  {
+    const vfloat4 a0   = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1   = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2   = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3   = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0)));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1)));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2)));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3)));
+    transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gatheri(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    transpose(a0, a1, a2, a3, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
+    transpose(b0, b1, b2, b3, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0; gatheri(a0, geom, itime);
+    Vec4vf4 b0; gatheri(b0, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat4(ftime));
+  }
+
+  template<>
+  __forceinline void PointMi<4>::gather(Vec4vf4& p0, Vec3vf4& n0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf4 a0, b0;
+    Vec3vf4 norm0, norm1;
+    gatheri(a0, norm0, geom, itime);
+    gatheri(b0, norm1, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat4(ftime));
+    n0 = lerp(norm0, norm1, vfloat4(ftime));
+  }
+
+#if defined(__AVX__)
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4)));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5)));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6)));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7)));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0)));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1)));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2)));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3)));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4)));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5)));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6)));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7)));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0)));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1)));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2)));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3)));
+    const vfloat4 b4 = vfloat4(geom->normal(primID(4)));
+    const vfloat4 b5 = vfloat4(geom->normal(primID(5)));
+    const vfloat4 b6 = vfloat4(geom->normal(primID(6)));
+    const vfloat4 b7 = vfloat4(geom->normal(primID(7)));
+    transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gatheri(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, const int itime) const
+  {
+    const vfloat4 a0 = vfloat4::loadu(geom->vertexPtr(primID(0), itime));
+    const vfloat4 a1 = vfloat4::loadu(geom->vertexPtr(primID(1), itime));
+    const vfloat4 a2 = vfloat4::loadu(geom->vertexPtr(primID(2), itime));
+    const vfloat4 a3 = vfloat4::loadu(geom->vertexPtr(primID(3), itime));
+    const vfloat4 a4 = vfloat4::loadu(geom->vertexPtr(primID(4), itime));
+    const vfloat4 a5 = vfloat4::loadu(geom->vertexPtr(primID(5), itime));
+    const vfloat4 a6 = vfloat4::loadu(geom->vertexPtr(primID(6), itime));
+    const vfloat4 a7 = vfloat4::loadu(geom->vertexPtr(primID(7), itime));
+    transpose(a0, a1, a2, a3, a4, a5, a6, a7, p0.x, p0.y, p0.z, p0.w);
+    const vfloat4 b0 = vfloat4(geom->normal(primID(0), itime));
+    const vfloat4 b1 = vfloat4(geom->normal(primID(1), itime));
+    const vfloat4 b2 = vfloat4(geom->normal(primID(2), itime));
+    const vfloat4 b3 = vfloat4(geom->normal(primID(3), itime));
+    const vfloat4 b4 = vfloat4(geom->normal(primID(4), itime));
+    const vfloat4 b5 = vfloat4(geom->normal(primID(5), itime));
+    const vfloat4 b6 = vfloat4(geom->normal(primID(6), itime));
+    const vfloat4 b7 = vfloat4(geom->normal(primID(7), itime));
+    transpose(b0, b1, b2, b3, b4, b5, b6, b7, n0.x, n0.y, n0.z);
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0;
+    gatheri(a0, geom, itime);
+    Vec4vf8 b0;
+    gatheri(b0, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat8(ftime));
+  }
+
+  template<>
+  __forceinline void PointMi<8>::gather(Vec4vf8& p0, Vec3vf8& n0, const Points* geom, float time) const
+  {
+    float ftime;
+    const int itime = geom->timeSegment(time, ftime);
+
+    Vec4vf8 a0, b0;
+    Vec3vf8 norm0, norm1;
+    gatheri(a0, norm0, geom, itime);
+    gatheri(b0, norm1, geom, itime + 1);
+    p0 = lerp(a0, b0, vfloat8(ftime));
+    n0 = lerp(norm0, norm1, vfloat8(ftime));
+  }
+#endif
+
+  template<int M>
+  typename PointMi<M>::Type PointMi<M>::type;
+
+  typedef PointMi<4> Point4i;
+  typedef PointMi<8> Point8i;
+  
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/primitive.h b/thirdparty/embree-aarch64/kernels/geometry/primitive.h
new file mode 100644
index 0000000000..41e5b2b304
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/primitive.h
@@ -0,0 +1,49 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/scene.h"
+#include "../../common/simd/simd.h"
+#include "../common/primref.h"
+#include "../common/primref_mb.h"
+
+namespace embree
+{
+  struct PrimitiveType
+  {
+    /*! returns name of this primitive type */
+    virtual const char* name() const = 0;
+    
+    /*! Returns the number of stored active primitives in a block. */
+    virtual size_t sizeActive(const char* This) const = 0;
+
+    /*! Returns the number of stored active and inactive primitives in a block. */
+    virtual size_t sizeTotal(const char* This) const = 0;
+
+    /*! Returns the number of bytes of block. */
+    virtual size_t getBytes(const char* This) const = 0;
+  };
+  
+  template<typename Primitive>
+  struct PrimitivePointQuery1
+  {
+    static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& prim)
+    {
+      bool changed = false;
+      for (size_t i = 0; i < Primitive::max_size(); i++)
+      {
+        if (!prim.valid(i)) break;
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(prim.geomID(i));
+        context->geomID = prim.geomID(i);
+        context->primID = prim.primID(i);
+        changed |= accel->pointQuery(query, context);
+      }
+      return changed;
+    }
+    
+    static __forceinline void pointQueryNoop(PointQuery* query, PointQueryContext* context, const Primitive& prim) { }
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp b/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp
new file mode 100644
index 0000000000..f93574c9c8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/primitive4.cpp
@@ -0,0 +1,379 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primitive.h"
+#include "curveNv.h"
+#include "curveNi.h"
+#include "curveNi_mb.h"
+#include "linei.h"
+#include "triangle.h"
+#include "trianglev.h"
+#include "trianglev_mb.h"
+#include "trianglei.h"
+#include "quadv.h"
+#include "quadi.h"
+#include "subdivpatch1.h"
+#include "object.h"
+#include "instance.h"
+#include "subgrid.h"
+
+namespace embree
+{
+  /********************** Curve4v **************************/
+
+  template<>
+  const char* Curve4v::Type::name () const {
+    return "curve4v";
+  }
+
+  template<>
+  size_t Curve4v::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4v*)This)->N;
+  }
+
+  template<>
+  size_t Curve4v::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4v*)This)->N;
+  }
+
+  template<>
+  size_t Curve4v::Type::getBytes(const char* This) const
+  {
+     if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+     else
+        return Curve4v::bytes(sizeActive(This));
+  }
+
+  /********************** Curve4i **************************/
+
+  template<>
+  const char* Curve4i::Type::name () const {
+    return "curve4i";
+  }
+
+  template<>
+  size_t Curve4i::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4i*)This)->N;
+  }
+
+  template<>
+  size_t Curve4i::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4i*)This)->N;
+  }
+
+  template<>
+  size_t Curve4i::Type::getBytes(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+    else
+      return Curve4i::bytes(sizeActive(This));
+  }
+
+  /********************** Curve4iMB **************************/
+
+  template<>
+  const char* Curve4iMB::Type::name () const {
+    return "curve4imb";
+  }
+
+  template<>
+  size_t Curve4iMB::Type::sizeActive(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return ((Line4i*)This)->size();
+    else
+      return ((Curve4iMB*)This)->N;
+  }
+
+  template<>
+  size_t Curve4iMB::Type::sizeTotal(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return 4;
+    else
+      return ((Curve4iMB*)This)->N;
+  }
+
+  template<>
+  size_t Curve4iMB::Type::getBytes(const char* This) const
+  {
+    if ((*This & Geometry::GType::GTY_BASIS_MASK) == Geometry::GType::GTY_BASIS_LINEAR)
+      return Line4i::bytes(sizeActive(This));
+    else
+      return Curve4iMB::bytes(sizeActive(This));
+  }
+
+  /********************** Line4i **************************/
+
+  template<>
+  const char* Line4i::Type::name () const {
+    return "line4i";
+  }
+
+  template<>
+  size_t Line4i::Type::sizeActive(const char* This) const {
+    return ((Line4i*)This)->size();
+  }
+
+  template<>
+  size_t Line4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Line4i::Type::getBytes(const char* This) const {
+    return sizeof(Line4i);
+  }
+
+  /********************** Triangle4 **************************/
+
+  template<>
+  const char* Triangle4::Type::name () const {
+    return "triangle4";
+  }
+
+  template<>
+  size_t Triangle4::Type::sizeActive(const char* This) const {
+    return ((Triangle4*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4);
+  }
+
+  /********************** Triangle4v **************************/
+
+  template<>
+  const char* Triangle4v::Type::name () const {
+    return "triangle4v";
+  }
+
+  template<>
+  size_t Triangle4v::Type::sizeActive(const char* This) const {
+    return ((Triangle4v*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4v::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4v::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4v);
+  }
+
+  /********************** Triangle4i **************************/
+
+  template<>
+  const char* Triangle4i::Type::name () const {
+    return "triangle4i";
+  }
+
+  template<>
+  size_t Triangle4i::Type::sizeActive(const char* This) const {
+    return ((Triangle4i*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4i::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4i);
+  }
+
+  /********************** Triangle4vMB **************************/
+
+  template<>
+  const char* Triangle4vMB::Type::name () const {
+    return  "triangle4vmb";
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::sizeActive(const char* This) const {
+    return ((Triangle4vMB*)This)->size();
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Triangle4vMB::Type::getBytes(const char* This) const {
+    return sizeof(Triangle4vMB);
+  }
+
+  /********************** Quad4v **************************/
+
+  template<>
+  const char* Quad4v::Type::name () const {
+    return "quad4v";
+  }
+
+  template<>
+  size_t Quad4v::Type::sizeActive(const char* This) const {
+    return ((Quad4v*)This)->size();
+  }
+
+  template<>
+  size_t Quad4v::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Quad4v::Type::getBytes(const char* This) const {
+    return sizeof(Quad4v);
+  }
+
+  /********************** Quad4i **************************/
+
+  template<>
+  const char* Quad4i::Type::name () const {
+    return "quad4i";
+  }
+
+  template<>
+  size_t Quad4i::Type::sizeActive(const char* This) const {
+    return ((Quad4i*)This)->size();
+  }
+
+  template<>
+  size_t Quad4i::Type::sizeTotal(const char* This) const {
+    return 4;
+  }
+
+  template<>
+  size_t Quad4i::Type::getBytes(const char* This) const {
+    return sizeof(Quad4i);
+  }
+
+  /********************** SubdivPatch1 **************************/
+
+  const char* SubdivPatch1::Type::name () const {
+    return "subdivpatch1";
+  }
+
+  size_t SubdivPatch1::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t SubdivPatch1::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t SubdivPatch1::Type::getBytes(const char* This) const {
+    return sizeof(SubdivPatch1);
+  }
+
+  SubdivPatch1::Type SubdivPatch1::type;
+
+  /********************** Virtual Object **************************/
+
+  const char* Object::Type::name () const {
+    return "object";
+  }
+
+  size_t Object::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t Object::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t Object::Type::getBytes(const char* This) const {
+    return sizeof(Object);
+  }
+
+  Object::Type Object::type;
+
+  /********************** Instance **************************/
+
+  const char* InstancePrimitive::Type::name () const {
+    return "instance";
+  }
+
+  size_t InstancePrimitive::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t InstancePrimitive::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t InstancePrimitive::Type::getBytes(const char* This) const {
+    return sizeof(InstancePrimitive);
+  }
+
+  InstancePrimitive::Type InstancePrimitive::type;
+
+  /********************** SubGrid **************************/
+
+  const char* SubGrid::Type::name () const {
+    return "subgrid";
+  }
+
+  size_t SubGrid::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  size_t SubGrid::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  size_t SubGrid::Type::getBytes(const char* This) const {
+    return sizeof(SubGrid);
+  }
+
+  SubGrid::Type SubGrid::type;
+  
+  /********************** SubGridQBVH4 **************************/
+
+  template<>
+  const char* SubGridQBVH4::Type::name () const {
+    return "SubGridQBVH4";
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::sizeActive(const char* This) const {
+    return 1;
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::sizeTotal(const char* This) const {
+    return 1;
+  }
+
+  template<>
+  size_t SubGridQBVH4::Type::getBytes(const char* This) const {
+    return sizeof(SubGridQBVH4);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h
new file mode 100644
index 0000000000..57ff4e60e5
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector.h
@@ -0,0 +1,76 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects a ray with a quad with backface culling
+     *  enabled. The quad v0,v1,v2,v3 is split into two triangles
+     *  v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two
+     *  triangles gets intersected. */
+    template<int N>
+    __forceinline vbool<N> intersect_quad_backface_culling(const vbool<N>& valid0,
+                                                           const Vec3fa& ray_org,
+                                                           const Vec3fa& ray_dir,
+                                                           const float ray_tnear,
+                                                           const float ray_tfar,
+                                                           const Vec3vf<N>& quad_v0,
+                                                           const Vec3vf<N>& quad_v1,
+                                                           const Vec3vf<N>& quad_v2,
+                                                           const Vec3vf<N>& quad_v3,
+                                                           vfloat<N>& u_o,
+                                                           vfloat<N>& v_o,
+                                                           vfloat<N>& t_o)
+    {
+      /* calculate vertices relative to ray origin */
+      vbool<N> valid = valid0;
+      const Vec3vf<N> O = Vec3vf<N>(ray_org);
+      const Vec3vf<N> D = Vec3vf<N>(ray_dir);
+      const Vec3vf<N> va = quad_v0-O;
+      const Vec3vf<N> vb = quad_v1-O;
+      const Vec3vf<N> vc = quad_v2-O;
+      const Vec3vf<N> vd = quad_v3-O;
+
+      const Vec3vf<N> edb = vb-vd;
+      const vfloat<N> WW = dot(cross(vd,edb),D);
+      const Vec3vf<N> v0 = select(WW <= 0.0f,va,vc);
+      const Vec3vf<N> v1 = select(WW <= 0.0f,vb,vd);
+      const Vec3vf<N> v2 = select(WW <= 0.0f,vd,vb);
+
+      /* calculate edges */
+      const Vec3vf<N> e0 = v2-v0;
+      const Vec3vf<N> e1 = v0-v1;
+
+      /* perform edge tests */
+      const vfloat<N> U = dot(cross(v0,e0),D);
+      const vfloat<N> V = dot(cross(v1,e1),D);
+      valid &= max(U,V) <= 0.0f;
+      if (unlikely(none(valid))) return false;
+
+      /* calculate geometry normal and denominator */
+      const Vec3vf<N> Ng = cross(e1,e0);
+      const vfloat<N> den = dot(Ng,D);
+      const vfloat<N> rcpDen = rcp(den);
+
+      /* perform depth test */
+      const vfloat<N> t = rcpDen*dot(v0,Ng);
+      valid &= vfloat<N>(ray_tnear) <= t & t <= vfloat<N>(ray_tfar);
+      if (unlikely(none(valid))) return false;
+
+      /* avoid division by 0 */
+      valid &= den != vfloat<N>(zero);
+      if (unlikely(none(valid))) return false;
+
+      /* update hit information */
+      t_o = t;
+      u_o = U * rcpDen;
+      v_o = V * rcpDen;
+      u_o = select(WW <= 0.0f,u_o,1.0f-u_o);
+      v_o = select(WW <= 0.0f,v_o,1.0f-v_o);
+      return valid;
+    }
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h
new file mode 100644
index 0000000000..74e8c7720c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_moeller.h
@@ -0,0 +1,566 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadv.h"
+#include "triangle_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct QuadHitM
+    {
+      __forceinline QuadHitM() {}
+
+      __forceinline QuadHitM(const vbool<M>& valid,
+                             const vfloat<M>& U,
+                             const vfloat<M>& V,
+                             const vfloat<M>& T,
+                             const vfloat<M>& absDen,
+                             const Vec3vf<M>& Ng,
+                             const vbool<M>& flags)
+        : U(U), V(V), T(T), absDen(absDen), tri_Ng(Ng), valid(valid), flags(flags) {}
+
+      __forceinline void finalize()
+      {
+        const vfloat<M> rcpAbsDen = rcp(absDen);
+        vt = T * rcpAbsDen;
+        const vfloat<M> u = min(U * rcpAbsDen,1.0f);
+        const vfloat<M> v = min(V * rcpAbsDen,1.0f);
+        const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+        const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+        vu = select(flags,u1,u);
+        vv = select(flags,v1,v);
+        vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+#else
+        const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+        vv = select(flags,u1,v);
+        vu = select(flags,v1,u);
+        vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z);
+#endif
+      }
+
+      __forceinline Vec2f uv(const size_t i)
+      {
+        const float u = vu[i];
+        const float v = vv[i];
+        return Vec2f(u,v);
+      }
+
+      __forceinline float   t(const size_t i) { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+    private:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      vfloat<M> absDen;
+      Vec3vf<M> tri_Ng;
+
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+
+    public:
+      const vbool<M> flags;
+    };
+
+    template<int K>
+    struct QuadHitK
+    {
+      __forceinline QuadHitK(const vfloat<K>& U,
+                             const vfloat<K>& V,
+                             const vfloat<K>& T,
+                             const vfloat<K>& absDen,
+                             const Vec3vf<K>& Ng,
+                             const vbool<K>& flags)
+        : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng) {}
+
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        const vfloat<K> u0 = min(U * rcpAbsDen,1.0f);
+        const vfloat<K> v0 = min(V * rcpAbsDen,1.0f);
+        const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+        const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+        const vfloat<K> u = select(flags,u1,u0);
+        const vfloat<K> v = select(flags,v1,v0);
+        const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+        return std::make_tuple(u,v,t,Ng);
+      }
+
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const vbool<K> flags;
+      const Vec3vf<K> tri_Ng;
+    };
+
+    /* ----------------------------- */
+    /* -- single ray intersectors -- */
+    /* ----------------------------- */
+
+
+    template<int M, bool filter>
+    struct QuadMIntersector1MoellerTrumbore;
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMIntersector1MoellerTrumbore
+    {
+      __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+      __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+        Intersect1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+
+        /* intersect first triangle */
+        if (intersector.intersect(ray,v0,v1,v3,hit)) 
+          epilog(hit.valid,hit);
+
+        /* intersect second triangle */
+        if (intersector.intersect(ray,v2,v3,v1,hit)) 
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          epilog(hit.valid,hit);
+        }
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                  const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+        Occluded1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+
+        /* intersect first triangle */
+        if (intersector.intersect(ray,v0,v1,v3,hit)) 
+        {
+          if (epilog(hit.valid,hit))
+            return true;
+        }
+
+        /* intersect second triangle */
+        if (intersector.intersect(ray,v2,v3,v1,hit)) 
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          if (epilog(hit.valid,hit))
+            return true;
+        }
+        return false;
+      }
+    };
+
+#if defined(__AVX512ER__) // KNL
+
+    /*! Intersects 4 quads with 1 ray using AVX512 */
+    template<bool filter>
+    struct QuadMIntersector1MoellerTrumbore<4,filter>
+    {
+      __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+                            select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+                            select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+        const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+        const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+                            select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+                            select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+        const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+                            select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+                            select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+        const vbool16 flags(0xf0f0);
+
+        MoellerTrumboreHitM<16> hit;
+        MoellerTrumboreIntersector1<16> intersector(ray,nullptr);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit))) 
+        {
+          vfloat16 U = hit.U, V = hit.V, absDen = hit.absDen;
+#if !defined(EMBREE_BACKFACE_CULLING)
+          hit.U = select(flags,absDen-V,U);
+          hit.V = select(flags,absDen-U,V);
+          hit.vNg *= select(flags,vfloat16(-1.0f),vfloat16(1.0f)); // FIXME: use XOR
+#else
+          hit.U = select(flags,absDen-U,U);
+          hit.V = select(flags,absDen-V,V);
+#endif
+          if (likely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#elif defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+    struct QuadMIntersector1MoellerTrumbore<4,filter>
+    {
+      __forceinline QuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline QuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        MoellerTrumboreHitM<8> hit;
+        MoellerTrumboreIntersector1<8> intersector(ray,nullptr);
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit)))
+        {
+          vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen;
+
+#if !defined(EMBREE_BACKFACE_CULLING)
+          hit.U = select(flags,absDen-V,U);
+          hit.V = select(flags,absDen-U,V);
+          hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); // FIXME: use XOR
+#else
+          hit.U = select(flags,absDen-U,U);
+          hit.V = select(flags,absDen-V,V);
+#endif
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+
+    struct MoellerTrumboreIntersector1KTriangleM
+    {
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<int M, int K, typename Epilog>
+      static  __forceinline bool intersect(RayK<K>& ray,
+                                           size_t k,
+                                           const Vec3vf<M>& tri_v0,
+                                           const Vec3vf<M>& tri_e1,
+                                           const Vec3vf<M>& tri_e2,
+                                           const Vec3vf<M>& tri_Ng,
+                                           const vbool<M>& flags,
+                                           const Epilog& epilog)
+      {
+        /* calculate denominator */
+        const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+        const Vec3vf<M> R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+        const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        QuadHitM<M> hit(valid,U,V,T,absDen,tri_Ng,flags);
+        return epilog(valid,hit);
+      }
+      
+      template<int M, int K, typename Epilog>
+      static __forceinline bool intersect1(RayK<K>& ray,
+                                           size_t k,
+                                           const Vec3vf<M>& v0,
+                                           const Vec3vf<M>& v1,
+                                           const Vec3vf<M>& v2,
+                                           const vbool<M>& flags,
+                                           const Epilog& epilog)
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        const Vec3vf<M> Ng = cross(e2,e1);
+        return intersect(ray,k,v0,e1,e2,Ng,flags,epilog);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumboreBase
+    {
+      __forceinline QuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray.org;
+        const Vec3vf<K> R = cross(C,ray.dir);
+        const vfloat<K> den = dot(tri_Ng,ray.dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(R,tri_e2) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(R,tri_e1) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        QuadHitK<K> hit(U,V,T,absDen,tri_Ng,flags);
+        return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,epilog);
+      }
+
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline bool intersectK(const vbool<K>& valid0, 
+                                    RayK<K>& ray,
+                                    const Vec3vf<K>& v0,
+                                    const Vec3vf<K>& v1,
+                                    const Vec3vf<K>& v2,
+                                    const Vec3vf<K>& v3,
+                                    const Epilog& epilog) const
+      {
+        intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog);
+        if (none(valid0)) return true;
+        intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog);
+        return none(valid0);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumbore : public QuadMIntersectorKMoellerTrumboreBase<M,K,filter>
+    {
+      __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+        MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog);
+        MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+        if (MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+
+#if defined(__AVX512ER__) // KNL
+
+    /*! Intersects 4 quads with 1 ray using AVX512 */
+    template<int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+                            select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+                            select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+        const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+        const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+                            select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+                            select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+        const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+                            select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+                            select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+        const vbool16 flags(0xf0f0);
+        return MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog);
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#elif defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+    struct QuadMIntersectorKMoellerTrumbore<4,K,filter> : public QuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        return MoellerTrumboreIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h
new file mode 100644
index 0000000000..7ca3aed0a0
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quad_intersector_pluecker.h
@@ -0,0 +1,529 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quad_intersector_moeller.h"
+
+/*! Modified Pluecker ray/triangle intersector. The test first shifts
+ *  the ray origin into the origin of the coordinate system and then
+ *  uses Pluecker coordinates for the intersection. Due to the shift,
+ *  the Pluecker coordinate calculation simplifies and the tests get
+ *  numerically stable. The edge equations are watertight along the
+ *  edge for neighboring triangles. */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct QuadHitPlueckerM
+    {
+      __forceinline QuadHitPlueckerM() {}
+
+      __forceinline QuadHitPlueckerM(const vbool<M>& valid,
+                                     const vfloat<M>& U,
+                                     const vfloat<M>& V,
+                                     const vfloat<M>& UVW,
+                                     const vfloat<M>& t,
+                                     const Vec3vf<M>& Ng,
+                                     const vbool<M>& flags)
+        : U(U), V(V), UVW(UVW), tri_Ng(Ng), valid(valid), vt(t), flags(flags) {}
+
+      __forceinline void finalize()
+      {
+        const vbool<M> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+        const vfloat<M> u = min(U * rcpUVW,1.0f);
+        const vfloat<M> v = min(V * rcpUVW,1.0f);
+        const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+        const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+        vu = select(flags,u1,u);
+        vv = select(flags,v1,v);
+        vNg = Vec3vf<M>(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+#else
+        const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+        vv = select(flags,u1,v);
+        vu = select(flags,v1,u);
+        vNg = Vec3vf<M>(flip*tri_Ng.x,flip*tri_Ng.y,flip*tri_Ng.z);
+#endif
+      }
+
+      __forceinline Vec2f uv(const size_t i)
+      {
+        const float u = vu[i];
+        const float v = vv[i];
+        return Vec2f(u,v);
+      }
+
+      __forceinline float   t(const size_t i) { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+    private:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> UVW;
+      Vec3vf<M> tri_Ng;
+
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+
+    public:
+      const vbool<M> flags;
+    };
+
+    template<int K>
+    struct QuadHitPlueckerK
+    {
+      __forceinline QuadHitPlueckerK(const vfloat<K>& U,
+                                     const vfloat<K>& V,
+                                     const vfloat<K>& UVW,
+                                     const vfloat<K>& t,
+                                     const Vec3vf<K>& Ng,
+                                     const vbool<K>& flags)
+        : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng) {}
+
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vbool<K> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+        const vfloat<K> u0 = min(U * rcpUVW,1.0f);
+        const vfloat<K> v0 = min(V * rcpUVW,1.0f);
+        const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+        const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+        const vfloat<K> u = select(flags,u1,u0);
+        const vfloat<K> v = select(flags,v1,v0);
+        const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+        return std::make_tuple(u,v,t,Ng);
+      }
+
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> UVW;
+      const vfloat<K> t;
+      const vbool<K> flags;
+      const Vec3vf<K> tri_Ng;
+    };
+
+    struct PlueckerIntersectorTriangle1
+    {
+      template<int M, typename Epilog>
+      static __forceinline bool intersect(Ray& ray,
+                                          const Vec3vf<M>& tri_v0,
+                                          const Vec3vf<M>& tri_v1,
+                                          const Vec3vf<M>& tri_v2,
+                                          const vbool<M>& flags,
+                                          const Epilog& epilog)
+      {
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+        const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid =  (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+
+         /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags);
+        return epilog(valid,hit);
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMIntersector1Pluecker
+    {
+      __forceinline QuadMIntersector1Pluecker() {}
+
+      __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+      __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+        PlueckerIntersectorTriangle1::intersect(ray,v0,v1,v3,vbool<M>(false),epilog);
+        PlueckerIntersectorTriangle1::intersect(ray,v2,v3,v1,vbool<M>(true),epilog);
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                  const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1EpilogM<M,M,filter> epilog(ray,context,geomID,primID);
+        if (PlueckerIntersectorTriangle1::intersect(ray,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (PlueckerIntersectorTriangle1::intersect(ray,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+#if defined(__AVX512ER__) // KNL
+
+    /*! Intersects 4 quads with 1 ray using AVX512 */
+    template<bool filter>
+    struct QuadMIntersector1Pluecker<4,filter>
+    {
+      __forceinline QuadMIntersector1Pluecker() {}
+
+      __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+                            select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+                            select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+        const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+        const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+                            select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+                            select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+        const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+                            select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+                            select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+        const vbool16 flags(0xf0f0);
+        return PlueckerIntersectorTriangle1::intersect(ray,vtx0,vtx1,vtx2,flags,epilog);
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,16,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#elif defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+    struct QuadMIntersector1Pluecker<4,filter>
+    {
+      __forceinline QuadMIntersector1Pluecker() {}
+
+      __forceinline QuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        return PlueckerIntersectorTriangle1::intersect(ray,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Intersect1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3,
+                                  const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect(ray,v0,v1,v2,v3,Occluded1EpilogM<8,8,filter>(ray,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    struct PlueckerIntersector1KTriangleM
+    {
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<int M, int K, typename Epilog>
+      static  __forceinline bool intersect1(RayK<K>& ray,
+                                            size_t k,
+                                            const Vec3vf<M>& tri_v0,
+                                            const Vec3vf<M>& tri_v1,
+                                            const Vec3vf<M>& tri_v2,
+                                            const vbool<M>& flags,
+                                            const Epilog& epilog)
+      {
+        /* calculate vertices relative to ray origin */
+          const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+          const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+          const Vec3vf<M> v0 = tri_v0-O;
+          const Vec3vf<M> v1 = tri_v1-O;
+          const Vec3vf<M> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<M> e0 = v2-v0;
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v1-v2;
+          
+          /* perform edge tests */
+          const vfloat<M> U = dot(cross(e0,v2+v0),D);
+          const vfloat<M> V = dot(cross(e1,v0+v1),D);
+          const vfloat<M> W = dot(cross(e2,v1+v2),D);
+          const vfloat<M> UVW = U+V+W;
+          const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          vbool<M> valid = max(U,V,W) <= eps;
+#else
+          vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate geometry normal and denominator */
+          const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<M> den = twice(dot(Ng,D));
+
+          /* perform depth test */
+          const vfloat<M> T = twice(dot(v0,Ng));
+          const vfloat<M> t = rcp(den)*T;
+          valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+          if (unlikely(none(valid))) return false;
+          
+          /* avoid division by 0 */
+          valid &= den != vfloat<M>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          QuadHitPlueckerM<M> hit(valid,U,V,UVW,t,Ng,flags);
+          return epilog(valid,hit);
+      }
+    };
+
+    template<int M, int K, bool filter>
+    struct QuadMIntersectorKPlueckerBase
+    {
+      __forceinline QuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const vbool<K>& flags,
+                                        const Epilog& epilog) const
+      {
+        /* calculate vertices relative to ray origin */
+          vbool<K> valid = valid0;
+          const Vec3vf<K> O = ray.org;
+          const Vec3vf<K> D = ray.dir;
+          const Vec3vf<K> v0 = tri_v0-O;
+          const Vec3vf<K> v1 = tri_v1-O;
+          const Vec3vf<K> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<K> e0 = v2-v0;
+          const Vec3vf<K> e1 = v0-v1;
+          const Vec3vf<K> e2 = v1-v2;
+           
+          /* perform edge tests */
+          const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+          const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+          const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+          const vfloat<K> UVW = U+V+W;
+          const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          valid &= max(U,V,W) <= eps;
+#else
+          valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+          if (unlikely(none(valid))) return false;
+          
+           /* calculate geometry normal and denominator */
+          const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+          /* perform depth test */
+          const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+          const vfloat<K> t = rcp(den)*T;
+          valid &= ray.tnear() <= t & t <= ray.tfar;
+          valid &= den != vfloat<K>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate hit information */
+          QuadHitPlueckerK<K> hit(U,V,UVW,t,Ng,flags);
+          return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M quads. */
+      template<typename Epilog>
+      __forceinline bool intersectK(const vbool<K>& valid0, 
+                                    RayK<K>& ray,
+                                    const Vec3vf<K>& v0,
+                                    const Vec3vf<K>& v1,
+                                    const Vec3vf<K>& v2,
+                                    const Vec3vf<K>& v3,
+                                    const Epilog& epilog) const
+      {
+        intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),epilog);
+        if (none(valid0)) return true;
+        intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),epilog);
+        return none(valid0);
+      }
+    };
+
+    template<int M, int K, bool filter>
+      struct QuadMIntersectorKPluecker : public QuadMIntersectorKPlueckerBase<M,K,filter>
+    {
+      __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Intersect1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+        PlueckerIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog);
+        PlueckerIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog);
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                   const vuint<M>& geomID, const vuint<M>& primID) const
+      {
+        Occluded1KEpilogM<M,M,K,filter> epilog(ray,k,context,geomID,primID);
+        if (PlueckerIntersector1KTriangleM::intersect1(ray,k,v0,v1,v3,vbool<M>(false),epilog)) return true;
+        if (PlueckerIntersector1KTriangleM::intersect1(ray,k,v2,v3,v1,vbool<M>(true ),epilog)) return true;
+        return false;
+      }
+    };
+
+#if defined(__AVX512ER__) // KNL
+
+    /*! Intersects 4 quads with 1 ray using AVX512 */
+    template<int K, bool filter>
+    struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {}
+
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf16 vtx0(select(0x0f0f,vfloat16(v0.x),vfloat16(v2.x)),
+                            select(0x0f0f,vfloat16(v0.y),vfloat16(v2.y)),
+                            select(0x0f0f,vfloat16(v0.z),vfloat16(v2.z)));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf16 vtx1(vfloat16(v1.x),vfloat16(v1.y),vfloat16(v1.z));
+        const Vec3vf16 vtx2(vfloat16(v3.x),vfloat16(v3.y),vfloat16(v3.z));
+#else
+        const Vec3vf16 vtx1(select(0x0f0f,vfloat16(v1.x),vfloat16(v3.x)),
+                            select(0x0f0f,vfloat16(v1.y),vfloat16(v3.y)),
+                            select(0x0f0f,vfloat16(v1.z),vfloat16(v3.z)));
+        const Vec3vf16 vtx2(select(0x0f0f,vfloat16(v3.x),vfloat16(v1.x)),
+                            select(0x0f0f,vfloat16(v3.y),vfloat16(v1.y)),
+                            select(0x0f0f,vfloat16(v3.z),vfloat16(v1.z)));
+#endif
+
+        const vbool16 flags(0xf0f0);
+        return PlueckerIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog);
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context, 
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,16,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#elif defined(__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+    struct QuadMIntersectorKPluecker<4,K,filter> : public QuadMIntersectorKPlueckerBase<4,K,filter>
+    {
+      __forceinline QuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : QuadMIntersectorKPlueckerBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+      __forceinline bool intersect1(RayK<K>& ray, size_t k, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        return PlueckerIntersector1KTriangleM::intersect1(ray,k,vtx0,vtx1,vtx2,flags,epilog); 
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                    const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Intersect1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const vuint4& geomID, const vuint4& primID) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,Occluded1KEpilogM<8,8,K,filter>(ray,k,context,vuint8(geomID),vuint8(primID)));
+      }
+    };
+
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadi.h b/thirdparty/embree-aarch64/kernels/geometry/quadi.h
new file mode 100644
index 0000000000..741ec519ab
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadi.h
@@ -0,0 +1,483 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  /* Stores M quads from an indexed face set */
+  template <int M>
+  struct QuadMi
+  {
+    /* Virtual interface to query information about the quad type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored quads */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline QuadMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline QuadMi(const vuint<M>& v0,
+                         const vuint<M>& v1,
+                         const vuint<M>& v2,
+                         const vuint<M>& v3,
+                         const vuint<M>& geomIDs,
+                         const vuint<M>& primIDs)
+#if defined(EMBREE_COMPACT_POLYS)
+      : geomIDs(geomIDs), primIDs(primIDs) {}
+#else
+     : v0_(v0),v1_(v1), v2_(v2), v3_(v3), geomIDs(geomIDs), primIDs(primIDs) {}
+#endif
+
+    /* Returns a mask that tells which quads are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+    /* Returns if the specified quad is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored quads */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); assert(geomIDs[i] != -1); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the quads */
+    __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++) {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        bounds.extend(mesh->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const QuadMesh* mesh = scene->get<QuadMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+
+    /* Fill quad from quad list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> geomID = -1, primID = -1;
+      const PrimRefT* prim = &prims[begin];
+      vuint<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+
+      for (size_t i=0; i<M; i++)
+      {
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+#if !defined(EMBREE_COMPACT_POLYS)
+          const QuadMesh* mesh = scene->get<QuadMesh>(prim->geomID());
+          const QuadMesh::Quad& q = mesh->quad(prim->primID());
+          unsigned int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = q.v[0] * int_stride;
+          v1[i] = q.v[1] * int_stride;
+          v2[i] = q.v[2] * int_stride;
+          v3[i] = q.v[3] * int_stride;
+#endif
+          begin++;
+        } else {
+          assert(i);
+          if (likely(i > 0)) {
+            geomID[i] = geomID[0]; // always valid geomIDs
+            primID[i] = -1;        // indicates invalid data
+            v0[i] = v0[0];
+            v1[i] = v0[0];
+            v2[i] = v0[0];
+            v3[i] = v0[0];
+          }
+        }
+        if (begin<end) prim = &prims[begin];
+      }
+      new (this) QuadMi(v0,v1,v2,v3,geomID,primID); // FIXME: use non temporal store
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    friend embree_ostream operator<<(embree_ostream cout, const QuadMi& quad) {
+      return cout << "QuadMi<" << M << ">( "
+#if !defined(EMBREE_COMPACT_POLYS)
+                  << "v0 = " << quad.v0_ << ", v1 = " << quad.v1_ << ", v2 = " << quad.v2_ << ", v3 = " << quad.v3_ << ", "
+#endif
+                  << "geomID = " << quad.geomIDs << ", primID = " << quad.primIDs << " )";
+    }
+
+  protected:
+#if !defined(EMBREE_COMPACT_POLYS)
+    vuint<M> v0_;         // 4 byte offset of 1st vertex
+    vuint<M> v1_;         // 4 byte offset of 2nd vertex
+    vuint<M> v2_;         // 4 byte offset of 3rd vertex
+    vuint<M> v3_;         // 4 byte offset of 4th vertex
+#endif
+    vuint<M> geomIDs;    // geometry ID of mesh
+    vuint<M> primIDs;    // primitive ID of primitive inside mesh
+  };
+
+  namespace isa
+  {
+    
+  template<int M>
+    struct QuadMi : public embree::QuadMi<M>
+  {
+#if !defined(EMBREE_COMPACT_POLYS)
+    using embree::QuadMi<M>::v0_;
+    using embree::QuadMi<M>::v1_;
+    using embree::QuadMi<M>::v2_;
+    using embree::QuadMi<M>::v3_;
+#endif
+    using embree::QuadMi<M>::geomIDs;
+    using embree::QuadMi<M>::primIDs;
+    using embree::QuadMi<M>::geomID;
+    using embree::QuadMi<M>::primID;
+    using embree::QuadMi<M>::valid;
+    
+    template<int vid>
+    __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const QuadMesh::Quad& quad = mesh->quad(primID(index));
+      return (Vec3f) mesh->vertices[0][quad.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const float* vertices = scene->vertices[geomID(index)];
+      return (Vec3f&) vertices[v[index]];
+#endif
+    }
+
+    template<int vid, typename T>
+    __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const QuadMesh::Quad& quad = mesh->quad(primID(index));
+      const Vec3fa v0 = mesh->vertices[itime+0][quad.v[vid]];
+      const Vec3fa v1 = mesh->vertices[itime+1][quad.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+      const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0);
+      const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1);
+      const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+      const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+      const Vec3<T> p0(v0.x,v0.y,v0.z);
+      const Vec3<T> p1(v1.x,v1.y,v1.z);
+      return lerp(p0,p1,ftime);
+    }
+
+    template<int vid, int K, typename T>
+    __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const
+    {
+      Vec3<T> p0, p1;
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+
+      for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask))
+      {
+#if defined(EMBREE_COMPACT_POLYS)
+        const QuadMesh::Quad& quad = mesh->quad(primID(index));
+        const Vec3fa v0 = mesh->vertices[itime[i]+0][quad.v[vid]];
+        const Vec3fa v1 = mesh->vertices[itime[i]+1][quad.v[vid]];
+#else
+        const vuint<M>& v = getVertexOffset<vid>();
+        const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0);
+        const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1);
+        const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+        const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+        p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z;
+        p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z;
+      }
+      return (T(one)-ftime)*p0 + ftime*p1;
+    }
+
+    struct Quad {
+      vfloat4 v0,v1,v2,v3;
+    };
+
+#if defined(EMBREE_COMPACT_POLYS)
+    
+    __forceinline Quad loadQuad(const int i, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero, zero };
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const QuadMesh::Quad& quad = mesh->quad(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices0[quad.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices0[quad.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices0[quad.v[2]];
+      const vfloat4 v3 = (vfloat4) mesh->vertices0[quad.v[3]];
+      return { v0, v1, v2, v3 };
+    }
+
+    __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero, zero };
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const QuadMesh::Quad& quad = mesh->quad(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices[itime][quad.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices[itime][quad.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices[itime][quad.v[2]];
+      const vfloat4 v3 = (vfloat4) mesh->vertices[itime][quad.v[3]];
+      return { v0, v1, v2, v3 };
+    }
+    
+#else
+
+    __forceinline Quad loadQuad(const int i, const Scene* const scene) const 
+    {
+      const float* vertices = scene->vertices[geomID(i)];
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]);
+      return { v0, v1, v2, v3 };
+    }
+
+    __forceinline Quad loadQuad(const int i, const int itime, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID);
+      const float* vertices = (const float*) mesh->vertexPtr(0,itime);
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      const vfloat4 v3 = vfloat4::loadu(vertices + v3_[i]);
+      return { v0, v1, v2, v3 };
+    }
+    
+#endif
+
+    /* Gather the quads */
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const Scene *const scene) const;
+
+#if defined(__AVX512F__)
+    __forceinline void gather(Vec3vf16& p0,
+                              Vec3vf16& p1,
+                              Vec3vf16& p2,
+                              Vec3vf16& p3,
+                              const Scene *const scene) const;
+#endif
+
+    template<int K>
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019
+    __noinline
+#else
+    __forceinline
+#endif
+    void gather(const vbool<K>& valid,
+      Vec3vf<K>& p0,
+      Vec3vf<K>& p1,
+      Vec3vf<K>& p2,
+      Vec3vf<K>& p3,
+      const size_t index,
+      const Scene* const scene,
+      const vfloat<K>& time) const
+    {
+      const QuadMesh* mesh = scene->get<QuadMesh>(geomID(index));
+
+      vfloat<K> ftime;
+      const vint<K> itime = mesh->timeSegment(time, ftime);
+
+      const size_t first = bsf(movemask(valid));
+      if (likely(all(valid,itime[first] == itime)))
+      {
+        p0 = getVertex<0>(index, scene, itime[first], ftime);
+        p1 = getVertex<1>(index, scene, itime[first], ftime);
+        p2 = getVertex<2>(index, scene, itime[first], ftime);
+        p3 = getVertex<3>(index, scene, itime[first], ftime);
+      }
+      else
+      {
+        p0 = getVertex<0>(valid, index, scene, itime, ftime);
+        p1 = getVertex<1>(valid, index, scene, itime, ftime);
+        p2 = getVertex<2>(valid, index, scene, itime, ftime);
+        p3 = getVertex<3>(valid, index, scene, itime, ftime);
+      }
+    }
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const QuadMesh* mesh,
+                              const Scene *const scene,
+                              const int itime) const;
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              Vec3vf<M>& p3,
+                              const Scene *const scene,
+                              const float time) const;
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(QuadMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M; i++)
+      {
+        if (!valid(i)) break;
+        const unsigned primId = primID(i);
+        const QuadMesh::Quad& q = mesh->quad(primId);
+        const Vec3fa p0 = mesh->vertex(q.v[0]);
+        const Vec3fa p1 = mesh->vertex(q.v[1]);
+        const Vec3fa p2 = mesh->vertex(q.v[2]);
+        const Vec3fa p3 = mesh->vertex(q.v[3]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3)));
+      }
+      return bounds;
+    }
+
+  private:
+#if !defined(EMBREE_COMPACT_POLYS)
+    template<int N> const vuint<M>& getVertexOffset() const;
+#endif
+  };
+
+#if !defined(EMBREE_COMPACT_POLYS)
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<0>() const { return v0_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<1>() const { return v1_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<2>() const { return v2_; }
+  template<> template<> __forceinline const vuint<4>& QuadMi<4>::getVertexOffset<3>() const { return v3_; }
+#endif
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const Scene *const scene) const
+  {
+    prefetchL1(((char*)this)+0*64);
+    prefetchL1(((char*)this)+1*64);
+    const Quad tri0 = loadQuad(0,scene);
+    const Quad tri1 = loadQuad(1,scene);
+    const Quad tri2 = loadQuad(2,scene);
+    const Quad tri3 = loadQuad(3,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+    transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z);
+  }
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const QuadMesh* mesh,
+                                       const Scene *const scene,
+                                       const int itime) const
+  {
+    // FIXME: for trianglei there all geometries are identical, is this the case here too?
+    
+    const Quad tri0 = loadQuad(0,itime,scene);
+    const Quad tri1 = loadQuad(1,itime,scene);
+    const Quad tri2 = loadQuad(2,itime,scene);
+    const Quad tri3 = loadQuad(3,itime,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+    transpose(tri0.v3,tri1.v3,tri2.v3,tri3.v3,p3.x,p3.y,p3.z);
+  }
+
+  template<>
+  __forceinline void QuadMi<4>::gather(Vec3vf4& p0,
+                                       Vec3vf4& p1,
+                                       Vec3vf4& p2,
+                                       Vec3vf4& p3,
+                                       const Scene *const scene,
+                                       const float time) const
+  {
+    const QuadMesh* mesh = scene->get<QuadMesh>(geomID(0)); // in mblur mode all geometries are identical
+
+    float ftime;
+    const int itime = mesh->timeSegment(time, ftime);
+
+    Vec3vf4 a0,a1,a2,a3; gather(a0,a1,a2,a3,mesh,scene,itime);
+    Vec3vf4 b0,b1,b2,b3; gather(b0,b1,b2,b3,mesh,scene,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    p2 = lerp(a2,b2,vfloat4(ftime));
+    p3 = lerp(a3,b3,vfloat4(ftime));
+  }
+  }
+
+  template<int M>
+  typename QuadMi<M>::Type QuadMi<M>::type;
+
+  typedef QuadMi<4> Quad4i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h
new file mode 100644
index 0000000000..96cf7f1ca2
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadi_intersector.h
@@ -0,0 +1,350 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadi.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiIntersector1Moeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiIntersectorKMoeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiIntersector1Pluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiIntersectorKPluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        Scene* scene = context->scene;
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = quad.template getVertex<0>(i,scene);
+          const Vec3vf<K> p1 = quad.template getVertex<1>(i,scene);
+          const Vec3vf<K> p2 = quad.template getVertex<2>(i,scene);
+          const Vec3vf<K> p3 = quad.template getVertex<3>(i,scene);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf4 v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M motion blur quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiMBIntersector1Moeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M motion blur quads with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiMBIntersectorKMoeller
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M quads. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M quads. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M quads and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M quads. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M motion blur quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMiMBIntersector1Pluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        pre.intersect(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time());
+        return pre.occluded(ray,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M motion blur quads with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMiMBIntersectorKPluecker
+    {
+      typedef QuadMi<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M quads. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,v3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M quads. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<QuadMi<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2,v3; quad.gather(valid_i,v0,v1,v2,v3,i,context->scene,ray.time());
+          if (pre.intersectK(valid0,ray,v0,v1,v2,v3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M quads and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M quads. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMi<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2,v3; quad.gather(v0,v1,v2,v3,context->scene,ray.time()[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,quad.geomID(),quad.primID());
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadv.h b/thirdparty/embree-aarch64/kernels/geometry/quadv.h
new file mode 100644
index 0000000000..0a1fe4d128
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadv.h
@@ -0,0 +1,165 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M quads in struct of array layout */
+  template <int M>
+  struct QuadMv
+  { 
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored quads */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline QuadMv() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline QuadMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), v1(v1), v2(v2), v3(v3), geomIDs(geomIDs), primIDs(primIDs) {}
+    
+    /* Returns a mask that tells which quads are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified quad is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored quads */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M> primID()       { return primIDs; }
+    __forceinline const vuint<M> primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the quads */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2,v3);
+      Vec3vf<M> upper = max(v0,v1,v2,v3);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+    
+    /* Non temporal store */
+    __forceinline static void store_nt(QuadMv* dst, const QuadMv& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->v1.x,src.v1.x);
+      vfloat<M>::store_nt(&dst->v1.y,src.v1.y);
+      vfloat<M>::store_nt(&dst->v1.z,src.v1.z);
+      vfloat<M>::store_nt(&dst->v2.x,src.v2.x);
+      vfloat<M>::store_nt(&dst->v2.y,src.v2.y);
+      vfloat<M>::store_nt(&dst->v2.z,src.v2.z);
+      vfloat<M>::store_nt(&dst->v3.x,src.v3.x);
+      vfloat<M>::store_nt(&dst->v3.y,src.v3.y);
+      vfloat<M>::store_nt(&dst->v3.z,src.v3.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill quad from quad list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero, v3 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const QuadMesh* __restrict__ const mesh = scene->get<QuadMesh>(geomID);
+        const QuadMesh::Quad& quad = mesh->quad(primID);
+        const Vec3fa& p0 = mesh->vertex(quad.v[0]);
+        const Vec3fa& p1 = mesh->vertex(quad.v[1]);
+        const Vec3fa& p2 = mesh->vertex(quad.v[2]);
+        const Vec3fa& p3 = mesh->vertex(quad.v[3]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+      }
+      QuadMv::store_nt(this,QuadMv(v0,v1,v2,v3,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(QuadMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+	
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const QuadMesh::Quad& quad = mesh->quad(primId);
+        const Vec3fa p0 = mesh->vertex(quad.v[0]);
+        const Vec3fa p1 = mesh->vertex(quad.v[1]);
+        const Vec3fa p2 = mesh->vertex(quad.v[2]);
+        const Vec3fa p3 = mesh->vertex(quad.v[3]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2),BBox3fa(p3)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+        v3.x[i] = p3.x; v3.y[i] = p3.y; v3.z[i] = p3.z;
+      }
+      new (this) QuadMv(v0,v1,v2,v3,vgeomID,vprimID);
+      return bounds;
+    }
+   
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the quads
+    Vec3vf<M> v1;      // 2nd vertex of the quads
+    Vec3vf<M> v2;      // 3rd vertex of the quads
+    Vec3vf<M> v3;      // 4rd vertex of the quads
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename QuadMv<M>::Type QuadMv<M>::type;
+
+  typedef QuadMv<4> Quad4v;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h
new file mode 100644
index 0000000000..30a24b291a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/quadv_intersector.h
@@ -0,0 +1,181 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "quadv.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMvIntersector1Moeller
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersector1MoellerTrumbore<M,filter> Precalculations;
+        
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+        
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMvIntersectorKMoeller
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersectorKMoellerTrumbore<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+    };
+
+    /*! Intersects M quads with 1 ray */
+    template<int M, bool filter>
+    struct QuadMvIntersector1Pluecker
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersector1Pluecker<M,filter> Precalculations;
+        
+      /*! Intersect a ray with the M quads and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+        
+      /*! Test if the ray is occluded by one of M quads. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded(ray,context, quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& quad)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, quad);
+      }
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int K, bool filter>
+    struct QuadMvIntersectorKPluecker
+    {
+      typedef QuadMv<M> Primitive;
+      typedef QuadMIntersectorKPluecker<M,K,filter> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,IntersectKEpilogM<M,K,filter>(ray,context,quad.geomID(),quad.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<QuadMv<M>::max_size(); i++)
+        {
+          if (!quad.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(quad.v0,i);
+          const Vec3vf<K> p1 = broadcast<vfloat<K>>(quad.v1,i);
+          const Vec3vf<K> p2 = broadcast<vfloat<K>>(quad.v2,i);
+          const Vec3vf<K> p3 = broadcast<vfloat<K>>(quad.v3,i);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,OccludedKEpilogM<M,K,filter>(valid0,ray,context,quad.geomID(),quad.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const QuadMv<M>& quad)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.occluded1(ray,k,context,quad.v0,quad.v1,quad.v2,quad.v3,quad.geomID(),quad.primID());
+      }
+    };
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h
new file mode 100644
index 0000000000..cdf68f486b
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/roundline_intersector.h
@@ -0,0 +1,710 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "curve_intersector_precalculations.h"
+
+
+/*
+  
+  This file implements the intersection of a ray with a round linear
+  curve segment. We define the geometry of such a round linear curve
+  segment from point p0 with radius r0 to point p1 with radius r1
+  using the cone that touches spheres p0/r0 and p1/r1 tangentially
+  plus the sphere p1/r1. We denote the tangentially touching cone from
+  p0/r0 to p1/r1 with cone(p0,r0,p1,r1) and the cone plus the ending
+  sphere with cone_sphere(p0,r0,p1,r1).
+
+  For multiple connected round linear curve segments this construction
+  yield a proper shape when viewed from the outside. Using the
+  following CSG we can also handle the interiour in most common cases:
+
+     round_linear_curve(pl,rl,p0,r0,p1,r1,pr,rr) =
+       cone_sphere(p0,r0,p1,r1) - cone(pl,rl,p0,r0) - cone(p1,r1,pr,rr)
+
+  Thus by subtracting the neighboring cone geometries, we cut away
+  parts of the center cone_sphere surface which lie inside the
+  combined curve. This approach works as long as geometry of the
+  current cone_sphere penetrates into direct neighbor segments only,
+  and not into segments further away.
+  
+  To construct a cone that touches two spheres at p0 and p1 with r0
+  and r1, one has to increase the cone radius at r0 and r1 to obtain
+  larger radii w0 and w1, such that the infinite cone properly touches
+  the spheres.  From the paper "Ray Tracing Generalized Tube
+  Primitives: Method and Applications"
+  (https://www.researchgate.net/publication/334378683_Ray_Tracing_Generalized_Tube_Primitives_Method_and_Applications)
+  one can derive the following equations for these increased
+  radii:
+
+     sr = 1.0f / sqrt(1-sqr(dr)/sqr(p1-p0))
+     w0 = sr*r0
+     w1 = sr*r1
+
+  Further, we want the cone to start where it touches the sphere at p0
+  and to end where it touches sphere at p1.  Therefore, we need to
+  construct clipping locations y0 and y1 for the start and end of the
+  cone. These start and end clipping location of the cone can get
+  calculated as:
+
+     Y0 =               - r0 * (r1-r0) / length(p1-p0)
+     Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0)
+
+  Where the cone starts a distance Y0 and ends a distance Y1 away of
+  point p0 along the cone center. The distance between Y1-Y0 can get
+  calculated as:
+
+    dY = length(p1-p0) - (r1-r0)^2 / length(p1-p0)
+
+  In the code below, Y will always be scaled by length(p1-p0) to
+  obtain y and you will find the terms r0*(r1-r0) and
+  (p1-p0)^2-(r1-r0)^2.
+
+ */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+      struct RoundLineIntersectorHitM
+      {
+        __forceinline RoundLineIntersectorHitM() {}
+        
+        __forceinline RoundLineIntersectorHitM(const vfloat<M>& u, const vfloat<M>& v, const vfloat<M>& t, const Vec3vf<M>& Ng)
+          : vu(u), vv(v), vt(t), vNg(Ng) {}
+	
+        __forceinline void finalize() {}
+	
+        __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+        __forceinline float t  (const size_t i) const { return vt[i]; }
+        __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+	
+      public:
+        vfloat<M> vu;
+        vfloat<M> vv;
+        vfloat<M> vt;
+        Vec3vf<M> vNg;
+      };
+    
+    namespace __roundline_internal
+    {
+      template<int M>
+        struct ConeGeometry
+        {
+          ConeGeometry (const Vec4vf<M>& a, const Vec4vf<M>& b)
+          : p0(a.xyz()), p1(b.xyz()), dP(p1-p0), dPdP(dot(dP,dP)), r0(a.w), sqr_r0(sqr(r0)), r1(b.w), dr(r1-r0), drdr(dr*dr), r0dr (r0*dr), g(dPdP - drdr) {}
+          
+          /* 
+             
+             This function tests if a point is accepted by first cone
+             clipping plane.
+
+             First, we need to project the point onto the line p0->p1:
+             
+               Y = (p-p0)*(p1-p0)/length(p1-p0)
+             
+             This value y is the distance to the projection point from
+             p0. The clip distances are calculated as:
+             
+               Y0 =               - r0 * (r1-r0) / length(p1-p0)
+               Y1 = length(p1-p0) - r1 * (r1-r0) / length(p1-p0)
+             
+             Thus to test if the point p is accepted by the first
+             clipping plane we need to test Y > Y0 and to test if it
+             is accepted by the second clipping plane we need to test
+             Y < Y1.
+             
+             By multiplying the calculations with length(p1-p0) these
+             calculation can get simplied to:
+             
+               y = (p-p0)*(p1-p0)
+               y0 =           - r0 * (r1-r0)
+               y1 = (p1-p0)^2 - r1 * (r1-r0)
+
+             and the test y > y0 and y < y1.
+             
+          */
+          
+          __forceinline vbool<M> isClippedByPlane (const vbool<M>& valid_i, const Vec3vf<M>& p) const
+          {
+            const Vec3vf<M> p0p = p - p0;
+            const vfloat<M> y = dot(p0p,dP);
+            const vfloat<M> cap0 = -r0dr;
+            const vbool<M> inside_cone = y > cap0;
+            return valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf)) & inside_cone;
+          }
+          
+          /* 
+             
+             This function tests whether a point lies inside the capped cone
+             tangential to its ending spheres.
+
+             Therefore one has to check if the point is inside the
+             region defined by the cone clipping planes, which is
+             performed similar as in the previous function.
+             
+             To perform the inside cone test we need to project the
+             point onto the line p0->p1:
+             
+               dP = p1-p0
+               Y = (p-p0)*dP/length(dP)
+                           
+             This value Y is the distance to the projection point from
+             p0. To obtain a parameter value u going from 0 to 1 along
+             the line p0->p1 we calculate:
+             
+               U = Y/length(dP)
+             
+             The radii to use at points p0 and p1 are:
+             
+               w0 = sr * r0
+               w1 = sr * r1
+               dw = w1-w0
+             
+             Using these radii and u one can directly test if the point
+             lies inside the cone using the formula dP*dP < wy*wy with:
+             
+               wy = w0 + u*dw
+               py = p0 + u*dP - p
+                          
+             By multiplying the calculations with length(p1-p0) and
+             inserting the definition of w can obtain simpler equations:
+             
+               y = (p-p0)*dP
+               ry = r0 + y/dP^2 * dr
+               wy = sr*ry        
+               py = p0 + y/dP^2*dP - p
+               y0 =      - r0 * dr
+               y1 = dP^2 - r1 * dr
+             
+             Thus for the in-cone test we get:
+             
+                    py^2 < wy^2
+               <=>  py^2 < sr^2 * ry^2
+               <=>  py^2 * ( dP^2 - dr^2 ) < dP^2 * ry^2
+             
+             This can further get simplified to:
+             
+               (p0-p)^2 * (dP^2 - dr^2) - y^2 < dP^2 * r0^2 + 2.0f*r0*dr*y;            
+                      
+          */
+          
+          __forceinline vbool<M> isInsideCappedCone (const vbool<M>& valid_i, const Vec3vf<M>& p) const
+          {
+            const Vec3vf<M> p0p = p - p0;
+            const vfloat<M> y = dot(p0p,dP);
+            const vfloat<M> cap0 = -r0dr+vfloat<M>(ulp);
+            const vfloat<M> cap1 = -r1*dr + dPdP;
+            
+            vbool<M> inside_cone = valid_i & (p0.x != vfloat<M>(inf)) & (p1.x != vfloat<M>(inf));
+            inside_cone &= y > cap0;  // start clipping plane
+            inside_cone &= y < cap1;  // end clipping plane 
+            inside_cone &= sqr(p0p)*g - sqr(y) < dPdP * sqr_r0 + 2.0f*r0dr*y; // in cone test
+            return inside_cone;
+          }
+          
+        protected:
+          Vec3vf<M> p0;
+          Vec3vf<M> p1;
+          Vec3vf<M> dP;
+          vfloat<M> dPdP;
+          vfloat<M> r0;
+          vfloat<M> sqr_r0;
+          vfloat<M> r1;
+          vfloat<M> dr;
+          vfloat<M> drdr;
+          vfloat<M> r0dr;
+          vfloat<M> g;
+        };
+      
+      template<int M>
+        struct ConeGeometryIntersector : public ConeGeometry<M>
+      {
+        using ConeGeometry<M>::p0;
+        using ConeGeometry<M>::p1;
+        using ConeGeometry<M>::dP;
+        using ConeGeometry<M>::dPdP;
+        using ConeGeometry<M>::r0;
+        using ConeGeometry<M>::sqr_r0;
+        using ConeGeometry<M>::r1;
+        using ConeGeometry<M>::dr;
+        using ConeGeometry<M>::r0dr;
+        using ConeGeometry<M>::g;
+        
+        ConeGeometryIntersector (const Vec3vf<M>& ray_org, const Vec3vf<M>& ray_dir, const vfloat<M>& dOdO, const vfloat<M>& rcp_dOdO, const Vec4vf<M>& a, const Vec4vf<M>& b)
+          : ConeGeometry<M>(a,b), org(ray_org), O(ray_org-p0), dO(ray_dir),  dOdO(dOdO), rcp_dOdO(rcp_dOdO), OdP(dot(dP,O)), dOdP(dot(dP,dO)),  yp(OdP + r0dr) {}
+        
+        /*
+          
+          This function intersects a ray with a cone that touches a
+          start sphere p0/r0 and end sphere p1/r1.
+          
+          To find this ray/cone intersections one could just
+          calculate radii w0 and w1 as described above and use a
+          standard ray/cone intersection routine with these
+          radii. However, it turns out that calculations can get
+          simplified when deriving a specialized ray/cone
+          intersection for this special case. We perform
+          calculations relative to the cone origin p0 and define:
+            
+            O  = ray_org - p0
+            dO = ray_dir
+            dP = p1-p0
+            dr = r1-r0
+            dw = w1-w0
+            
+          For some t we can compute the potential hit point h = O + t*dO and
+          project it onto the cone vector dP to obtain u = (h*dP)/(dP*dP). In
+          case of an intersection, the squared distance from the hit point
+          projected onto the cone center line to the hit point should be equal
+          to the squared cone radius at u:
+            
+            (u*dP - h)^2 = (w0 + u*dw)^2
+           
+          Inserting the definition of h, u, w0, and dw into this formula, then
+          factoring out all terms, and sorting by t^2, t^1, and t^0 terms
+          yields a quadratic equation to solve.
+            
+          Inserting u:
+            ( (h*dP)*dP/dP^2 - h )^2 = ( w0 + (h*dP)*dw/dP^2 )^2
+            
+          Multiplying by dP^4:
+            ( (h*dP)*dP - h*dP^2 )^2 = ( w0*dP^2 + (h*dP)*dw )^2
+            
+          Inserting w0 and dw:
+            ( (h*dP)*dP - h*dP^2 )^2 = ( r0*dP^2 + (h*dP)*dr )^2 / (1-dr^2/dP^2)
+            ( (h*dP)*dP - h*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (h*dP)*dr )^2
+            
+          Now one can insert the definition of h, factor out, and presort by t:
+            ( ((O + t*dO)*dP)*dP - (O + t*dO)*dP^2 )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + ((O + t*dO)*dP)*dr )^2
+            ( (O*dP)*dP-O*dP^2 + t*( (dO*dP)*dP - dO*dP^2 ) )^2 *(dP^2 - dr^2) = dP^2 * ( r0*dP^2 + (O*dP)*dr + t*(dO*dP)*dr )^2
+            
+          Factoring out further and sorting by t^2, t^1 and t^0 yields:
+            
+            0 =   t^2 * [ ((dO*dP)*dP - dO-dP^2)^2 * (dP^2 - dr^2) - dP^2*(dO*dP)^2*dr^2 ]
+              + 2*t^1 * [ ((O*dP)*dP - O*dP^2) * ((dO*dP)*dP - dO*dP^2) * (dP^2 - dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)*(dO*dP)*dr ]
+              +   t^0 * [ ( (O*dP)*dP - O*dP^2)^2 * (dP^2-dr^2) - dP^2*(r0*dP^2 + (O*dP)*dr)^2 ]
+            
+          This can be simplified to:
+            
+             0 =   t^2 * [ (dP^2 - dr^2)*dO^2 - (dO*dP)^2 ]
+               + 2*t^1 * [ (dP^2 - dr^2)*(O*dO) - (dO*dP)*(O*dP + r0*dr) ]
+               +   t^0 * [ (dP^2 - dr^2)*O^2 - (O*dP)^2 - r0^2*dP^2 - 2.0f*r0*dr*(O*dP) ]
+            
+          Solving this quadratic equation yields the values for t at which the
+          ray intersects the cone.
+          
+        */
+        
+        __forceinline bool intersectCone(vbool<M>& valid, vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* return no hit by default */
+          lower = pos_inf;
+          upper = neg_inf;
+          
+          /* compute quadratic equation A*t^2 + B*t + C = 0 */
+          const vfloat<M> OO = dot(O,O);
+          const vfloat<M> OdO = dot(dO,O);
+          const vfloat<M> A = g * dOdO - sqr(dOdP);
+          const vfloat<M> B = 2.0f * (g*OdO - dOdP*yp);
+          const vfloat<M> C = g*OO - sqr(OdP) - sqr_r0*dPdP - 2.0f*r0dr*OdP;
+          
+          /* we miss the cone if determinant is smaller than zero */
+          const vfloat<M> D = B*B - 4.0f*A*C;
+          valid &= (D >= 0.0f & g > 0.0f);  // if g <= 0 then the cone is inside a sphere end
+          
+          /* When rays are parallel to the cone surface, then the
+           * ray may be inside or outside the cone. We just assume a
+           * miss in that case, which is fine as rays inside the
+           * cone would anyway hit the ending spheres in that
+           * case. */
+          valid &= abs(A) > min_rcp_input;
+          if (unlikely(none(valid))) {
+            return false;
+          }
+          
+          /* compute distance to front and back hit */
+          const vfloat<M> Q = sqrt(D);
+          const vfloat<M> rcp_2A = rcp(2.0f*A);
+          t_cone_front = (-B-Q)*rcp_2A;
+          y_cone_front = yp + t_cone_front*dOdP;
+          lower = select( (y_cone_front > -(float)ulp) & (y_cone_front <= g) & (g > 0.0f), t_cone_front, vfloat<M>(pos_inf));
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+          t_cone_back = (-B+Q)*rcp_2A;
+          y_cone_back  = yp + t_cone_back *dOdP;
+          upper = select( (y_cone_back  > -(float)ulp) & (y_cone_back  <= g) & (g > 0.0f), t_cone_back , vfloat<M>(neg_inf));
+#endif          
+          return true;
+        }
+        
+        /* 
+           This function intersects the ray with the end sphere at
+           p1. We already clip away hits that are inside the
+           neighboring cone segment.
+           
+        */
+        
+        __forceinline void intersectEndSphere(vbool<M>& valid, 
+                                              const ConeGeometry<M>& coneR, 
+                                              vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* calculate front and back hit with end sphere */
+          const Vec3vf<M> O1 = org - p1;
+          const vfloat<M> O1dO = dot(O1,dO);
+          const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r1));
+          const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) );
+          
+          /* clip away front hit if it is inside next cone segment */
+          t_sph1_front = (-O1dO - rhs1)*rcp_dOdO;
+          const Vec3vf<M> hit_front = org + t_sph1_front*dO;
+          vbool<M> valid_sph1_front = h2 >= 0.0f & yp + t_sph1_front*dOdP > g & !coneR.isClippedByPlane (valid, hit_front);
+          lower = select(valid_sph1_front, t_sph1_front, vfloat<M>(pos_inf));
+          
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          /* clip away back hit if it is inside next cone segment */
+          t_sph1_back  = (-O1dO + rhs1)*rcp_dOdO;
+          const Vec3vf<M> hit_back = org + t_sph1_back*dO;
+          vbool<M> valid_sph1_back  = h2 >= 0.0f & yp + t_sph1_back*dOdP > g & !coneR.isClippedByPlane (valid, hit_back);
+          upper = select(valid_sph1_back, t_sph1_back,  vfloat<M>(neg_inf));
+#else
+          upper = vfloat<M>(neg_inf);
+#endif
+        }
+
+        __forceinline void intersectBeginSphere(const vbool<M>& valid, 
+                                                vfloat<M>& lower, vfloat<M>& upper)
+        {
+          /* calculate front and back hit with end sphere */
+          const Vec3vf<M> O1 = org - p0;
+          const vfloat<M> O1dO = dot(O1,dO);
+          const vfloat<M> h2 = sqr(O1dO) - dOdO*(sqr(O1) - sqr(r0));
+          const vfloat<M> rhs1 = select( h2 >= 0.0f, sqrt(h2), vfloat<M>(neg_inf) );
+          
+          /* clip away front hit if it is inside next cone segment */
+          t_sph0_front = (-O1dO - rhs1)*rcp_dOdO;
+          vbool<M> valid_sph1_front = valid & h2 >= 0.0f & yp + t_sph0_front*dOdP < 0;
+          lower = select(valid_sph1_front, t_sph0_front, vfloat<M>(pos_inf));
+
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          /* clip away back hit if it is inside next cone segment */
+          t_sph0_back  = (-O1dO + rhs1)*rcp_dOdO;
+          vbool<M> valid_sph1_back  = valid & h2 >= 0.0f & yp + t_sph0_back*dOdP < 0;
+          upper = select(valid_sph1_back, t_sph0_back,  vfloat<M>(neg_inf));
+#else   
+          upper = vfloat<M>(neg_inf);
+#endif
+        }
+        
+        /* 
+           
+           This function calculates the geometry normal of some cone hit.
+           
+           For a given hit point h (relative to p0) with a cone
+           starting at p0 with radius w0 and ending at p1 with
+           radius w1 one normally calculates the geometry normal by
+           first calculating the parmetric u hit location along the
+           cone:
+           
+             u = dot(h,dP)/dP^2
+           
+           Using this value one can now directly calculate the
+           geometry normal by bending the connection vector (h-u*dP)
+           from hit to projected hit with some cone dependent value
+           dw/sqrt(dP^2) * normalize(dP):
+           
+             Ng = normalize(h-u*dP) - dw/length(dP) * normalize(dP)
+           
+           The length of the vector (h-u*dP) can also get calculated
+           by interpolating the radii as w0+u*dw which yields:
+           
+             Ng = (h-u*dP)/(w0+u*dw) - dw/dP^2 * dP
+           
+           Multiplying with (w0+u*dw) yield a scaled Ng':
+           
+             Ng' = (h-u*dP) - (w0+u*dw)*dw/dP^2*dP
+           
+           Inserting the definition of w0 and dw and refactoring
+           yield a furhter scaled Ng'':
+           
+             Ng'' = (dP^2 - dr^2) (h-q) - (r0+u*dr)*dr*dP
+           
+           Now inserting the definition of u gives and multiplying
+           with the denominator yields:
+           
+             Ng''' = (dP^2-dr^2)*(dP^2*h-dot(h,dP)*dP) - (dP^2*r0+dot(h,dP)*dr)*dr*dP
+           
+           Factoring out, cancelling terms, dividing by dP^2, and
+           factoring again yields finally:
+           
+             Ng'''' = (dP^2-dr^2)*h - dP*(dot(h,dP) + r0*dr)
+           
+        */
+        
+        __forceinline Vec3vf<M> Ng_cone(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back);
+          const vfloat<M> t = select(front_hit, t_cone_front, t_cone_back);
+          const Vec3vf<M> h = O + t*dO;
+          return g*h-dP*y;
+#else
+          const Vec3vf<M> h = O + t_cone_front*dO;
+          return g*h-dP*y_cone_front;
+#endif
+        }
+        
+        /* compute geometry normal of sphere hit as the difference
+         * vector from hit point to sphere center */
+        
+        __forceinline Vec3vf<M> Ng_sphere1(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> t_sph1 = select(front_hit, t_sph1_front, t_sph1_back);
+          return org+t_sph1*dO-p1;
+#else 
+          return org+t_sph1_front*dO-p1;
+#endif
+        }
+
+        __forceinline Vec3vf<M> Ng_sphere0(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> t_sph0 = select(front_hit, t_sph0_front, t_sph0_back);
+          return org+t_sph0*dO-p0;
+#else
+          return org+t_sph0_front*dO-p0;
+#endif
+        }
+        
+        /* 
+           This function calculates the u coordinate of a
+           hit. Therefore we use the hit distance y (which is zero
+           at the first cone clipping plane) and divide by distance
+           g between the clipping planes.
+           
+        */
+        
+        __forceinline vfloat<M> u_cone(const vbool<M>& front_hit) const
+        {
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+          const vfloat<M> y = select(front_hit, y_cone_front, y_cone_back);
+          return clamp(y*rcp(g));
+#else
+          return clamp(y_cone_front*rcp(g));
+#endif
+        }
+        
+      private:
+        Vec3vf<M> org;
+        Vec3vf<M> O;
+        Vec3vf<M> dO;
+        vfloat<M> dOdO;
+        vfloat<M> rcp_dOdO;
+        vfloat<M> OdP;
+        vfloat<M> dOdP;
+        
+        /* for ray/cone intersection */
+      private:
+        vfloat<M> yp;
+        vfloat<M> y_cone_front;
+        vfloat<M> t_cone_front;
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> y_cone_back;
+        vfloat<M> t_cone_back;
+#endif
+        
+        /* for ray/sphere intersection */
+      private:
+        vfloat<M> t_sph1_front;
+        vfloat<M> t_sph0_front;
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> t_sph1_back;
+        vfloat<M> t_sph0_back;
+#endif
+      };
+      
+      
+      template<int M, typename Epilog, typename ray_tfar_func>
+        static __forceinline bool intersectConeSphere(const vbool<M>& valid_i,
+                                                      const Vec3vf<M>& ray_org_in, const Vec3vf<M>& ray_dir, 
+                                                      const vfloat<M>& ray_tnear, const ray_tfar_func& ray_tfar,
+                                                      const Vec4vf<M>& v0, const Vec4vf<M>& v1,
+                                                      const Vec4vf<M>& vL, const Vec4vf<M>& vR,
+                                                      const Epilog& epilog)
+      {         
+        vbool<M> valid = valid_i;
+        
+        /* move ray origin closer to make calculations numerically stable */
+        const vfloat<M> dOdO = sqr(ray_dir);
+        const vfloat<M> rcp_dOdO = rcp(dOdO);
+        const Vec3vf<M> center = vfloat<M>(0.5f)*(v0.xyz()+v1.xyz());
+        const vfloat<M> dt = dot(center-ray_org_in,ray_dir)*rcp_dOdO;
+        const Vec3vf<M> ray_org = ray_org_in + dt*ray_dir;
+        
+        /* intersect with cone from v0 to v1 */
+        vfloat<M> t_cone_lower, t_cone_upper;
+        ConeGeometryIntersector<M> cone (ray_org, ray_dir, dOdO, rcp_dOdO, v0, v1);
+        vbool<M> validCone = valid;
+        cone.intersectCone(validCone, t_cone_lower, t_cone_upper);
+
+        valid &= (validCone | (cone.g <= 0.0f));  // if cone is entirely in sphere end - check sphere
+        if (unlikely(none(valid)))
+          return false;
+        
+        /* cone hits inside the neighboring capped cones are inside the geometry and thus ignored */
+        const ConeGeometry<M> coneL (v0, vL);
+        const ConeGeometry<M> coneR (v1, vR);
+#if !defined(EMBREE_BACKFACE_CULLING_CURVES)
+        const Vec3vf<M> hit_lower = ray_org + t_cone_lower*ray_dir;
+        const Vec3vf<M> hit_upper = ray_org + t_cone_upper*ray_dir;
+        t_cone_lower = select (!coneL.isInsideCappedCone (validCone, hit_lower) & !coneR.isInsideCappedCone (validCone, hit_lower), t_cone_lower, vfloat<M>(pos_inf));
+        t_cone_upper = select (!coneL.isInsideCappedCone (validCone, hit_upper) & !coneR.isInsideCappedCone (validCone, hit_upper), t_cone_upper, vfloat<M>(neg_inf));
+#endif
+
+        /* intersect ending sphere */
+        vfloat<M> t_sph1_lower, t_sph1_upper;
+        vfloat<M> t_sph0_lower = vfloat<M>(pos_inf);
+        vfloat<M> t_sph0_upper = vfloat<M>(neg_inf);
+        cone.intersectEndSphere(valid, coneR, t_sph1_lower, t_sph1_upper);
+
+        const vbool<M> isBeginPoint = valid & (vL[0] == vfloat<M>(pos_inf));
+        if (unlikely(any(isBeginPoint))) {
+          cone.intersectBeginSphere (isBeginPoint, t_sph0_lower, t_sph0_upper);
+        }
+        
+        /* CSG union of cone and end sphere */
+        vfloat<M> t_sph_lower = min(t_sph0_lower, t_sph1_lower);
+        vfloat<M> t_cone_sphere_lower = min(t_cone_lower, t_sph_lower);
+#if !defined (EMBREE_BACKFACE_CULLING_CURVES)
+        vfloat<M> t_sph_upper = max(t_sph0_upper, t_sph1_upper);
+        vfloat<M> t_cone_sphere_upper = max(t_cone_upper, t_sph_upper);
+        
+        /* filter out hits that are not in tnear/tfar range */
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf);
+        const vbool<M> valid_upper = valid & ray_tnear <= dt+t_cone_sphere_upper & dt+t_cone_sphere_upper <= ray_tfar() & t_cone_sphere_upper != vfloat<M>(neg_inf);
+        
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_lower | valid_upper;
+        if (unlikely(none(valid_first)))
+          return false;
+        
+        /* construct first hit */
+        const vfloat<M> t_first = select(valid_lower, t_cone_sphere_lower, t_cone_sphere_upper);
+        const vbool<M> cone_hit_first = t_first == t_cone_lower | t_first == t_cone_upper;
+        const vbool<M> sph0_hit_first = t_first == t_sph0_lower | t_first == t_sph0_upper;
+        const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower)));
+        const vfloat<M> u_first  = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_first,Ng_first);
+        const bool is_hit_first = epilog(valid_first, hit);
+        
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_cone_sphere_upper;
+        const vbool<M> valid_second = valid_lower & valid_upper & (dt+t_cone_sphere_upper <= ray_tfar());
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+        
+        /* invoke intersection filter for second hit */
+        const vbool<M> cone_hit_second = t_second == t_cone_lower | t_second == t_cone_upper;
+        const vbool<M> sph0_hit_second = t_second == t_sph0_lower | t_second == t_sph0_upper;
+        const Vec3vf<M> Ng_second = select(cone_hit_second, cone.Ng_cone(false), select (sph0_hit_second, cone.Ng_sphere0(false), cone.Ng_sphere1(false)));
+        const vfloat<M> u_second  = select(cone_hit_second, cone.u_cone(false), select (sph0_hit_second, vfloat<M>(zero), vfloat<M>(one)));
+
+        hit = RoundLineIntersectorHitM<M>(u_second,zero,dt+t_second,Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+#else
+        /* filter out hits that are not in tnear/tfar range */
+        const vbool<M> valid_lower = valid & ray_tnear <= dt+t_cone_sphere_lower & dt+t_cone_sphere_lower <= ray_tfar() & t_cone_sphere_lower != vfloat<M>(pos_inf);
+        
+        /* check if there is a valid hit */
+        if (unlikely(none(valid_lower)))
+          return false;
+        
+        /* construct first hit */
+        const vbool<M> cone_hit_first = t_cone_sphere_lower == t_cone_lower | t_cone_sphere_lower == t_cone_upper;
+        const vbool<M> sph0_hit_first = t_cone_sphere_lower == t_sph0_lower | t_cone_sphere_lower == t_sph0_upper;
+        const Vec3vf<M> Ng_first = select(cone_hit_first, cone.Ng_cone(valid_lower), select (sph0_hit_first, cone.Ng_sphere0(valid_lower), cone.Ng_sphere1(valid_lower)));
+        const vfloat<M> u_first  = select(cone_hit_first, cone.u_cone(valid_lower), select (sph0_hit_first, vfloat<M>(zero), vfloat<M>(one)));
+
+        /* invoke intersection filter for first hit */
+        RoundLineIntersectorHitM<M> hit(u_first,zero,dt+t_cone_sphere_lower,Ng_first);
+        const bool is_hit_first = epilog(valid_lower, hit);
+        
+        return is_hit_first;
+#endif
+      }
+      
+    } // end namespace __roundline_internal
+    
+    template<int M>
+      struct RoundLinearCurveIntersector1
+      {
+        typedef CurvePrecalculations1 Precalculations;
+        
+        struct ray_tfar {
+          Ray& ray;
+          __forceinline ray_tfar(Ray& ray) : ray(ray) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar; };
+        };
+
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            Ray& ray,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Vec4vf<M>& vLi, const Vec4vf<M>& vRi,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+          const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+          const vfloat<M> ray_tnear(ray.tnear());
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          const Vec4vf<M> vL = enlargeRadiusToMinWidth(context,geom,ray_org,vLi);
+          const Vec4vf<M> vR = enlargeRadiusToMinWidth(context,geom,ray_org,vRi);
+          return  __roundline_internal::intersectConeSphere(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray),v0,v1,vL,vR,epilog);
+        }
+      };
+    
+    template<int M, int K>
+      struct RoundLinearCurveIntersectorK
+      {
+        typedef CurvePrecalculationsK<K> Precalculations;
+        
+        struct ray_tfar {
+          RayK<K>& ray;
+          size_t k;
+          __forceinline ray_tfar(RayK<K>& ray, size_t k) : ray(ray), k(k) {}
+          __forceinline vfloat<M> operator() () const { return ray.tfar[k]; };
+        };
+        
+        template<typename Epilog>
+        static __forceinline bool intersect(const vbool<M>& valid_i,
+                                            RayK<K>& ray, size_t k,
+                                            IntersectContext* context,
+                                            const LineSegments* geom,
+                                            const Precalculations& pre,
+                                            const Vec4vf<M>& v0i, const Vec4vf<M>& v1i,
+                                            const Vec4vf<M>& vLi, const Vec4vf<M>& vRi,
+                                            const Epilog& epilog)
+        {
+          const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+          const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+          const vfloat<M> ray_tnear = ray.tnear()[k];
+          const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+          const Vec4vf<M> v1 = enlargeRadiusToMinWidth(context,geom,ray_org,v1i);
+          const Vec4vf<M> vL = enlargeRadiusToMinWidth(context,geom,ray_org,vLi);
+          const Vec4vf<M> vR = enlargeRadiusToMinWidth(context,geom,ray_org,vRi);
+          return __roundline_internal::intersectConeSphere(valid_i,ray_org,ray_dir,ray_tnear,ray_tfar(ray,k),v0,v1,vL,vR,epilog);
+        }
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h
new file mode 100644
index 0000000000..079817335e
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/roundlinei_intersector.h
@@ -0,0 +1,136 @@
+// ======================================================================== //
+// Copyright 2009-2020 Intel Corporation                                    //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ======================================================================== //
+
+#pragma once
+
+#include "roundline_intersector.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, int Mx, bool filter>
+    struct RoundLinearCurveMiIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct RoundLinearCurveMiMBIntersector1
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Intersect1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time());
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return RoundLinearCurveIntersector1<Mx>::intersect(valid,ray,context,geom,pre,v0,v1,vL,vR,Occluded1EpilogM<M,Mx,filter>(ray,context,line.geomID(),line.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& line)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, line);
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct RoundLinearCurveMiIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct RoundLinearCurveMiMBIntersectorK
+    {
+      typedef LineMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context,  const Primitive& line)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& line)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const LineSegments* geom = context->scene->get<LineSegments>(line.geomID());
+        Vec4vf<M> v0,v1,vL,vR; line.gather(v0,v1,vL,vR,geom,ray.time()[k]);
+        const vbool<Mx> valid = line.template valid<Mx>();
+        return RoundLinearCurveIntersectorK<Mx,K>::intersect(valid,ray,k,context,geom,pre,v0,v1,vL,vR,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,line.geomID(),line.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h
new file mode 100644
index 0000000000..3ab90c29ef
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/sphere_intersector.h
@@ -0,0 +1,183 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_points.h"
+#include "curve_intersector_precalculations.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct SphereIntersectorHitM
+    {
+      __forceinline SphereIntersectorHitM() {}
+
+      __forceinline SphereIntersectorHitM(const vfloat<M>& t, const Vec3vf<M>& Ng)
+        : vt(t), vNg(Ng) {}
+
+      __forceinline void finalize() {}
+
+      __forceinline Vec2f uv(const size_t i) const {
+        return Vec2f(0.0f, 0.0f);
+      }
+      __forceinline float t(const size_t i) const {
+        return vt[i];
+      }
+      __forceinline Vec3fa Ng(const size_t i) const {
+        return Vec3fa(vNg.x[i], vNg.y[i], vNg.z[i]);
+      }
+
+     public:
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct SphereIntersector1
+    {
+      typedef CurvePrecalculations1 Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+          const vbool<M>& valid_i, Ray& ray,
+          const Precalculations& pre, const Vec4vf<M>& v0, const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const vfloat<M> rd2    = rcp(dot(ray.dir, ray.dir));
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec3vf<M> ray_dir(ray.dir.x, ray.dir.y, ray.dir.z);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        const vfloat<M> td      = sqrt((r2 - l2) * rd2);
+        const vfloat<M> t_front = projC0 - td;
+        const vfloat<M> t_back  = projC0 + td;
+
+        const vbool<M> valid_front = valid & (ray.tnear() <= t_front) & (t_front <= ray.tfar);
+        const vbool<M> valid_back  = valid & (ray.tnear() <= t_back ) & (t_back  <= ray.tfar);
+
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_front | valid_back;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        /* construct first hit */
+        const vfloat<M> td_front = -td;
+        const vfloat<M> td_back  = +td;
+        const vfloat<M> t_first  = select(valid_front, t_front, t_back);
+        const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp;
+        SphereIntersectorHitM<M> hit(t_first, Ng_first);
+
+        /* invoke intersection filter for first hit */
+        const bool is_hit_first = epilog(valid_first, hit);
+                
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_back;
+        const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar);
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+
+        /* invoke intersection filter for second hit */
+        const Vec3vf<M> Ng_second = td_back * ray_dir - perp;
+        hit = SphereIntersectorHitM<M> (t_second, Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(
+        const vbool<M>& valid_i, Ray& ray, IntersectContext* context, const Points* geom,
+        const Precalculations& pre, const Vec4vf<M>& v0i, const Epilog& epilog)
+      {
+        const Vec3vf<M> ray_org(ray.org.x, ray.org.y, ray.org.z);
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        return intersect(valid_i,ray,pre,v0,epilog);
+      }
+    };
+
+    template<int M, int K>
+    struct SphereIntersectorK
+    {
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid_i,
+                                          RayK<K>& ray, size_t k,
+                                          IntersectContext* context,
+                                          const Points* geom,
+                                          const Precalculations& pre,
+                                          const Vec4vf<M>& v0i,
+                                          const Epilog& epilog)
+      {
+        vbool<M> valid = valid_i;
+
+        const Vec3vf<M> ray_org(ray.org.x[k], ray.org.y[k], ray.org.z[k]);
+        const Vec3vf<M> ray_dir(ray.dir.x[k], ray.dir.y[k], ray.dir.z[k]);
+        const vfloat<M> rd2 = rcp(dot(ray_dir, ray_dir));
+
+        const Vec4vf<M> v0 = enlargeRadiusToMinWidth(context,geom,ray_org,v0i);
+        const Vec3vf<M> center = v0.xyz();
+        const vfloat<M> radius = v0.w;
+
+        const Vec3vf<M> c0     = center - ray_org;
+        const vfloat<M> projC0 = dot(c0, ray_dir) * rd2;
+        const Vec3vf<M> perp   = c0 - projC0 * ray_dir;
+        const vfloat<M> l2     = dot(perp, perp);
+        const vfloat<M> r2     = radius * radius;
+        valid &= (l2 <= r2);
+        if (unlikely(none(valid)))
+          return false;
+
+        const vfloat<M> td      = sqrt((r2 - l2) * rd2);
+        const vfloat<M> t_front = projC0 - td;
+        const vfloat<M> t_back  = projC0 + td;
+
+        const vbool<M> valid_front = valid & (ray.tnear()[k] <= t_front) & (t_front <= ray.tfar[k]);
+        const vbool<M> valid_back  = valid & (ray.tnear()[k] <= t_back ) & (t_back  <= ray.tfar[k]);
+
+        /* check if there is a first hit */
+        const vbool<M> valid_first = valid_front | valid_back;
+        if (unlikely(none(valid_first)))
+          return false;
+
+        /* construct first hit */
+        const vfloat<M> td_front = -td;
+        const vfloat<M> td_back  = +td;
+        const vfloat<M> t_first  = select(valid_front, t_front, t_back);
+        const Vec3vf<M> Ng_first = select(valid_front, td_front, td_back) * ray_dir - perp;
+        SphereIntersectorHitM<M> hit(t_first, Ng_first);
+
+        /* invoke intersection filter for first hit */
+        const bool is_hit_first = epilog(valid_first, hit);
+                
+        /* check for possible second hits before potentially accepted hit */
+        const vfloat<M> t_second = t_back;
+        const vbool<M> valid_second = valid_front & valid_back & (t_second <= ray.tfar[k]);
+        if (unlikely(none(valid_second)))
+          return is_hit_first;
+
+        /* invoke intersection filter for second hit */
+        const Vec3vf<M> Ng_second = td_back * ray_dir - perp;
+        hit = SphereIntersectorHitM<M> (t_second, Ng_second);
+        const bool is_hit_second = epilog(valid_second, hit);
+        
+        return is_hit_first | is_hit_second;
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h
new file mode 100644
index 0000000000..1146847602
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/spherei_intersector.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "intersector_epilog.h"
+#include "pointi.h"
+#include "sphere_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, int Mx, bool filter>
+    struct SphereMiIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        SphereIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        return SphereIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query,
+                                           PointQueryContext* context,
+                                           const Primitive& sphere)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere);
+      }
+    };
+
+    template<int M, int Mx, bool filter>
+    struct SphereMiMBIntersector1
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculations1 Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre,
+                                          RayHit& ray,
+                                          IntersectContext* context,
+                                          const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time());
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        SphereIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Intersect1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre,
+                                         Ray& ray,
+                                         IntersectContext* context,
+                                         const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time());
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        return SphereIntersector1<Mx>::intersect(
+          valid, ray, context, geom, pre, v0, Occluded1EpilogM<M, Mx, filter>(ray, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool pointQuery(PointQuery* query,
+                                           PointQueryContext* context,
+                                           const Primitive& sphere)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, sphere);
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct SphereMiIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        SphereIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        return SphereIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+    };
+
+    template<int M, int Mx, int K, bool filter>
+    struct SphereMiMBIntersectorK
+    {
+      typedef PointMi<M> Primitive;
+      typedef CurvePrecalculationsK<K> Precalculations;
+
+      static __forceinline void intersect(
+          const Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(normal.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        SphereIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Intersect1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+
+      static __forceinline bool occluded(
+          const Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& sphere)
+      {
+        STAT3(shadow.trav_prims, 1, 1, 1);
+        const Points* geom = context->scene->get<Points>(sphere.geomID());
+        Vec4vf<M> v0; sphere.gather(v0, geom, ray.time()[k]);
+        const vbool<Mx> valid = sphere.template valid<Mx>();
+        return SphereIntersectorK<Mx, K>::intersect(
+          valid, ray, k, context, geom, pre, v0,
+          Occluded1KEpilogM<M, Mx, K, filter>(ray, k, context, sphere.geomID(), sphere.primID()));
+      }
+    };
+  }  // namespace isa
+}  // namespace embree
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h
new file mode 100644
index 0000000000..94ad46ad87
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1.h
@@ -0,0 +1,38 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../geometry/primitive.h"
+#include "../subdiv/subdivpatch1base.h"
+
+namespace embree
+{
+
+  struct __aligned(64) SubdivPatch1 : public SubdivPatch1Base
+  {
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    
+    static Type type;
+
+  public:
+
+    /*! constructor for cached subdiv patch */
+    SubdivPatch1 (const unsigned int gID,
+                        const unsigned int pID,
+                        const unsigned int subPatch,
+                        const SubdivMesh *const mesh,
+                        const size_t time,
+                        const Vec2f uv[4],
+                        const float edge_level[4],
+                        const int subdiv[4],
+                        const int simd_width) 
+      : SubdivPatch1Base(gID,pID,subPatch,mesh,time,uv,edge_level,subdiv,simd_width) {}
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h
new file mode 100644
index 0000000000..74ec1de258
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subdivpatch1_intersector.h
@@ -0,0 +1,237 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subdivpatch1.h"
+#include "grid_soa.h"
+#include "grid_soa_intersector1.h"
+#include "grid_soa_intersector_packet.h"
+#include "../common/ray.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename T>
+      class SubdivPatch1Precalculations : public T
+    { 
+    public:
+      __forceinline SubdivPatch1Precalculations (const Ray& ray, const void* ptr)
+        : T(ray,ptr) {}
+    };
+
+    template<int K, typename T>
+      class SubdivPatch1PrecalculationsK : public T
+    { 
+    public:
+      __forceinline SubdivPatch1PrecalculationsK (const vbool<K>& valid, RayK<K>& ray)
+        : T(valid,ray) {}
+    };
+
+    class SubdivPatch1Intersector1
+    {
+    public:
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1Precalculations<GridSOAIntersector1::Precalculations> Precalculations;
+
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        lazy_node = prim->root(0);
+        pre.grid = (Primitive*)prim;
+        return false;
+      }
+
+      /*! Intersect a ray with the primitive. */
+      template<int N, int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) 
+      {
+        if (likely(ty == 0)) GridSOAIntersector1::intersect(pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+        intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersector1::occluded(pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+        return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      template<int N>
+        static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) 
+      {
+          // TODO: PointQuery implement
+          assert(false && "not implemented");
+          return false;
+      }
+
+      template<int N>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) {
+        return pointQuery(This,query,context,prim,ty,tquery,lazy_node);
+      }
+    };
+
+    class SubdivPatch1MBIntersector1
+    {
+    public:
+      typedef SubdivPatch1 Primitive;
+      typedef GridSOAMBIntersector1::Precalculations Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+      {
+        Primitive* prim = (Primitive*) prim_i;
+        GridSOA* grid = nullptr;
+        grid = (GridSOA*) prim->root_ref.get();
+        pre.itime = getTimeSegment(ray.time(), float(grid->time_steps-1), pre.ftime);
+        lazy_node = grid->root(pre.itime);
+        pre.grid = grid;
+        return false;
+      }
+
+      /*! Intersect a ray with the primitive. */
+      template<int N, int Nx, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) 
+      {
+        if (likely(ty == 0)) GridSOAMBIntersector1::intersect(pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,ray,context,prim,lazy_node);
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+        intersect(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      /*! Test if the ray is occluded by the primitive */
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersector1::occluded(pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,ray,context,prim,lazy_node);
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node) {
+        return occluded(This,pre,ray,context,prim,ty,tray,lazy_node);
+      }
+      
+      template<int N>
+        static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) 
+      {
+          // TODO: PointQuery implement
+          assert(false && "not implemented");
+          return false;
+      }
+
+      template<int N, int Nx, bool robust>
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, size_t ty0, const Primitive* prim, size_t ty, const TravPointQuery<N> &tquery, size_t& lazy_node) {
+        return pointQuery(This,query,context,prim,ty,tquery,lazy_node);
+      }
+    };
+
+    template <int K>
+      struct SubdivPatch1IntersectorK
+    {
+      typedef GridSOA Primitive;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAIntersectorK<K>::Precalculations> Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim, size_t& lazy_node)
+      {
+        lazy_node = prim->root(0);
+        pre.grid = (Primitive*)prim;
+        return false;
+      }
+      
+      template<bool robust>        
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<bool robust>        
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, int Nx, bool robust>              
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, int Nx, bool robust>              
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+    };
+
+    typedef SubdivPatch1IntersectorK<4>  SubdivPatch1Intersector4;
+    typedef SubdivPatch1IntersectorK<8>  SubdivPatch1Intersector8;
+    typedef SubdivPatch1IntersectorK<16> SubdivPatch1Intersector16;
+
+    template <int K>
+      struct SubdivPatch1MBIntersectorK
+    {
+      typedef SubdivPatch1 Primitive;
+      //typedef GridSOAMBIntersectorK<K>::Precalculations Precalculations;
+      typedef SubdivPatch1PrecalculationsK<K,typename GridSOAMBIntersectorK<K>::Precalculations> Precalculations;
+      
+      static __forceinline bool processLazyNode(Precalculations& pre, IntersectContext* context, const Primitive* prim_i, size_t& lazy_node)
+      {
+        Primitive* prim = (Primitive*) prim_i;
+        GridSOA* grid = (GridSOA*) prim->root_ref.get();
+        lazy_node = grid->troot;
+        pre.grid = grid;
+        return false;
+      }
+
+      template<bool robust>
+      static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(valid,pre,ray,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+
+      template<bool robust>
+      static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t ty, const TravRayK<K, robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(valid,pre,ray,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, int Nx, bool robust>      
+      static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) GridSOAMBIntersectorK<K>::intersect(pre,ray,k,context,prim,lazy_node);
+        else                 processLazyNode(pre,context,prim,lazy_node);
+      }
+      
+      template<int N, int Nx, bool robust>      
+      static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t ty, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        if (likely(ty == 0)) return GridSOAMBIntersectorK<K>::occluded(pre,ray,k,context,prim,lazy_node);
+        else                 return processLazyNode(pre,context,prim,lazy_node);
+      }
+    };
+
+    typedef SubdivPatch1MBIntersectorK<4>  SubdivPatch1MBIntersector4;
+    typedef SubdivPatch1MBIntersectorK<8>  SubdivPatch1MBIntersector8;
+    typedef SubdivPatch1MBIntersectorK<16> SubdivPatch1MBIntersector16;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid.h
new file mode 100644
index 0000000000..39fa6fb0f0
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid.h
@@ -0,0 +1,517 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/ray.h"
+#include "../common/scene_grid_mesh.h"
+#include "../bvh/bvh.h"
+
+namespace embree
+{
+    /* Stores M quads from an indexed face set */
+      struct SubGrid
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        /* primitive supports multiple time segments */
+        static const bool singleTimeSegment = false;
+
+        /* Returns maximum number of stored quads */
+        static __forceinline size_t max_size() { return 1; }
+
+        /* Returns required number of primitive blocks for N primitives */
+        static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+      public:
+
+        /* Default constructor */
+        __forceinline SubGrid() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGrid(const unsigned int x,
+                              const unsigned int y,
+                              const unsigned int geomID,
+                              const unsigned int primID)
+          : _x(x), _y(y), _geomID(geomID), _primID(primID)
+        {
+        }
+
+        __forceinline bool invalid3x3X() const { return (unsigned int)_x & (1<<15); }
+        __forceinline bool invalid3x3Y() const { return (unsigned int)_y & (1<<15); }
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3vf4& p0,
+                                  Vec3vf4& p1,
+                                  Vec3vf4& p2,
+                                  Vec3vf4& p3,
+                                  const GridMesh* const mesh,
+                                  const GridMesh::Grid &g) const
+        {
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = vfloat4::loadu(mesh->vertexPtr(vtxID00));
+          const vfloat4 vtx01  = vfloat4::loadu(mesh->vertexPtr(vtxID01));
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = vfloat4::loadu(mesh->vertexPtr(vtxID10));
+          const vfloat4 vtx11  = vfloat4::loadu(mesh->vertexPtr(vtxID11));
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = vfloat4::loadu(mesh->vertexPtr(vtxID02));
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = vfloat4::loadu(mesh->vertexPtr(vtxID12));
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = vfloat4::loadu(mesh->vertexPtr(vtxID20));
+          const vfloat4 vtx21  = vfloat4::loadu(mesh->vertexPtr(vtxID21));
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = vfloat4::loadu(mesh->vertexPtr(vtxID22));
+
+          transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z);
+          transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z);
+          transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z);
+          transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z);                    
+        }
+
+        template<typename T>
+        __forceinline vfloat4 getVertexMB(const GridMesh* const mesh, const size_t offset, const size_t itime, const float ftime) const
+        {
+          const T v0 = T::loadu(mesh->vertexPtr(offset,itime+0));
+          const T v1 = T::loadu(mesh->vertexPtr(offset,itime+1));
+          return lerp(v0,v1,ftime);
+        }
+
+        /* Gather the quads */
+        __forceinline void gatherMB(Vec3vf4& p0,
+                                    Vec3vf4& p1,
+                                    Vec3vf4& p2,
+                                    Vec3vf4& p3,
+                                    const GridMesh* const mesh,
+                                    const GridMesh::Grid &g,
+                                    const size_t itime, 
+                                    const float ftime) const
+        {
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime);
+          const vfloat4 vtx01  = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime);
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime);
+          const vfloat4 vtx11  = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime);
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime);
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime);
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime);
+          const vfloat4 vtx21  = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime);
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime);
+
+          transpose(vtx00,vtx01,vtx11,vtx10,p0.x,p0.y,p0.z);
+          transpose(vtx01,vtx02,vtx12,vtx11,p1.x,p1.y,p1.z);
+          transpose(vtx11,vtx12,vtx22,vtx21,p2.x,p2.y,p2.z);
+          transpose(vtx10,vtx11,vtx21,vtx20,p3.x,p3.y,p3.z);                    
+        }
+
+
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3vf4& p0,
+                                  Vec3vf4& p1,
+                                  Vec3vf4& p2,
+                                  Vec3vf4& p3,
+                                  const Scene *const scene) const
+        {
+          const GridMesh* const mesh = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g    = mesh->grid(primID());
+          gather(p0,p1,p2,p3,mesh,g);
+        }
+
+        /* Gather the quads in the motion blur case */
+        __forceinline void gatherMB(Vec3vf4& p0,
+                                    Vec3vf4& p1,
+                                    Vec3vf4& p2,
+                                    Vec3vf4& p3,
+                                    const Scene *const scene,
+                                    const size_t itime, 
+                                    const float ftime) const
+        {
+          const GridMesh* const mesh = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g    = mesh->grid(primID());
+          gatherMB(p0,p1,p2,p3,mesh,g,itime,ftime);
+        }
+
+        /* Gather the quads */
+        __forceinline void gather(Vec3fa vtx[16], const Scene *const scene) const
+        {
+          const GridMesh* mesh     = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g  = mesh->grid(primID());
+
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const Vec3fa vtx00  = Vec3fa::loadu(mesh->vertexPtr(vtxID00));
+          const Vec3fa vtx01  = Vec3fa::loadu(mesh->vertexPtr(vtxID01));
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const Vec3fa vtx10  = Vec3fa::loadu(mesh->vertexPtr(vtxID10));
+          const Vec3fa vtx11  = Vec3fa::loadu(mesh->vertexPtr(vtxID11));
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const Vec3fa vtx02  = Vec3fa::loadu(mesh->vertexPtr(vtxID02));
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const Vec3fa vtx12  = Vec3fa::loadu(mesh->vertexPtr(vtxID12));
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const Vec3fa vtx20  = Vec3fa::loadu(mesh->vertexPtr(vtxID20));
+          const Vec3fa vtx21  = Vec3fa::loadu(mesh->vertexPtr(vtxID21));
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const Vec3fa vtx22  = Vec3fa::loadu(mesh->vertexPtr(vtxID22));
+
+          vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10;
+          vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11;
+          vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20;
+          vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21;
+        }
+
+        /* Gather the quads */
+        __forceinline void gatherMB(vfloat4 vtx[16], const Scene *const scene, const size_t itime, const float ftime) const
+        {
+          const GridMesh* mesh     = scene->get<GridMesh>(geomID());
+          const GridMesh::Grid &g  = mesh->grid(primID());
+
+          /* first quad always valid */
+          const size_t vtxID00 = g.startVtxID + x() + y() * g.lineVtxOffset;
+          const size_t vtxID01 = vtxID00 + 1;
+          const vfloat4 vtx00  = getVertexMB<vfloat4>(mesh,vtxID00,itime,ftime);
+          const vfloat4 vtx01  = getVertexMB<vfloat4>(mesh,vtxID01,itime,ftime);
+          const size_t vtxID10 = vtxID00 + g.lineVtxOffset;
+          const size_t vtxID11 = vtxID01 + g.lineVtxOffset;
+          const vfloat4 vtx10  = getVertexMB<vfloat4>(mesh,vtxID10,itime,ftime);
+          const vfloat4 vtx11  = getVertexMB<vfloat4>(mesh,vtxID11,itime,ftime);
+
+          /* deltaX => vtx02, vtx12 */
+          const size_t deltaX  = invalid3x3X() ? 0 : 1;
+          const size_t vtxID02 = vtxID01 + deltaX;       
+          const vfloat4 vtx02  = getVertexMB<vfloat4>(mesh,vtxID02,itime,ftime);
+          const size_t vtxID12 = vtxID11 + deltaX;       
+          const vfloat4 vtx12  = getVertexMB<vfloat4>(mesh,vtxID12,itime,ftime);
+
+          /* deltaY => vtx20, vtx21 */
+          const size_t deltaY  = invalid3x3Y() ? 0 : g.lineVtxOffset;
+          const size_t vtxID20 = vtxID10 + deltaY;
+          const size_t vtxID21 = vtxID11 + deltaY;
+          const vfloat4 vtx20  = getVertexMB<vfloat4>(mesh,vtxID20,itime,ftime);
+          const vfloat4 vtx21  = getVertexMB<vfloat4>(mesh,vtxID21,itime,ftime);
+
+          /* deltaX/deltaY => vtx22 */
+          const size_t vtxID22 = vtxID11 + deltaX + deltaY;       
+          const vfloat4 vtx22  = getVertexMB<vfloat4>(mesh,vtxID22,itime,ftime);
+
+          vtx[ 0] = vtx00; vtx[ 1] = vtx01; vtx[ 2] = vtx11; vtx[ 3] = vtx10;
+          vtx[ 4] = vtx01; vtx[ 5] = vtx02; vtx[ 6] = vtx12; vtx[ 7] = vtx11;
+          vtx[ 8] = vtx10; vtx[ 9] = vtx11; vtx[10] = vtx21; vtx[11] = vtx20;
+          vtx[12] = vtx11; vtx[13] = vtx12; vtx[14] = vtx22; vtx[15] = vtx21;
+        }        
+          
+
+        /* Calculate the bounds of the subgrid */
+        __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+        {
+          BBox3fa bounds = empty;
+          FATAL("not implemented yet");
+          return bounds;
+        }
+
+        /* Calculate the linear bounds of the primitive */
+        __forceinline LBBox3fa linearBounds(const Scene* const scene, const size_t itime)
+        {
+          return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+        }
+
+        __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+        {
+          LBBox3fa allBounds = empty;
+          FATAL("not implemented yet");
+          return allBounds;
+        }
+
+        __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+        {
+          LBBox3fa allBounds = empty;
+          FATAL("not implemented yet");
+          return allBounds;
+        }
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGrid& sg) {
+          return cout << "SubGrid " << " ( x " << sg.x() << ", y = " << sg.y() << ", geomID = " << sg.geomID() << ", primID = " << sg.primID() << " )";
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID() const { return _primID; }
+        __forceinline unsigned int x() const { return (unsigned int)_x & 0x7fff; }
+        __forceinline unsigned int y() const { return (unsigned int)_y & 0x7fff; }
+
+      private:
+        unsigned short _x;
+        unsigned short _y;
+        unsigned int _geomID;    // geometry ID of mesh
+        unsigned int _primID;    // primitive ID of primitive inside mesh
+      };
+
+      struct SubGridID {
+        unsigned short x;
+        unsigned short y;
+        unsigned int primID;
+        
+        __forceinline SubGridID() {}
+        __forceinline SubGridID(const unsigned int x, const unsigned int y, const unsigned int primID) :
+        x(x), y(y), primID(primID) {}        
+      };
+
+      /* QuantizedBaseNode as large subgrid leaf */
+      template<int N>
+      struct SubGridQBVHN
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        __forceinline size_t size() const
+        {
+          for (size_t i=0;i<N;i++)
+            if (primID(i) == -1) return i;
+          return N;
+        }
+
+      __forceinline void clear() {
+        for (size_t i=0;i<N;i++)
+          subgridIDs[i] = SubGridID(0,0,(unsigned int)-1);
+        qnode.clear();
+      }
+
+        /* Default constructor */
+        __forceinline SubGridQBVHN() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGridQBVHN(const unsigned int x[N],
+                                   const unsigned int y[N],
+                                   const unsigned int primID[N],
+                                   const BBox3fa * const subGridBounds,
+                                   const unsigned int geomID,
+                                   const unsigned int items)
+        {
+          clear();
+          _geomID = geomID;
+
+          __aligned(64) typename BVHN<N>::AABBNode node;
+          node.clear();          
+          for (size_t i=0;i<items;i++)
+          {
+            subgridIDs[i] = SubGridID(x[i],y[i],primID[i]);
+            node.setBounds(i,subGridBounds[i]);
+          }
+          qnode.init_dim(node);
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; }
+        __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; }
+        __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; }
+
+        __forceinline SubGrid subgrid(const size_t i) const {
+          assert(i < N);
+          assert(primID(i) != -1);
+          return SubGrid(x(i),y(i),geomID(),primID(i));
+        }
+
+      public:
+        SubGridID subgridIDs[N];
+
+        typename BVHN<N>::QuantizedBaseNode qnode;
+
+        unsigned int _geomID;    // geometry ID of mesh
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGridQBVHN& sg) {
+          cout << "SubGridQBVHN " << embree_endl;
+          for (size_t i=0;i<N;i++)
+            cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl;
+          cout << "geomID " << sg._geomID << embree_endl;
+          cout << "lowerX " << sg.qnode.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.dequantizeUpperZ() << embree_endl;
+          return cout;
+        }
+
+      };
+
+      template<int N>
+        typename SubGridQBVHN<N>::Type SubGridQBVHN<N>::type;
+
+      typedef SubGridQBVHN<4> SubGridQBVH4;
+      typedef SubGridQBVHN<8> SubGridQBVH8;
+
+
+      /* QuantizedBaseNode as large subgrid leaf */
+      template<int N>
+      struct SubGridMBQBVHN
+      {
+        /* Virtual interface to query information about the quad type */
+        struct Type : public PrimitiveType
+        {
+          const char* name() const;
+          size_t sizeActive(const char* This) const;
+          size_t sizeTotal(const char* This) const;
+          size_t getBytes(const char* This) const;
+        };
+        static Type type;
+
+      public:
+
+        __forceinline size_t size() const
+        {
+          for (size_t i=0;i<N;i++)
+            if (primID(i) == -1) return i;
+          return N;
+        }
+
+      __forceinline void clear() {
+        for (size_t i=0;i<N;i++)
+          subgridIDs[i] = SubGridID(0,0,(unsigned int)-1);
+        qnode.clear();
+      }
+
+        /* Default constructor */
+        __forceinline SubGridMBQBVHN() {  }
+
+        /* Construction from vertices and IDs */
+        __forceinline SubGridMBQBVHN(const unsigned int x[N],
+                                     const unsigned int y[N],
+                                     const unsigned int primID[N],
+                                     const BBox3fa * const subGridBounds0,
+                                     const BBox3fa * const subGridBounds1,
+                                     const unsigned int geomID,
+                                     const float toffset,
+                                     const float tscale,
+                                     const unsigned int items)
+        {
+          clear();
+          _geomID = geomID;
+          time_offset = toffset;
+          time_scale  = tscale;
+
+          __aligned(64) typename BVHN<N>::AABBNode node0,node1;
+          node0.clear();          
+          node1.clear();          
+          for (size_t i=0;i<items;i++)
+          {
+            subgridIDs[i] = SubGridID(x[i],y[i],primID[i]);
+            node0.setBounds(i,subGridBounds0[i]);
+            node1.setBounds(i,subGridBounds1[i]);
+          }
+          qnode.node0.init_dim(node0);
+          qnode.node1.init_dim(node1);
+        }
+
+        __forceinline unsigned int geomID() const { return _geomID; }
+        __forceinline unsigned int primID(const size_t i) const { assert(i < N); return subgridIDs[i].primID; }
+        __forceinline unsigned int x(const size_t i) const { assert(i < N); return subgridIDs[i].x; }
+        __forceinline unsigned int y(const size_t i) const { assert(i < N); return subgridIDs[i].y; }
+
+        __forceinline SubGrid subgrid(const size_t i) const {
+          assert(i < N);
+          assert(primID(i) != -1);
+          return SubGrid(x(i),y(i),geomID(),primID(i));
+        }
+
+        __forceinline float adjustTime(const float t) const { return time_scale * (t-time_offset); }
+
+        template<int K>
+        __forceinline vfloat<K> adjustTime(const vfloat<K> &t) const { return time_scale * (t-time_offset); }
+
+      public:
+        SubGridID subgridIDs[N];
+
+        typename BVHN<N>::QuantizedBaseNodeMB qnode;
+
+        float time_offset;
+        float time_scale;
+        unsigned int _geomID;    // geometry ID of mesh
+
+
+        friend embree_ostream operator<<(embree_ostream cout, const SubGridMBQBVHN& sg) {
+          cout << "SubGridMBQBVHN " << embree_endl;
+          for (size_t i=0;i<N;i++)
+            cout << i << " ( x = " << sg.subgridIDs[i].x << ", y = " << sg.subgridIDs[i].y << ", primID = " << sg.subgridIDs[i].primID << " )" << embree_endl;
+          cout << "geomID      " << sg._geomID << embree_endl;
+          cout << "time_offset " << sg.time_offset << embree_endl;
+          cout << "time_scale  " << sg.time_scale << embree_endl;         
+          cout << "lowerX " << sg.qnode.node0.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.node0.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.node0.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.node0.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.node0.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.node0.dequantizeUpperZ() << embree_endl;
+          cout << "lowerX " << sg.qnode.node1.dequantizeLowerX() << embree_endl;
+          cout << "upperX " << sg.qnode.node1.dequantizeUpperX() << embree_endl;
+          cout << "lowerY " << sg.qnode.node1.dequantizeLowerY() << embree_endl;
+          cout << "upperY " << sg.qnode.node1.dequantizeUpperY() << embree_endl;
+          cout << "lowerZ " << sg.qnode.node1.dequantizeLowerZ() << embree_endl;
+          cout << "upperZ " << sg.qnode.node1.dequantizeUpperZ() << embree_endl;
+          return cout;
+        }
+
+      };
+
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector.h
new file mode 100644
index 0000000000..045eee4329
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector.h
@@ -0,0 +1,518 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "subgrid_intersector_moeller.h"
+#include "subgrid_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    // =======================================================================================
+    // =================================== SubGridIntersectors ===============================
+    // =======================================================================================
+
+
+    template<int N, bool filter>
+    struct SubGridIntersector1Moeller
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1MoellerTrumbore<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID());
+        assert(accel);
+        context->geomID = subgrid.geomID();
+        context->primID = subgrid.primID();
+        return accel->pointQuery(query, context);
+      }
+
+      template<int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<Nx> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+      template<int Nx, bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<Nx> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+    
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask;
+          if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+            mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          } else {
+            mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          }
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+            changed |= pointQuery(query, context, prim[i].subgrid(ID));
+          }
+        }
+        return changed;
+      }
+    };
+
+    template<int N, bool filter>
+    struct SubGridIntersector1Pluecker
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        STAT3(point_query.trav_prims,1,1,1);
+        AccelSet* accel = (AccelSet*)context->scene->get(subgrid.geomID());
+        context->geomID = subgrid.geomID();
+        context->primID = subgrid.primID();
+        return accel->pointQuery(query, context);
+      }
+
+      template<int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<Nx> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+
+      template<int Nx, bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<Nx> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        bool changed = false;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<N> dist;
+          size_t mask;
+          if (likely(context->query_type == POINT_QUERY_TYPE_SPHERE)) {
+            mask = BVHNQuantizedBaseNodePointQuerySphere1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          } else {
+            mask = BVHNQuantizedBaseNodePointQueryAABB1<N>::pointQuery(&prim[i].qnode,tquery,dist);
+          }
+#if defined(__AVX__)
+          STAT3(point_query.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+            changed |= pointQuery(query, context, prim[i].subgrid(ID));
+          }
+        }
+        return changed;
+      }
+    };
+
+    template<int N, int K, bool filter>
+    struct SubGridIntersectorKMoeller
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<int Nx, bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<Nx> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<int Nx, bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<Nx> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+
+
+    template<int N, int K, bool filter>
+    struct SubGridIntersectorKPluecker
+    {
+      typedef SubGridQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          pre.intersectK(valid_i,ray,p0,p1,p2,p3,g,subgrid,i,IntersectKEpilogM<4,K,filter>(ray,context,subgrid.geomID(),subgrid.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        Vec3fa vtx[16];
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        subgrid.gather(vtx,context->scene);
+        for (unsigned int i=0; i<4; i++)
+        {
+          const Vec3vf<K> p0 = vtx[i*4+0];
+          const Vec3vf<K> p1 = vtx[i*4+1];
+          const Vec3vf<K> p2 = vtx[i*4+2];
+          const Vec3vf<K> p3 = vtx[i*4+3];
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          if (pre.intersectK(valid0,ray,p0,p1,p2,p3,g,subgrid,i,OccludedKEpilogM<4,K,filter>(valid0,ray,context,subgrid.geomID(),subgrid.primID(),i)))
+            break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+        Vec3vf4 v0,v1,v2,v3; subgrid.gather(v0,v1,v2,v3,context->scene);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<int Nx, bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<Nx> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<int Nx, bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<Nx> dist;
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              assert(((size_t)1 << ID) & movemask(prim[i].qnode.validMask()));
+
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+
+
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h
new file mode 100644
index 0000000000..f65b4abf61
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_moeller.h
@@ -0,0 +1,493 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "quad_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    /* ----------------------------- */
+    /* -- single ray intersectors -- */
+    /* ----------------------------- */
+
+    template<int M>
+      __forceinline void interpolateUV(MoellerTrumboreHitM<M> &hit,const GridMesh::Grid &g, const SubGrid& subgrid) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const vint<M> sx((int)subgrid.x());
+      const vint<M> sy((int)subgrid.y());
+      const vint<M> sxM(sx + vint<M>(0,1,1,0));
+      const vint<M> syM(sy + vint<M>(0,0,1,1));
+      const float inv_resX = rcp((float)((int)g.resX-1));
+      const float inv_resY = rcp((float)((int)g.resY-1));          
+      hit.U = (hit.U + (vfloat<M>)sxM * hit.absDen) * inv_resX;
+      hit.V = (hit.V + (vfloat<M>)syM * hit.absDen) * inv_resY;
+    }
+    
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore;
+
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore
+      {
+        __forceinline SubGridQuadMIntersector1MoellerTrumbore() {}
+
+        __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+
+        __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                     const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          MoellerTrumboreHitM<M> hit;
+          MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+          Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+          /* intersect first triangle */
+          if (intersector.intersect(ray,v0,v1,v3,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid);
+            epilog(hit.valid,hit);
+          }
+
+          /* intersect second triangle */
+          if (intersector.intersect(ray,v2,v3,v1,hit)) 
+          {
+            hit.U = hit.absDen - hit.U;
+            hit.V = hit.absDen - hit.V;
+            interpolateUV<M>(hit,g,subgrid);
+            epilog(hit.valid,hit);
+          }
+        }
+      
+        __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          MoellerTrumboreHitM<M> hit;
+          MoellerTrumboreIntersector1<M> intersector(ray,nullptr);
+          Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+          
+          /* intersect first triangle */
+          if (intersector.intersect(ray,v0,v1,v3,hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid);
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+
+          /* intersect second triangle */
+          if (intersector.intersect(ray,v2,v3,v1,hit)) 
+          {
+            hit.U = hit.absDen - hit.U;
+            hit.V = hit.absDen - hit.V;
+            interpolateUV<M>(hit,g,subgrid);
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+          return false;
+        }
+      };
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+      struct SubGridQuadMIntersector1MoellerTrumbore<4,filter>
+    {
+      __forceinline SubGridQuadMIntersector1MoellerTrumbore() {}
+
+      __forceinline SubGridQuadMIntersector1MoellerTrumbore(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        MoellerTrumboreHitM<8> hit;
+        MoellerTrumboreIntersector1<8> intersector(ray,nullptr);
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersector.intersect(ray,vtx0,vtx1,vtx2,hit)))
+        {
+          vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen;
+
+#if !defined(EMBREE_BACKFACE_CULLING)
+          hit.U = select(flags,absDen-V,U);
+          hit.V = select(flags,absDen-U,V);
+          hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 
+#else
+          hit.U = select(flags,absDen-U,U);
+          hit.V = select(flags,absDen-V,V);
+#endif
+          /* correct U,V interpolation across the entire grid */
+          const vint8 sx((int)subgrid.x());
+          const vint8 sy((int)subgrid.y());
+          const vint8 sx8(sx + vint8(0,1,1,0,0,1,1,0));
+          const vint8 sy8(sy + vint8(0,0,1,1,0,0,1,1));
+          const float inv_resX = rcp((float)((int)g.resX-1));
+          const float inv_resY = rcp((float)((int)g.resY-1));          
+          hit.U = (hit.U + (vfloat8)sx8 * absDen) * inv_resX;
+          hit.V = (hit.V + (vfloat8)sy8 * absDen) * inv_resY;          
+
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+    // ============================================================================================================================
+    // ============================================================================================================================
+    // ============================================================================================================================
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    template<int K>
+      struct SubGridQuadHitK
+      {
+        __forceinline SubGridQuadHitK(const vfloat<K>& U,
+                                      const vfloat<K>& V,
+                                      const vfloat<K>& T,
+                                      const vfloat<K>& absDen,
+                                      const Vec3vf<K>& Ng,
+                                      const vbool<K>& flags,
+                                      const GridMesh::Grid &g, 
+                                      const SubGrid& subgrid,
+                                      const unsigned int i)
+        : U(U), V(V), T(T), absDen(absDen), flags(flags), tri_Ng(Ng), g(g), subgrid(subgrid), i(i) {}
+
+        __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+        {
+          const vfloat<K> rcpAbsDen = rcp(absDen);
+          const vfloat<K> t = T * rcpAbsDen;
+          const vfloat<K> u0 = min(U * rcpAbsDen,1.0f);
+          const vfloat<K> v0 = min(V * rcpAbsDen,1.0f);
+          const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+          const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+          const vfloat<K> uu = select(flags,u1,u0);
+          const vfloat<K> vv = select(flags,v1,v0);
+          const unsigned int sx = subgrid.x() + (unsigned int)(i % 2);
+          const unsigned int sy = subgrid.y() + (unsigned int)(i >>1);
+          const float inv_resX = rcp((float)(int)(g.resX-1));
+          const float inv_resY = rcp((float)(int)(g.resY-1));
+          const vfloat<K> u = (uu + (float)(int)sx) * inv_resX;
+          const vfloat<K> v = (vv + (float)(int)sy) * inv_resY;
+          const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+          return std::make_tuple(u,v,t,Ng);
+        }
+
+      private:
+        const vfloat<K> U;
+        const vfloat<K> V;
+        const vfloat<K> T;
+        const vfloat<K> absDen;
+        const vbool<K> flags;
+        const Vec3vf<K> tri_Ng;
+
+        const GridMesh::Grid &g;
+        const SubGrid& subgrid;
+        const size_t i;
+      };
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumboreBase
+      {
+        __forceinline SubGridQuadMIntersectorKMoellerTrumboreBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+        template<typename Epilog>
+        __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                          RayK<K>& ray,
+                                          const Vec3vf<K>& tri_v0,
+                                          const Vec3vf<K>& tri_e1,
+                                          const Vec3vf<K>& tri_e2,
+                                          const Vec3vf<K>& tri_Ng,
+                                          const vbool<K>& flags,
+                                          const GridMesh::Grid &g, 
+                                          const SubGrid &subgrid,
+                                          const unsigned int i,
+                                          const Epilog& epilog) const
+        { 
+          /* calculate denominator */
+          vbool<K> valid = valid0;
+          const Vec3vf<K> C = tri_v0 - ray.org;
+          const Vec3vf<K> R = cross(C,ray.dir);
+          const vfloat<K> den = dot(tri_Ng,ray.dir);
+          const vfloat<K> absDen = abs(den);
+          const vfloat<K> sgnDen = signmsk(den);
+        
+          /* test against edge p2 p0 */
+          const vfloat<K> U = dot(R,tri_e2) ^ sgnDen;
+          valid &= U >= 0.0f;
+          if (likely(none(valid))) return false;
+        
+          /* test against edge p0 p1 */
+          const vfloat<K> V = dot(R,tri_e1) ^ sgnDen;
+          valid &= V >= 0.0f;
+          if (likely(none(valid))) return false;
+        
+          /* test against edge p1 p2 */
+          const vfloat<K> W = absDen-U-V;
+          valid &= W >= 0.0f;
+          if (likely(none(valid))) return false;
+        
+          /* perform depth test */
+          const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+          valid &= (absDen*ray.tnear() < T) & (T <= absDen*ray.tfar);
+          if (unlikely(none(valid))) return false;
+        
+          /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+          valid &= den < vfloat<K>(zero);
+          if (unlikely(none(valid))) return false;
+#else
+          valid &= den != vfloat<K>(zero);
+          if (unlikely(none(valid))) return false;
+#endif
+        
+          /* calculate hit information */
+          SubGridQuadHitK<K> hit(U,V,T,absDen,tri_Ng,flags,g,subgrid,i);
+          return epilog(valid,hit);
+        }
+      
+        template<typename Epilog>
+        __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                          RayK<K>& ray,
+                                          const Vec3vf<K>& tri_v0,
+                                          const Vec3vf<K>& tri_v1,
+                                          const Vec3vf<K>& tri_v2,
+                                          const vbool<K>& flags,
+                                          const GridMesh::Grid &g, 
+                                          const SubGrid &subgrid,
+                                          const unsigned int i,
+                                          const Epilog& epilog) const
+        {
+          const Vec3vf<K> e1 = tri_v0-tri_v1;
+          const Vec3vf<K> e2 = tri_v2-tri_v0;
+          const Vec3vf<K> Ng = cross(e2,e1);
+          return intersectK(valid0,ray,tri_v0,e1,e2,Ng,flags,g,subgrid,i,epilog);
+        }
+
+        template<typename Epilog>
+        __forceinline bool intersectK(const vbool<K>& valid0, 
+                                      RayK<K>& ray,
+                                      const Vec3vf<K>& v0,
+                                      const Vec3vf<K>& v1,
+                                      const Vec3vf<K>& v2,
+                                      const Vec3vf<K>& v3,
+                                      const GridMesh::Grid &g, 
+                                      const SubGrid &subgrid,
+                                      const unsigned int i,
+                                      const Epilog& epilog) const
+        {
+          intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),g,subgrid,i,epilog);
+          if (none(valid0)) return true;
+          intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),g,subgrid,i,epilog);
+          return none(valid0);
+        }
+
+        static  __forceinline bool intersect1(RayK<K>& ray,
+                                              size_t k,
+                                              const Vec3vf<M>& tri_v0,
+                                              const Vec3vf<M>& tri_e1,
+                                              const Vec3vf<M>& tri_e2,
+                                              const Vec3vf<M>& tri_Ng,
+                                              MoellerTrumboreHitM<M> &hit)
+        {
+          /* calculate denominator */
+          const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+          const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+          const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+          const Vec3vf<M> R = cross(C,D);
+          const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+          const vfloat<M> absDen = abs(den);
+          const vfloat<M> sgnDen = signmsk(den);
+        
+          /* perform edge tests */
+          const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+          const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+        
+          /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+          vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+          vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+          if (likely(none(valid))) return false;
+        
+          /* perform depth test */
+          const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+          valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+          if (likely(none(valid))) return false;
+        
+          /* calculate hit information */
+          new (&hit) MoellerTrumboreHitM<M>(valid,U,V,T,absDen,tri_Ng);
+          return true;
+        }
+
+        static __forceinline bool intersect1(RayK<K>& ray,
+                                             size_t k,
+                                             const Vec3vf<M>& v0,
+                                             const Vec3vf<M>& v1,
+                                             const Vec3vf<M>& v2,
+                                             MoellerTrumboreHitM<M> &hit)
+        {
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v2-v0;
+          const Vec3vf<M> Ng = cross(e2,e1);
+          return intersect1(ray,k,v0,e1,e2,Ng,hit);
+        }
+
+      };
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumbore : public SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKMoellerTrumboreBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+        MoellerTrumboreHitM<4> hit;
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,hit))
+        {
+          interpolateUV<M>(hit,g,subgrid);
+          epilog(hit.valid,hit);
+        }
+
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,hit))
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          interpolateUV<M>(hit,g,subgrid);
+          epilog(hit.valid,hit);
+        }
+
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+        MoellerTrumboreHitM<4> hit;
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,hit))
+        {
+          interpolateUV<M>(hit,g,subgrid);
+          if (epilog(hit.valid,hit)) return true;
+        }
+
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,hit))
+        {
+          hit.U = hit.absDen - hit.U;
+          hit.V = hit.absDen - hit.V;
+          interpolateUV<M>(hit,g,subgrid);
+          if (epilog(hit.valid,hit)) return true;
+        }
+        return false;
+      }
+    };
+
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<int K, bool filter>
+      struct SubGridQuadMIntersectorKMoellerTrumbore<4,K,filter> : public SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKMoellerTrumbore(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKMoellerTrumboreBase<4,K,filter>(valid,ray) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect1(RayK<K>& ray, size_t k,const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                      const GridMesh::Grid &g, const SubGrid &subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+
+        MoellerTrumboreHitM<8> hit;
+        if (SubGridQuadMIntersectorKMoellerTrumboreBase<8,K,filter>::intersect1(ray,k,vtx0,vtx1,vtx2,hit))
+        {
+          vfloat8 U = hit.U, V = hit.V, absDen = hit.absDen;
+#if !defined(EMBREE_BACKFACE_CULLING)
+          hit.U = select(flags,absDen-V,U);
+          hit.V = select(flags,absDen-U,V);
+          hit.vNg *= select(flags,vfloat8(-1.0f),vfloat8(1.0f)); 
+#else
+          hit.U = select(flags,absDen-U,U);
+          hit.V = select(flags,absDen-V,V);
+#endif
+
+          /* correct U,V interpolation across the entire grid */
+          const vint8 sx((int)subgrid.x());
+          const vint8 sy((int)subgrid.y());
+          const vint8 sx8(sx + vint8(0,1,1,0,0,1,1,0));
+          const vint8 sy8(sy + vint8(0,0,1,1,0,0,1,1));
+          const float inv_resX = rcp((float)((int)g.resX-1));
+          const float inv_resY = rcp((float)((int)g.resY-1));          
+          hit.U = (hit.U + (vfloat8)sx8 * absDen) * inv_resX;
+          hit.V = (hit.V + (vfloat8)sy8 * absDen) * inv_resY;          
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Intersect1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        return intersect1(ray,k,v0,v1,v2,v3,g,subgrid,Occluded1KEpilogMU<8,K,filter>(ray,k,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h
new file mode 100644
index 0000000000..1cd88aa799
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_intersector_pluecker.h
@@ -0,0 +1,508 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid.h"
+#include "quad_intersector_moeller.h"
+#include "quad_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+
+    template<int M>
+    struct SubGridQuadHitPlueckerM
+    {
+      __forceinline SubGridQuadHitPlueckerM() {}
+
+      __forceinline SubGridQuadHitPlueckerM(const vbool<M>& valid,
+                                            const vfloat<M>& U,
+                                            const vfloat<M>& V,
+                                            const vfloat<M>& UVW,
+                                            const vfloat<M>& t,
+                                            const Vec3vf<M>& Ng,
+                                            const vbool<M>& flags) : valid(valid), vt(t)
+      {
+        const vbool<M> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+        const vfloat<M> u = min(U * rcpUVW,1.0f);
+        const vfloat<M> v = min(V * rcpUVW,1.0f);
+        const vfloat<M> u1 = vfloat<M>(1.0f) - u;
+        const vfloat<M> v1 = vfloat<M>(1.0f) - v;
+#if !defined(__AVX__) || defined(EMBREE_BACKFACE_CULLING)
+        vu = select(flags,u1,u);
+        vv = select(flags,v1,v);
+        vNg = Vec3vf<M>(Ng.x,Ng.y,Ng.z);
+#else
+        const vfloat<M> flip = select(flags,vfloat<M>(-1.0f),vfloat<M>(1.0f));
+        vv = select(flags,u1,v);
+        vu = select(flags,v1,u);
+        vNg = Vec3vf<M>(flip*Ng.x,flip*Ng.y,flip*Ng.z);
+#endif
+      }
+
+      __forceinline void finalize()
+      {
+      }
+
+      __forceinline Vec2f uv(const size_t i)
+      {
+        const float u = vu[i];
+        const float v = vv[i];
+        return Vec2f(u,v);
+      }
+
+      __forceinline float   t(const size_t i) { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+      __forceinline void interpolateUV(SubGridQuadHitPlueckerM<M> &hit,const GridMesh::Grid &g, const SubGrid& subgrid, const vint<M> &stepX, const vint<M> &stepY) 
+    {
+      /* correct U,V interpolation across the entire grid */
+      const vint<M> sx((int)subgrid.x());
+      const vint<M> sy((int)subgrid.y());
+      const vint<M> sxM(sx + stepX);
+      const vint<M> syM(sy + stepY);
+      const float inv_resX = rcp((float)((int)g.resX-1));
+      const float inv_resY = rcp((float)((int)g.resY-1));          
+      hit.vu = (hit.vu + vfloat<M>(sxM)) * inv_resX;
+      hit.vv = (hit.vv + vfloat<M>(syM)) * inv_resY;
+    }
+
+    template<int M>
+    __forceinline static bool intersectPluecker(Ray& ray,
+                                                const Vec3vf<M>& tri_v0,
+                                                const Vec3vf<M>& tri_v1,
+                                                const Vec3vf<M>& tri_v2,
+                                                const vbool<M>& flags,
+                                                SubGridQuadHitPlueckerM<M> &hit)
+    {
+        /* calculate vertices relative to ray origin */
+      const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+      const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+
+        /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        new (&hit) SubGridQuadHitPlueckerM<M>(valid,U,V,UVW,t,Ng,flags);
+        return true;
+      }
+
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1Pluecker;
+
+    template<int M, bool filter>
+      struct SubGridQuadMIntersector1Pluecker
+      {
+        __forceinline SubGridQuadMIntersector1Pluecker() {}
+
+        __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+
+        __forceinline void intersect(RayHit& ray, IntersectContext* context,
+                                     const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                     const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          SubGridQuadHitPlueckerM<M> hit;
+          Intersect1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+
+          /* intersect first triangle */
+          if (intersectPluecker(ray,v0,v1,v3,vbool<M>(false),hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+
+          /* intersect second triangle */
+          if (intersectPluecker(ray,v2,v3,v1,vbool<M>(true),hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            epilog(hit.valid,hit);
+          }
+        }
+      
+        __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3,
+                                    const GridMesh::Grid &g, const SubGrid& subgrid) const
+        {
+          SubGridQuadHitPlueckerM<M> hit;
+          Occluded1EpilogMU<M,filter> epilog(ray,context,subgrid.geomID(),subgrid.primID());
+          
+          /* intersect first triangle */
+          if (intersectPluecker(ray,v0,v1,v3,vbool<M>(false),hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+
+          /* intersect second triangle */
+          if (intersectPluecker(ray,v2,v3,v1,vbool<M>(true),hit)) 
+          {
+            interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+            if (epilog(hit.valid,hit))
+              return true;
+          }
+
+          return false;
+        }
+      };
+
+#if defined (__AVX__)
+
+    /*! Intersects 4 quads with 1 ray using AVX */
+    template<bool filter>
+      struct SubGridQuadMIntersector1Pluecker<4,filter>
+    {
+      __forceinline SubGridQuadMIntersector1Pluecker() {}
+
+      __forceinline SubGridQuadMIntersector1Pluecker(const Ray& ray, const void* ptr) {}
+      
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray, const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, const GridMesh::Grid &g, const SubGrid& subgrid, const Epilog& epilog) const
+      {
+        const Vec3vf8 vtx0(vfloat8(v0.x,v2.x),vfloat8(v0.y,v2.y),vfloat8(v0.z,v2.z));
+#if !defined(EMBREE_BACKFACE_CULLING)
+        const Vec3vf8 vtx1(vfloat8(v1.x),vfloat8(v1.y),vfloat8(v1.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x),vfloat8(v3.y),vfloat8(v3.z));        
+#else
+        const Vec3vf8 vtx1(vfloat8(v1.x,v3.x),vfloat8(v1.y,v3.y),vfloat8(v1.z,v3.z));
+        const Vec3vf8 vtx2(vfloat8(v3.x,v1.x),vfloat8(v3.y,v1.y),vfloat8(v3.z,v1.z));
+#endif
+        SubGridQuadHitPlueckerM<8> hit;
+        const vbool8 flags(0,0,0,0,1,1,1,1);
+        if (unlikely(intersectPluecker(ray,vtx0,vtx1,vtx2,flags,hit)))
+        {
+          /* correct U,V interpolation across the entire grid */
+          interpolateUV<8>(hit,g,subgrid,vint<8>(0,1,1,0,0,1,1,0),vint<8>(0,0,1,1,0,0,1,1));            
+          if (unlikely(epilog(hit.valid,hit)))
+            return true;
+        }
+        return false;
+      }
+      
+      __forceinline bool intersect(RayHit& ray, IntersectContext* context,
+                                   const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                   const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Intersect1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+      
+      __forceinline bool occluded(Ray& ray, IntersectContext* context,
+                                  const Vec3vf4& v0, const Vec3vf4& v1, const Vec3vf4& v2, const Vec3vf4& v3, 
+                                  const GridMesh::Grid &g, const SubGrid& subgrid) const
+      {
+          return intersect(ray,v0,v1,v2,v3,g,subgrid,Occluded1EpilogMU<8,filter>(ray,context,subgrid.geomID(),subgrid.primID()));
+      }
+    };
+
+#endif
+
+
+    /* ----------------------------- */
+    /* -- ray packet intersectors -- */
+    /* ----------------------------- */
+
+    template<int K>
+      struct SubGridQuadHitPlueckerK
+      {
+         __forceinline SubGridQuadHitPlueckerK(const vfloat<K>& U,
+                                               const vfloat<K>& V,
+                                               const vfloat<K>& UVW,
+                                               const vfloat<K>& t,
+                                               const Vec3vf<K>& Ng,
+                                               const vbool<K>& flags,
+                                               const GridMesh::Grid &g, 
+                                               const SubGrid& subgrid,
+                                               const unsigned int i)
+         : U(U), V(V), UVW(UVW), t(t), flags(flags), tri_Ng(Ng), g(g), subgrid(subgrid), i(i) {}
+
+        __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+        {
+          const vbool<K> invalid = abs(UVW) < min_rcp_input;
+          const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+          const vfloat<K> u0 = min(U * rcpUVW,1.0f);
+          const vfloat<K> v0 = min(V * rcpUVW,1.0f);
+          const vfloat<K> u1 = vfloat<K>(1.0f) - u0;
+          const vfloat<K> v1 = vfloat<K>(1.0f) - v0;
+          const vfloat<K> uu = select(flags,u1,u0);
+          const vfloat<K> vv = select(flags,v1,v0);
+          const unsigned int sx = subgrid.x() + (unsigned int)(i % 2);
+          const unsigned int sy = subgrid.y() + (unsigned int)(i >>1);
+          const float inv_resX = rcp((float)(int)(g.resX-1));
+          const float inv_resY = rcp((float)(int)(g.resY-1));
+          const vfloat<K> u = (uu + (float)(int)sx) * inv_resX;
+          const vfloat<K> v = (vv + (float)(int)sy) * inv_resY;
+          const Vec3vf<K> Ng(tri_Ng.x,tri_Ng.y,tri_Ng.z);
+          return std::make_tuple(u,v,t,Ng);
+        }
+
+      private:
+        const vfloat<K> U;
+        const vfloat<K> V;
+        const vfloat<K> UVW;
+        const vfloat<K> t;
+        const vfloat<K> absDen;
+        const vbool<K> flags;
+        const Vec3vf<K> tri_Ng;
+
+        const GridMesh::Grid &g;
+        const SubGrid& subgrid;
+        const size_t i;
+      };
+
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKPlueckerBase
+      {
+        __forceinline SubGridQuadMIntersectorKPlueckerBase(const vbool<K>& valid, const RayK<K>& ray) {}
+            
+        template<typename Epilog>
+        __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                          RayK<K>& ray,
+                                          const Vec3vf<K>& tri_v0,
+                                          const Vec3vf<K>& tri_v1,
+                                          const Vec3vf<K>& tri_v2,
+                                          const Vec3vf<K>& tri_Ng,
+                                          const vbool<K>& flags,
+                                          const GridMesh::Grid &g, 
+                                          const SubGrid &subgrid,
+                                          const unsigned int i,
+                                          const Epilog& epilog) const
+        { 
+          /* calculate denominator */
+        /* calculate vertices relative to ray origin */
+          vbool<K> valid = valid0;
+          const Vec3vf<K> O = ray.org;
+          const Vec3vf<K> D = ray.dir;
+          const Vec3vf<K> v0 = tri_v0-O;
+          const Vec3vf<K> v1 = tri_v1-O;
+          const Vec3vf<K> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<K> e0 = v2-v0;
+          const Vec3vf<K> e1 = v0-v1;
+          const Vec3vf<K> e2 = v1-v2;
+           
+          /* perform edge tests */
+          const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+          const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+          const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+          const vfloat<K> UVW = U+V+W;
+          const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          valid &= max(U,V,W) <= eps;
+#else
+          valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate geometry normal and denominator */
+          const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+          /* perform depth test */
+          const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+          const vfloat<K> t = rcp(den)*T;
+          valid &= ray.tnear() <= t & t <= ray.tfar;
+          valid &= den != vfloat<K>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate hit information */
+          SubGridQuadHitPlueckerK<K> hit(U,V,UVW,t,tri_Ng,flags,g,subgrid,i);
+          return epilog(valid,hit);
+        }
+      
+        template<typename Epilog>
+        __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                          RayK<K>& ray,
+                                          const Vec3vf<K>& v0,
+                                          const Vec3vf<K>& v1,
+                                          const Vec3vf<K>& v2,
+                                          const vbool<K>& flags,
+                                          const GridMesh::Grid &g, 
+                                          const SubGrid &subgrid,
+                                          const unsigned int i,
+                                          const Epilog& epilog) const
+        {
+          const Vec3vf<K> e1 = v0-v1;
+          const Vec3vf<K> e2 = v2-v0;
+          const Vec3vf<K> Ng = cross(e2,e1);
+          return intersectK(valid0,ray,v0,v1,v2,Ng,flags,g,subgrid,i,epilog);
+        }
+
+        template<typename Epilog>
+        __forceinline bool intersectK(const vbool<K>& valid0, 
+                                      RayK<K>& ray,
+                                      const Vec3vf<K>& v0,
+                                      const Vec3vf<K>& v1,
+                                      const Vec3vf<K>& v2,
+                                      const Vec3vf<K>& v3,
+                                      const GridMesh::Grid &g, 
+                                      const SubGrid &subgrid,
+                                      const unsigned int i,
+                                      const Epilog& epilog) const
+        {
+          intersectK(valid0,ray,v0,v1,v3,vbool<K>(false),g,subgrid,i,epilog);
+          if (none(valid0)) return true;
+          intersectK(valid0,ray,v2,v3,v1,vbool<K>(true ),g,subgrid,i,epilog);
+          return none(valid0);
+        }
+
+        static  __forceinline bool intersect1(RayK<K>& ray,
+                                              size_t k,
+                                              const Vec3vf<M>& tri_v0,
+                                              const Vec3vf<M>& tri_v1,
+                                              const Vec3vf<M>& tri_v2,
+                                              const Vec3vf<M>& tri_Ng,
+                                              const vbool<M>& flags,
+                                              SubGridQuadHitPlueckerM<M> &hit)
+        {
+          /* calculate vertices relative to ray origin */
+          const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+          const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+          const Vec3vf<M> v0 = tri_v0-O;
+          const Vec3vf<M> v1 = tri_v1-O;
+          const Vec3vf<M> v2 = tri_v2-O;
+          
+          /* calculate triangle edges */
+          const Vec3vf<M> e0 = v2-v0;
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v1-v2;
+          
+          /* perform edge tests */
+          const vfloat<M> U = dot(cross(e0,v2+v0),D);
+          const vfloat<M> V = dot(cross(e1,v0+v1),D);
+          const vfloat<M> W = dot(cross(e2,v1+v2),D);
+          const vfloat<M> UVW = U+V+W;
+          const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+          vbool<M> valid = max(U,V,W) <= eps ;
+#else
+          vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+          if (unlikely(none(valid))) return false;
+          
+          /* calculate geometry normal and denominator */
+          const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+          const vfloat<M> den = twice(dot(Ng,D));
+
+          /* perform depth test */
+          const vfloat<M> T = twice(dot(v0,Ng));
+          const vfloat<M> t = rcp(den)*T;
+          valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+          if (unlikely(none(valid))) return false;
+          
+          /* avoid division by 0 */
+          valid &= den != vfloat<M>(zero);
+          if (unlikely(none(valid))) return false;
+          
+          /* update hit information */
+          new (&hit) SubGridQuadHitPlueckerM<M>(valid,U,V,UVW,t,tri_Ng,flags);
+          return true;
+        }
+
+        static __forceinline bool intersect1(RayK<K>& ray,
+                                             size_t k,
+                                             const Vec3vf<M>& v0,
+                                             const Vec3vf<M>& v1,
+                                             const Vec3vf<M>& v2,
+                                             const vbool<M>& flags,
+                                             SubGridQuadHitPlueckerM<M> &hit)
+        {
+          const Vec3vf<M> e1 = v0-v1;
+          const Vec3vf<M> e2 = v2-v0;
+          const Vec3vf<M> Ng = cross(e2,e1); // FIXME: optimize!!!
+          return intersect1(ray,k,v0,v1,v2,Ng,flags,hit);
+        }
+
+      };
+
+    template<int M, int K, bool filter>
+      struct SubGridQuadMIntersectorKPluecker : public SubGridQuadMIntersectorKPlueckerBase<M,K,filter>
+    {
+      __forceinline SubGridQuadMIntersectorKPluecker(const vbool<K>& valid, const RayK<K>& ray)
+        : SubGridQuadMIntersectorKPlueckerBase<M,K,filter>(valid,ray) {}
+
+      __forceinline void intersect1(RayHitK<K>& ray, size_t k, IntersectContext* context,
+                                    const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        Intersect1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+        SubGridQuadHitPlueckerM<4> hit;
+        if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,vboolf4(false),hit))
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          epilog(hit.valid,hit);
+        }
+
+        if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,vboolf4(true),hit))
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          epilog(hit.valid,hit);
+        }
+
+      }
+      
+      __forceinline bool occluded1(RayK<K>& ray, size_t k, IntersectContext* context,
+                                   const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const Vec3vf<M>& v3, const GridMesh::Grid &g, const SubGrid &subgrid) const
+      {
+        Occluded1KEpilogMU<M,K,filter> epilog(ray,k,context,subgrid.geomID(),subgrid.primID());
+
+        SubGridQuadHitPlueckerM<4> hit;
+        if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v0,v1,v3,vboolf4(false),hit))
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+
+        if (SubGridQuadMIntersectorKPlueckerBase<4,K,filter>::intersect1(ray,k,v2,v3,v1,vboolf4(true),hit))
+        {
+          interpolateUV<M>(hit,g,subgrid,vint<M>(0,1,1,0),vint<M>(0,0,1,1));
+          if (epilog(hit.valid,hit)) return true;
+        }
+        return false;
+      }
+    };
+
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h
new file mode 100644
index 0000000000..400a88b985
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/subgrid_mb_intersector.h
@@ -0,0 +1,236 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "subgrid_intersector.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int N, bool filter>
+    struct SubGridMBIntersector1Pluecker
+    {
+      typedef SubGridMBQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersector1Pluecker<4,filter> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        float ftime;
+        const int itime = mesh->timeSegment(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime);
+        pre.intersect(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        float ftime;
+        const int itime = mesh->timeSegment(ray.time(), ftime);
+
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime,ftime);
+        return pre.occluded(ray,context,v0,v1,v2,v3,g,subgrid);
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const SubGrid& subgrid)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, subgrid);
+      }
+
+      template<int Nx, bool robust>
+        static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+        for (size_t i=0;i<num;i++)
+        {
+          vfloat<Nx> dist;
+          const float time = prim[i].adjustTime(ray.time());
+
+          assert(time <= 1.0f);
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+#if defined(__AVX__)
+          STAT3(normal.trav_hit_boxes[popcnt(mask)],1,1,1);
+#endif
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            if (unlikely(dist[ID] > ray.tfar)) continue;
+            intersect(pre,ray,context,prim[i].subgrid(ID));
+          }
+        }
+      }
+
+      template<int Nx, bool robust>        
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+      {
+        BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+        for (size_t i=0;i<num;i++)
+        {
+          const float time = prim[i].adjustTime(ray.time());
+          assert(time <= 1.0f);
+          vfloat<Nx> dist;
+          size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+          while(mask != 0)
+          {
+            const size_t ID = bscf(mask); 
+            if (occluded(pre,ray,context,prim[i].subgrid(ID)))
+              return true;
+          }
+        }
+        return false;
+      }
+      
+      static __forceinline bool pointQuery(const Accel::Intersectors* This, PointQuery* query, PointQueryContext* context, const Primitive* prim, size_t num, const TravPointQuery<N> &tquery, size_t& lazy_node)
+      {
+        assert(false && "not implemented");
+        return false;
+      }
+    };
+
+
+    template<int N, int K, bool filter>
+    struct SubGridMBIntersectorKPluecker
+    {
+      typedef SubGridMBQBVHN<N> Primitive;
+      typedef SubGridQuadMIntersectorKPluecker<4,K,filter> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        size_t m_valid = movemask(valid_i);
+        while(m_valid)
+        {
+          size_t ID = bscf(m_valid);
+          intersect(pre,ray,ID,context,subgrid);
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const SubGrid& subgrid)
+      {
+        vbool<K> valid0 = valid_i;
+        size_t m_valid = movemask(valid_i);
+        while(m_valid)
+        {
+          size_t ID = bscf(m_valid);
+          if (occluded(pre,ray,ID,context,subgrid))
+            clear(valid0,ID);
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+ 
+        vfloat<K> ftime;
+        const vint<K> itime = mesh->timeSegment(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]);
+        pre.intersect1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const SubGrid& subgrid)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const GridMesh* mesh    = context->scene->get<GridMesh>(subgrid.geomID());
+        const GridMesh::Grid &g = mesh->grid(subgrid.primID());
+
+        vfloat<K> ftime;
+        const vint<K> itime = mesh->timeSegment(ray.time(), ftime);
+        Vec3vf4 v0,v1,v2,v3; subgrid.gatherMB(v0,v1,v2,v3,context->scene,itime[k],ftime[k]);
+        return pre.occluded1(ray,k,context,v0,v1,v2,v3,g,subgrid);
+      }
+
+        template<bool robust>
+          static __forceinline void intersect(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            const vfloat<K> time = prim[j].adjustTime(ray.time());
+
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue;
+              intersect(valid,pre,ray,context,prim[j].subgrid(i));
+            }
+          }
+        }
+
+        template<bool robust>        
+        static __forceinline vbool<K> occluded(const vbool<K>& valid, const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive* prim, size_t num, const TravRayK<K, robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersectorK<N,K,robust> isecK;
+
+          vbool<K> valid0 = valid;
+          for (size_t j=0;j<num;j++)
+          {
+            size_t m_valid = movemask(prim[j].qnode.validMask());
+            const vfloat<K> time = prim[j].adjustTime(ray.time());
+            vfloat<K> dist;
+            while(m_valid)
+            {
+              const size_t i = bscf(m_valid);
+              if (none(valid0 & isecK.intersectK(&prim[j].qnode,i,tray,time,dist))) continue;
+              valid0 &= !occluded(valid0,pre,ray,context,prim[j].subgrid(i));
+              if (none(valid0)) break;
+            }
+          }
+          return !valid0;
+        }
+        
+        template<int Nx, bool robust>        
+          static __forceinline void intersect(const Accel::Intersectors* This, Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            const float time = prim[i].adjustTime(ray.time()[k]);
+            assert(time <= 1.0f);
+
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              if (unlikely(dist[ID] > ray.tfar[k])) continue;
+              intersect(pre,ray,k,context,prim[i].subgrid(ID));
+            }
+          }
+        }
+        
+        template<int Nx, bool robust>
+        static __forceinline bool occluded(const Accel::Intersectors* This, Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive* prim, size_t num, const TravRay<N,Nx,robust> &tray, size_t& lazy_node)
+        {
+          BVHNQuantizedBaseNodeIntersector1<N,Nx,robust> isec1;
+          
+          for (size_t i=0;i<num;i++)
+          {
+            vfloat<N> dist;
+            const float time = prim[i].adjustTime(ray.time()[k]);
+            assert(time <= 1.0f);
+
+            size_t mask = isec1.intersect(&prim[i].qnode,tray,time,dist); 
+            while(mask != 0)
+            {
+              const size_t ID = bscf(mask); 
+              if (occluded(pre,ray,k,context,prim[i].subgrid(ID)))
+                return true;
+            }
+          }
+          return false;
+        }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle.h b/thirdparty/embree-aarch64/kernels/geometry/triangle.h
new file mode 100644
index 0000000000..0dedf6dc4c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle.h
@@ -0,0 +1,162 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Precalculated representation for M triangles. Stores for each
+     triangle a base vertex, two edges, and the geometry normal to
+     speed up intersection calculations */
+  template<int M>
+  struct TriangleM
+  {
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+    
+  public:
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleM() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleM(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), e1(v0-v1), e2(v2-v0), geomIDs(geomIDs), primIDs(primIDs) {}
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+    
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid()));  }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs;  }
+    __forceinline const vuint<M>& geomID() const { return geomIDs;  }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangle */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> p0 = v0;
+      Vec3vf<M> p1 = v0-e1;
+      Vec3vf<M> p2 = v0+e2;
+      Vec3vf<M> lower = min(p0,p1,p2);
+      Vec3vf<M> upper = max(p0,p1,p2);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Non temporal store */
+    __forceinline static void store_nt(TriangleM* dst, const TriangleM& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->e1.x,src.e1.x);
+      vfloat<M>::store_nt(&dst->e1.y,src.e1.y);
+      vfloat<M>::store_nt(&dst->e1.z,src.e1.z);
+      vfloat<M>::store_nt(&dst->e2.x,src.e2.x);
+      vfloat<M>::store_nt(&dst->e2.y,src.e2.y);
+      vfloat<M>::store_nt(&dst->e2.z,src.e2.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+
+	  for (size_t i=0; i<M; i++)
+      {
+        if (unlikely(geomID(i) == -1)) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleM::store_nt(this,TriangleM(v0,v1,v2,vgeomID,vprimID));
+      return bounds;
+    }
+
+  public:
+    Vec3vf<M> v0;      // base vertex of the triangles
+    Vec3vf<M> e1;      // 1st edge of the triangles (v0-v1)
+    Vec3vf<M> e2;      // 2nd edge of the triangles (v2-v0)
+  private:
+    vuint<M> geomIDs; // geometry IDs
+    vuint<M> primIDs; // primitive IDs
+  };
+
+  template<int M>
+  typename TriangleM<M>::Type TriangleM<M>::type;
+
+  typedef TriangleM<4> Triangle4;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h
new file mode 100644
index 0000000000..125a42c5fe
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector.h
@@ -0,0 +1,96 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "triangle_intersector_moeller.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMIntersector1Moeller
+    {
+      typedef TriangleM<M> Primitive;
+      typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersectEdge(ray,tri.v0,tri.e1,tri.e2,Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+      
+    };
+
+    /*! Intersects M triangles with K rays. */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMIntersectorKMoeller
+    {
+      typedef TriangleM<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT_USER(0,TriangleM<M>::max_size());
+        for (size_t i=0; i<TriangleM<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i);
+          const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i);
+          pre.intersectEdgeK(valid_i,ray,p0,e1,e2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleM<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> p0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> e1 = broadcast<vfloat<K>>(tri.e1,i);
+          const Vec3vf<K> e2 = broadcast<vfloat<K>>(tri.e2,i);
+          pre.intersectEdgeK(valid0,ray,p0,e1,e2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleM<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersectEdge(ray,k,tri.v0,tri.e1,tri.e2,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h
new file mode 100644
index 0000000000..b5a8519236
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_moeller.h
@@ -0,0 +1,403 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+/*! This intersector implements a modified version of the Moeller
+ *  Trumbore intersector from the paper "Fast, Minimum Storage
+ *  Ray-Triangle Intersection". In contrast to the paper we
+ *  precalculate some factors and factor the calculations differently
+ *  to allow precalculating the cross product e1 x e2. The resulting
+ *  algorithm is similar to the fastest one of the paper "Optimizing
+ *  Ray-Triangle Intersection via Automated Search". */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct MoellerTrumboreHitM
+    {
+      __forceinline MoellerTrumboreHitM() {}
+
+      __forceinline MoellerTrumboreHitM(const vbool<M>& valid, const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& T, const vfloat<M>& absDen, const Vec3vf<M>& Ng)
+        : U(U), V(V), T(T), absDen(absDen), valid(valid), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        const vfloat<M> rcpAbsDen = rcp(absDen);
+        vt = T * rcpAbsDen;
+        vu = U * rcpAbsDen;
+        vv = V * rcpAbsDen;
+      }
+
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    public:
+      vfloat<M> U;
+      vfloat<M> V;
+      vfloat<M> T;
+      vfloat<M> absDen;
+      
+    public:
+      vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+    
+    template<int M>
+    struct MoellerTrumboreIntersector1
+    {
+      __forceinline MoellerTrumboreIntersector1() {}
+
+      __forceinline MoellerTrumboreIntersector1(const Ray& ray, const void* ptr) {}
+
+      __forceinline bool intersect(const vbool<M>& valid0,
+                                   Ray& ray,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_e1,
+                                   const Vec3vf<M>& tri_e2,
+                                   const Vec3vf<M>& tri_Ng,
+                                   MoellerTrumboreHitM<M>& hit) const
+      {
+        /* calculate denominator */
+        vbool<M> valid = valid0;
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+        const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> C = Vec3vf<M>(tri_v0) - O;
+        const Vec3vf<M> R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vf<M>(tri_Ng),D);
+
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(R,Vec3vf<M>(tri_e2)) ^ sgnDen;
+        const vfloat<M> V = dot(R,Vec3vf<M>(tri_e1)) ^ sgnDen;
+        
+        /* perform backface culling */        
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        valid &= (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()) < T) & (T <= absDen*vfloat<M>(ray.tfar));
+        if (likely(none(valid))) return false;
+   
+        
+        /* update hit information */
+        new (&hit) MoellerTrumboreHitM<M>(valid,U,V,T,absDen,tri_Ng);
+
+        return true;
+      }
+
+      __forceinline bool intersectEdge(Ray& ray,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       MoellerTrumboreHitM<M>& hit) const
+      {
+        vbool<M> valid = true;
+        const Vec3<vfloat<M>> tri_Ng = cross(tri_e2,tri_e1);
+        return intersect(valid,ray,tri_v0,tri_e1,tri_e2,tri_Ng,hit);
+      }
+      
+      __forceinline bool intersect(Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   MoellerTrumboreHitM<M>& hit) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,v0,e1,e2,hit);
+      }
+
+      __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   MoellerTrumboreHitM<M>& hit) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(valid,ray,v0,e1,e2,hit);
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(Ray& ray,
+                                       const Vec3vf<M>& v0,
+                                       const Vec3vf<M>& e1,
+                                       const Vec3vf<M>& e2,
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        if (likely(intersectEdge(ray,v0,e1,e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+        __forceinline bool intersect(Ray& ray,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        if (likely(intersect(ray,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        if (likely(intersect(valid,ray,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+    };
+    
+    template<int K>
+    struct MoellerTrumboreHitK
+    {
+      __forceinline MoellerTrumboreHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng)
+        : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        const vfloat<K> u = U * rcpAbsDen;
+        const vfloat<K> v = V * rcpAbsDen;
+        return std::make_tuple(u,v,t,Ng);
+      }
+      
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const Vec3vf<K> Ng;
+    };
+    
+    template<int M, int K>
+    struct MoellerTrumboreIntersectorK
+    {
+      __forceinline MoellerTrumboreIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        //RayK<K>& ray,
+                                        const Vec3vf<K>& ray_org,
+                                        const Vec3vf<K>& ray_dir,
+                                        const vfloat<K>& ray_tnear,
+                                        const vfloat<K>& ray_tfar,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const Epilog& epilog) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray_org;
+        const Vec3vf<K> R = cross(C,ray_dir);
+        const vfloat<K> den = dot(tri_Ng,ray_dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(tri_e2,R) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(tri_e1,R) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        MoellerTrumboreHitK<K> hit(U,V,T,absDen,tri_Ng);
+        return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const Epilog& epilog) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog);
+      }
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0, 
+                                            RayK<K>& ray,
+                                            const Vec3vf<K>& tri_v0, 
+                                            const Vec3vf<K>& tri_e1, 
+                                            const Vec3vf<K>& tri_e2, 
+                                            const Epilog& epilog) const
+      {
+        const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog);
+      }
+      
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       MoellerTrumboreHitM<M>& hit) const
+      {
+        /* calculate denominator */
+        typedef Vec3vf<M> Vec3vfM;
+        const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1);
+
+        const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vfM C = Vec3vfM(tri_v0) - O;
+        const Vec3vfM R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vfM(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen;
+        const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        new (&hit) MoellerTrumboreHitM<M>(valid,U,V,T,absDen,tri_Ng);
+        return true;
+      }
+
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       MoellerTrumboreHitM<M>& hit) const
+      {
+        if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit))) 
+        {
+          hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]);
+          hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper;
+          return any(hit.valid);
+        }
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,                           
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        MoellerTrumboreHitM<M> hit;
+        if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+      
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2, 
+                                   const Epilog& epilog) const      
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,v0,e1,e2,epilog);
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const BBox<vfloat<M>>& time_range,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,time_range,v0,e1,e2,epilog);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h
new file mode 100644
index 0000000000..f1de99d208
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_pluecker.h
@@ -0,0 +1,247 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "trianglev.h"
+#include "trianglev_mb.h"
+#include "intersector_epilog.h"
+
+/*! Modified Pluecker ray/triangle intersector. The test first shifts
+ *  the ray origin into the origin of the coordinate system and then
+ *  uses Pluecker coordinates for the intersection. Due to the shift,
+ *  the Pluecker coordinate calculation simplifies and the tests get
+ *  numerically stable. The edge equations are watertight along the
+ *  edge for neighboring triangles. */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M, typename UVMapper>
+    struct PlueckerHitM
+    {
+      __forceinline PlueckerHitM(const vfloat<M>& U, const vfloat<M>& V, const vfloat<M>& UVW, const vfloat<M>& t, const Vec3vf<M>& Ng, const UVMapper& mapUV)
+        : U(U), V(V), UVW(UVW), mapUV(mapUV), vt(t), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        const vbool<M> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<M> rcpUVW = select(invalid,vfloat<M>(0.0f),rcp(UVW));
+        vu = U * rcpUVW;
+        vv = V * rcpUVW;
+        mapUV(vu,vv);
+      }
+      
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    private:
+      const vfloat<M> U;
+      const vfloat<M> V;
+      const vfloat<M> UVW;
+      const UVMapper& mapUV;
+      
+    public:
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct PlueckerIntersector1
+    {
+      __forceinline PlueckerIntersector1() {}
+
+      __forceinline PlueckerIntersector1(const Ray& ray, const void* ptr) {}
+
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(Ray& ray,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const
+      {
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = Vec3vf<M>((Vec3fa)ray.org);
+	const Vec3vf<M> D = Vec3vf<M>((Vec3fa)ray.dir);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+        
+        /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()) <= t & t <= vfloat<M>(ray.tfar);
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        PlueckerHitM<M,UVMapper> hit(U,V,UVW,t,Ng,mapUV);
+        return epilog(valid,hit);
+      }
+    };
+
+    template<int K, typename UVMapper>
+    struct PlueckerHitK
+    {
+      __forceinline PlueckerHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& UVW, const vfloat<K>& t, const Vec3vf<K>& Ng, const UVMapper& mapUV)
+        : U(U), V(V), UVW(UVW), t(t), Ng(Ng), mapUV(mapUV) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vbool<K> invalid = abs(UVW) < min_rcp_input;
+        const vfloat<K> rcpUVW = select(invalid,vfloat<K>(0.0f),rcp(UVW));
+        vfloat<K> u = U * rcpUVW;
+        vfloat<K> v = V * rcpUVW;
+        mapUV(u,v);
+        return std::make_tuple(u,v,t,Ng);
+      }
+      
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> UVW;
+      const vfloat<K> t;
+      const Vec3vf<K> Ng;
+      const UVMapper& mapUV;
+    };
+    
+    template<int M, int K>
+    struct PlueckerIntersectorK
+    {
+      __forceinline PlueckerIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename UVMapper, typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const UVMapper& mapUV,
+                                        const Epilog& epilog) const
+      {
+        /* calculate vertices relative to ray origin */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> O = ray.org;
+        const Vec3vf<K> D = ray.dir;
+        const Vec3vf<K> v0 = tri_v0-O;
+        const Vec3vf<K> v1 = tri_v1-O;
+        const Vec3vf<K> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<K> e0 = v2-v0;
+        const Vec3vf<K> e1 = v0-v1;
+        const Vec3vf<K> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<K> U = dot(Vec3vf<K>(cross(e0,v2+v0)),D);
+        const vfloat<K> V = dot(Vec3vf<K>(cross(e1,v0+v1)),D);
+        const vfloat<K> W = dot(Vec3vf<K>(cross(e2,v1+v2)),D);
+        const vfloat<K> UVW = U+V+W;
+        const vfloat<K> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= max(U,V,W) <= eps;
+#else
+        valid &= (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+         /* calculate geometry normal and denominator */
+        const Vec3vf<K> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<K> den = twice(dot(Vec3vf<K>(Ng),D));
+
+        /* perform depth test */
+        const vfloat<K> T = twice(dot(v0,Vec3vf<K>(Ng)));
+        const vfloat<K> t = rcp(den)*T;
+        valid &= ray.tnear() <= t & t <= ray.tfar;
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+        
+        /* calculate hit information */
+        PlueckerHitK<K,UVMapper> hit(U,V,UVW,t,Ng,mapUV);
+        return epilog(valid,hit);
+      }
+
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      template<typename UVMapper, typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray, size_t k,
+                                   const Vec3vf<M>& tri_v0,
+                                   const Vec3vf<M>& tri_v1,
+                                   const Vec3vf<M>& tri_v2,
+                                   const UVMapper& mapUV,
+                                   const Epilog& epilog) const
+      {
+        /* calculate vertices relative to ray origin */
+        const Vec3vf<M> O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vf<M> D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vf<M> v0 = tri_v0-O;
+        const Vec3vf<M> v1 = tri_v1-O;
+        const Vec3vf<M> v2 = tri_v2-O;
+
+        /* calculate triangle edges */
+        const Vec3vf<M> e0 = v2-v0;
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v1-v2;
+
+        /* perform edge tests */
+        const vfloat<M> U = dot(cross(e0,v2+v0),D);
+        const vfloat<M> V = dot(cross(e1,v0+v1),D);
+        const vfloat<M> W = dot(cross(e2,v1+v2),D);
+        const vfloat<M> UVW = U+V+W;
+        const vfloat<M> eps = float(ulp)*abs(UVW);
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = max(U,V,W) <= eps;
+#else
+        vbool<M> valid = (min(U,V,W) >= -eps) | (max(U,V,W) <= eps);
+#endif
+        if (unlikely(none(valid))) return false;
+
+        /* calculate geometry normal and denominator */
+        const Vec3vf<M> Ng = stable_triangle_normal(e0,e1,e2);
+        const vfloat<M> den = twice(dot(Ng,D));
+        
+        /* perform depth test */
+        const vfloat<M> T = twice(dot(v0,Ng));
+        const vfloat<M> t = rcp(den)*T;
+        valid &= vfloat<M>(ray.tnear()[k]) <= t & t <= vfloat<M>(ray.tfar[k]);
+        if (unlikely(none(valid))) return false;
+
+        /* avoid division by 0 */
+        valid &= den != vfloat<M>(zero);
+        if (unlikely(none(valid))) return false;
+
+        /* update hit information */
+        PlueckerHitM<M,UVMapper> hit(U,V,UVW,t,Ng,mapUV);
+        return epilog(valid,hit);
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h
new file mode 100644
index 0000000000..63e649d8fb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_intersector_woop.h
@@ -0,0 +1,418 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+/*! This intersector implements a modified version of the Woop's ray-triangle intersection test */
+
+namespace embree
+{
+  namespace isa
+  {
+    template<int M>
+    struct WoopHitM
+    {
+      __forceinline WoopHitM() {}
+
+      __forceinline WoopHitM(const vbool<M>& valid, 
+                             const vfloat<M>& U, 
+                             const vfloat<M>& V, 
+                             const vfloat<M>& T, 
+                             const vfloat<M>& inv_det,                              
+                             const Vec3vf<M>& Ng)
+        : U(U), V(V), T(T), inv_det(inv_det), valid(valid), vNg(Ng) {}
+      
+      __forceinline void finalize() 
+      {
+        vt = T;
+        vu = U*inv_det;
+        vv = V*inv_det;
+      }
+
+      __forceinline Vec2f uv (const size_t i) const { return Vec2f(vu[i],vv[i]); }
+      __forceinline float t  (const size_t i) const { return vt[i]; }
+      __forceinline Vec3fa Ng(const size_t i) const { return Vec3fa(vNg.x[i],vNg.y[i],vNg.z[i]); }
+      
+    private:
+      const vfloat<M> U;
+      const vfloat<M> V;
+      const vfloat<M> T;
+      const vfloat<M> inv_det;
+      
+    public:
+      const vbool<M> valid;
+      vfloat<M> vu;
+      vfloat<M> vv;
+      vfloat<M> vt;
+      Vec3vf<M> vNg;
+    };
+
+    template<int M>
+    struct WoopPrecalculations1
+    {
+      unsigned int kx,ky,kz;
+      Vec3vf<M> org;
+      Vec3fa S;
+      __forceinline WoopPrecalculations1() {}
+
+      __forceinline WoopPrecalculations1(const Ray& ray, const void* ptr)
+      {
+        kz = maxDim(abs(ray.dir));
+        kx = (kz+1) % 3;
+        ky = (kx+1) % 3;
+        const float inv_dir_kz = rcp(ray.dir[kz]);
+        if (ray.dir[kz]) std::swap(kx,ky);
+        S.x = ray.dir[kx] * inv_dir_kz;
+        S.y = ray.dir[ky] * inv_dir_kz;
+        S.z = inv_dir_kz;
+        org = Vec3vf<M>(ray.org[kx],ray.org[ky],ray.org[kz]);
+      }
+    };
+
+    
+    template<int M>
+    struct WoopIntersector1
+    {
+
+        typedef WoopPrecalculations1<M> Precalculations;
+
+      __forceinline WoopIntersector1() {}
+
+      __forceinline WoopIntersector1(const Ray& ray, const void* ptr) {}
+
+      static __forceinline bool intersect(const vbool<M>& valid0,
+                                          Ray& ray,
+                                          const Precalculations& pre,
+                                          const Vec3vf<M>& tri_v0,
+                                          const Vec3vf<M>& tri_v1,
+                                          const Vec3vf<M>& tri_v2,
+                                          WoopHitM<M>& hit)
+      {       
+        vbool<M> valid = valid0;
+
+        /* vertices relative to ray origin */
+        const Vec3vf<M> org = Vec3vf<M>(pre.org.x,pre.org.y,pre.org.z);
+        const Vec3vf<M> A = Vec3vf<M>(tri_v0[pre.kx],tri_v0[pre.ky],tri_v0[pre.kz]) - org;
+        const Vec3vf<M> B = Vec3vf<M>(tri_v1[pre.kx],tri_v1[pre.ky],tri_v1[pre.kz]) - org;
+        const Vec3vf<M> C = Vec3vf<M>(tri_v2[pre.kx],tri_v2[pre.ky],tri_v2[pre.kz]) - org;
+
+        /* shear and scale vertices */
+        const vfloat<M> Ax = nmadd(A.z,pre.S.x,A.x);
+        const vfloat<M> Ay = nmadd(A.z,pre.S.y,A.y);
+        const vfloat<M> Bx = nmadd(B.z,pre.S.x,B.x);
+        const vfloat<M> By = nmadd(B.z,pre.S.y,B.y);
+        const vfloat<M> Cx = nmadd(C.z,pre.S.x,C.x);
+        const vfloat<M> Cy = nmadd(C.z,pre.S.y,C.y);
+
+        /* scaled barycentric */
+        const vfloat<M> U0 = Cx*By;
+        const vfloat<M> U1 = Cy*Bx;
+        const vfloat<M> V0 = Ax*Cy;
+        const vfloat<M> V1 = Ay*Cx;
+        const vfloat<M> W0 = Bx*Ay;
+        const vfloat<M> W1 = By*Ax;
+#if !defined(__AVX512F__)
+        valid &= (U0 >= U1) & (V0 >= V1) & (W0 >= W1) |
+          (U0 <= U1) & (V0 <= V1) & (W0 <= W1);
+#else
+        valid &= ge(ge(U0 >= U1,V0,V1),W0,W1) | le(le(U0 <= U1,V0,V1),W0,W1);
+#endif
+
+        if (likely(none(valid))) return false;
+        const vfloat<M> U = U0-U1;
+        const vfloat<M> V = V0-V1;
+        const vfloat<M> W = W0-W1;
+
+        const vfloat<M> det = U+V+W;
+
+        valid &= det != 0.0f;
+        const vfloat<M> inv_det = rcp(det);
+
+        const vfloat<M> Az = pre.S.z * A.z;
+        const vfloat<M> Bz = pre.S.z * B.z;
+        const vfloat<M> Cz = pre.S.z * C.z;
+        const vfloat<M> T  = madd(U,Az,madd(V,Bz,W*Cz)); 
+        const vfloat<M> t  = T * inv_det;
+        /* perform depth test */
+        valid &= (vfloat<M>(ray.tnear()) < t) & (t <= vfloat<M>(ray.tfar));
+        if (likely(none(valid))) return false;
+        
+        const Vec3vf<M> tri_Ng = cross(tri_v2-tri_v0,tri_v0-tri_v1);
+
+        /* update hit information */
+        new (&hit) WoopHitM<M>(valid,U,V,t,inv_det,tri_Ng);
+        return true;
+      }
+      
+      static __forceinline bool intersect(Ray& ray,
+                                   const Precalculations& pre,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   WoopHitM<M>& hit)
+      {
+        vbool<M> valid = true;
+        return intersect(valid,ray,pre,v0,v1,v2,hit);
+      }
+
+
+      template<typename Epilog>
+      static __forceinline bool intersect(Ray& ray,
+                                     const Precalculations& pre,
+                                     const Vec3vf<M>& v0,
+                                     const Vec3vf<M>& v1,
+                                     const Vec3vf<M>& v2,
+                                     const Epilog& epilog)
+      {
+        WoopHitM<M> hit;
+        if (likely(intersect(ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      static __forceinline bool intersect(const vbool<M>& valid,
+                                   Ray& ray,
+                                   const Precalculations& pre,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog)
+      {
+        WoopHitM<M> hit;
+        if (likely(intersect(valid,ray,pre,v0,v1,v2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+    };
+    
+#if 0
+    template<int K>
+    struct WoopHitK
+    {
+      __forceinline WoopHitK(const vfloat<K>& U, const vfloat<K>& V, const vfloat<K>& T, const vfloat<K>& absDen, const Vec3vf<K>& Ng)
+        : U(U), V(V), T(T), absDen(absDen), Ng(Ng) {}
+      
+      __forceinline std::tuple<vfloat<K>,vfloat<K>,vfloat<K>,Vec3vf<K>> operator() () const
+      {
+        const vfloat<K> rcpAbsDen = rcp(absDen);
+        const vfloat<K> t = T * rcpAbsDen;
+        const vfloat<K> u = U * rcpAbsDen;
+        const vfloat<K> v = V * rcpAbsDen;
+        return std::make_tuple(u,v,t,Ng);
+      }
+      
+    private:
+      const vfloat<K> U;
+      const vfloat<K> V;
+      const vfloat<K> T;
+      const vfloat<K> absDen;
+      const Vec3vf<K> Ng;
+    };
+    
+    template<int M, int K>
+    struct WoopIntersectorK
+    {
+      __forceinline WoopIntersectorK(const vbool<K>& valid, const RayK<K>& ray) {}
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0,
+                                        //RayK<K>& ray,
+                                        const Vec3vf<K>& ray_org,
+                                        const Vec3vf<K>& ray_dir,
+                                        const vfloat<K>& ray_tnear,
+                                        const vfloat<K>& ray_tfar,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_e1,
+                                        const Vec3vf<K>& tri_e2,
+                                        const Vec3vf<K>& tri_Ng,
+                                        const Epilog& epilog) const
+      { 
+        /* calculate denominator */
+        vbool<K> valid = valid0;
+        const Vec3vf<K> C = tri_v0 - ray_org;
+        const Vec3vf<K> R = cross(C,ray_dir);
+        const vfloat<K> den = dot(tri_Ng,ray_dir);
+        const vfloat<K> absDen = abs(den);
+        const vfloat<K> sgnDen = signmsk(den);
+        
+        /* test against edge p2 p0 */
+        const vfloat<K> U = dot(tri_e2,R) ^ sgnDen;
+        valid &= U >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p0 p1 */
+        const vfloat<K> V = dot(tri_e1,R) ^ sgnDen;
+        valid &= V >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* test against edge p1 p2 */
+        const vfloat<K> W = absDen-U-V;
+        valid &= W >= 0.0f;
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<K> T = dot(tri_Ng,C) ^ sgnDen;
+        valid &= (absDen*ray_tnear < T) & (T <= absDen*ray_tfar);
+        if (unlikely(none(valid))) return false;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        valid &= den < vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#else
+        valid &= den != vfloat<K>(zero);
+        if (unlikely(none(valid))) return false;
+#endif
+        
+        /* calculate hit information */
+        WoopHitK<K> hit(U,V,T,absDen,tri_Ng);
+        return epilog(valid,hit);
+      }
+      
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectK(const vbool<K>& valid0, 
+                                        RayK<K>& ray,
+                                        const Vec3vf<K>& tri_v0,
+                                        const Vec3vf<K>& tri_v1,
+                                        const Vec3vf<K>& tri_v2,
+                                        const Epilog& epilog) const
+      {
+        const Vec3vf<K> e1 = tri_v0-tri_v1;
+        const Vec3vf<K> e2 = tri_v2-tri_v0;
+        const Vec3vf<K> Ng = cross(e2,e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,e1,e2,Ng,epilog);
+      }
+
+      /*! Intersects K rays with one of M triangles. */
+      template<typename Epilog>
+      __forceinline vbool<K> intersectEdgeK(const vbool<K>& valid0, 
+                                            RayK<K>& ray,
+                                            const Vec3vf<K>& tri_v0, 
+                                            const Vec3vf<K>& tri_e1, 
+                                            const Vec3vf<K>& tri_e2, 
+                                            const Epilog& epilog) const
+      {
+        const Vec3vf<K> tri_Ng = cross(tri_e2,tri_e1);
+        return intersectK(valid0,ray.org,ray.dir,ray.tnear(),ray.tfar,tri_v0,tri_e1,tri_e2,tri_Ng,epilog);
+      }
+      
+      /*! Intersect k'th ray from ray packet of size K with M triangles. */
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0,
+                                       const Vec3vf<M>& tri_e1,
+                                       const Vec3vf<M>& tri_e2,
+                                       WoopHitM<M>& hit) const
+      {
+        /* calculate denominator */
+        typedef Vec3vf<M> Vec3vfM;
+        const Vec3vf<M> tri_Ng = cross(tri_e2,tri_e1);
+
+        const Vec3vfM O = broadcast<vfloat<M>>(ray.org,k);
+        const Vec3vfM D = broadcast<vfloat<M>>(ray.dir,k);
+        const Vec3vfM C = Vec3vfM(tri_v0) - O;
+        const Vec3vfM R = cross(C,D);
+        const vfloat<M> den = dot(Vec3vfM(tri_Ng),D);
+        const vfloat<M> absDen = abs(den);
+        const vfloat<M> sgnDen = signmsk(den);
+        
+        /* perform edge tests */
+        const vfloat<M> U = dot(Vec3vf<M>(tri_e2),R) ^ sgnDen;
+        const vfloat<M> V = dot(Vec3vf<M>(tri_e1),R) ^ sgnDen;
+        
+        /* perform backface culling */
+#if defined(EMBREE_BACKFACE_CULLING)
+        vbool<M> valid = (den < vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#else
+        vbool<M> valid = (den != vfloat<M>(zero)) & (U >= 0.0f) & (V >= 0.0f) & (U+V<=absDen);
+#endif
+        if (likely(none(valid))) return false;
+        
+        /* perform depth test */
+        const vfloat<M> T = dot(Vec3vf<M>(tri_Ng),C) ^ sgnDen;
+        valid &= (absDen*vfloat<M>(ray.tnear()[k]) < T) & (T <= absDen*vfloat<M>(ray.tfar[k]));
+        if (likely(none(valid))) return false;
+        
+        /* calculate hit information */
+        new (&hit) WoopHitM<M>(valid,U,V,T,absDen,tri_Ng);
+        return true;
+      }
+
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       WoopHitM<M>& hit) const
+      {
+        if (likely(intersect(ray,k,tri_v0,tri_e1,tri_e2,hit))) 
+        {
+          hit.valid &= time_range.lower <= vfloat<M>(ray.time[k]);
+          hit.valid &= vfloat<M>(ray.time[k]) < time_range.upper;
+          return any(hit.valid);
+        }
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        WoopHitM<M> hit;
+        if (likely(intersectEdge(ray,k,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersectEdge(RayK<K>& ray,
+                                       size_t k,                           
+                                       const BBox<vfloat<M>>& time_range,
+                                       const Vec3vf<M>& tri_v0, 
+                                       const Vec3vf<M>& tri_e1, 
+                                       const Vec3vf<M>& tri_e2, 
+                                       const Epilog& epilog) const
+      {
+        WoopHitM<M> hit;
+        if (likely(intersectEdge(ray,k,time_range,tri_v0,tri_e1,tri_e2,hit))) return epilog(hit.valid,hit);
+        return false;
+      }
+      
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const Vec3vf<M>& v0, 
+                                   const Vec3vf<M>& v1, 
+                                   const Vec3vf<M>& v2, 
+                                   const Epilog& epilog) const      
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,v0,e1,e2,epilog);
+      }
+
+      template<typename Epilog>
+      __forceinline bool intersect(RayK<K>& ray,
+                                   size_t k,
+                                   const BBox<vfloat<M>>& time_range,
+                                   const Vec3vf<M>& v0,
+                                   const Vec3vf<M>& v1,
+                                   const Vec3vf<M>& v2,
+                                   const Epilog& epilog) const
+      {
+        const Vec3vf<M> e1 = v0-v1;
+        const Vec3vf<M> e2 = v2-v0;
+        return intersectEdge(ray,k,time_range,v0,e1,e2,epilog);
+      }
+    };
+#endif
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h
new file mode 100644
index 0000000000..91b35c36f3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/triangle_triangle_intersector.h
@@ -0,0 +1,132 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "primitive.h"
+
+namespace embree
+{ 
+  namespace isa
+  {
+    struct TriangleTriangleIntersector
+    {
+      __forceinline static float T(float pa0, float pa1, float da0, float da1) {
+        return pa0 + (pa1-pa0)*da0/(da0-da1);
+      }
+      
+      __forceinline static bool point_line_side(const Vec2f& p, const Vec2f& a0, const Vec2f& a1) {
+        return det(p-a0,a0-a1) >= 0.0f;
+      }
+      
+      __forceinline static bool point_inside_triangle(const Vec2f& p, const Vec2f& a, const Vec2f& b, const Vec2f& c) 
+      {
+        const bool pab = point_line_side(p,a,b); 
+        const bool pbc = point_line_side(p,b,c);
+        const bool pca = point_line_side(p,c,a);
+        return pab == pbc && pab == pca;
+      }
+      
+      __forceinline static bool intersect_line_line(const Vec2f& a0, const Vec2f& a1, const Vec2f& b0, const Vec2f& b1)
+      {
+        const bool different_sides0 = point_line_side(b0,a0,a1) != point_line_side(b1,a0,a1);
+        const bool different_sides1 = point_line_side(a0,b0,b1) != point_line_side(a1,b0,b1);
+        return different_sides0 && different_sides1;
+      }
+      
+      __forceinline static bool intersect_triangle_triangle (const Vec2f& a0, const Vec2f& a1, const Vec2f& a2, 
+                                                             const Vec2f& b0, const Vec2f& b1, const Vec2f& b2)
+      {
+        const bool a01_b01 = intersect_line_line(a0,a1,b0,b1); 
+        if (a01_b01) return true;
+        const bool a01_b12 = intersect_line_line(a0,a1,b1,b2);
+        if (a01_b12) return true;
+        const bool a01_b20 = intersect_line_line(a0,a1,b2,b0);
+        if (a01_b20) return true;
+        const bool a12_b01 = intersect_line_line(a1,a2,b0,b1);
+        if (a12_b01) return true;
+        const bool a12_b12 = intersect_line_line(a1,a2,b1,b2);
+        if (a12_b12) return true;
+        const bool a12_b20 = intersect_line_line(a1,a2,b2,b0);
+        if (a12_b20) return true;
+        const bool a20_b01 = intersect_line_line(a2,a0,b0,b1);
+        if (a20_b01) return true;
+        const bool a20_b12 = intersect_line_line(a2,a0,b1,b2);
+        if (a20_b12) return true;
+        const bool a20_b20 = intersect_line_line(a2,a0,b2,b0);
+        if (a20_b20) return true;
+        
+        bool a_in_b = point_inside_triangle(a0,b0,b1,b2) && point_inside_triangle(a1,b0,b1,b2) && point_inside_triangle(a2,b0,b1,b2);
+        if (a_in_b) return true;
+        
+        bool b_in_a = point_inside_triangle(b0,a0,a1,a2) && point_inside_triangle(b1,a0,a1,a2) && point_inside_triangle(b2,a0,a1,a2);
+        if (b_in_a) return true;
+        
+        return false;
+      }
+      
+      static bool intersect_triangle_triangle (const Vec3fa& a0, const Vec3fa& a1, const Vec3fa& a2,
+                                               const Vec3fa& b0, const Vec3fa& b1, const Vec3fa& b2)
+      {
+        const float eps = 1E-5f;
+        
+        /* calculate triangle planes */
+        const Vec3fa Na = cross(a1-a0,a2-a0);
+        const float  Ca = dot(Na,a0);
+        const Vec3fa Nb = cross(b1-b0,b2-b0);
+        const float  Cb = dot(Nb,b0);
+        
+        /* project triangle A onto plane B */
+        const float da0 = dot(Nb,a0)-Cb;
+        const float da1 = dot(Nb,a1)-Cb;
+        const float da2 = dot(Nb,a2)-Cb;
+        if (max(da0,da1,da2) < -eps) return false;
+        if (min(da0,da1,da2) > +eps) return false;
+        //CSTAT(bvh_collide_prim_intersections4++);
+        
+        /* project triangle B onto plane A */
+        const float db0 = dot(Na,b0)-Ca;
+        const float db1 = dot(Na,b1)-Ca;
+        const float db2 = dot(Na,b2)-Ca;
+        if (max(db0,db1,db2) < -eps) return false;
+        if (min(db0,db1,db2) > +eps) return false;
+        //CSTAT(bvh_collide_prim_intersections5++);
+        
+        if (unlikely((std::fabs(da0) < eps && std::fabs(da1) < eps && std::fabs(da2) < eps) ||
+                     (std::fabs(db0) < eps && std::fabs(db1) < eps && std::fabs(db2) < eps)))
+        {
+          const size_t dz = maxDim(Na);
+          const size_t dx = (dz+1)%3;
+          const size_t dy = (dx+1)%3;
+          const Vec2f A0(a0[dx],a0[dy]);
+          const Vec2f A1(a1[dx],a1[dy]);
+          const Vec2f A2(a2[dx],a2[dy]);
+          const Vec2f B0(b0[dx],b0[dy]);
+          const Vec2f B1(b1[dx],b1[dy]);
+          const Vec2f B2(b2[dx],b2[dy]);
+          return intersect_triangle_triangle(A0,A1,A2,B0,B1,B2);
+        }
+        
+        const Vec3fa D = cross(Na,Nb);
+        const float pa0 = dot(D,a0);
+        const float pa1 = dot(D,a1);
+        const float pa2 = dot(D,a2);
+        const float pb0 = dot(D,b0);
+        const float pb1 = dot(D,b1);
+        const float pb2 = dot(D,b2);
+        
+        BBox1f ba = empty;
+        if (min(da0,da1) <= 0.0f && max(da0,da1) >= 0.0f && abs(da0-da1) > 0.0f) ba.extend(T(pa0,pa1,da0,da1));
+        if (min(da1,da2) <= 0.0f && max(da1,da2) >= 0.0f && abs(da1-da2) > 0.0f) ba.extend(T(pa1,pa2,da1,da2));
+        if (min(da2,da0) <= 0.0f && max(da2,da0) >= 0.0f && abs(da2-da0) > 0.0f) ba.extend(T(pa2,pa0,da2,da0));
+        
+        BBox1f bb = empty;
+        if (min(db0,db1) <= 0.0f && max(db0,db1) >= 0.0f && abs(db0-db1) > 0.0f) bb.extend(T(pb0,pb1,db0,db1));
+        if (min(db1,db2) <= 0.0f && max(db1,db2) >= 0.0f && abs(db1-db2) > 0.0f) bb.extend(T(pb1,pb2,db1,db2));
+        if (min(db2,db0) <= 0.0f && max(db2,db0) >= 0.0f && abs(db2-db0) > 0.0f) bb.extend(T(pb2,pb0,db2,db0));
+        
+        return conjoint(ba,bb);
+      }
+    };
+  }
+}
+
+  
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglei.h b/thirdparty/embree-aarch64/kernels/geometry/trianglei.h
new file mode 100644
index 0000000000..4f3118cc0c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglei.h
@@ -0,0 +1,442 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+#include "../common/scene.h"
+
+namespace embree
+{
+  /* Stores M triangles from an indexed face set */
+  template <int M>
+  struct TriangleMi
+  {
+    /* Virtual interface to query information about the triangle type */
+    struct Type : public PrimitiveType
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* primitive supports multiple time segments */
+    static const bool singleTimeSegment = false;
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMi() {  }
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMi(const vuint<M>& v0,
+                             const vuint<M>& v1,
+                             const vuint<M>& v2,
+                             const vuint<M>& geomIDs,
+                             const vuint<M>& primIDs)
+#if defined(EMBREE_COMPACT_POLYS)
+      : geomIDs(geomIDs), primIDs(primIDs) {}
+#else
+    : v0_(v0), v1_(v1), v2_(v2), geomIDs(geomIDs), primIDs(primIDs) {}
+#endif
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return primIDs != vuint<M>(-1); }
+
+    /* Returns if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return primIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline vuint<M> geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline vuint<M> primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles */
+    __forceinline const BBox3fa bounds(const Scene *const scene, const size_t itime=0) const
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M && valid(i); i++) {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        bounds.extend(mesh->bounds(primID(i),itime));
+      }
+      return bounds;
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime) {
+      return LBBox3fa(bounds(scene,itime+0),bounds(scene,itime+1));
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, size_t itime, size_t numTimeSteps)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), itime, numTimeSteps));
+      }
+      return allBounds;
+    }
+
+    __forceinline LBBox3fa linearBounds(const Scene *const scene, const BBox1f time_range)
+    {
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && valid(i); i++)
+      {
+        const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(i));
+        allBounds.extend(mesh->linearBounds(primID(i), time_range));
+      }
+      return allBounds;
+    }
+    
+    /* Non-temporal store */
+    __forceinline static void store_nt(TriangleMi* dst, const TriangleMi& src)
+    {
+#if !defined(EMBREE_COMPACT_POLYS)
+      vuint<M>::store_nt(&dst->v0_,src.v0_);
+      vuint<M>::store_nt(&dst->v1_,src.v1_);
+      vuint<M>::store_nt(&dst->v2_,src.v2_);
+#endif
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    template<typename PrimRefT>
+    __forceinline void fill(const PrimRefT* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> v0 = zero, v1 = zero, v2 = zero;
+      vuint<M> geomID = -1, primID = -1;
+      const PrimRefT* prim = &prims[begin];
+
+      for (size_t i=0; i<M; i++)
+      {
+        if (begin<end) {
+          geomID[i] = prim->geomID();
+          primID[i] = prim->primID();
+#if !defined(EMBREE_COMPACT_POLYS)
+          const TriangleMesh* mesh = scene->get<TriangleMesh>(prim->geomID());
+          const TriangleMesh::Triangle& tri = mesh->triangle(prim->primID());
+          unsigned int int_stride = mesh->vertices0.getStride()/4;
+          v0[i] = tri.v[0] * int_stride;
+          v1[i] = tri.v[1] * int_stride;
+          v2[i] = tri.v[2] * int_stride;
+#endif
+          begin++;
+        } else {
+          assert(i);
+          if (likely(i > 0)) {
+            geomID[i] = geomID[0];
+            primID[i] = -1;
+            v0[i] = v0[0];
+            v1[i] = v0[0];
+            v2[i] = v0[0];
+          }
+        }
+        if (begin<end) prim = &prims[begin];
+      }
+      new (this) TriangleMi(v0,v1,v2,geomID,primID); // FIXME: use non temporal store
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, itime);
+    }
+
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      fill(prims, begin, end, scene);
+      return linearBounds(scene, time_range);
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned int primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+      }
+      return bounds;
+    }
+
+  protected:
+#if !defined(EMBREE_COMPACT_POLYS)
+    vuint<M> v0_;         // 4 byte offset of 1st vertex
+    vuint<M> v1_;         // 4 byte offset of 2nd vertex
+    vuint<M> v2_;         // 4 byte offset of 3rd vertex
+#endif
+    vuint<M> geomIDs;    // geometry ID of mesh
+    vuint<M> primIDs;    // primitive ID of primitive inside mesh
+  };
+
+  namespace isa
+  {
+    
+  template<int M>
+    struct TriangleMi : public embree::TriangleMi<M>
+  {
+#if !defined(EMBREE_COMPACT_POLYS)
+    using embree::TriangleMi<M>::v0_;
+    using embree::TriangleMi<M>::v1_;
+    using embree::TriangleMi<M>::v2_;
+#endif
+    using embree::TriangleMi<M>::geomIDs;
+    using embree::TriangleMi<M>::primIDs;
+    using embree::TriangleMi<M>::geomID;
+    using embree::TriangleMi<M>::primID;
+    using embree::TriangleMi<M>::valid;
+        
+    /* loads a single vertex */
+    template<int vid>
+    __forceinline Vec3f getVertex(const size_t index, const Scene *const scene) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+      return (Vec3f) mesh->vertices[0][tri.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const float* vertices = scene->vertices[geomID(index)];
+      return (Vec3f&) vertices[v[index]];
+#endif
+    }
+
+    template<int vid, typename T>
+    __forceinline Vec3<T> getVertex(const size_t index, const Scene *const scene, const size_t itime, const T& ftime) const
+    {
+#if defined(EMBREE_COMPACT_POLYS)
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+      const Vec3fa v0 = mesh->vertices[itime+0][tri.v[vid]];
+      const Vec3fa v1 = mesh->vertices[itime+1][tri.v[vid]];
+#else
+      const vuint<M>& v = getVertexOffset<vid>();
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      const float* vertices0 = (const float*) mesh->vertexPtr(0,itime+0);
+      const float* vertices1 = (const float*) mesh->vertexPtr(0,itime+1);
+      const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+      const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+      const Vec3<T> p0(v0.x,v0.y,v0.z);
+      const Vec3<T> p1(v1.x,v1.y,v1.z);
+      return lerp(p0,p1,ftime);
+    }
+
+    template<int vid, int K, typename T>
+    __forceinline Vec3<T> getVertex(const vbool<K>& valid, const size_t index, const Scene *const scene, const vint<K>& itime, const T& ftime) const
+    {
+      Vec3<T> p0, p1;
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+      
+      for (size_t mask=movemask(valid), i=bsf(mask); mask; mask=btc(mask,i), i=bsf(mask))
+      {
+#if defined(EMBREE_COMPACT_POLYS)
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID(index));
+        const Vec3fa v0 = mesh->vertices[itime[i]+0][tri.v[vid]];
+        const Vec3fa v1 = mesh->vertices[itime[i]+1][tri.v[vid]];
+#else
+        const vuint<M>& v = getVertexOffset<vid>();
+        const float* vertices0 = (const float*) mesh->vertexPtr(0,itime[i]+0);
+        const float* vertices1 = (const float*) mesh->vertexPtr(0,itime[i]+1);
+        const Vec3fa v0 = Vec3fa::loadu(vertices0+v[index]);
+        const Vec3fa v1 = Vec3fa::loadu(vertices1+v[index]);
+#endif
+        p0.x[i] = v0.x; p0.y[i] = v0.y; p0.z[i] = v0.z;
+        p1.x[i] = v1.x; p1.y[i] = v1.y; p1.z[i] = v1.z;
+      }
+      return (T(one)-ftime)*p0 + ftime*p1;
+    }
+
+    struct Triangle {
+      vfloat4 v0,v1,v2;
+    };
+    
+#if defined(EMBREE_COMPACT_POLYS)
+    
+    __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const 
+    {
+      const unsigned int geomID = geomIDs[i];
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero };
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID);
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices0[tri.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices0[tri.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices0[tri.v[2]];
+      return { v0, v1, v2 };
+    }
+
+    __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const 
+    {
+      const unsigned int primID = primIDs[i];
+      if (unlikely(primID == -1)) return { zero, zero, zero };
+      const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+      const vfloat4 v0 = (vfloat4) mesh->vertices[itime][tri.v[0]];
+      const vfloat4 v1 = (vfloat4) mesh->vertices[itime][tri.v[1]];
+      const vfloat4 v2 = (vfloat4) mesh->vertices[itime][tri.v[2]];
+      return { v0, v1, v2 };
+    }
+    
+#else
+
+    __forceinline Triangle loadTriangle(const int i, const Scene* const scene) const 
+    {
+      const float* vertices = scene->vertices[geomID(i)];
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      return { v0, v1, v2 };
+    }
+
+    __forceinline Triangle loadTriangle(const int i, const int itime, const TriangleMesh* const mesh) const 
+    {
+      const float* vertices = (const float*) mesh->vertexPtr(0,itime);
+      const vfloat4 v0 = vfloat4::loadu(vertices + v0_[i]);
+      const vfloat4 v1 = vfloat4::loadu(vertices + v1_[i]);
+      const vfloat4 v2 = vfloat4::loadu(vertices + v2_[i]);
+      return { v0, v1, v2 };
+    }
+    
+#endif
+
+    /* Gather the triangles */
+    __forceinline void gather(Vec3vf<M>& p0, Vec3vf<M>& p1, Vec3vf<M>& p2, const Scene* const scene) const;
+
+    template<int K>
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 2000) // workaround for compiler bug in ICC 2019
+    __noinline
+#else
+    __forceinline
+#endif
+    void gather(const vbool<K>& valid,
+                Vec3vf<K>& p0,
+                Vec3vf<K>& p1,
+                Vec3vf<K>& p2,
+                const size_t index,
+                const Scene* const scene,
+                const vfloat<K>& time) const
+    {
+      const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(index));
+
+      vfloat<K> ftime;
+      const vint<K> itime = mesh->timeSegment(time, ftime);
+
+      const size_t first = bsf(movemask(valid));
+      if (likely(all(valid,itime[first] == itime)))
+      {
+        p0 = getVertex<0>(index, scene, itime[first], ftime);
+        p1 = getVertex<1>(index, scene, itime[first], ftime);
+        p2 = getVertex<2>(index, scene, itime[first], ftime);
+      } else {
+        p0 = getVertex<0>(valid, index, scene, itime, ftime);
+        p1 = getVertex<1>(valid, index, scene, itime, ftime);
+        p2 = getVertex<2>(valid, index, scene, itime, ftime);
+      }
+    }
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              const TriangleMesh* mesh,
+                              const Scene *const scene,
+                              const int itime) const;
+
+    __forceinline void gather(Vec3vf<M>& p0,
+                              Vec3vf<M>& p1,
+                              Vec3vf<M>& p2,
+                              const Scene *const scene,
+                              const float time) const;
+
+
+#if !defined(EMBREE_COMPACT_POLYS)
+    template<int N> const vuint<M>& getVertexOffset() const;
+#endif
+  };
+
+#if !defined(EMBREE_COMPACT_POLYS)
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<0>() const { return v0_; }
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<1>() const { return v1_; }
+  template<> template<> __forceinline const vuint<4>& TriangleMi<4>::getVertexOffset<2>() const { return v2_; }
+#endif
+  
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const Scene* const scene) const
+  {
+    const Triangle tri0 = loadTriangle(0,scene);
+    const Triangle tri1 = loadTriangle(1,scene);
+    const Triangle tri2 = loadTriangle(2,scene);
+    const Triangle tri3 = loadTriangle(3,scene);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+  }
+
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const TriangleMesh* mesh,
+                                           const Scene *const scene,
+                                           const int itime) const
+  {
+    const Triangle tri0 = loadTriangle(0,itime,mesh);
+    const Triangle tri1 = loadTriangle(1,itime,mesh);
+    const Triangle tri2 = loadTriangle(2,itime,mesh);
+    const Triangle tri3 = loadTriangle(3,itime,mesh);
+    transpose(tri0.v0,tri1.v0,tri2.v0,tri3.v0,p0.x,p0.y,p0.z);
+    transpose(tri0.v1,tri1.v1,tri2.v1,tri3.v1,p1.x,p1.y,p1.z);
+    transpose(tri0.v2,tri1.v2,tri2.v2,tri3.v2,p2.x,p2.y,p2.z);
+  }
+
+  template<>
+  __forceinline void TriangleMi<4>::gather(Vec3vf4& p0,
+                                           Vec3vf4& p1,
+                                           Vec3vf4& p2,
+                                           const Scene *const scene,
+                                           const float time) const
+  {
+    const TriangleMesh* mesh = scene->get<TriangleMesh>(geomID(0)); // in mblur mode all geometries are identical
+
+    float ftime;
+    const int itime = mesh->timeSegment(time, ftime);
+
+    Vec3vf4 a0,a1,a2; gather(a0,a1,a2,mesh,scene,itime);
+    Vec3vf4 b0,b1,b2; gather(b0,b1,b2,mesh,scene,itime+1);
+    p0 = lerp(a0,b0,vfloat4(ftime));
+    p1 = lerp(a1,b1,vfloat4(ftime));
+    p2 = lerp(a2,b2,vfloat4(ftime));
+  }
+  }
+
+  template<int M>
+  typename TriangleMi<M>::Type TriangleMi<M>::type;
+
+  typedef TriangleMi<4> Triangle4i;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h
new file mode 100644
index 0000000000..e2f106a62c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglei_intersector.h
@@ -0,0 +1,336 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "trianglei.h"
+#include "triangle_intersector_moeller.h"
+#include "triangle_intersector_pluecker.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMiIntersector1Moeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMiIntersectorKMoeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        const Scene* scene = context->scene;
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        const Scene* scene = context->scene;
+
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMiIntersector1Pluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersector1<Mx> Precalculations;
+
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMiIntersectorKPluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        const Scene* scene = context->scene;
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        const Scene* scene = context->scene;
+
+        for (size_t i=0; i<Primitive::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),RayHitK<K>::size());
+          const Vec3vf<K> v0 = tri.template getVertex<0>(i,scene);
+          const Vec3vf<K> v1 = tri.template getVertex<1>(i,scene);
+          const Vec3vf<K> v2 = tri.template getVertex<2>(i,scene);
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0, v1, v2; tri.gather(v0,v1,v2,context->scene);
+        return pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMiMBIntersector1Moeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        return pre.intersect(ray,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMiMBIntersectorKMoeller
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        return pre.intersect(ray,k,v0,v1,v2,/*UVIdentity<Mx>(),*/Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMiMBIntersector1Pluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time());
+        return pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMiMBIntersectorKPluecker
+    {
+      typedef TriangleMi<M> Primitive;
+      typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+        for (size_t i=0; i<TriangleMi<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          Vec3vf<K> v0,v1,v2; tri.gather(valid_i,v0,v1,v2,i,context->scene,ray.time());
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMi<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        Vec3vf<M> v0,v1,v2; tri.gather(v0,v1,v2,context->scene,ray.time()[k]);
+        return pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev.h
new file mode 100644
index 0000000000..19af389e73
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev.h
@@ -0,0 +1,157 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M triangles in struct of array layout */
+  template <int M>
+  struct TriangleMv
+  { 
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+    static Type type;
+
+  public:
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMv() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMv(const Vec3vf<M>& v0, const Vec3vf<M>& v1, const Vec3vf<M>& v2, const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(v0), v1(v1), v2(v2), geomIDs(geomIDs), primIDs(primIDs) {}
+    
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns true if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles */
+    __forceinline BBox3fa bounds() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2);
+      Vec3vf<M> upper = max(v0,v1,v2);
+      vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+                     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+    
+    /* Non temporal store */
+    __forceinline static void store_nt(TriangleMv* dst, const TriangleMv& src)
+    {
+      vfloat<M>::store_nt(&dst->v0.x,src.v0.x);
+      vfloat<M>::store_nt(&dst->v0.y,src.v0.y);
+      vfloat<M>::store_nt(&dst->v0.z,src.v0.z);
+      vfloat<M>::store_nt(&dst->v1.x,src.v1.x);
+      vfloat<M>::store_nt(&dst->v1.y,src.v1.y);
+      vfloat<M>::store_nt(&dst->v1.z,src.v1.z);
+      vfloat<M>::store_nt(&dst->v2.x,src.v2.x);
+      vfloat<M>::store_nt(&dst->v2.y,src.v2.y);
+      vfloat<M>::store_nt(&dst->v2.z,src.v2.z);
+      vuint<M>::store_nt(&dst->geomIDs,src.geomIDs);
+      vuint<M>::store_nt(&dst->primIDs,src.primIDs);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline void fill(const PrimRef* prims, size_t& begin, size_t end, Scene* scene)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa& p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa& p2 = mesh->vertex(tri.v[2]);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      TriangleMv::store_nt(this,TriangleMv(v0,v1,v2,vgeomID,vprimID));
+    }
+
+    /* Updates the primitive */
+    __forceinline BBox3fa update(TriangleMesh* mesh)
+    {
+      BBox3fa bounds = empty;
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> v0 = zero, v1 = zero, v2 = zero;
+      
+      for (size_t i=0; i<M; i++)
+      {
+        if (primID(i) == -1) break;
+        const unsigned geomId = geomID(i);
+        const unsigned primId = primID(i);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primId);
+        const Vec3fa p0 = mesh->vertex(tri.v[0]);
+        const Vec3fa p1 = mesh->vertex(tri.v[1]);
+        const Vec3fa p2 = mesh->vertex(tri.v[2]);
+        bounds.extend(merge(BBox3fa(p0),BBox3fa(p1),BBox3fa(p2)));
+        vgeomID [i] = geomId;
+        vprimID [i] = primId;
+        v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
+        v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
+        v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
+      }
+      new (this) TriangleMv(v0,v1,v2,vgeomID,vprimID);
+      return bounds;
+    }
+   
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the triangles
+    Vec3vf<M> v1;      // 2nd vertex of the triangles
+    Vec3vf<M> v2;      // 3rd vertex of the triangles
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename TriangleMv<M>::Type TriangleMv<M>::type;
+
+  typedef TriangleMv<4> Triangle4v;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h
new file mode 100644
index 0000000000..6af0d5a11c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_intersector.h
@@ -0,0 +1,206 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "triangle_intersector_pluecker.h"
+#include "triangle_intersector_moeller.h"
+#include "triangle_intersector_woop.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMvIntersector1Moeller
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+
+    template<int M, int Mx, bool filter>
+    struct TriangleMvIntersector1Woop
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef WoopIntersector1<Mx> intersec;
+      typedef WoopPrecalculations1<M> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return intersec::intersect(ray,pre,tri.v0,tri.v1,tri.v2,Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMvIntersectorKMoeller
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid_i,ray,v0,v1,v2,/*UVIdentity<K>(),*/IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid0,ray,v0,v1,v2,/*UVIdentity<K>(),*/OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,/*UVIdentity<Mx>(),*/Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+      }
+    };
+
+    /*! Intersects M triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMvIntersector1Pluecker
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef PlueckerIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHit& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+
+    /*! Intersects M triangles with K rays */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMvIntersectorKPluecker
+    {
+      typedef TriangleMv<M> Primitive;
+      typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const Primitive& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<M; i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> v0 = broadcast<vfloat<K>>(tri.v0,i);
+          const Vec3vf<K> v1 = broadcast<vfloat<K>>(tri.v1,i);
+          const Vec3vf<K> v2 = broadcast<vfloat<K>>(tri.v2,i);
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const Primitive& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        return pre.intersect(ray,k,tri.v0,tri.v1,tri.v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID())); //FIXME: M,Mx
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h
new file mode 100644
index 0000000000..63137aee16
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb.h
@@ -0,0 +1,201 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "primitive.h"
+
+namespace embree
+{
+  /* Stores the vertices of M triangles in struct of array layout */
+  template<int M>
+  struct TriangleMvMB
+  {
+  public:
+    struct Type : public PrimitiveType 
+    {
+      const char* name() const;
+      size_t sizeActive(const char* This) const;
+      size_t sizeTotal(const char* This) const;
+      size_t getBytes(const char* This) const;
+    };
+
+    static Type type;
+
+  public:
+
+    /* primitive supports single time segments */
+    static const bool singleTimeSegment = true;
+
+    /* Returns maximum number of stored triangles */
+    static __forceinline size_t max_size() { return M; }
+    
+    /* Returns required number of primitive blocks for N primitives */
+    static __forceinline size_t blocks(size_t N) { return (N+max_size()-1)/max_size(); }
+   
+  public:
+
+    /* Default constructor */
+    __forceinline TriangleMvMB() {}
+
+    /* Construction from vertices and IDs */
+    __forceinline TriangleMvMB(const Vec3vf<M>& a0, const Vec3vf<M>& a1,
+                               const Vec3vf<M>& b0, const Vec3vf<M>& b1,
+                               const Vec3vf<M>& c0, const Vec3vf<M>& c1,
+                               const vuint<M>& geomIDs, const vuint<M>& primIDs)
+      : v0(a0), v1(b0), v2(c0), dv0(a1-a0), dv1(b1-b0), dv2(c1-c0), geomIDs(geomIDs), primIDs(primIDs) {}
+
+    /* Returns a mask that tells which triangles are valid */
+    __forceinline vbool<M> valid() const { return geomIDs != vuint<M>(-1); }
+
+    /* Returns if the specified triangle is valid */
+    __forceinline bool valid(const size_t i) const { assert(i<M); return geomIDs[i] != -1; }
+
+    /* Returns the number of stored triangles */
+    __forceinline size_t size() const { return bsf(~movemask(valid())); }
+
+    /* Returns the geometry IDs */
+    __forceinline       vuint<M>& geomID()       { return geomIDs; }
+    __forceinline const vuint<M>& geomID() const { return geomIDs; }
+    __forceinline unsigned int geomID(const size_t i) const { assert(i<M); return geomIDs[i]; }
+
+    /* Returns the primitive IDs */
+    __forceinline       vuint<M>& primID()       { return primIDs; }
+    __forceinline const vuint<M>& primID() const { return primIDs; }
+    __forceinline unsigned int primID(const size_t i) const { assert(i<M); return primIDs[i]; }
+
+    /* Calculate the bounds of the triangles at t0 */
+    __forceinline BBox3fa bounds0() const 
+    {
+      Vec3vf<M> lower = min(v0,v1,v2);
+      Vec3vf<M> upper = max(v0,v1,v2);
+      const vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+		     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Calculate the bounds of the triangles at t1 */
+    __forceinline BBox3fa bounds1() const 
+    {
+      const Vec3vf<M> p0 = v0+dv0;
+      const Vec3vf<M> p1 = v1+dv1;
+      const Vec3vf<M> p2 = v2+dv2;
+      Vec3vf<M> lower = min(p0,p1,p2);
+      Vec3vf<M> upper = max(p0,p1,p2);
+      const vbool<M> mask = valid();
+      lower.x = select(mask,lower.x,vfloat<M>(pos_inf));
+      lower.y = select(mask,lower.y,vfloat<M>(pos_inf));
+      lower.z = select(mask,lower.z,vfloat<M>(pos_inf));
+      upper.x = select(mask,upper.x,vfloat<M>(neg_inf));
+      upper.y = select(mask,upper.y,vfloat<M>(neg_inf));
+      upper.z = select(mask,upper.z,vfloat<M>(neg_inf));
+      return BBox3fa(Vec3fa(reduce_min(lower.x),reduce_min(lower.y),reduce_min(lower.z)),
+		     Vec3fa(reduce_max(upper.x),reduce_max(upper.y),reduce_max(upper.z)));
+    }
+
+    /* Calculate the linear bounds of the primitive */
+    __forceinline LBBox3fa linearBounds() const {
+      return LBBox3fa(bounds0(),bounds1());
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRef* prims, size_t& begin, size_t end, Scene* scene, size_t itime)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero;
+      Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero;
+
+      BBox3fa bounds0 = empty;
+      BBox3fa bounds1 = empty;
+      
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+	const PrimRef& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* __restrict__ const mesh = scene->get<TriangleMesh>(geomID);
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        const Vec3fa& a0 = mesh->vertex(tri.v[0],itime+0); bounds0.extend(a0);
+        const Vec3fa& a1 = mesh->vertex(tri.v[0],itime+1); bounds1.extend(a1);
+        const Vec3fa& b0 = mesh->vertex(tri.v[1],itime+0); bounds0.extend(b0);
+        const Vec3fa& b1 = mesh->vertex(tri.v[1],itime+1); bounds1.extend(b1);
+        const Vec3fa& c0 = mesh->vertex(tri.v[2],itime+0); bounds0.extend(c0);
+        const Vec3fa& c1 = mesh->vertex(tri.v[2],itime+1); bounds1.extend(c1);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        va0.x[i] = a0.x; va0.y[i] = a0.y; va0.z[i] = a0.z;
+	va1.x[i] = a1.x; va1.y[i] = a1.y; va1.z[i] = a1.z;
+	vb0.x[i] = b0.x; vb0.y[i] = b0.y; vb0.z[i] = b0.z;
+	vb1.x[i] = b1.x; vb1.y[i] = b1.y; vb1.z[i] = b1.z;
+	vc0.x[i] = c0.x; vc0.y[i] = c0.y; vc0.z[i] = c0.z;
+	vc1.x[i] = c1.x; vc1.y[i] = c1.y; vc1.z[i] = c1.z;
+      }
+      new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID);
+      return LBBox3fa(bounds0,bounds1);
+    }
+
+    /* Fill triangle from triangle list */
+    __forceinline LBBox3fa fillMB(const PrimRefMB* prims, size_t& begin, size_t end, Scene* scene, const BBox1f time_range)
+    {
+      vuint<M> vgeomID = -1, vprimID = -1;
+      Vec3vf<M> va0 = zero, vb0 = zero, vc0 = zero;
+      Vec3vf<M> va1 = zero, vb1 = zero, vc1 = zero;
+
+      LBBox3fa allBounds = empty;
+      for (size_t i=0; i<M && begin<end; i++, begin++)
+      {
+        const PrimRefMB& prim = prims[begin];
+        const unsigned geomID = prim.geomID();
+        const unsigned primID = prim.primID();
+        const TriangleMesh* const mesh = scene->get<TriangleMesh>(geomID);
+        const range<int> itime_range = mesh->timeSegmentRange(time_range);
+        assert(itime_range.size() == 1);
+        const int ilower = itime_range.begin();
+        const TriangleMesh::Triangle& tri = mesh->triangle(primID);
+        allBounds.extend(mesh->linearBounds(primID, time_range));
+        const Vec3fa& a0 = mesh->vertex(tri.v[0],ilower+0);
+        const Vec3fa& a1 = mesh->vertex(tri.v[0],ilower+1);
+        const Vec3fa& b0 = mesh->vertex(tri.v[1],ilower+0);
+        const Vec3fa& b1 = mesh->vertex(tri.v[1],ilower+1);
+        const Vec3fa& c0 = mesh->vertex(tri.v[2],ilower+0);
+        const Vec3fa& c1 = mesh->vertex(tri.v[2],ilower+1);
+        const BBox1f time_range_v(mesh->timeStep(ilower+0),mesh->timeStep(ilower+1));
+        auto a01 = globalLinear(std::make_pair(a0,a1),time_range_v);
+        auto b01 = globalLinear(std::make_pair(b0,b1),time_range_v);
+        auto c01 = globalLinear(std::make_pair(c0,c1),time_range_v);
+        vgeomID [i] = geomID;
+        vprimID [i] = primID;
+        va0.x[i] = a01.first .x; va0.y[i] = a01.first .y; va0.z[i] = a01.first .z;
+	va1.x[i] = a01.second.x; va1.y[i] = a01.second.y; va1.z[i] = a01.second.z;
+	vb0.x[i] = b01.first .x; vb0.y[i] = b01.first .y; vb0.z[i] = b01.first .z;
+	vb1.x[i] = b01.second.x; vb1.y[i] = b01.second.y; vb1.z[i] = b01.second.z;
+	vc0.x[i] = c01.first .x; vc0.y[i] = c01.first .y; vc0.z[i] = c01.first .z;
+	vc1.x[i] = c01.second.x; vc1.y[i] = c01.second.y; vc1.z[i] = c01.second.z;
+      }
+      new (this) TriangleMvMB(va0,va1,vb0,vb1,vc0,vc1,vgeomID,vprimID);
+      return allBounds;
+    }
+
+  public:
+    Vec3vf<M> v0;      // 1st vertex of the triangles
+    Vec3vf<M> v1;      // 2nd vertex of the triangles
+    Vec3vf<M> v2;      // 3rd vertex of the triangles
+    Vec3vf<M> dv0;     // difference vector between time steps t0 and t1 for first vertex
+    Vec3vf<M> dv1;     // difference vector between time steps t0 and t1 for second vertex
+    Vec3vf<M> dv2;     // difference vector between time steps t0 and t1 for third vertex
+  private:
+    vuint<M> geomIDs; // geometry ID
+    vuint<M> primIDs; // primitive ID
+  };
+
+  template<int M>
+  typename TriangleMvMB<M>::Type TriangleMvMB<M>::type;
+
+  typedef TriangleMvMB<4> Triangle4vMB;
+}
diff --git a/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h
new file mode 100644
index 0000000000..35a260d826
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/geometry/trianglev_mb_intersector.h
@@ -0,0 +1,211 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "triangle.h"
+#include "intersector_epilog.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMvMBIntersector1Moeller
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef MoellerTrumboreIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time());
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        pre.intersect(ray,v0,v1,v2,Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time());
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        return pre.intersect(ray,v0,v1,v2,Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+    
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMvMBIntersectorKMoeller
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef MoellerTrumboreIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid_i,ray,v0,v1,v2,IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid0,ray,v0,v1,v2,OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+      
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time()[k]);
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        pre.intersect(ray,k,v0,v1,v2,Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time()[k]);
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        return pre.intersect(ray,k,v0,v1,v2,Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+
+    /*! Intersects M motion blur triangles with 1 ray */
+    template<int M, int Mx, bool filter>
+    struct TriangleMvMBIntersector1Pluecker
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef PlueckerIntersector1<Mx> Precalculations;
+
+      /*! Intersect a ray with the M triangles and updates the hit. */
+      static __forceinline void intersect(const Precalculations& pre, RayHit& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time());
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Intersect1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of M triangles. */
+      static __forceinline bool occluded(const Precalculations& pre, Ray& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time());
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        return pre.intersect(ray,v0,v1,v2,UVIdentity<Mx>(),Occluded1EpilogM<M,Mx,filter>(ray,context,tri.geomID(),tri.primID()));
+      }
+      
+      static __forceinline bool pointQuery(PointQuery* query, PointQueryContext* context, const Primitive& tri)
+      {
+        return PrimitivePointQuery1<Primitive>::pointQuery(query, context, tri);
+      }
+    };
+    
+    /*! Intersects M motion blur triangles with K rays. */
+    template<int M, int Mx, int K, bool filter>
+    struct TriangleMvMBIntersectorKPluecker
+    {
+      typedef TriangleMvMB<M> Primitive;
+      typedef PlueckerIntersectorK<Mx,K> Precalculations;
+
+      /*! Intersects K rays with M triangles. */
+      static __forceinline void intersect(const vbool<K>& valid_i, Precalculations& pre, RayHitK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(normal.trav_prims,1,popcnt(valid_i),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid_i,ray,v0,v1,v2,UVIdentity<K>(),IntersectKEpilogM<M,K,filter>(ray,context,tri.geomID(),tri.primID(),i));
+        }
+      }
+
+      /*! Test for K rays if they are occluded by any of the M triangles. */
+      static __forceinline vbool<K> occluded(const vbool<K>& valid_i, Precalculations& pre, RayK<K>& ray, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        vbool<K> valid0 = valid_i;
+
+        for (size_t i=0; i<TriangleMvMB<M>::max_size(); i++)
+        {
+          if (!tri.valid(i)) break;
+          STAT3(shadow.trav_prims,1,popcnt(valid0),K);
+          const Vec3vf<K> time(ray.time());
+          const Vec3vf<K> v0 = madd(time,broadcast<vfloat<K>>(tri.dv0,i),broadcast<vfloat<K>>(tri.v0,i));
+          const Vec3vf<K> v1 = madd(time,broadcast<vfloat<K>>(tri.dv1,i),broadcast<vfloat<K>>(tri.v1,i));
+          const Vec3vf<K> v2 = madd(time,broadcast<vfloat<K>>(tri.dv2,i),broadcast<vfloat<K>>(tri.v2,i));
+          pre.intersectK(valid0,ray,v0,v1,v2,UVIdentity<K>(),OccludedKEpilogM<M,K,filter>(valid0,ray,context,tri.geomID(),tri.primID(),i));
+          if (none(valid0)) break;
+        }
+        return !valid0;
+      }
+
+      /*! Intersect a ray with M triangles and updates the hit. */
+      static __forceinline void intersect(Precalculations& pre, RayHitK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(normal.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time()[k]);
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Intersect1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+
+      /*! Test if the ray is occluded by one of the M triangles. */
+      static __forceinline bool occluded(Precalculations& pre, RayK<K>& ray, size_t k, IntersectContext* context, const TriangleMvMB<M>& tri)
+      {
+        STAT3(shadow.trav_prims,1,1,1);
+        const Vec3vf<Mx> time(ray.time()[k]);
+        const Vec3vf<Mx> v0 = madd(time,Vec3vf<Mx>(tri.dv0),Vec3vf<Mx>(tri.v0));
+        const Vec3vf<Mx> v1 = madd(time,Vec3vf<Mx>(tri.dv1),Vec3vf<Mx>(tri.v1));
+        const Vec3vf<Mx> v2 = madd(time,Vec3vf<Mx>(tri.dv2),Vec3vf<Mx>(tri.v2));
+        return pre.intersect(ray,k,v0,v1,v2,UVIdentity<Mx>(),Occluded1KEpilogM<M,Mx,K,filter>(ray,k,context,tri.geomID(),tri.primID()));
+      }
+    };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/hash.h b/thirdparty/embree-aarch64/kernels/hash.h
new file mode 100644
index 0000000000..4abbe203d6
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/hash.h
@@ -0,0 +1,5 @@
+
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTC_HASH "6ef362f99af80c9dfe8dd2bfc582d9067897edc6"
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h
new file mode 100644
index 0000000000..c0e78820f8
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/bezier_curve.h
@@ -0,0 +1,669 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/scene_curves.h"
+
+namespace embree
+{
+  class BezierBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = t0 * t0 * t0;
+      const T B1 = 3.0f * t1 * (t0 * t0);
+      const T B2 = 3.0f * (t1 * t1) * t0;
+      const T B3 = t1 * t1 * t1;
+      return Vec4<T>(B0,B1,B2,B3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = -(t0*t0);
+      const T B1 = madd(-2.0f,t0*t1,t0*t0);
+      const T B2 = msub(+2.0f,t0*t1,t1*t1);
+      const T B3 = +(t1*t1);
+      return T(3.0f)*Vec4<T>(B0,B1,B2,B3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t1 = u;
+      const T t0 = 1.0f-t1;
+      const T B0 = t0;
+      const T B1 = madd(-2.0f,t0,t1);
+      const T B2 = madd(-2.0f,t1,t0);
+      const T B3 = t1;
+      return T(6.0f)*Vec4<T>(B0,B1,B2,B3);
+    }
+  };
+  
+  struct PrecomputedBezierBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedBezierBasis() {}
+    PrecomputedBezierBasis(int shift);
+
+    /* basis for bezier evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bezier derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedBezierBasis bezier_basis0;
+  extern PrecomputedBezierBasis bezier_basis1;
+
+  
+  template<typename V>
+    struct LinearBezierCurve
+    {
+      V v0,v1;
+      
+      __forceinline LinearBezierCurve () {}
+      
+      __forceinline LinearBezierCurve (const LinearBezierCurve& other)
+        : v0(other.v0), v1(other.v1) {}
+      
+      __forceinline LinearBezierCurve& operator= (const LinearBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; return *this;
+      }
+        
+        __forceinline LinearBezierCurve (const V& v0, const V& v1)
+          : v0(v0), v1(v1) {}
+      
+      __forceinline V begin() const { return v0; }
+      __forceinline V end  () const { return v1; }
+      
+      bool hasRoot() const;
+      
+      friend embree_ostream operator<<(embree_ostream cout, const LinearBezierCurve& a) {
+        return cout << "LinearBezierCurve (" << a.v0 << ", " << a.v1 << ")";
+      }
+    };
+  
+  template<> __forceinline bool LinearBezierCurve<Interval1f>::hasRoot() const {
+    return numRoots(v0,v1);
+  }
+  
+  template<typename V>
+    struct QuadraticBezierCurve
+    {
+      V v0,v1,v2;
+      
+      __forceinline QuadraticBezierCurve () {}
+      
+      __forceinline QuadraticBezierCurve (const QuadraticBezierCurve& other)
+        : v0(other.v0), v1(other.v1), v2(other.v2) {}
+      
+      __forceinline QuadraticBezierCurve& operator= (const QuadraticBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; v2 = other.v2; return *this;
+      }
+        
+        __forceinline QuadraticBezierCurve (const V& v0, const V& v1, const V& v2)
+          : v0(v0), v1(v1), v2(v2) {}
+      
+      __forceinline V begin() const { return v0; }
+      __forceinline V end  () const { return v2; }
+      
+      __forceinline V interval() const {
+        return merge(v0,v1,v2);
+      }
+      
+      __forceinline BBox<V> bounds() const {
+        return merge(BBox<V>(v0),BBox<V>(v1),BBox<V>(v2));
+      }
+      
+      friend embree_ostream operator<<(embree_ostream cout, const QuadraticBezierCurve& a) {
+        return cout << "QuadraticBezierCurve ( (" << a.u.lower << ", " << a.u.upper << "), " << a.v0 << ", " << a.v1 << ", " << a.v2 << ")";
+      }
+    };
+  
+  
+  typedef QuadraticBezierCurve<float> QuadraticBezierCurve1f;
+  typedef QuadraticBezierCurve<Vec2fa> QuadraticBezierCurve2fa;
+  typedef QuadraticBezierCurve<Vec3fa> QuadraticBezierCurve3fa;
+
+  template<typename Vertex>
+    struct CubicBezierCurve
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline CubicBezierCurve() {}
+
+      template<typename T1>
+      __forceinline CubicBezierCurve (const CubicBezierCurve<T1>& other)
+      : v0(other.v0), v1(other.v1), v2(other.v2), v3(other.v3) {}
+      
+      __forceinline CubicBezierCurve& operator= (const CubicBezierCurve& other) {
+        v0 = other.v0; v1 = other.v1; v2 = other.v2; v3 = other.v3; return *this;
+      }
+      
+      __forceinline CubicBezierCurve(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return v0;
+      }
+
+      __forceinline Vertex end() const {
+        return v3;
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline Vertex begin_direction() const {
+        return v1-v0;
+      }
+
+      __forceinline Vertex end_direction() const {
+        return v3-v2;
+      }
+
+      __forceinline CubicBezierCurve<float> xfm(const Vertex& dx) const {
+        return CubicBezierCurve<float>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx));
+      }
+      
+      __forceinline CubicBezierCurve<vfloatx> vxfm(const Vertex& dx) const {
+        return CubicBezierCurve<vfloatx>(dot(v0,dx),dot(v1,dx),dot(v2,dx),dot(v3,dx));
+      }
+      
+      __forceinline CubicBezierCurve<float> xfm(const Vertex& dx, const Vertex& p) const {
+        return CubicBezierCurve<float>(dot(v0-p,dx),dot(v1-p,dx),dot(v2-p,dx),dot(v3-p,dx));
+      }
+
+       __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space) const
+      {
+        const Vec3fa q0 = xfmVector(space,v0);
+        const Vec3fa q1 = xfmVector(space,v1);
+        const Vec3fa q2 = xfmVector(space,v2);
+        const Vec3fa q3 = xfmVector(space,v3);
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+      
+      __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3fa q0 = xfmVector(space,v0-p);
+        const Vec3fa q1 = xfmVector(space,v1-p);
+        const Vec3fa q2 = xfmVector(space,v2-p);
+        const Vec3fa q3 = xfmVector(space,v3-p);
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+
+      __forceinline CubicBezierCurve<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w);
+        return CubicBezierCurve<Vec3ff>(q0,q1,q2,q3);
+      }
+
+      __forceinline CubicBezierCurve<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const
+      {
+        const Vec3fa q0 = xfmVector(space,s*(v0-p));
+        const Vec3fa q1 = xfmVector(space,s*(v1-p));
+        const Vec3fa q2 = xfmVector(space,s*(v2-p));
+        const Vec3fa q3 = xfmVector(space,s*(v3-p));
+        return CubicBezierCurve<Vec3fa>(q0,q1,q2,q3);
+      }
+      
+      __forceinline int maxRoots() const;
+      
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+      
+      __forceinline friend CubicBezierCurve operator +( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a.v0+b.v0,a.v1+b.v1,a.v2+b.v2,a.v3+b.v3);
+      }
+      
+      __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a.v0-b.v0,a.v1-b.v1,a.v2-b.v2,a.v3-b.v3);
+      }
+      
+      __forceinline friend CubicBezierCurve operator -( const CubicBezierCurve& a, const Vertex& b ) {
+        return CubicBezierCurve(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+      
+      __forceinline friend CubicBezierCurve operator *( const Vertex& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(a*b.v0,a*b.v1,a*b.v2,a*b.v3);
+      }
+
+      __forceinline friend CubicBezierCurve cmadd( const Vertex& a, const CubicBezierCurve& b,  const CubicBezierCurve& c) {
+        return CubicBezierCurve(madd(a,b.v0,c.v0),madd(a,b.v1,c.v1),madd(a,b.v2,c.v2),madd(a,b.v3,c.v3));
+      }
+      
+      __forceinline friend CubicBezierCurve clerp ( const CubicBezierCurve& a, const CubicBezierCurve& b, const Vertex& t ) {
+        return cmadd((Vertex(1.0f)-t),a,t*b);
+      }
+      
+      __forceinline friend CubicBezierCurve merge ( const CubicBezierCurve& a, const CubicBezierCurve& b ) {
+        return CubicBezierCurve(merge(a.v0,b.v0),merge(a.v1,b.v1),merge(a.v2,b.v2),merge(a.v3,b.v3));
+      }
+      
+      __forceinline void split(CubicBezierCurve& left, CubicBezierCurve& right, const float t = 0.5f) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        new (&left ) CubicBezierCurve(p00,p10,p20,p30);
+        new (&right) CubicBezierCurve(p30,p21,p12,p03);
+      }
+      
+      __forceinline CubicBezierCurve<Vec2vfx> split() const
+      {
+        const float u0 = 0.0f, u1 = 1.0f;
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1)));
+        Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale);
+        const Vec2vfx P3 = shift_right_1(P0);
+        const Vec2vfx dP3du = shift_right_1(dP0du); 
+        const Vec2vfx P1 = P0 + dP0du; 
+        const Vec2vfx P2 = P3 - dP3du;
+        return CubicBezierCurve<Vec2vfx>(P0,P1,P2,P3);
+      }
+      
+      __forceinline CubicBezierCurve<Vec2vfx> split(const BBox1f& u) const
+      {
+        const float u0 = u.lower, u1 = u.upper;
+        const float dscale = (u1-u0)*(1.0f/(3.0f*(VSIZEX-1)));
+        const vfloatx vu0 = lerp(u0,u1,vfloatx(step)*(1.0f/(VSIZEX-1)));
+        Vec2vfx P0, dP0du; evalN(vu0,P0,dP0du); dP0du = dP0du * Vec2vfx(dscale);
+        const Vec2vfx P3 = shift_right_1(P0);
+        const Vec2vfx dP3du = shift_right_1(dP0du); 
+        const Vec2vfx P1 = P0 + dP0du; 
+        const Vec2vfx P2 = P3 - dP3du;
+        return CubicBezierCurve<Vec2vfx>(P0,P1,P2,P3);
+      }
+      
+      __forceinline void eval(float t, Vertex& p, Vertex& dp) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = Vertex(3.0f)*(p21-p20);
+      }
+
+#if 0
+      __forceinline Vertex eval(float t) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        
+        return p30;
+      }
+#else
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = BezierBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+#endif
+      
+      __forceinline Vertex eval_dt(float t) const
+      {
+        const Vertex p00 = v1-v0;
+        const Vertex p01 = v2-v1;
+        const Vertex p02 = v3-v2;
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        return Vertex(3.0f)*p20;
+      }
+
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = BezierBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = BezierBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void evalN(const vfloatx& t, Vec2vfx& p, Vec2vfx& dp) const
+      {
+        const Vec2vfx p00 = v0;
+        const Vec2vfx p01 = v1;
+        const Vec2vfx p02 = v2;
+        const Vec2vfx p03 = v3;
+        
+        const Vec2vfx p10 = lerp(p00,p01,t);
+        const Vec2vfx p11 = lerp(p01,p02,t);
+        const Vec2vfx p12 = lerp(p02,p03,t);
+        
+        const Vec2vfx p20 = lerp(p10,p11,t);
+        const Vec2vfx p21 = lerp(p11,p12,t);
+        
+        const Vec2vfx p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = vfloatx(3.0f)*(p21-p20);
+      }
+
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        const Vertex p00 = v0;
+        const Vertex p01 = v1;
+        const Vertex p02 = v2;
+        const Vertex p03 = v3;
+        const Vertex p10 = lerp(p00,p01,t);
+        const Vertex p11 = lerp(p01,p02,t);
+        const Vertex p12 = lerp(p02,p03,t);
+        const Vertex p20 = lerp(p10,p11,t);
+        const Vertex p21 = lerp(p11,p12,t);
+        const Vertex p30 = lerp(p20,p21,t);
+        p = p30;
+        dp = 3.0f*(p21-p20);
+        ddp = eval_dudu(t);
+      }
+      
+      __forceinline CubicBezierCurve clip(const Interval1f& u1) const
+      {
+        Vertex f0,df0; eval(u1.lower,f0,df0);
+        Vertex f1,df1; eval(u1.upper,f1,df1);
+        float s = u1.upper-u1.lower;
+        return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1);
+      }
+      
+      __forceinline QuadraticBezierCurve<Vertex> derivative() const
+      {
+        const Vertex q0 = 3.0f*(v1-v0);
+        const Vertex q1 = 3.0f*(v2-v1);
+        const Vertex q2 = 3.0f*(v3-v2);
+        return QuadraticBezierCurve<Vertex>(q0,q1,q2);
+      }
+      
+      __forceinline BBox<Vertex> derivative_bounds(const Interval1f& u1) const
+      {
+        Vertex f0,df0; eval(u1.lower,f0,df0);
+        Vertex f3,df3; eval(u1.upper,f3,df3);
+        const float s = u1.upper-u1.lower;
+        const Vertex f1 = f0+s*(1.0f/3.0f)*df0;
+        const Vertex f2 = f3-s*(1.0f/3.0f)*df3;
+        const Vertex q0 = s*df0;
+        const Vertex q1 = 3.0f*(f2-f1);
+        const Vertex q2 = s*df3;
+        return merge(BBox<Vertex>(q0),BBox<Vertex>(q1),BBox<Vertex>(q2));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BezierBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        const Vec4vf<M> p00 = v0;
+        const Vec4vf<M> p01 = v1;
+        const Vec4vf<M> p02 = v2;
+        const Vec4vf<M> p03 = v3;
+        
+        const Vec4vf<M> p10 = lerp(p00,p01,t);
+        const Vec4vf<M> p11 = lerp(p01,p02,t);
+        const Vec4vf<M> p12 = lerp(p02,p03,t);
+        const Vec4vf<M> p20 = lerp(p10,p11,t);
+        const Vec4vf<M> p21 = lerp(p11,p12,t);
+        const Vec4vf<M> p30 = lerp(p20,p21,t);
+        
+        p = p30;
+        dp = vfloat<M>(3.0f)*(p21-p20);
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis0.c0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis0.c1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis0.c2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis0.c3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis1.c0[size][ofs]), Vec(v0), 
+                    madd(vfloat<M>::loadu(&bezier_basis1.c1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis1.c2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis1.c3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis0.d0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis0.d1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis0.d2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis0.d3[size][ofs]) * Vec(v3))));
+      }
+      
+      template<int M, typename Vec = Vec4vf<M>>
+      __forceinline Vec derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBezierBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bezier_basis1.d0[size][ofs]), Vec(v0),
+                    madd(vfloat<M>::loadu(&bezier_basis1.d1[size][ofs]), Vec(v1),
+                         madd(vfloat<M>::loadu(&bezier_basis1.d2[size][ofs]), Vec(v2),
+                              vfloat<M>::loadu(&bezier_basis1.d3[size][ofs]) * Vec(v3))));
+      }
+
+      /* calculates bounds of bezier curve geometry */
+      __forceinline BBox3fa accurateBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec3vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec3vfx p  = eval0<VSIZEX,Vec3vf<VSIZEX>>(i,N);
+          const Vec3vfx dp = derivative0<VSIZEX,Vec3vf<VSIZEX>>(i,N);
+          const Vec3vfx pm = p-Vec3vfx(scale)*select(vi!=vintx(0),dp,Vec3vfx(zero));
+          const Vec3vfx pp = p+Vec3vfx(scale)*select(vi!=vintx(N),dp,Vec3vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        return BBox3fa(lower,upper);
+      }
+      
+      /* calculates bounds of bezier curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) < vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru   = select(valid,max(ru,abs(pi.w)),ru);
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(min(lower,v3),max(upper,v3)),max(upper_r,Vec3fa(abs(v3.w))));
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const CubicBezierCurve& curve) {
+        return cout << "CubicBezierCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+
+#if defined(__AVX__)
+  template<>
+    __forceinline CubicBezierCurve<vfloat4> CubicBezierCurve<vfloat4>::clip(const Interval1f& u1) const
+  {
+    const vfloat8 p00 = vfloat8(v0);
+    const vfloat8 p01 = vfloat8(v1);
+    const vfloat8 p02 = vfloat8(v2);
+    const vfloat8 p03 = vfloat8(v3);
+
+    const vfloat8 t(vfloat4(u1.lower),vfloat4(u1.upper));
+    const vfloat8 p10 = lerp(p00,p01,t);
+    const vfloat8 p11 = lerp(p01,p02,t);
+    const vfloat8 p12 = lerp(p02,p03,t);
+    const vfloat8 p20 = lerp(p10,p11,t);
+    const vfloat8 p21 = lerp(p11,p12,t);
+    const vfloat8 p30 = lerp(p20,p21,t);
+    
+    const vfloat8 f01  = p30;
+    const vfloat8 df01 = vfloat8(3.0f)*(p21-p20);
+        
+    const vfloat4 f0  = extract4<0>(f01),  f1  = extract4<1>(f01);
+    const vfloat4 df0 = extract4<0>(df01), df1 = extract4<1>(df01);
+    const float s = u1.upper-u1.lower;
+    return CubicBezierCurve(f0,f0+s*(1.0f/3.0f)*df0,f1-s*(1.0f/3.0f)*df1,f1);
+  }
+#endif
+  
+  template<typename Vertex> using BezierCurveT = CubicBezierCurve<Vertex>;
+  
+  typedef CubicBezierCurve<float> CubicBezierCurve1f;
+  typedef CubicBezierCurve<Vec2fa> CubicBezierCurve2fa;
+  typedef CubicBezierCurve<Vec3fa> CubicBezierCurve3fa;
+  typedef CubicBezierCurve<Vec3fa> BezierCurve3fa;
+  
+  template<> __forceinline int CubicBezierCurve<float>::maxRoots() const
+  {
+    float eps = 1E-4f;
+    bool neg0 = v0 <= 0.0f; bool zero0 = fabs(v0) < eps;
+    bool neg1 = v1 <= 0.0f; bool zero1 = fabs(v1) < eps;
+    bool neg2 = v2 <= 0.0f; bool zero2 = fabs(v2) < eps;
+    bool neg3 = v3 <= 0.0f; bool zero3 = fabs(v3) < eps;
+    return (neg0 != neg1 || zero0) + (neg1 != neg2 || zero1) + (neg2 != neg3 || zero2 || zero3);
+  }
+  
+  template<> __forceinline int CubicBezierCurve<Interval1f>::maxRoots() const {
+    return numRoots(v0,v1) + numRoots(v1,v2) + numRoots(v2,v3);
+  }
+
+  __forceinline CubicBezierCurve<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CubicBezierCurve<Vec3ff>& curve)
+  {
+    return CubicBezierCurve<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h
new file mode 100644
index 0000000000..d87ed41ccb
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/bezier_patch.h
@@ -0,0 +1,372 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_curve.h"
+
+namespace embree
+{  
+  template<class T, class S>
+    static __forceinline T deCasteljau(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3)
+  {
+    const T v0_1 = lerp(v0,v1,uu);
+    const T v1_1 = lerp(v1,v2,uu);
+    const T v2_1 = lerp(v2,v3,uu);
+    const T v0_2 = lerp(v0_1,v1_1,uu);
+    const T v1_2 = lerp(v1_1,v2_1,uu);
+    const T v0_3 = lerp(v0_2,v1_2,uu);
+    return v0_3;
+  }
+  
+  template<class T, class S>
+    static __forceinline T deCasteljau_tangent(const S& uu, const T& v0, const T& v1, const T& v2, const T& v3)
+  {
+    const T v0_1 = lerp(v0,v1,uu);
+    const T v1_1 = lerp(v1,v2,uu);
+    const T v2_1 = lerp(v2,v3,uu);
+    const T v0_2 = lerp(v0_1,v1_1,uu);
+    const T v1_2 = lerp(v1_1,v2_1,uu);
+    return S(3.0f)*(v1_2-v0_2);
+  }
+
+  template<typename Vertex>
+    __forceinline Vertex computeInnerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 36.0f * (16.0f * v[y][x] + 4.0f * (v[y-1][x] +  v[y+1][x] + v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y+1][x+1] + v[y-1][x+1] + v[y+1][x-1]));
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeTopEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y-1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + (v[y-1][x-1] + v[y-1][x+1]));
+  }
+
+  template<typename Vertex>
+    __forceinline Vertex computeBottomEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y+1][x] + 2.0f * (v[y][x-1] + v[y][x+1]) + v[y+1][x-1] + v[y+1][x+1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeLeftEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x-1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x-1] + v[y+1][x-1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeRightEdgeBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x) {
+    return 1.0f / 18.0f * (8.0f * v[y][x] + 4.0f * v[y][x+1] + 2.0f * (v[y-1][x] + v[y+1][x]) + v[y-1][x+1] + v[y+1][x+1]);
+  }
+  
+  template<typename Vertex>
+    __forceinline Vertex computeCornerBezierControlPoint(const Vertex v[4][4], const size_t y, const size_t x, const ssize_t delta_y, const ssize_t delta_x)
+  {
+    return 1.0f / 9.0f * (4.0f * v[y][x] + 2.0f * (v[y+delta_y][x] + v[y][x+delta_x]) + v[y+delta_y][x+delta_x]);
+  }
+
+  template<typename Vertex, typename Vertex_t>
+    class __aligned(64) BezierPatchT
+  {
+   public:
+      Vertex matrix[4][4];
+    
+  public:
+
+    __forceinline BezierPatchT() {}
+
+    __forceinline BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride);
+
+    __forceinline BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch);
+
+    __forceinline BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch,
+                               const BezierCurveT<Vertex>* border0,
+                               const BezierCurveT<Vertex>* border1,
+                               const BezierCurveT<Vertex>* border2,
+                               const BezierCurveT<Vertex>* border3);
+                               
+    __forceinline BezierPatchT(const BSplinePatchT<Vertex,Vertex_t>& source)
+    {
+      /* compute inner bezier control points */
+      matrix[0][0] = computeInnerBezierControlPoint(source.v,1,1);
+      matrix[0][3] = computeInnerBezierControlPoint(source.v,1,2);
+      matrix[3][3] = computeInnerBezierControlPoint(source.v,2,2);
+      matrix[3][0] = computeInnerBezierControlPoint(source.v,2,1);
+      
+      /* compute top edge control points */
+      matrix[0][1] = computeRightEdgeBezierControlPoint(source.v,1,1);
+      matrix[0][2] = computeLeftEdgeBezierControlPoint(source.v,1,2); 
+      
+      /* compute buttom edge control points */
+      matrix[3][1] = computeRightEdgeBezierControlPoint(source.v,2,1);
+      matrix[3][2] = computeLeftEdgeBezierControlPoint(source.v,2,2);
+      
+      /* compute left edge control points */
+      matrix[1][0] = computeBottomEdgeBezierControlPoint(source.v,1,1);
+      matrix[2][0] = computeTopEdgeBezierControlPoint(source.v,2,1);
+      
+      /* compute right edge control points */
+      matrix[1][3] = computeBottomEdgeBezierControlPoint(source.v,1,2);
+      matrix[2][3] = computeTopEdgeBezierControlPoint(source.v,2,2);
+      
+      /* compute corner control points */
+      matrix[1][1] = computeCornerBezierControlPoint(source.v,1,1, 1, 1);
+      matrix[1][2] = computeCornerBezierControlPoint(source.v,1,2, 1,-1);
+      matrix[2][2] = computeCornerBezierControlPoint(source.v,2,2,-1,-1);
+      matrix[2][1] = computeCornerBezierControlPoint(source.v,2,1,-1, 1);      
+    }
+
+    static __forceinline Vertex_t bilinear(const Vec4f Bu, const Vertex matrix[4][4], const Vec4f Bv)
+    {
+      const Vertex_t M0 = madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))); 
+      const Vertex_t M1 = madd(Bu.x,matrix[1][0],madd(Bu.y,matrix[1][1],madd(Bu.z,matrix[1][2],Bu.w * matrix[1][3])));
+      const Vertex_t M2 = madd(Bu.x,matrix[2][0],madd(Bu.y,matrix[2][1],madd(Bu.z,matrix[2][2],Bu.w * matrix[2][3])));
+      const Vertex_t M3 = madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])));
+      return madd(Bv.x,M0,madd(Bv.y,M1,madd(Bv.z,M2,Bv.w*M3)));
+    }
+
+    static __forceinline Vertex_t eval(const Vertex matrix[4][4], const float uu, const float vv) 
+    {      
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_du(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::derivative(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dudu(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative2(uu);
+      const Vec4f Bv = BezierBasis::eval(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dvdv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::eval(uu);
+      const Vec4f Bv = BezierBasis::derivative2(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t eval_dudv(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vec4f Bu = BezierBasis::derivative(uu);
+      const Vec4f Bv = BezierBasis::derivative(vv);
+      return bilinear(Bu,matrix,Bv);
+    }
+
+    static __forceinline Vertex_t normal(const Vertex matrix[4][4], const float uu, const float vv) 
+    {
+      const Vertex_t dPdu = eval_du(matrix,uu,vv);
+      const Vertex_t dPdv = eval_dv(matrix,uu,vv);
+      return cross(dPdu,dPdv);
+    }
+
+    __forceinline Vertex_t normal(const float uu, const float vv) 
+    {
+      const Vertex_t dPdu = eval_du(matrix,uu,vv);
+      const Vertex_t dPdv = eval_dv(matrix,uu,vv);
+      return cross(dPdu,dPdv);
+    }
+
+    __forceinline Vertex_t eval(const float uu, const float vv) const {
+      return eval(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_du(const float uu, const float vv) const { 
+      return eval_du(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dv(const float uu, const float vv) const {
+      return eval_dv(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dudu(const float uu, const float vv) const { 
+      return eval_dudu(matrix,uu,vv);
+    }
+    
+    __forceinline Vertex_t eval_dvdv(const float uu, const float vv) const { 
+      return eval_dvdv(matrix,uu,vv);
+    }
+
+    __forceinline Vertex_t eval_dudv(const float uu, const float vv) const { 
+      return eval_dudv(matrix,uu,vv);
+    }
+
+    __forceinline void eval(const float u, const float v, Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, const float dscale = 1.0f) const
+    {
+      if (P) {
+        *P = eval(u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+        assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+      }
+    }
+
+    template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n) const
+      {
+        const vfloat curve0_x = v_n[0] * vfloat(matrix[0][0][i]) + v_n[1] * vfloat(matrix[1][0][i]) + v_n[2] * vfloat(matrix[2][0][i]) + v_n[3] * vfloat(matrix[3][0][i]);
+        const vfloat curve1_x = v_n[0] * vfloat(matrix[0][1][i]) + v_n[1] * vfloat(matrix[1][1][i]) + v_n[2] * vfloat(matrix[2][1][i]) + v_n[3] * vfloat(matrix[3][1][i]);
+        const vfloat curve2_x = v_n[0] * vfloat(matrix[0][2][i]) + v_n[1] * vfloat(matrix[1][2][i]) + v_n[2] * vfloat(matrix[2][2][i]) + v_n[3] * vfloat(matrix[3][2][i]);
+        const vfloat curve3_x = v_n[0] * vfloat(matrix[0][3][i]) + v_n[1] * vfloat(matrix[1][3][i]) + v_n[2] * vfloat(matrix[2][3][i]) + v_n[3] * vfloat(matrix[3][3][i]);
+        return u_n[0] * curve0_x + u_n[1] * curve1_x + u_n[2] * curve2_x + u_n[3] * curve3_x;
+      }
+
+    template<typename vbool, typename vfloat>
+      __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                              float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                              const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv,u_n,v_n));
+        }
+        if (dPdu) 
+        {
+          {
+            assert(dPdu);
+            const Vec4<vfloat> u_n = BezierBasis::derivative(uu);
+            const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdu+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+          {
+            assert(dPdv);
+            const Vec4<vfloat> u_n = BezierBasis::eval(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdv+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+        }
+        if (ddPdudu) 
+        {
+          {
+            assert(ddPdudu);
+            const Vec4<vfloat> u_n = BezierBasis::derivative2(uu);
+            const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudu+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdvdv);
+            const Vec4<vfloat> u_n = BezierBasis::eval(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative2(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdvdv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdudv);
+            const Vec4<vfloat> u_n = BezierBasis::derivative(uu);
+            const Vec4<vfloat> v_n = BezierBasis::derivative(vv); 
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+        }
+      }
+
+    template<typename T>
+      static __forceinline Vec3<T> eval(const Vertex matrix[4][4], const T& uu, const T& vv) 
+    {      
+      const T one_minus_uu = 1.0f - uu;
+      const T one_minus_vv = 1.0f - vv;      
+
+      const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu;
+      const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv;
+      const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu);
+      const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv);
+      const T B2_u = 3.0f * (uu * one_minus_uu * uu);
+      const T B2_v = 3.0f * (vv * one_minus_vv * vv);
+      const T B3_u = uu * uu * uu;
+      const T B3_v = vv * vv * vv;
+      
+      const T x = 
+        madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u*matrix[0][3].x))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,matrix[1][1].x,madd(B2_u,matrix[1][2].x,B3_u*matrix[1][3].x))),
+        madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,matrix[2][1].x,madd(B2_u,matrix[2][2].x,B3_u*matrix[2][3].x))),
+             B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u*matrix[3][3].x)))))); 
+
+      const T y = 
+        madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u*matrix[0][3].y))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,matrix[1][1].y,madd(B2_u,matrix[1][2].y,B3_u*matrix[1][3].y))),
+        madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,matrix[2][1].y,madd(B2_u,matrix[2][2].y,B3_u*matrix[2][3].y))),
+             B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u*matrix[3][3].y)))))); 
+      
+      const T z = 
+        madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u*matrix[0][3].z))), 
+        madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,matrix[1][1].z,madd(B2_u,matrix[1][2].z,B3_u*matrix[1][3].z))),
+        madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,matrix[2][1].z,madd(B2_u,matrix[2][2].z,B3_u*matrix[2][3].z))),
+             B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u*matrix[3][3].z)))))); 
+      
+      return Vec3<T>(x,y,z);
+    }
+
+    template<typename vfloat>
+      __forceinline Vec3<vfloat> eval(const vfloat& uu, const vfloat& vv) const {     
+      return eval(matrix,uu,vv);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> normal(const Vertex matrix[4][4], const T& uu, const T& vv) 
+    {
+      
+      const Vec3<T> matrix_00 = Vec3<T>(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z);
+      const Vec3<T> matrix_01 = Vec3<T>(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z);
+      const Vec3<T> matrix_02 = Vec3<T>(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z);
+      const Vec3<T> matrix_03 = Vec3<T>(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z);
+
+      const Vec3<T> matrix_10 = Vec3<T>(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z);
+      const Vec3<T> matrix_11 = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> matrix_12 = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> matrix_13 = Vec3<T>(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z);
+
+      const Vec3<T> matrix_20 = Vec3<T>(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z);
+      const Vec3<T> matrix_21 = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+      const Vec3<T> matrix_22 = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> matrix_23 = Vec3<T>(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z);
+
+      const Vec3<T> matrix_30 = Vec3<T>(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z);
+      const Vec3<T> matrix_31 = Vec3<T>(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z);
+      const Vec3<T> matrix_32 = Vec3<T>(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z);
+      const Vec3<T> matrix_33 = Vec3<T>(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z);
+            
+      /* tangentU */
+      const Vec3<T> col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30);
+      const Vec3<T> col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31);
+      const Vec3<T> col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32);
+      const Vec3<T> col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33);
+      
+      const Vec3<T> tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vec3<T> row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03);
+      const Vec3<T> row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13);
+      const Vec3<T> row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23);
+      const Vec3<T> row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33);
+      
+      const Vec3<T> tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vec3<T> n = cross(tangentU,tangentV);
+      return n;
+    }
+
+    template<typename vfloat>
+      __forceinline Vec3<vfloat> normal(const vfloat& uu, const vfloat& vv) const {     
+      return normal(matrix,uu,vv);
+    }
+  };
+
+  typedef BezierPatchT<Vec3fa,Vec3fa_t> BezierPatch3fa;
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h
new file mode 100644
index 0000000000..35748754bd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/bilinear_patch.h
@@ -0,0 +1,191 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) BilinearPatchT
+    {
+      typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+      typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+      
+    public:
+      Vertex v[4];
+      
+    public:
+      
+      __forceinline BilinearPatchT () {}
+
+      __forceinline BilinearPatchT (const HalfEdge* edge, const BufferView<Vertex>& vertices) {
+        init(edge,vertices.getPtr(),vertices.getStride());
+      }
+      
+      __forceinline BilinearPatchT (const HalfEdge* edge, const char* vertices, size_t stride) {
+        init(edge,vertices,stride);
+      }
+
+      __forceinline void init (const HalfEdge* edge, const char* vertices, size_t stride)
+      {
+        v[0] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[1] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[2] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+        v[3] = Vertex::loadu(vertices+edge->getStartVertexIndex()*stride); edge = edge->next();
+      }
+
+      __forceinline BilinearPatchT (const CatmullClarkPatch& patch)
+      {
+        v[0] = patch.ring[0].getLimitVertex();
+        v[1] = patch.ring[1].getLimitVertex();
+        v[2] = patch.ring[2].getLimitVertex();
+        v[3] = patch.ring[3].getLimitVertex();
+      }
+
+      __forceinline BBox<Vertex> bounds() const
+      {
+        
+        BBox<Vertex> bounds (v[0]);
+        bounds.extend(v[1]);
+        bounds.extend(v[2]);
+        bounds.extend(v[3]);
+        return bounds;
+      }
+      
+      __forceinline Vertex eval(const float uu, const float vv) const {
+        return lerp(lerp(v[0],v[1],uu),lerp(v[3],v[2],uu),vv);
+      }
+
+      __forceinline Vertex eval_du(const float uu, const float vv) const {
+        return lerp(v[1]-v[0],v[2]-v[3],vv);
+      }
+
+      __forceinline Vertex eval_dv(const float uu, const float vv) const {
+        return lerp(v[3]-v[0],v[2]-v[1],uu);
+      }
+
+      __forceinline Vertex eval_dudu(const float uu, const float vv) const {
+        return Vertex(zero);
+      }
+
+      __forceinline Vertex eval_dvdv(const float uu, const float vv) const {
+        return Vertex(zero);
+      }
+
+      __forceinline Vertex eval_dudv(const float uu, const float vv) const {
+        return (v[2]-v[3]) - (v[1]-v[0]);
+      }
+
+      __forceinline Vertex normal(const float uu, const float vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+      
+      __forceinline void eval(const float u, const float v, 
+                              Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv,
+                              const float dscale = 1.0f) const
+      {
+        if (P) {
+          *P = eval(u,v); 
+        }
+        if (dPdu) {
+          assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+          assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+        }
+        if (ddPdudu) {
+          assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+          assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+          assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+        }
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(lerp(v[0].x,v[1].x,uu),lerp(v[3].x,v[2].x,uu),vv);
+        const vfloat y = lerp(lerp(v[0].y,v[1].y,uu),lerp(v[3].y,v[2].y,uu),vv);
+        const vfloat z = lerp(lerp(v[0].z,v[1].z,uu),lerp(v[3].z,v[2].z,uu),vv);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval_du(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(v[1].x-v[0].x,v[2].x-v[3].x,vv);
+        const vfloat y = lerp(v[1].y-v[0].y,v[2].y-v[3].y,vv);
+        const vfloat z = lerp(v[1].z-v[0].z,v[2].z-v[3].z,vv);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<class vfloat>
+      __forceinline Vec3<vfloat> eval_dv(const vfloat& uu, const vfloat& vv) const
+      {
+        const vfloat x = lerp(v[3].x-v[0].x,v[2].x-v[1].x,uu);
+        const vfloat y = lerp(v[3].y-v[0].y,v[2].y-v[1].y,uu);
+        const vfloat z = lerp(v[3].z-v[0].z,v[2].z-v[1].z,uu);
+        return Vec3<vfloat>(x,y,z);
+      }
+
+      template<typename vfloat>
+      __forceinline Vec3<vfloat> normal(const vfloat& uu, const vfloat& vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+
+       template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(lerp(v[0][i],v[1][i],uu),lerp(v[3][i],v[2][i],uu),vv);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_du(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(v[1][i]-v[0][i],v[2][i]-v[3][i],vv);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return lerp(v[3][i]-v[0][i],v[2][i]-v[1][i],uu);
+      }
+      
+      template<class vfloat>
+      __forceinline vfloat eval_dudu(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return vfloat(zero);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dvdv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return vfloat(zero);
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval_dudv(const size_t i, const vfloat& uu, const vfloat& vv) const {
+        return (v[2][i]-v[3][i]) - (v[1][i]-v[0][i]);
+      }
+
+      template<typename vbool, typename vfloat>
+      __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                              float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                              const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv));
+        }
+        if (dPdu) {
+          for (size_t i=0; i<N; i++) {
+            assert(dPdu); vfloat::store(valid,dPdu+i*dstride,eval_du(i,uu,vv)*dscale);
+            assert(dPdv); vfloat::store(valid,dPdv+i*dstride,eval_dv(i,uu,vv)*dscale);
+          }
+        }
+        if (ddPdudu) {
+          for (size_t i=0; i<N; i++) {
+            assert(ddPdudu); vfloat::store(valid,ddPdudu+i*dstride,eval_dudu(i,uu,vv)*sqr(dscale));
+            assert(ddPdvdv); vfloat::store(valid,ddPdvdv+i*dstride,eval_dvdv(i,uu,vv)*sqr(dscale));
+            assert(ddPdudv); vfloat::store(valid,ddPdudv+i*dstride,eval_dudv(i,uu,vv)*sqr(dscale));
+          }
+        }
+      }
+    };
+  
+  typedef BilinearPatchT<Vec3fa,Vec3fa_t> BilinearPatch3fa;
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h
new file mode 100644
index 0000000000..a325667328
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/bspline_curve.h
@@ -0,0 +1,319 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  class BSplineBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t  = u;
+      const T s  = T(1.0f) - u;
+      const T n0 = s*s*s;
+      const T n1 = (4.0f*(s*s*s)+(t*t*t)) + (12.0f*((s*t)*s) + 6.0f*((t*s)*t));
+      const T n2 = (4.0f*(t*t*t)+(s*s*s)) + (12.0f*((t*s)*t) + 6.0f*((s*t)*s));
+      const T n3 = t*t*t;
+      return T(1.0f/6.0f)*Vec4<T>(n0,n1,n2,n3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 = -s*s;
+      const T n1 = -t*t - 4.0f*(t*s);
+      const T n2 =  s*s + 4.0f*(s*t);
+      const T n3 =  t*t;
+      return T(0.5f)*Vec4<T>(n0,n1,n2,n3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 = s;
+      const T n1 = t - 2.0f*s;
+      const T n2 = s - 2.0f*t;
+      const T n3 = t;
+      return Vec4<T>(n0,n1,n2,n3);
+    }
+  };
+  
+  struct PrecomputedBSplineBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedBSplineBasis() {}
+    PrecomputedBSplineBasis(int shift);
+
+    /* basis for bspline evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bspline derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedBSplineBasis bspline_basis0;
+  extern PrecomputedBSplineBasis bspline_basis1;
+
+  template<typename Vertex>
+    struct BSplineCurveT
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline BSplineCurveT() {}
+      
+      __forceinline BSplineCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2));
+      }
+
+      __forceinline Vertex end() const {
+        return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3));
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+      
+      __forceinline friend BSplineCurveT operator -( const BSplineCurveT& a, const Vertex& b ) {
+        return BSplineCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+
+      __forceinline BSplineCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,(Vec3fa)v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,(Vec3fa)v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,(Vec3fa)v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,(Vec3fa)v3-p), v3.w);
+        return BSplineCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+      
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = BSplineBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = BSplineBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = BSplineBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        p = eval(t);
+        dp = eval_du(t);
+        ddp = eval_dudu(t);
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = BSplineBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        p = veval(t);
+        dp = veval_du(t);
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis0.c0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis0.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis0.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis0.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis1.c0[size][ofs]), Vec4vf<M>(v0), 
+                    madd(vfloat<M>::loadu(&bspline_basis1.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis1.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis1.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis0.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis0.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis0.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis0.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedBSplineBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&bspline_basis1.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&bspline_basis1.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&bspline_basis1.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&bspline_basis1.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      /* calculates bounds of bspline curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          const Vec3ff pe = end();
+          return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<=N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) <= vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru = select(valid,max(ru,abs(pi.w)),ru); 
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(lower,upper),upper_r);
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const BSplineCurveT& curve) {
+        return cout << "BSplineCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+  
+  template<typename Vertex>
+    __forceinline void convert(const BezierCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve) {
+    ocurve = icurve;
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BSplineCurveT<Vertex>& icurve, BSplineCurveT<Vertex>& ocurve) {
+    ocurve = icurve;
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BezierCurveT<Vertex>& icurve, BSplineCurveT<Vertex>& ocurve)
+  {
+    const Vertex v0 = madd(6.0f,icurve.v0,madd(-7.0f,icurve.v1,2.0f*icurve.v2));
+    const Vertex v1 = msub(2.0f,icurve.v1,icurve.v2);
+    const Vertex v2 = msub(2.0f,icurve.v2,icurve.v1);
+    const Vertex v3 = madd(2.0f,icurve.v1,madd(-7.0f,icurve.v2,6.0f*icurve.v3));
+    ocurve = BSplineCurveT<Vertex>(v0,v1,v2,v3);
+  }
+  
+  template<typename Vertex>
+    __forceinline void convert(const BSplineCurveT<Vertex>& icurve, BezierCurveT<Vertex>& ocurve)
+  {
+    const Vertex v0 = madd(1.0f/6.0f,icurve.v0,madd(2.0f/3.0f,icurve.v1,1.0f/6.0f*icurve.v2));
+    const Vertex v1 = madd(2.0f/3.0f,icurve.v1,1.0f/3.0f*icurve.v2);
+    const Vertex v2 = madd(1.0f/3.0f,icurve.v1,2.0f/3.0f*icurve.v2);
+    const Vertex v3 = madd(1.0f/6.0f,icurve.v1,madd(2.0f/3.0f,icurve.v2,1.0f/6.0f*icurve.v3));
+    ocurve = BezierCurveT<Vertex>(v0,v1,v2,v3);
+  }
+
+  __forceinline BSplineCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const BSplineCurveT<Vec3ff>& curve)
+  {
+    return BSplineCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                 enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+  
+  typedef BSplineCurveT<Vec3fa> BSplineCurve3fa;
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h
new file mode 100644
index 0000000000..9769bc17bd
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/bspline_patch.h
@@ -0,0 +1,449 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bspline_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) BSplinePatchT
+    {
+      typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+      typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+      
+    public:
+      
+      __forceinline BSplinePatchT () {}
+
+      __forceinline BSplinePatchT (const CatmullClarkPatch& patch) {
+        init(patch);
+      }
+
+      __forceinline BSplinePatchT(const CatmullClarkPatch& patch,
+                                  const BezierCurveT<Vertex>* border0,
+                                  const BezierCurveT<Vertex>* border1,
+                                  const BezierCurveT<Vertex>* border2,
+                                  const BezierCurveT<Vertex>* border3)
+      {
+        init(patch);
+      }
+
+      __forceinline BSplinePatchT (const HalfEdge* edge, const char* vertices, size_t stride) {
+        init(edge,vertices,stride);
+      }
+
+      __forceinline Vertex hard_corner(const                    Vertex& v01, const Vertex& v02, 
+                                       const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                       const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        return 4.0f*v11 - 2.0f*(v12+v21) + v22;
+      }
+
+      __forceinline Vertex soft_convex_corner( const                    Vertex& v01, const Vertex& v02, 
+                                               const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                               const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        return -8.0f*v11 + 4.0f*(v12+v21) + v22;
+      }
+
+      __forceinline Vertex convex_corner(const float vertex_crease_weight, 
+                                         const                    Vertex& v01, const Vertex& v02, 
+                                         const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                         const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        if (std::isinf(vertex_crease_weight)) return hard_corner(v01,v02,v10,v11,v12,v20,v21,v22);
+        else                                  return soft_convex_corner(v01,v02,v10,v11,v12,v20,v21,v22);
+      }
+
+      __forceinline Vertex load(const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Vertex_t::loadu(vertices+edge->getStartVertexIndex()*stride);
+      }
+
+      __forceinline void init_border(const CatmullClarkRing& edge0,
+                                     Vertex& v01, Vertex& v02,
+                                     const Vertex& v11, const Vertex& v12,
+                                     const Vertex& v21, const Vertex& v22)
+      {
+        if (likely(edge0.has_opposite_back(0)))
+        {
+          v01 = edge0.back(2);
+          v02 = edge0.back(1);
+        } else {
+          v01 = 2.0f*v11-v21;
+          v02 = 2.0f*v12-v22;
+        }
+      }
+
+      __forceinline void init_corner(const CatmullClarkRing& edge0,
+                                     Vertex& v00,       const Vertex& v01, const Vertex& v02, 
+                                     const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                                     const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        const bool MAYBE_UNUSED has_back1 = edge0.has_opposite_back(1);
+        const bool has_back0 = edge0.has_opposite_back(0);
+        const bool has_front1 = edge0.has_opposite_front(1);
+        const bool MAYBE_UNUSED has_front2 = edge0.has_opposite_front(2);
+        
+        if (likely(has_back0)) {
+          if (likely(has_front1)) { assert(has_back1 && has_front2); v00 = edge0.back(3); }
+          else { assert(!has_back1); v00 = 2.0f*v01-v02; }
+        }
+        else {
+          if (likely(has_front1)) { assert(!has_front2); v00 = 2.0f*v10-v20; }
+          else v00 = convex_corner(edge0.vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22);
+        }
+      }
+      
+      void init(const CatmullClarkPatch& patch)
+      {
+        /* fill inner vertices */
+        const Vertex v11 = v[1][1] = patch.ring[0].vtx;
+        const Vertex v12 = v[1][2] = patch.ring[1].vtx;
+        const Vertex v22 = v[2][2] = patch.ring[2].vtx; 
+        const Vertex v21 = v[2][1] = patch.ring[3].vtx; 
+        
+        /* fill border vertices */
+        init_border(patch.ring[0],v[0][1],v[0][2],v11,v12,v21,v22);
+        init_border(patch.ring[1],v[1][3],v[2][3],v12,v22,v11,v21);
+        init_border(patch.ring[2],v[3][2],v[3][1],v22,v21,v12,v11);
+        init_border(patch.ring[3],v[2][0],v[1][0],v21,v11,v22,v12);
+        
+        /* fill corner vertices */
+        init_corner(patch.ring[0],v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22);
+        init_corner(patch.ring[1],v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21);
+        init_corner(patch.ring[2],v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11);
+        init_corner(patch.ring[3],v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12);
+      }
+      
+      void init_border(const HalfEdge* edge0, const char* vertices, size_t stride,
+                                     Vertex& v01, Vertex& v02,
+                                     const Vertex& v11, const Vertex& v12,
+                                     const Vertex& v21, const Vertex& v22)
+      {
+        if (likely(edge0->hasOpposite())) 
+        {
+          const HalfEdge* e = edge0->opposite()->next()->next(); 
+          v01 = load(e,vertices,stride); 
+          v02 = load(e->next(),vertices,stride);
+        } else {
+          v01 = 2.0f*v11-v21;
+          v02 = 2.0f*v12-v22;
+        }
+      }
+      
+      void init_corner(const HalfEdge* edge0, const char* vertices, size_t stride,
+                       Vertex& v00, const Vertex& v01, const Vertex& v02, 
+                       const Vertex& v10, const Vertex& v11, const Vertex& v12, 
+                       const Vertex& v20, const Vertex& v21, const Vertex& v22)
+      {
+        const bool has_back0 = edge0->hasOpposite();
+        const bool has_front1 = edge0->prev()->hasOpposite();
+
+        if (likely(has_back0))
+        { 
+          const HalfEdge* e = edge0->opposite()->next();
+          if (likely(has_front1))
+          {
+            assert(e->hasOpposite());
+            assert(edge0->prev()->opposite()->prev()->hasOpposite());
+            v00 = load(e->opposite()->prev(),vertices,stride);
+          } 
+          else {
+            assert(!e->hasOpposite());
+            v00 = 2.0f*v01-v02;
+          }
+        }
+        else
+        {
+          if (likely(has_front1)) {
+            assert(!edge0->prev()->opposite()->prev()->hasOpposite());
+            v00 = 2.0f*v10-v20;
+          }
+          else {
+            assert(edge0->vertex_crease_weight == 0.0f || std::isinf(edge0->vertex_crease_weight));
+            v00 = convex_corner(edge0->vertex_crease_weight,v01,v02,v10,v11,v12,v20,v21,v22);
+          }
+        }
+      }
+      
+      void init(const HalfEdge* edge0, const char* vertices, size_t stride)
+      {
+        assert( edge0->isRegularFace() );
+        
+        /* fill inner vertices */
+        const Vertex v11 = v[1][1] = load(edge0,vertices,stride); const HalfEdge* edge1 = edge0->next();
+        const Vertex v12 = v[1][2] = load(edge1,vertices,stride); const HalfEdge* edge2 = edge1->next();
+        const Vertex v22 = v[2][2] = load(edge2,vertices,stride); const HalfEdge* edge3 = edge2->next();
+        const Vertex v21 = v[2][1] = load(edge3,vertices,stride); assert(edge0  == edge3->next());
+        
+        /* fill border vertices */
+        init_border(edge0,vertices,stride,v[0][1],v[0][2],v11,v12,v21,v22);
+        init_border(edge1,vertices,stride,v[1][3],v[2][3],v12,v22,v11,v21);
+        init_border(edge2,vertices,stride,v[3][2],v[3][1],v22,v21,v12,v11);
+        init_border(edge3,vertices,stride,v[2][0],v[1][0],v21,v11,v22,v12);
+        
+        /* fill corner vertices */
+        init_corner(edge0,vertices,stride,v[0][0],v[0][1],v[0][2],v[1][0],v11,v12,v[2][0],v21,v22);
+        init_corner(edge1,vertices,stride,v[0][3],v[1][3],v[2][3],v[0][2],v12,v22,v[0][1],v11,v21);
+        init_corner(edge2,vertices,stride,v[3][3],v[3][2],v[3][1],v[2][3],v22,v21,v[1][3],v12,v11);
+        init_corner(edge3,vertices,stride,v[3][0],v[2][0],v[1][0],v[3][1],v21,v11,v[3][2],v22,v12);
+      }
+      
+      __forceinline BBox<Vertex> bounds() const
+      {
+        const Vertex* const cv = &v[0][0];
+        BBox<Vertex> bounds (cv[0]);
+        for (size_t i=1; i<16 ; i++)
+          bounds.extend( cv[i] );
+        return bounds;
+      }
+      
+      __forceinline Vertex eval(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex eval_du(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex eval_dv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dudu(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::eval(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative2(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dvdv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative2(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::eval(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+
+      __forceinline Vertex eval_dudv(const float uu, const float vv) const
+      {
+        const Vec4f v_n = BSplineBasis::derivative(vv);
+        const Vertex_t curve0 = madd(v_n[0],v[0][0],madd(v_n[1],v[1][0],madd(v_n[2],v[2][0],v_n[3] * v[3][0])));
+        const Vertex_t curve1 = madd(v_n[0],v[0][1],madd(v_n[1],v[1][1],madd(v_n[2],v[2][1],v_n[3] * v[3][1])));
+        const Vertex_t curve2 = madd(v_n[0],v[0][2],madd(v_n[1],v[1][2],madd(v_n[2],v[2][2],v_n[3] * v[3][2])));
+        const Vertex_t curve3 = madd(v_n[0],v[0][3],madd(v_n[1],v[1][3],madd(v_n[2],v[2][3],v_n[3] * v[3][3])));
+        
+        const Vec4f u_n = BSplineBasis::derivative(uu);
+        return madd(u_n[0],curve0,madd(u_n[1],curve1,madd(u_n[2],curve2,u_n[3] * curve3)));
+      }
+      
+      __forceinline Vertex normal(const float uu, const float vv) const
+      {
+        const Vertex tu = eval_du(uu,vv);
+        const Vertex tv = eval_dv(uu,vv);
+        return cross(tu,tv);
+      }   
+
+      template<typename T>
+      __forceinline Vec3<T> eval(const T& uu, const T& vv, const Vec4<T>& u_n, const Vec4<T>& v_n) const
+      {
+        const T curve0_x = madd(v_n[0],T(v[0][0].x),madd(v_n[1],T(v[1][0].x),madd(v_n[2],T(v[2][0].x),v_n[3] * T(v[3][0].x))));
+        const T curve1_x = madd(v_n[0],T(v[0][1].x),madd(v_n[1],T(v[1][1].x),madd(v_n[2],T(v[2][1].x),v_n[3] * T(v[3][1].x))));
+        const T curve2_x = madd(v_n[0],T(v[0][2].x),madd(v_n[1],T(v[1][2].x),madd(v_n[2],T(v[2][2].x),v_n[3] * T(v[3][2].x))));
+        const T curve3_x = madd(v_n[0],T(v[0][3].x),madd(v_n[1],T(v[1][3].x),madd(v_n[2],T(v[2][3].x),v_n[3] * T(v[3][3].x))));
+        const T x = madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+                  
+        const T curve0_y = madd(v_n[0],T(v[0][0].y),madd(v_n[1],T(v[1][0].y),madd(v_n[2],T(v[2][0].y),v_n[3] * T(v[3][0].y))));
+        const T curve1_y = madd(v_n[0],T(v[0][1].y),madd(v_n[1],T(v[1][1].y),madd(v_n[2],T(v[2][1].y),v_n[3] * T(v[3][1].y))));
+        const T curve2_y = madd(v_n[0],T(v[0][2].y),madd(v_n[1],T(v[1][2].y),madd(v_n[2],T(v[2][2].y),v_n[3] * T(v[3][2].y))));
+        const T curve3_y = madd(v_n[0],T(v[0][3].y),madd(v_n[1],T(v[1][3].y),madd(v_n[2],T(v[2][3].y),v_n[3] * T(v[3][3].y))));
+        const T y = madd(u_n[0],curve0_y,madd(u_n[1],curve1_y,madd(u_n[2],curve2_y,u_n[3] * curve3_y)));
+          
+        const T curve0_z = madd(v_n[0],T(v[0][0].z),madd(v_n[1],T(v[1][0].z),madd(v_n[2],T(v[2][0].z),v_n[3] * T(v[3][0].z))));
+        const T curve1_z = madd(v_n[0],T(v[0][1].z),madd(v_n[1],T(v[1][1].z),madd(v_n[2],T(v[2][1].z),v_n[3] * T(v[3][1].z))));
+        const T curve2_z = madd(v_n[0],T(v[0][2].z),madd(v_n[1],T(v[1][2].z),madd(v_n[2],T(v[2][2].z),v_n[3] * T(v[3][2].z))));
+        const T curve3_z = madd(v_n[0],T(v[0][3].z),madd(v_n[1],T(v[1][3].z),madd(v_n[2],T(v[2][3].z),v_n[3] * T(v[3][3].z))));
+        const T z = madd(u_n[0],curve0_z,madd(u_n[1],curve1_z,madd(u_n[2],curve2_z,u_n[3] * curve3_z)));
+        
+        return Vec3<T>(x,y,z);
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> eval(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu);
+        const Vec4<T> v_n = BSplineBasis::eval(vv);
+        return eval(uu,vv,u_n,v_n);
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_du(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative(uu); 
+        const Vec4<T> v_n = BSplineBasis::eval(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> eval_dv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dudu(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative2(uu); 
+        const Vec4<T> v_n = BSplineBasis::eval(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dvdv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::eval(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative2(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+
+      template<typename T>
+      __forceinline Vec3<T> eval_dudv(const T& uu, const T& vv) const
+      {
+        const Vec4<T> u_n = BSplineBasis::derivative(uu); 
+        const Vec4<T> v_n = BSplineBasis::derivative(vv); 
+        return eval(uu,vv,u_n,v_n);      
+      }
+      
+      template<typename T>
+      __forceinline Vec3<T> normal(const T& uu, const T& vv) const {
+        return cross(eval_du(uu,vv),eval_dv(uu,vv));
+      }
+
+      void eval(const float u, const float v, 
+                Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv, 
+                const float dscale = 1.0f) const
+      {
+        if (P) {
+          *P = eval(u,v); 
+        }
+        if (dPdu) {
+          assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+          assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+        }
+        if (ddPdudu) {
+          assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+          assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+          assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+        }
+      }
+
+      template<class vfloat>
+      __forceinline vfloat eval(const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n) const
+      {
+        const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i]))));
+        const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(v[1][1][i]),madd(v_n[2],vfloat(v[2][1][i]),v_n[3] * vfloat(v[3][1][i]))));
+        const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(v[1][2][i]),madd(v_n[2],vfloat(v[2][2][i]),v_n[3] * vfloat(v[3][2][i]))));
+        const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i]))));
+        return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+      }
+        
+      template<typename vbool, typename vfloat>
+      void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, 
+                const float dscale, const size_t dstride, const size_t N) const
+      {
+        if (P) {
+          const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BSplineBasis::eval(vv); 
+          for (size_t i=0; i<N; i++) vfloat::store(valid,P+i*dstride,eval(i,uu,vv,u_n,v_n));
+        }
+        if (dPdu) 
+        {
+          {
+            assert(dPdu);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::eval(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdu+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+          {
+            assert(dPdv);
+            const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,dPdv+i*dstride,eval(i,uu,vv,u_n,v_n)*dscale);
+          }
+        }
+        if (ddPdudu) 
+        {
+          {
+            assert(ddPdudu);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative2(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::eval(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudu+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdvdv);
+            const Vec4<vfloat> u_n = BSplineBasis::eval(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative2(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdvdv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+          {
+            assert(ddPdudv);
+            const Vec4<vfloat> u_n = BSplineBasis::derivative(uu); 
+            const Vec4<vfloat> v_n = BSplineBasis::derivative(vv);
+            for (size_t i=0; i<N; i++) vfloat::store(valid,ddPdudv+i*dstride,eval(i,uu,vv,u_n,v_n)*sqr(dscale));
+          }
+        }
+      }
+
+      friend __forceinline embree_ostream operator<<(embree_ostream o, const BSplinePatchT& p)
+      {
+        for (size_t y=0; y<4; y++)
+          for (size_t x=0; x<4; x++)
+            o << "[" << y << "][" << x << "] " << p.v[y][x] << embree_endl;
+        return o;
+      } 
+
+    public:
+      Vertex v[4][4];
+    };
+  
+  typedef BSplinePatchT<Vec3fa,Vec3fa_t> BSplinePatch3fa;
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h
new file mode 100644
index 0000000000..05031cf6b9
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_coefficients.h
@@ -0,0 +1,85 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+
+namespace embree
+{
+  static const size_t MAX_PATCH_VALENCE = 16;         //!< maximum number of vertices of a patch
+  static const size_t MAX_RING_FACE_VALENCE = 64;     //!< maximum number of faces per ring
+  static const size_t MAX_RING_EDGE_VALENCE = 2*64;   //!< maximum number of edges per ring
+
+  class CatmullClarkPrecomputedCoefficients 
+  {
+  private:
+    
+    float table_cos_2PI_div_n[MAX_RING_FACE_VALENCE+1];
+
+    float* table_limittangent_a[MAX_RING_FACE_VALENCE+1];
+    float* table_limittangent_b[MAX_RING_FACE_VALENCE+1];
+    float table_limittangent_c[MAX_RING_FACE_VALENCE+1];
+
+    __forceinline float set_cos_2PI_div_n(const size_t n) { 
+      if (unlikely(n == 0)) return 1.0f;
+      return cosf(2.0f*float(pi)/(float)n); 
+    }
+
+    __forceinline float set_limittangent_a(const size_t i, const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n));
+      const float c1 = (1.0f/(float)n + cosf(float(pi)/(float)n) * c0); 
+      return cosf(2.0f*float(pi)*(float)i/(float)n) * c1;
+    }
+
+    __forceinline float set_limittangent_b(const size_t i, const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      const float c0 = 1.0f/(float)n * 1.0f / sqrtf(4.0f + cosf(float(pi)/(float)n)*cosf(float(pi)/(float)n));
+      return cosf((2.0f*float(pi)*i+float(pi))/(float)n) * c0;
+    }
+
+    __forceinline float set_limittangent_c(const size_t n)  
+    { 
+      if (unlikely(n == 0)) return 1.0f;
+      return 2.0f/16.0f * (5.0f + cosf(2.0f*float(pi)/(float)n) + cosf(float(pi)/(float)n) * sqrtf(18.0f+2.0f*cosf(2.0f*float(pi)/(float)n)));
+    }
+
+  public:
+
+    __forceinline float cos_2PI_div_n(const size_t n)
+    {
+      if (likely(n <= MAX_RING_FACE_VALENCE))
+        return table_cos_2PI_div_n[n];
+      else
+        return set_cos_2PI_div_n(n);
+    }
+
+    __forceinline float limittangent_a(const size_t i, const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      assert(i < n);
+      return table_limittangent_a[n][i];
+    }
+
+    __forceinline float limittangent_b(const size_t i, const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      assert(i < n);
+      return table_limittangent_b[n][i];
+    }
+
+    __forceinline float limittangent_c(const size_t n)
+    {
+      assert(n <= MAX_RING_FACE_VALENCE);
+      return table_limittangent_c[n];
+    }
+
+    static CatmullClarkPrecomputedCoefficients table;
+ 
+    CatmullClarkPrecomputedCoefficients();    
+    ~CatmullClarkPrecomputedCoefficients();    
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h
new file mode 100644
index 0000000000..ab1d63594a
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_patch.h
@@ -0,0 +1,562 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_ring.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) CatmullClarkPatchT
+    {
+    public:
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef typename CatmullClark1Ring::Type Type;
+    
+    array_t<CatmullClark1RingT<Vertex,Vertex_t>,4> ring;
+    
+    public:
+    __forceinline CatmullClarkPatchT () {}
+
+    __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const char* vertices, size_t stride) {
+      init(first_half_edge,vertices,stride);
+    }
+    
+    __forceinline CatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView<Vec3fa>& vertices) {
+      init(first_half_edge,vertices.getPtr(),vertices.getStride());
+    }
+    
+    __forceinline void init (const HalfEdge* first_half_edge, const char* vertices, size_t stride) 
+    {
+      for (unsigned i=0; i<4; i++)
+        ring[i].init(first_half_edge+i,vertices,stride);
+
+      assert(verify());
+    }
+
+    __forceinline size_t bytes() const {
+      return ring[0].bytes()+ring[1].bytes()+ring[2].bytes()+ring[3].bytes();
+    }
+
+    __forceinline void serialize(void* ptr, size_t& ofs) const
+    {
+      for (size_t i=0; i<4; i++)
+        ring[i].serialize((char*)ptr,ofs);
+    }
+
+    __forceinline void deserialize(void* ptr)
+    {
+      size_t ofs = 0;
+      for (size_t i=0; i<4; i++)
+        ring[i].deserialize((char*)ptr,ofs);
+    }
+
+    __forceinline BBox3fa bounds() const
+    {
+      BBox3fa bounds (ring[0].bounds());
+      for (size_t i=1; i<4; i++)
+	bounds.extend(ring[i].bounds());
+      return bounds;
+    }
+    
+    __forceinline Type type() const 
+    {
+      const int ty0 = ring[0].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty1 = ring[1].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty2 = ring[2].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      const int ty3 = ring[3].type() ^ CatmullClark1Ring::TYPE_CREASES;
+      return (Type) ((ty0 & ty1 & ty2 & ty3) ^ CatmullClark1Ring::TYPE_CREASES);
+    }
+    
+    __forceinline bool isFinalResolution(float res) const {
+      return ring[0].isFinalResolution(res) && ring[1].isFinalResolution(res) && ring[2].isFinalResolution(res) && ring[3].isFinalResolution(res);
+    }
+    
+    static __forceinline void init_regular(const CatmullClark1RingT<Vertex,Vertex_t>& p0,
+					   const CatmullClark1RingT<Vertex,Vertex_t>& p1,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest0,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest1) 
+    {
+      assert(p1.face_valence > 2);
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 4;
+      dest1.edge_valence = dest0.edge_valence = 8;
+      dest1.border_index = dest0.border_index = -1;
+      dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx;
+      dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4];
+      dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1];
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2];
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1];
+      dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f;
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 3;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }    
+    
+    static __forceinline void init_border(const CatmullClark1RingT<Vertex,Vertex_t> &p0,
+                                          const CatmullClark1RingT<Vertex,Vertex_t> &p1,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest0,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest1) 
+    {
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 3;
+      dest1.edge_valence = dest0.edge_valence = 6;
+      dest0.border_index = 2;
+      dest1.border_index = 4;
+      dest1.vtx  = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx;
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1];
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 2;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }
+    
+    static __forceinline void init_regular(const Vertex_t &center, const Vertex_t center_ring[8], const unsigned int offset, CatmullClark1RingT<Vertex,Vertex_t> &dest)
+    {
+      dest.vertex_level = 0.0f;
+      dest.face_valence = 4;
+      dest.edge_valence = 8;
+      dest.border_index = -1;
+      dest.vtx     = (Vertex_t)center;
+      dest.vertex_crease_weight = 0.0f;
+      for (size_t i=0; i<8; i++) 
+	dest.ring[i] = (Vertex_t)center_ring[(offset+i)%8];
+      for (size_t i=0; i<4; i++) 
+        dest.crease_weight[i] = 0.0f;
+      
+      dest.eval_start_index = (8-offset)>>1;
+      if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence;
+      assert( dest.eval_start_index < dest.face_valence );
+      dest.eval_unique_identifier = 0;
+    }
+    
+    __noinline void subdivide(array_t<CatmullClarkPatchT,4>& patch) const
+    {
+      ring[0].subdivide(patch[0].ring[0]);
+      ring[1].subdivide(patch[1].ring[1]);
+      ring[2].subdivide(patch[2].ring[2]);
+      ring[3].subdivide(patch[3].ring[3]);
+      
+      patch[0].ring[0].edge_level = 0.5f*ring[0].edge_level;
+      patch[0].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      patch[0].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[0].ring[3].edge_level = 0.5f*ring[3].edge_level;
+      
+      patch[1].ring[0].edge_level = 0.5f*ring[0].edge_level;
+      patch[1].ring[1].edge_level = 0.5f*ring[1].edge_level;
+      patch[1].ring[2].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[1].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      
+      patch[2].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[2].ring[1].edge_level = 0.5f*ring[1].edge_level;
+      patch[2].ring[2].edge_level = 0.5f*ring[2].edge_level;
+      patch[2].ring[3].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      
+      patch[3].ring[0].edge_level = 0.25f*(ring[0].edge_level+ring[2].edge_level);
+      patch[3].ring[1].edge_level = 0.25f*(ring[1].edge_level+ring[3].edge_level);
+      patch[3].ring[2].edge_level = 0.5f*ring[2].edge_level;
+      patch[3].ring[3].edge_level = 0.5f*ring[3].edge_level;
+      
+      const bool regular0 = ring[0].has_last_face() && ring[1].face_valence > 2;
+      if (likely(regular0))
+        init_regular(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]);
+      else
+        init_border(patch[0].ring[0],patch[1].ring[1],patch[0].ring[1],patch[1].ring[0]);
+      
+      const bool regular1 = ring[1].has_last_face() && ring[2].face_valence > 2;
+      if (likely(regular1))
+        init_regular(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]);
+      else
+        init_border(patch[1].ring[1],patch[2].ring[2],patch[1].ring[2],patch[2].ring[1]);
+      
+      const bool regular2 = ring[2].has_last_face() && ring[3].face_valence > 2;
+      if (likely(regular2))
+        init_regular(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]);
+      else
+        init_border(patch[2].ring[2],patch[3].ring[3],patch[2].ring[3],patch[3].ring[2]);
+      
+      const bool regular3 = ring[3].has_last_face() && ring[0].face_valence > 2;
+      if (likely(regular3))
+        init_regular(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]);
+      else
+        init_border(patch[3].ring[3],patch[0].ring[0],patch[3].ring[0],patch[0].ring[3]);
+      
+      Vertex_t center = (ring[0].vtx + ring[1].vtx + ring[2].vtx + ring[3].vtx) * 0.25f;
+
+      Vertex_t center_ring[8];
+      center_ring[0] = (Vertex_t)patch[3].ring[3].ring[0];
+      center_ring[7] = (Vertex_t)patch[3].ring[3].vtx;
+      center_ring[6] = (Vertex_t)patch[2].ring[2].ring[0];
+      center_ring[5] = (Vertex_t)patch[2].ring[2].vtx;
+      center_ring[4] = (Vertex_t)patch[1].ring[1].ring[0];
+      center_ring[3] = (Vertex_t)patch[1].ring[1].vtx;
+      center_ring[2] = (Vertex_t)patch[0].ring[0].ring[0];
+      center_ring[1] = (Vertex_t)patch[0].ring[0].vtx;
+      
+      init_regular(center,center_ring,0,patch[0].ring[2]);
+      init_regular(center,center_ring,2,patch[1].ring[3]);
+      init_regular(center,center_ring,4,patch[2].ring[0]);
+      init_regular(center,center_ring,6,patch[3].ring[1]);
+      
+      assert(patch[0].verify());
+      assert(patch[1].verify());
+      assert(patch[2].verify());
+      assert(patch[3].verify());
+    }
+    
+    bool verify() const {
+      return ring[0].hasValidPositions() && ring[1].hasValidPositions() && ring[2].hasValidPositions() && ring[3].hasValidPositions();
+    }
+    
+    __forceinline void init( FinalQuad& quad ) const
+    {
+      quad.vtx[0] = (Vertex_t)ring[0].vtx;
+      quad.vtx[1] = (Vertex_t)ring[1].vtx;
+      quad.vtx[2] = (Vertex_t)ring[2].vtx;
+      quad.vtx[3] = (Vertex_t)ring[3].vtx;
+    };
+    
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClarkPatchT &p)
+    {
+      o << "CatmullClarkPatch { " << embree_endl;
+      for (size_t i=0; i<4; i++)
+	o << "ring" << i << ": " << p.ring[i] << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    };
+  
+  typedef CatmullClarkPatchT<Vec3fa,Vec3fa_t> CatmullClarkPatch3fa;
+  
+  template<typename Vertex, typename Vertex_t = Vertex>
+    class __aligned(64) GeneralCatmullClarkPatchT
+    {
+    public:
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef BezierCurveT<Vertex> BezierCurve;
+
+    static const unsigned SIZE = MAX_PATCH_VALENCE;
+    DynamicStackArray<GeneralCatmullClark1RingT<Vertex,Vertex_t>,8,SIZE> ring;
+    unsigned N;
+    
+    __forceinline GeneralCatmullClarkPatchT () 
+    : N(0) {}
+    
+    GeneralCatmullClarkPatchT (const HalfEdge* h, const char* vertices, size_t stride) {
+      init(h,vertices,stride);
+    }
+
+    __forceinline GeneralCatmullClarkPatchT (const HalfEdge* first_half_edge, const BufferView<Vec3fa>& vertices) {
+      init(first_half_edge,vertices.getPtr(),vertices.getStride());
+    }
+
+    __forceinline void init (const HalfEdge* h, const char* vertices, size_t stride) 
+    {
+      unsigned int i = 0;
+      const HalfEdge* edge = h; 
+      do {
+        ring[i].init(edge,vertices,stride);
+        edge = edge->next();
+        i++;
+      } while ((edge != h) && (i < SIZE));
+      N = i;
+    }
+
+    __forceinline unsigned size() const { 
+      return N; 
+    }
+    
+    __forceinline bool isQuadPatch() const {
+      return (N == 4) && ring[0].only_quads && ring[1].only_quads && ring[2].only_quads && ring[3].only_quads;
+    }
+
+    static __forceinline void init_regular(const CatmullClark1RingT<Vertex,Vertex_t>& p0,
+					   const CatmullClark1RingT<Vertex,Vertex_t>& p1,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest0,
+					   CatmullClark1RingT<Vertex,Vertex_t>& dest1) 
+    {
+      assert(p1.face_valence > 2);
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 4;
+      dest1.edge_valence = dest0.edge_valence = 8;
+      dest1.border_index = dest0.border_index = -1;
+      dest1.vtx = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[7] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[6] = (Vertex_t)p1.vtx;
+      dest1.ring[7] = dest0.ring[5] = (Vertex_t)p1.ring[4];
+      dest1.ring[6] = dest0.ring[4] = (Vertex_t)p0.ring[p0.edge_valence-1];
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.edge_valence-2];
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[3] = p1.crease_weight[1];
+      dest1.crease_weight[3] = dest0.crease_weight[2] = 0.0f;
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 3;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }      
+    }
+    
+    
+    static __forceinline void init_border(const CatmullClark1RingT<Vertex,Vertex_t> &p0,
+                                          const CatmullClark1RingT<Vertex,Vertex_t> &p1,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest0,
+                                          CatmullClark1RingT<Vertex,Vertex_t> &dest1) 
+    {
+      dest1.vertex_level = dest0.vertex_level = p0.edge_level;
+      dest1.face_valence = dest0.face_valence = 3;
+      dest1.edge_valence = dest0.edge_valence = 6;
+      dest0.border_index = 2;
+      dest1.border_index = 4;
+      dest1.vtx  = dest0.vtx = (Vertex_t)p0.ring[0];
+      dest1.vertex_crease_weight = dest0.vertex_crease_weight = 0.0f;
+      
+      dest1.ring[2] = dest0.ring[0] = (Vertex_t)p0.ring[1];
+      dest1.ring[1] = dest0.ring[5] = (Vertex_t)p1.ring[0];
+      dest1.ring[0] = dest0.ring[4] = (Vertex_t)p1.vtx;
+      dest1.ring[5] = dest0.ring[3] = (Vertex_t)p0.ring[p0.border_index+1]; // dummy
+      dest1.ring[4] = dest0.ring[2] = (Vertex_t)p0.vtx;
+      dest1.ring[3] = dest0.ring[1] = (Vertex_t)p0.ring[2];
+      
+      dest1.crease_weight[1] = dest0.crease_weight[0] = 0.0f;
+      dest1.crease_weight[0] = dest0.crease_weight[2] = p1.crease_weight[1];
+      dest1.crease_weight[2] = dest0.crease_weight[1] = p0.crease_weight[0];
+      
+      if (p0.eval_unique_identifier <= p1.eval_unique_identifier)
+      {
+        dest0.eval_start_index = 1;
+        dest1.eval_start_index = 2;
+        dest0.eval_unique_identifier = p0.eval_unique_identifier;
+        dest1.eval_unique_identifier = p0.eval_unique_identifier;
+      }
+      else
+      {
+        dest0.eval_start_index = 2;
+        dest1.eval_start_index = 0;
+        dest0.eval_unique_identifier = p1.eval_unique_identifier;
+        dest1.eval_unique_identifier = p1.eval_unique_identifier;
+      }
+    }
+    
+    static __forceinline void init_regular(const Vertex_t &center, const array_t<Vertex_t,2*SIZE>& center_ring, const float vertex_level, const unsigned int N, const unsigned int offset, CatmullClark1RingT<Vertex,Vertex_t> &dest)
+    {
+      assert(N<(MAX_RING_FACE_VALENCE));
+      assert(2*N<(MAX_RING_EDGE_VALENCE));
+      dest.vertex_level = vertex_level;
+      dest.face_valence = N;
+      dest.edge_valence = 2*N;
+      dest.border_index = -1;
+      dest.vtx     = (Vertex_t)center;
+      dest.vertex_crease_weight = 0.0f;
+      for (unsigned i=0; i<2*N; i++) {
+        dest.ring[i] = (Vertex_t)center_ring[(2*N+offset+i-1)%(2*N)];
+        assert(isvalid(dest.ring[i]));
+      }
+      for (unsigned i=0; i<N; i++) 
+        dest.crease_weight[i] = 0.0f;
+      
+      assert(offset <= 2*N);
+      dest.eval_start_index = (2*N-offset)>>1;
+      if (dest.eval_start_index >= dest.face_valence) dest.eval_start_index -= dest.face_valence;
+      
+      assert( dest.eval_start_index < dest.face_valence );
+      dest.eval_unique_identifier = 0;
+    }
+    
+    __noinline void subdivide(array_t<CatmullClarkPatch,SIZE>& patch, unsigned& N_o) const
+    {
+      N_o = N;
+      assert( N );
+      for (unsigned i=0; i<N; i++) {
+        unsigned ip1 = (i+1)%N; // FIXME: %
+        ring[i].subdivide(patch[i].ring[0]);
+        patch[i]  .ring[0].edge_level = 0.5f*ring[i].edge_level;
+        patch[ip1].ring[3].edge_level = 0.5f*ring[i].edge_level;
+        
+	assert( patch[i].ring[0].hasValidPositions() );
+        
+      }
+      assert(N < 2*SIZE);
+      Vertex_t center = Vertex_t(0.0f);
+      array_t<Vertex_t,2*SIZE> center_ring;
+      float center_vertex_level = 2.0f; // guarantees that irregular vertices get always isolated also for non-quads
+      
+      for (unsigned i=0; i<N; i++)
+      {
+        unsigned ip1 = (i+1)%N; // FIXME: %
+        unsigned im1 = (i+N-1)%N; // FIXME: %
+        bool regular = ring[i].has_last_face() && ring[ip1].face_valence > 2;
+        if (likely(regular)) init_regular(patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]); 
+        else                 init_border (patch[i].ring[0],patch[ip1].ring[0],patch[i].ring[1],patch[ip1].ring[3]);
+        
+	assert( patch[i].ring[1].hasValidPositions() );
+	assert( patch[ip1].ring[3].hasValidPositions() );
+        
+	float level = 0.25f*(ring[im1].edge_level+ring[ip1].edge_level);
+        patch[i].ring[1].edge_level = patch[ip1].ring[2].edge_level = level;
+	center_vertex_level = max(center_vertex_level,level);
+        
+        center += ring[i].vtx;
+        center_ring[2*i+0] = (Vertex_t)patch[i].ring[0].vtx;
+        center_ring[2*i+1] = (Vertex_t)patch[i].ring[0].ring[0];
+      }
+      center /= float(N);
+      
+      for (unsigned int i=0; i<N; i++) {
+        init_regular(center,center_ring,center_vertex_level,N,2*i,patch[i].ring[2]);
+        
+	assert( patch[i].ring[2].hasValidPositions() );
+      }
+    }
+    
+    void init(CatmullClarkPatch& patch) const
+    {
+      assert(size() == 4);
+      ring[0].convert(patch.ring[0]);
+      ring[1].convert(patch.ring[1]);
+      ring[2].convert(patch.ring[2]);
+      ring[3].convert(patch.ring[3]);
+    }
+    
+    static void fix_quad_ring_order (array_t<CatmullClarkPatch,GeneralCatmullClarkPatchT::SIZE>& patches)
+    {
+      CatmullClark1Ring patches1ring1 = patches[1].ring[1];
+      patches[1].ring[1] = patches[1].ring[0]; // FIXME: optimize these assignments
+      patches[1].ring[0] = patches[1].ring[3];
+      patches[1].ring[3] = patches[1].ring[2];
+      patches[1].ring[2] = patches1ring1;
+      
+      CatmullClark1Ring patches2ring2 = patches[2].ring[2];
+      patches[2].ring[2] = patches[2].ring[0];
+      patches[2].ring[0] = patches2ring2;
+      CatmullClark1Ring patches2ring3 = patches[2].ring[3];
+      patches[2].ring[3] = patches[2].ring[1];
+      patches[2].ring[1] = patches2ring3;
+      
+      CatmullClark1Ring patches3ring3 = patches[3].ring[3];
+      patches[3].ring[3] = patches[3].ring[0];
+      patches[3].ring[0] = patches[3].ring[1];
+      patches[3].ring[1] = patches[3].ring[2];
+      patches[3].ring[2] = patches3ring3;
+    }
+
+    __forceinline void getLimitBorder(BezierCurve curves[GeneralCatmullClarkPatchT::SIZE]) const
+    {
+      Vertex P0 = ring[0].getLimitVertex();
+      for (unsigned i=0; i<N; i++)
+      {
+        const unsigned i0 = i, i1 = i+1==N ? 0 : i+1;
+        const Vertex P1 = madd(1.0f/3.0f,ring[i0].getLimitTangent(),P0);
+        const Vertex P3 = ring[i1].getLimitVertex();
+        const Vertex P2 = madd(1.0f/3.0f,ring[i1].getSecondLimitTangent(),P3);
+        new (&curves[i]) BezierCurve(P0,P1,P2,P3);
+        P0 = P3;
+      }
+    }
+
+    __forceinline void getLimitBorder(BezierCurve curves[2], const unsigned subPatch) const
+    {
+      const unsigned i0 = subPatch;
+      const Vertex t0_p = ring[i0].getLimitTangent();
+      const Vertex t0_m = ring[i0].getSecondLimitTangent();
+          
+      const unsigned i1 = subPatch+1 == N ? 0 : subPatch+1;
+      const Vertex t1_p = ring[i1].getLimitTangent();
+      const Vertex t1_m = ring[i1].getSecondLimitTangent();
+      
+      const unsigned i2 = subPatch == 0 ? N-1 : subPatch-1;
+      const Vertex t2_p = ring[i2].getLimitTangent();
+      const Vertex t2_m = ring[i2].getSecondLimitTangent();
+      
+      const Vertex b00 = ring[i0].getLimitVertex();
+      const Vertex b03 = ring[i1].getLimitVertex();
+      const Vertex b33 = ring[i2].getLimitVertex();
+      
+      const Vertex b01 = madd(1.0/3.0f,t0_p,b00);
+      const Vertex b11 = madd(1.0/3.0f,t0_m,b00);
+      
+      //const Vertex b13 = madd(1.0/3.0f,t1_p,b03);
+      const Vertex b02 = madd(1.0/3.0f,t1_m,b03);
+          
+      const Vertex b22 = madd(1.0/3.0f,t2_p,b33);
+      const Vertex b23 = madd(1.0/3.0f,t2_m,b33);
+          
+      new (&curves[0]) BezierCurve(b00,b01,b02,b03);
+      new (&curves[1]) BezierCurve(b33,b22,b11,b00);
+    }
+    
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClarkPatchT &p)
+    {
+      o << "GeneralCatmullClarkPatch { " << embree_endl;
+      for (unsigned i=0; i<p.N; i++)
+	o << "ring" << i << ": " << p.ring[i] << embree_endl;
+      o << "}" << embree_endl;
+      return o;
+    }
+    };
+  
+  typedef GeneralCatmullClarkPatchT<Vec3fa,Vec3fa_t> GeneralCatmullClarkPatch3fa;
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h
new file mode 100644
index 0000000000..73b41fd4ff
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullclark_ring.h
@@ -0,0 +1,826 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/geometry.h"
+#include "../common/buffer.h"
+#include "half_edge.h"
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{
+  struct __aligned(64) FinalQuad {
+    Vec3fa vtx[4];
+  };
+
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) CatmullClark1RingT
+  {
+    ALIGNED_STRUCT_(64);
+    
+    int border_index;                                   //!< edge index where border starts
+    unsigned int face_valence;                          //!< number of adjacent quad faces
+    unsigned int edge_valence;                          //!< number of adjacent edges (2*face_valence)
+    float vertex_crease_weight;                         //!< weight of vertex crease (0 if no vertex crease)
+    DynamicStackArray<float,16,MAX_RING_FACE_VALENCE> crease_weight; //!< edge crease weights for each adjacent edge
+    float vertex_level;                                 //!< maximum level of all adjacent edges
+    float edge_level;                                   //!< level of first edge
+    unsigned int eval_start_index;                      //!< topology dependent index to start evaluation
+    unsigned int eval_unique_identifier;                //!< topology dependent unique identifier for this ring 
+    Vertex vtx;                                         //!< center vertex
+    DynamicStackArray<Vertex,32,MAX_RING_EDGE_VALENCE> ring;  //!< ring of neighboring vertices
+   
+  public:
+    CatmullClark1RingT () 
+    : eval_start_index(0), eval_unique_identifier(0) {} // FIXME: default constructor should be empty
+
+    /*! calculates number of bytes required to serialize this structure */
+    __forceinline size_t bytes() const
+    {
+      size_t ofs = 0;
+      ofs += sizeof(border_index);
+      ofs += sizeof(face_valence);
+      assert(2*face_valence == edge_valence);
+      ofs += sizeof(vertex_crease_weight);
+      ofs += face_valence*sizeof(float);
+      ofs += sizeof(vertex_level);
+      ofs += sizeof(edge_level);
+      ofs += sizeof(eval_start_index);
+      ofs += sizeof(eval_unique_identifier);
+      ofs += sizeof(vtx);
+      ofs += edge_valence*sizeof(Vertex);
+      return ofs;
+    }
+
+    template<typename Ty>
+    static __forceinline void store(char* ptr, size_t& ofs, const Ty& v) {
+      *(Ty*)&ptr[ofs] = v; ofs += sizeof(Ty);
+    }
+
+    template<typename Ty>
+    static __forceinline void load(char* ptr, size_t& ofs, Ty& v) {
+      v = *(Ty*)&ptr[ofs]; ofs += sizeof(Ty);
+    }
+
+    /*! serializes the ring to some memory location */
+    __forceinline void serialize(char* ptr, size_t& ofs) const
+    {
+      store(ptr,ofs,border_index);
+      store(ptr,ofs,face_valence);
+      store(ptr,ofs,vertex_crease_weight);
+      for (size_t i=0; i<face_valence; i++)
+        store(ptr,ofs,crease_weight[i]);
+      store(ptr,ofs,vertex_level);
+      store(ptr,ofs,edge_level);
+      store(ptr,ofs,eval_start_index);
+      store(ptr,ofs,eval_unique_identifier);
+      Vertex_t::storeu(&ptr[ofs],vtx); ofs += sizeof(Vertex);
+      for (size_t i=0; i<edge_valence; i++) {
+        Vertex_t::storeu(&ptr[ofs],ring[i]); ofs += sizeof(Vertex);
+      }
+    }
+
+    /*! deserializes the ring from some memory location */
+    __forceinline void deserialize(char* ptr, size_t& ofs)
+    {
+      load(ptr,ofs,border_index);
+      load(ptr,ofs,face_valence);
+      edge_valence = 2*face_valence;
+      load(ptr,ofs,vertex_crease_weight);
+      for (size_t i=0; i<face_valence; i++)
+        load(ptr,ofs,crease_weight[i]);
+      load(ptr,ofs,vertex_level);
+      load(ptr,ofs,edge_level);
+      load(ptr,ofs,eval_start_index);
+      load(ptr,ofs,eval_unique_identifier);
+      vtx = Vertex_t::loadu(&ptr[ofs]); ofs += sizeof(Vertex);
+      for (size_t i=0; i<edge_valence; i++) {
+        ring[i] = Vertex_t::loadu(&ptr[ofs]); ofs += sizeof(Vertex);
+      }
+    }
+
+    __forceinline bool hasBorder() const {
+      return border_index != -1;
+    }
+    
+    __forceinline const Vertex& front(size_t i) const {
+      assert(edge_valence>i);
+      return ring[i];
+    }
+    
+    __forceinline const Vertex& back(size_t i) const {
+      assert(edge_valence>=i);
+      return ring[edge_valence-i];
+    }
+    
+    __forceinline bool has_last_face() const {
+      return (size_t)border_index != (size_t)edge_valence-2;
+    }
+
+    __forceinline bool has_opposite_front(size_t i) const {
+      return (size_t)border_index != 2*i;
+    }
+
+    __forceinline bool has_opposite_back(size_t i) const {
+      return (size_t)border_index != ((size_t)edge_valence-2-2*i);
+    }
+    
+    __forceinline BBox3fa bounds() const
+    {
+      BBox3fa bounds ( vtx );
+      for (size_t i = 0; i<edge_valence ; i++)
+	bounds.extend( ring[i] );
+      return bounds;
+    }
+
+    /*! initializes the ring from the half edge structure */
+    __forceinline void init(const HalfEdge* const h, const char* vertices, size_t stride) 
+    {
+      border_index = -1;
+      vtx = Vertex_t::loadu(vertices+h->getStartVertexIndex()*stride);
+      vertex_crease_weight = h->vertex_crease_weight;
+      
+      HalfEdge* p = (HalfEdge*) h;
+
+      unsigned i=0;
+      unsigned min_vertex_index = (unsigned)-1;
+      unsigned min_vertex_index_face = (unsigned)-1;
+      edge_level = p->edge_level;
+      vertex_level = 0.0f;
+
+      do
+      {
+        vertex_level = max(vertex_level,p->edge_level);
+        crease_weight[i/2] = p->edge_crease_weight;
+        assert(p->hasOpposite() || p->edge_crease_weight == float(inf));
+
+        /* store first two vertices of face */
+        p = p->next();
+        const unsigned index0 = p->getStartVertexIndex();
+        ring[i++] = Vertex_t::loadu(vertices+index0*stride);
+        if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; }
+        p = p->next();
+
+        const unsigned index1 = p->getStartVertexIndex();
+        ring[i++] = Vertex_t::loadu(vertices+index1*stride);
+        p = p->next();
+       
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          /* find minimum start vertex */
+          const unsigned index0 = p->getStartVertexIndex();
+          if (index0 < min_vertex_index) { min_vertex_index = index0; min_vertex_index_face = i>>1; }
+
+          /*! mark first border edge and store dummy vertex for face between the two border edges */
+          border_index = i;
+          crease_weight[i/2] = inf; 
+          ring[i++] = Vertex_t::loadu(vertices+index0*stride);
+          ring[i++] = vtx; // dummy vertex
+          	  
+          /*! goto other side of border */
+          p = (HalfEdge*) h;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+
+      } while (p != h); 
+
+      edge_valence = i;
+      face_valence = i >> 1;
+      eval_unique_identifier = min_vertex_index;
+      eval_start_index = min_vertex_index_face;
+
+      assert( hasValidPositions() );
+    }
+      
+    __forceinline void subdivide(CatmullClark1RingT& dest) const
+    {
+      dest.edge_level             = 0.5f*edge_level;
+      dest.vertex_level           = 0.5f*vertex_level;
+      dest.face_valence           = face_valence;
+      dest.edge_valence           = edge_valence;
+      dest.border_index           = border_index;
+      dest.vertex_crease_weight   = max(0.0f,vertex_crease_weight-1.0f);
+      dest.eval_start_index       = eval_start_index;
+      dest.eval_unique_identifier = eval_unique_identifier;
+
+      /* calculate face points */
+      Vertex_t S = Vertex_t(0.0f);
+      for (size_t i=0; i<face_valence; i++) 
+      {
+        size_t face_index = i + eval_start_index; if (face_index >= face_valence) face_index -= face_valence; assert(face_index < face_valence);
+        size_t index0 = 2*face_index+0; if (index0 >= edge_valence) index0 -= edge_valence; assert(index0 < edge_valence);
+        size_t index1 = 2*face_index+1; if (index1 >= edge_valence) index1 -= edge_valence; assert(index1 < edge_valence);
+        size_t index2 = 2*face_index+2; if (index2 >= edge_valence) index2 -= edge_valence; assert(index2 < edge_valence);
+        S += dest.ring[index1] = ((vtx + ring[index1]) + (ring[index0] + ring[index2])) * 0.25f;
+      }
+      
+      /* calculate new edge points */
+      size_t num_creases = 0;
+      array_t<size_t,MAX_RING_FACE_VALENCE> crease_id;
+
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t face_index = i + eval_start_index;
+        if (face_index >= face_valence) face_index -= face_valence;
+        const float edge_crease = crease_weight[face_index];
+        dest.crease_weight[face_index] = max(edge_crease-1.0f,0.0f);
+      
+        size_t index      = 2*face_index;
+        size_t prev_index = face_index == 0 ? edge_valence-1 : 2*face_index-1;
+        size_t next_index = 2*face_index+1;
+
+        const Vertex_t v = vtx + ring[index];
+        const Vertex_t f = dest.ring[prev_index] + dest.ring[next_index];
+        S += ring[index];
+                
+        /* fast path for regular edge points */
+        if (likely(edge_crease <= 0.0f)) {
+          dest.ring[index] = (v+f) * 0.25f;
+        }
+        
+        /* slower path for hard edge rule */
+        else {
+          crease_id[num_creases++] = face_index;
+          dest.ring[index] = v*0.5f;
+	  
+          /* even slower path for blended edge rule */
+          if (unlikely(edge_crease < 1.0f)) {
+            dest.ring[index] = lerp((v+f)*0.25f,v*0.5f,edge_crease);
+          }
+        }
+      }
+      
+      /* compute new vertex using smooth rule */
+      const float inv_face_valence = 1.0f / (float)face_valence;
+      const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence;
+      dest.vtx = v_smooth;
+      
+      /* compute new vertex using vertex_crease_weight rule */
+      if (unlikely(vertex_crease_weight > 0.0f)) 
+      {
+        if (vertex_crease_weight >= 1.0f) {
+          dest.vtx = vtx;
+        } else {
+          dest.vtx = lerp(v_smooth,vtx,vertex_crease_weight);
+        }
+        return;
+      }
+      
+      /* no edge crease rule and dart rule */
+      if (likely(num_creases <= 1))
+        return;
+      
+      /* compute new vertex using crease rule */
+      if (likely(num_creases == 2)) 
+      {
+        /* update vertex using crease rule */
+        const size_t crease0 = crease_id[0], crease1 = crease_id[1];
+        const Vertex_t v_sharp = (Vertex_t)(ring[2*crease0] + 6.0f*vtx + ring[2*crease1]) * (1.0f / 8.0f);
+        dest.vtx = v_sharp;
+
+        /* update crease_weights using chaikin rule */
+        const float crease_weight0 = crease_weight[crease0], crease_weight1 = crease_weight[crease1];
+        dest.crease_weight[crease0] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f);
+        dest.crease_weight[crease1] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f);
+
+        /* interpolate between sharp and smooth rule */
+        const float v_blend = 0.5f*(crease_weight0+crease_weight1);
+        if (unlikely(v_blend < 1.0f)) {
+          dest.vtx = lerp(v_smooth,v_sharp,v_blend);
+        }
+      }
+      
+      /* compute new vertex using corner rule */
+      else {
+        dest.vtx = vtx;
+      }
+    }
+    
+    __forceinline bool isRegular1() const 
+    {
+      if (border_index == -1) {
+	if (face_valence == 4) return true;
+      } else {
+	if (face_valence < 4) return true;
+      }
+      return false;
+    }
+
+    __forceinline size_t numEdgeCreases() const
+    {
+      ssize_t numCreases = 0;
+      for (size_t i=0; i<face_valence; i++) {
+        numCreases += crease_weight[i] > 0.0f;
+      }
+      return numCreases;
+    }
+
+    enum Type {
+      TYPE_NONE            = 0,      //!< invalid type
+      TYPE_REGULAR         = 1,      //!< regular patch when ignoring creases
+      TYPE_REGULAR_CREASES = 2,      //!< regular patch when considering creases
+      TYPE_GREGORY         = 4,      //!< gregory patch when ignoring creases
+      TYPE_GREGORY_CREASES = 8,      //!< gregory patch when considering creases
+      TYPE_CREASES         = 16      //!< patch has crease features
+    };
+    
+    __forceinline Type type() const
+    {
+      /* check if there is an edge crease anywhere */      
+      const size_t numCreases = numEdgeCreases();
+      const bool noInnerCreases = hasBorder() ? numCreases == 2 : numCreases == 0;
+
+      Type crease_mask = (Type) (TYPE_REGULAR | TYPE_GREGORY);
+      if (noInnerCreases ) crease_mask = (Type) (crease_mask | TYPE_REGULAR_CREASES | TYPE_GREGORY_CREASES);
+      if (numCreases != 0) crease_mask = (Type) (crease_mask | TYPE_CREASES);
+
+      /* calculate if this vertex is regular */
+      bool hasBorder = border_index != -1;
+      if (face_valence == 2 && hasBorder) {
+        if      (vertex_crease_weight == 0.0f      ) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+        else if (vertex_crease_weight == float(inf)) return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+        else                                         return TYPE_CREASES;
+      }
+      else if (vertex_crease_weight != 0.0f)         return TYPE_CREASES;
+      else if (face_valence == 3 &&  hasBorder)      return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+      else if (face_valence == 4 && !hasBorder)      return (Type) (crease_mask & (TYPE_REGULAR | TYPE_REGULAR_CREASES | TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+      else                                           return (Type) (crease_mask & (TYPE_GREGORY | TYPE_GREGORY_CREASES | TYPE_CREASES));
+    }
+
+    __forceinline bool isFinalResolution(float res) const {
+      return vertex_level <= res;
+    }
+
+    /* computes the limit vertex */
+    __forceinline Vertex getLimitVertex() const
+    {
+      /* return hard corner */ 
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return vtx;
+
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {
+	const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+	return (4.0f * vtx + (ring[border_index] + ring[second_border_index])) * 1.0f/6.0f;
+      }
+      
+      Vertex_t F( 0.0f );
+      Vertex_t E( 0.0f );
+      
+      assert(eval_start_index < face_valence);
+
+      for (size_t i=0; i<face_valence; i++) {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+        F += ring[2*index+1];
+        E += ring[2*index];
+      }
+
+      const float n = (float)face_valence;
+      return (Vertex_t)(n*n*vtx+4.0f*E+F) / ((n+5.0f)*n);      
+    }
+    
+    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    __forceinline Vertex getLimitTangent() const 
+    {
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return ring[0] - vtx;
+
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {	
+	if (border_index != (int)edge_valence-2 ) {
+	  return ring[0] - vtx; 
+	}
+	else
+	{
+	  const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+	  return (ring[second_border_index] - ring[border_index]) * 0.5f;
+	}
+      }
+      
+      Vertex_t alpha( 0.0f );
+      Vertex_t beta ( 0.0f );
+      
+      const size_t n = face_valence;
+
+      assert(eval_start_index < face_valence);
+
+      Vertex_t q( 0.0f );
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+        const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(index,n);
+        const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(index,n);
+	alpha +=  a * ring[2*index];
+	beta  +=  b * ring[2*index+1];
+      }
+
+      const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n);
+      return sigma * (alpha + beta);
+    }
+    
+    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    __forceinline Vertex getSecondLimitTangent() const 
+    {
+      if (unlikely(std::isinf(vertex_crease_weight)))
+        return ring[2] - vtx;
+ 
+      /* border vertex rule */
+      if (unlikely(border_index != -1))
+      {
+        if (border_index != 2) {
+          return ring[2] - vtx;
+        }
+        else {
+          const unsigned int second_border_index = border_index+2 >= int(edge_valence) ? 0 : border_index+2;
+          return (ring[border_index] - ring[second_border_index]) * 0.5f;
+        }
+      }
+      
+      Vertex_t alpha( 0.0f );
+      Vertex_t beta ( 0.0f );
+
+      const size_t n = face_valence;
+
+      assert(eval_start_index < face_valence);
+
+      for (size_t i=0; i<face_valence; i++)
+      {
+        size_t index = i+eval_start_index;
+        if (index >= face_valence) index -= face_valence;
+
+        size_t prev_index = index == 0 ? face_valence-1 : index-1; // need to be bit-wise exact in cosf eval
+        const float a = CatmullClarkPrecomputedCoefficients::table.limittangent_a(prev_index,n);
+        const float b = CatmullClarkPrecomputedCoefficients::table.limittangent_b(prev_index,n);
+	alpha += a * ring[2*index];
+	beta  += b * ring[2*index+1];
+      }
+
+      const float sigma = CatmullClarkPrecomputedCoefficients::table.limittangent_c(n);
+      return sigma* (alpha + beta);      
+    }
+
+    /* gets surface normal */
+    const Vertex getNormal() const  {
+      return cross(getLimitTangent(),getSecondLimitTangent());
+    }
+    
+    /* returns center of the n-th quad in the 1-ring */
+    __forceinline Vertex getQuadCenter(const size_t index) const
+    {
+      const Vertex_t &p0 = vtx;
+      const Vertex_t &p1 = ring[2*index+0];
+      const Vertex_t &p2 = ring[2*index+1];
+      const Vertex_t &p3 = index == face_valence-1 ? ring[0] : ring[2*index+2];
+      const Vertex p = (p0+p1+p2+p3) * 0.25f;
+      return p;
+    }
+    
+    /* returns center of the n-th edge in the 1-ring */
+    __forceinline Vertex getEdgeCenter(const size_t index) const {
+      return (vtx + ring[index*2]) * 0.5f;
+    }
+
+    bool hasValidPositions() const
+    {
+      for (size_t i=0; i<edge_valence; i++) {
+        if (!isvalid(ring[i]))
+          return false;
+      }	
+      return true;
+    }
+
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const CatmullClark1RingT &c)
+    {
+      o << "vtx " << c.vtx << " size = " << c.edge_valence << ", " << 
+	"hard_edge = " << c.border_index << ", face_valence " << c.face_valence << 
+	", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", eval_start_index: " << c.eval_start_index << ", ring: " << embree_endl;
+      
+      for (unsigned int i=0; i<min(c.edge_valence,(unsigned int)MAX_RING_FACE_VALENCE); i++) {
+        o << i << " -> " << c.ring[i];
+        if (i % 2 == 0) o << " crease = " << c.crease_weight[i/2];
+        o << embree_endl;
+      }
+      return o;
+    } 
+  };
+
+  typedef CatmullClark1RingT<Vec3fa,Vec3fa_t> CatmullClark1Ring3fa;
+  
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) GeneralCatmullClark1RingT
+  {
+    ALIGNED_STRUCT_(64);
+    
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    
+    struct Face 
+    {
+      __forceinline Face() {}
+      __forceinline Face (int size, float crease_weight)
+        : size(size), crease_weight(crease_weight) {}
+
+      // FIXME: add member that returns total number of vertices
+
+      int size;              // number of vertices-2 of nth face in ring
+      float crease_weight;
+    };
+
+    Vertex vtx;
+    DynamicStackArray<Vertex,32,MAX_RING_EDGE_VALENCE> ring; 
+    DynamicStackArray<Face,16,MAX_RING_FACE_VALENCE> faces;
+    unsigned int face_valence;
+    unsigned int edge_valence;
+    int border_face;
+    float vertex_crease_weight;
+    float vertex_level;                      //!< maximum level of adjacent edges
+    float edge_level;                        // level of first edge
+    bool only_quads;                         // true if all faces are quads
+    unsigned int eval_start_face_index;
+    unsigned int eval_start_vertex_index;
+    unsigned int eval_unique_identifier;
+
+  public:
+    GeneralCatmullClark1RingT() 
+      : eval_start_face_index(0), eval_start_vertex_index(0), eval_unique_identifier(0) {}
+
+    __forceinline bool isRegular() const 
+    {
+      if (border_face == -1 && face_valence == 4) return true;
+      return false;
+    }
+    
+    __forceinline bool has_last_face() const {
+      return border_face != (int)face_valence-1;
+    }
+    
+    __forceinline bool has_second_face() const {
+      return (border_face == -1) || (border_face >= 2);
+    }
+
+    bool hasValidPositions() const
+    {
+      for (size_t i=0; i<edge_valence; i++) {
+        if (!isvalid(ring[i]))
+          return false;
+      }	
+      return true;
+    }
+
+    __forceinline void init(const HalfEdge* const h, const char* vertices, size_t stride)
+    {
+      only_quads = true;
+      border_face = -1;
+      vtx = Vertex_t::loadu(vertices+h->getStartVertexIndex()*stride);
+      vertex_crease_weight = h->vertex_crease_weight;
+      HalfEdge* p = (HalfEdge*) h;
+      
+      unsigned int e=0, f=0;
+      unsigned min_vertex_index = (unsigned)-1;
+      unsigned min_vertex_index_face = (unsigned)-1;
+      unsigned min_vertex_index_vertex = (unsigned)-1;
+      edge_level = p->edge_level;
+      vertex_level = 0.0f;
+      do 
+      {
+        HalfEdge* p_prev = p->prev();
+        HalfEdge* p_next = p->next();
+        const float crease_weight = p->edge_crease_weight;
+         assert(p->hasOpposite() || p->edge_crease_weight == float(inf));
+        vertex_level = max(vertex_level,p->edge_level);
+
+        /* find minimum start vertex */
+        unsigned vertex_index = p_next->getStartVertexIndex();
+        if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; }
+
+	/* store first N-2 vertices of face */
+	unsigned int vn = 0;
+        for (p = p_next; p!=p_prev; p=p->next()) {
+          ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride);
+          vn++;
+	}
+	faces[f++] = Face(vn,crease_weight);
+	only_quads &= (vn == 2);
+	
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          /* find minimum start vertex */
+          unsigned vertex_index = p->getStartVertexIndex();
+          if (vertex_index < min_vertex_index) { min_vertex_index = vertex_index; min_vertex_index_face = f; min_vertex_index_vertex = e; }
+
+          /*! mark first border edge and store dummy vertex for face between the two border edges */
+          border_face = f;
+	  faces[f++] = Face(2,inf); 
+          ring[e++] = Vertex_t::loadu(vertices+p->getStartVertexIndex()*stride);
+          ring[e++] = vtx; // dummy vertex
+	  
+          /*! goto other side of border */
+          p = (HalfEdge*) h;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+	
+      } while (p != h); 
+      
+      edge_valence = e;
+      face_valence = f;
+      eval_unique_identifier = min_vertex_index;
+      eval_start_face_index = min_vertex_index_face;
+      eval_start_vertex_index = min_vertex_index_vertex;
+
+      assert( hasValidPositions() );
+    }
+    
+    __forceinline void subdivide(CatmullClark1Ring& dest) const
+    {
+      dest.edge_level = 0.5f*edge_level;
+      dest.vertex_level = 0.5f*vertex_level;
+      dest.face_valence = face_valence;
+      dest.edge_valence = 2*face_valence;
+      dest.border_index = border_face == -1 ? -1 : 2*border_face; // FIXME:
+      dest.vertex_crease_weight    = max(0.0f,vertex_crease_weight-1.0f);
+      dest.eval_start_index        = eval_start_face_index;
+      dest.eval_unique_identifier  = eval_unique_identifier;
+      assert(dest.face_valence <= MAX_RING_FACE_VALENCE);
+
+      /* calculate face points */
+      Vertex_t S = Vertex_t(0.0f);
+      for (size_t face=0, v=eval_start_vertex_index; face<face_valence; face++) {
+        size_t f = (face + eval_start_face_index)%face_valence;
+
+        Vertex_t F = vtx;
+        for (size_t k=v; k<=v+faces[f].size; k++) F += ring[k%edge_valence]; // FIXME: optimize
+        S += dest.ring[2*f+1] = F/float(faces[f].size+2);
+        v+=faces[f].size;
+        v%=edge_valence;
+      }
+      
+      /* calculate new edge points */
+      size_t num_creases = 0;
+      array_t<size_t,MAX_RING_FACE_VALENCE> crease_id;
+      Vertex_t C = Vertex_t(0.0f);
+      for (size_t face=0, j=eval_start_vertex_index; face<face_valence; face++)
+      {
+        size_t i = (face + eval_start_face_index)%face_valence;
+        
+        const Vertex_t v = vtx + ring[j];
+        Vertex_t f = dest.ring[2*i+1];
+        if (i == 0) f += dest.ring[dest.edge_valence-1]; 
+        else        f += dest.ring[2*i-1];
+        S += ring[j];
+        dest.crease_weight[i] = max(faces[i].crease_weight-1.0f,0.0f);
+        
+        /* fast path for regular edge points */
+        if (likely(faces[i].crease_weight <= 0.0f)) {
+          dest.ring[2*i] = (v+f) * 0.25f;
+        }
+        
+        /* slower path for hard edge rule */
+        else {
+          C += ring[j]; crease_id[num_creases++] = i;
+          dest.ring[2*i] = v*0.5f;
+	  
+          /* even slower path for blended edge rule */
+          if (unlikely(faces[i].crease_weight < 1.0f)) {
+            dest.ring[2*i] = lerp((v+f)*0.25f,v*0.5f,faces[i].crease_weight);
+          }
+        }
+        j+=faces[i].size;
+        j%=edge_valence;
+      }
+      
+      /* compute new vertex using smooth rule */
+      const float inv_face_valence = 1.0f / (float)face_valence;
+      const Vertex_t v_smooth = (Vertex_t) madd(inv_face_valence,S,(float(face_valence)-2.0f)*vtx)*inv_face_valence;
+      dest.vtx = v_smooth;
+      
+      /* compute new vertex using vertex_crease_weight rule */
+      if (unlikely(vertex_crease_weight > 0.0f)) 
+      {
+        if (vertex_crease_weight >= 1.0f) {
+          dest.vtx = vtx;
+        } else {
+          dest.vtx = lerp(vtx,v_smooth,vertex_crease_weight);
+        }
+        return;
+      }
+      
+      if (likely(num_creases <= 1))
+        return;
+      
+      /* compute new vertex using crease rule */
+      if (likely(num_creases == 2)) {
+        const Vertex_t v_sharp = (Vertex_t)(C + 6.0f * vtx) * (1.0f / 8.0f);
+        const float crease_weight0 = faces[crease_id[0]].crease_weight;
+        const float crease_weight1 = faces[crease_id[1]].crease_weight;
+        dest.vtx = v_sharp;
+        dest.crease_weight[crease_id[0]] = max(0.25f*(3.0f*crease_weight0 + crease_weight1)-1.0f,0.0f);
+        dest.crease_weight[crease_id[1]] = max(0.25f*(3.0f*crease_weight1 + crease_weight0)-1.0f,0.0f);
+        const float v_blend = 0.5f*(crease_weight0+crease_weight1);
+        if (unlikely(v_blend < 1.0f)) {
+          dest.vtx = lerp(v_sharp,v_smooth,v_blend);
+        }
+      }
+      
+      /* compute new vertex using corner rule */
+      else {
+        dest.vtx = vtx;
+      }
+    }
+
+    void convert(CatmullClark1Ring& dst) const
+    {
+      dst.edge_level = edge_level;
+      dst.vertex_level = vertex_level;
+      dst.vtx = vtx;
+      dst.face_valence = face_valence;
+      dst.edge_valence = 2*face_valence;
+      dst.border_index = border_face == -1 ? -1 : 2*border_face;
+      for (size_t i=0; i<face_valence; i++) 
+	dst.crease_weight[i] = faces[i].crease_weight;
+      dst.vertex_crease_weight = vertex_crease_weight;
+      for (size_t i=0; i<edge_valence; i++) dst.ring[i] = ring[i];
+
+      dst.eval_start_index = eval_start_face_index;
+      dst.eval_unique_identifier = eval_unique_identifier;
+
+      assert( dst.hasValidPositions() );
+    }
+
+
+    /* gets limit tangent in the direction of egde vtx -> ring[0] */
+    __forceinline Vertex getLimitTangent() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+      {
+        convert(cc_vtx);
+        return cc_vtx.getLimitTangent();
+      }
+      
+      subdivide(cc_vtx);
+      return 2.0f * cc_vtx.getLimitTangent();
+    }
+
+    /* gets limit tangent in the direction of egde vtx -> ring[edge_valence-2] */
+    __forceinline Vertex getSecondLimitTangent() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+      {
+        convert(cc_vtx);
+        return cc_vtx.getSecondLimitTangent();
+      }
+      
+      subdivide(cc_vtx);
+      return 2.0f * cc_vtx.getSecondLimitTangent();
+    }
+
+
+    /* gets limit vertex */
+    __forceinline Vertex getLimitVertex() const 
+    {
+      CatmullClark1Ring cc_vtx;
+     
+      /* fast path for quad only rings */
+      if (only_quads)
+        convert(cc_vtx);
+      else 
+        subdivide(cc_vtx);
+      return cc_vtx.getLimitVertex();
+    }
+
+    friend __forceinline embree_ostream operator<<(embree_ostream o, const GeneralCatmullClark1RingT &c)
+    {
+      o << "vtx " << c.vtx << " size = " << c.edge_valence << ", border_face = " << c.border_face << ", " << " face_valence = " << c.face_valence << 
+	", edge_level = " << c.edge_level << ", vertex_level = " << c.vertex_level << ", ring: " << embree_endl;
+      for (size_t v=0, f=0; f<c.face_valence; v+=c.faces[f++].size) {
+        for (size_t i=v; i<v+c.faces[f].size; i++) {
+          o << i << " -> " << c.ring[i];
+          if (i == v) o << " crease = " << c.faces[f].crease_weight;
+          o << embree_endl;
+        }
+      }
+      return o;
+    } 
+  };  
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h
new file mode 100644
index 0000000000..b244af481c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/catmullrom_curve.h
@@ -0,0 +1,296 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "../common/scene_curves.h"
+
+/*
+
+  Implements Catmul Rom curves with control points p0, p1, p2, p3. At
+  t=0 the curve goes through p1, with tangent (p2-p0)/3, and for t=1
+  the curve goes through p2 with tangent (p3-p2)/2.
+
+ */
+
+namespace embree
+{
+  class CatmullRomBasis
+  {
+  public:
+
+    template<typename T>
+      static __forceinline Vec4<T> eval(const T& u) 
+    {
+      const T t  = u;
+      const T s  = T(1.0f) - u;
+      const T n0 = - t * s * s;
+      const T n1 = 2.0f + t * t * (3.0f * t - 5.0f);
+      const T n2 = 2.0f + s * s * (3.0f * s - 5.0f);
+      const T n3 = - s * t * t;
+      return T(0.5f) * Vec4<T>(n0, n1, n2, n3);
+    }
+    
+    template<typename T>
+      static __forceinline Vec4<T>  derivative(const T& u)
+    {
+      const T t  =  u;
+      const T s  =  1.0f - u;
+      const T n0 =  - s * s + 2.0f * s * t;
+      const T n1 =  2.0f * t * (3.0f * t - 5.0f) + 3.0f * t * t;
+      const T n2 =  2.0f * s * (3.0f * t + 2.0f) - 3.0f * s * s;
+      const T n3 = -2.0f * s * t + t * t;
+      return T(0.5f) * Vec4<T>(n0, n1, n2, n3);
+    }
+
+    template<typename T>
+      static __forceinline Vec4<T>  derivative2(const T& u)
+    {
+      const T t  =  u;
+      const T n0 = -3.0f * t + 2.0f;
+      const T n1 =  9.0f * t - 5.0f;
+      const T n2 = -9.0f * t + 4.0f;
+      const T n3 =  3.0f * t - 1.0f;
+      return Vec4<T>(n0, n1, n2, n3);
+    }
+  };
+  
+  struct PrecomputedCatmullRomBasis
+  {
+    enum { N = 16 };
+  public:
+    PrecomputedCatmullRomBasis() {}
+    PrecomputedCatmullRomBasis(int shift);
+
+    /* basis for bspline evaluation */
+  public:
+    float c0[N+1][N+1];
+    float c1[N+1][N+1];
+    float c2[N+1][N+1];
+    float c3[N+1][N+1];
+    
+    /* basis for bspline derivative evaluation */
+  public:
+    float d0[N+1][N+1];
+    float d1[N+1][N+1];
+    float d2[N+1][N+1];
+    float d3[N+1][N+1];
+  };
+  extern PrecomputedCatmullRomBasis catmullrom_basis0;
+  extern PrecomputedCatmullRomBasis catmullrom_basis1;
+
+  template<typename Vertex>
+    struct CatmullRomCurveT
+    {
+      Vertex v0,v1,v2,v3;
+      
+      __forceinline CatmullRomCurveT() {}
+      
+      __forceinline CatmullRomCurveT(const Vertex& v0, const Vertex& v1, const Vertex& v2, const Vertex& v3)
+        : v0(v0), v1(v1), v2(v2), v3(v3) {}
+
+      __forceinline Vertex begin() const {
+        return madd(1.0f/6.0f,v0,madd(2.0f/3.0f,v1,1.0f/6.0f*v2));
+      }
+
+      __forceinline Vertex end() const {
+        return madd(1.0f/6.0f,v1,madd(2.0f/3.0f,v2,1.0f/6.0f*v3));
+      }
+
+      __forceinline Vertex center() const {
+        return 0.25f*(v0+v1+v2+v3);
+      }
+
+      __forceinline BBox<Vertex> bounds() const {
+        return merge(BBox<Vertex>(v0),BBox<Vertex>(v1),BBox<Vertex>(v2),BBox<Vertex>(v3));
+      }
+
+      __forceinline friend CatmullRomCurveT operator -( const CatmullRomCurveT& a, const Vertex& b ) {
+        return CatmullRomCurveT(a.v0-b,a.v1-b,a.v2-b,a.v3-b);
+      }
+
+      __forceinline CatmullRomCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,v0-p), v0.w);
+        const Vec3ff q1(xfmVector(space,v1-p), v1.w);
+        const Vec3ff q2(xfmVector(space,v2-p), v2.w);
+        const Vec3ff q3(xfmVector(space,v3-p), v3.w);
+        return CatmullRomCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+      
+      __forceinline Vertex eval(const float t) const 
+      {
+        const Vec4<float> b = CatmullRomBasis::eval(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_du(const float t) const
+      {
+        const Vec4<float> b = CatmullRomBasis::derivative(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline Vertex eval_dudu(const float t) const 
+      {
+        const Vec4<float> b = CatmullRomBasis::derivative2(t);
+        return madd(b.x,v0,madd(b.y,v1,madd(b.z,v2,b.w*v3)));
+      }
+      
+      __forceinline void eval(const float t, Vertex& p, Vertex& dp, Vertex& ddp) const
+      {
+        p = eval(t);
+        dp = eval_du(t);
+        ddp = eval_dudu(t);
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::eval(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_du(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::derivative(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline Vec4vf<M> veval_dudu(const vfloat<M>& t) const 
+      {
+        const Vec4vf<M> b = CatmullRomBasis::derivative2(t);
+        return madd(b.x, Vec4vf<M>(v0), madd(b.y, Vec4vf<M>(v1), madd(b.z, Vec4vf<M>(v2), b.w * Vec4vf<M>(v3))));
+      }
+
+      template<int M>
+      __forceinline void veval(const vfloat<M>& t, Vec4vf<M>& p, Vec4vf<M>& dp) const
+      {
+        p = veval(t);
+        dp = veval_du(t);
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis0.c0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis0.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis0.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis0.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> eval1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis1.c0[size][ofs]), Vec4vf<M>(v0), 
+                    madd(vfloat<M>::loadu(&catmullrom_basis1.c1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis1.c2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis1.c3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative0(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis0.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis0.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis0.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis0.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      template<int M>
+      __forceinline Vec4vf<M> derivative1(const int ofs, const int size) const
+      {
+        assert(size <= PrecomputedCatmullRomBasis::N);
+        assert(ofs <= size);
+        return madd(vfloat<M>::loadu(&catmullrom_basis1.d0[size][ofs]), Vec4vf<M>(v0),
+                    madd(vfloat<M>::loadu(&catmullrom_basis1.d1[size][ofs]), Vec4vf<M>(v1),
+                         madd(vfloat<M>::loadu(&catmullrom_basis1.d2[size][ofs]), Vec4vf<M>(v2),
+                              vfloat<M>::loadu(&catmullrom_basis1.d3[size][ofs]) * Vec4vf<M>(v3))));
+      }
+      
+      /* calculates bounds of catmull-rom curve geometry */
+      __forceinline BBox3fa accurateRoundBounds() const
+      {
+        const int N = 7;
+        const float scale = 1.0f/(3.0f*(N-1));
+        Vec4vfx pl(pos_inf), pu(neg_inf);
+        for (int i=0; i<=N; i+=VSIZEX)
+        {
+          vintx vi = vintx(i)+vintx(step);
+          vboolx valid = vi <= vintx(N);
+          const Vec4vfx p  = eval0<VSIZEX>(i,N);
+          const Vec4vfx dp = derivative0<VSIZEX>(i,N);
+          const Vec4vfx pm = p-Vec4vfx(scale)*select(vi!=vintx(0),dp,Vec4vfx(zero));
+          const Vec4vfx pp = p+Vec4vfx(scale)*select(vi!=vintx(N),dp,Vec4vfx(zero));
+          pl = select(valid,min(pl,p,pm,pp),pl); // FIXME: use masked min
+          pu = select(valid,max(pu,p,pm,pp),pu); // FIXME: use masked min
+        }
+        const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+        const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+        const float r_min = reduce_min(pl.w);
+        const float r_max = reduce_max(pu.w);
+        const Vec3fa upper_r = Vec3fa(max(abs(r_min),abs(r_max)));
+        return enlarge(BBox3fa(lower,upper),upper_r);
+      }
+      
+      /* calculates bounds when tessellated into N line segments */
+      __forceinline BBox3fa accurateFlatBounds(int N) const
+      {
+        if (likely(N == 4))
+        {
+          const Vec4vf4 pi = eval0<4>(0,4);
+          const Vec3fa lower(reduce_min(pi.x),reduce_min(pi.y),reduce_min(pi.z));
+          const Vec3fa upper(reduce_max(pi.x),reduce_max(pi.y),reduce_max(pi.z));
+          const Vec3fa upper_r = Vec3fa(reduce_max(abs(pi.w)));
+          const Vec3ff pe = end();
+          return enlarge(BBox3fa(min(lower,pe),max(upper,pe)),max(upper_r,Vec3fa(abs(pe.w))));
+        } 
+        else
+        {
+          Vec3vfx pl(pos_inf), pu(neg_inf); vfloatx ru(0.0f);
+          for (int i=0; i<=N; i+=VSIZEX)
+          {
+            vboolx valid = vintx(i)+vintx(step) <= vintx(N);
+            const Vec4vfx pi = eval0<VSIZEX>(i,N);
+            
+            pl.x = select(valid,min(pl.x,pi.x),pl.x); // FIXME: use masked min
+            pl.y = select(valid,min(pl.y,pi.y),pl.y); 
+            pl.z = select(valid,min(pl.z,pi.z),pl.z); 
+            
+            pu.x = select(valid,max(pu.x,pi.x),pu.x); // FIXME: use masked min
+            pu.y = select(valid,max(pu.y,pi.y),pu.y); 
+            pu.z = select(valid,max(pu.z,pi.z),pu.z); 
+            
+            ru = select(valid,max(ru,abs(pi.w)),ru); 
+          }
+          const Vec3fa lower(reduce_min(pl.x),reduce_min(pl.y),reduce_min(pl.z));
+          const Vec3fa upper(reduce_max(pu.x),reduce_max(pu.y),reduce_max(pu.z));
+          const Vec3fa upper_r(reduce_max(ru));
+          return enlarge(BBox3fa(lower,upper),upper_r);
+        }
+      }
+      
+      friend __forceinline embree_ostream operator<<(embree_ostream cout, const CatmullRomCurveT& curve) {
+        return cout << "CatmullRomCurve { v0 = " << curve.v0 << ", v1 = " << curve.v1 << ", v2 = " << curve.v2 << ", v3 = " << curve.v3 << " }";
+      }
+    };
+
+  __forceinline CatmullRomCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const CatmullRomCurveT<Vec3ff>& curve)
+  {
+    return CatmullRomCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,curve.v0),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v1),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v2),
+                                    enlargeRadiusToMinWidth(context,geom,ray_org,curve.v3));
+  }
+  
+  typedef CatmullRomCurveT<Vec3fa> CatmullRomCurve3fa;
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h
new file mode 100644
index 0000000000..23f24c360c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval.h
@@ -0,0 +1,226 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Vertex, typename Vertex_t = Vertex>
+      struct FeatureAdaptiveEval
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+        typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        typedef BSplinePatchT<Vertex,Vertex_t> BSplinePatch;
+        typedef BezierPatchT<Vertex,Vertex_t> BezierPatch;
+        typedef GregoryPatchT<Vertex,Vertex_t> GregoryPatch;
+        typedef BilinearPatchT<Vertex,Vertex_t> BilinearPatch;
+        typedef BezierCurveT<Vertex> BezierCurve;
+        
+      public:
+        
+        FeatureAdaptiveEval (const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, 
+                             Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          switch (edge->patch_type) {
+          case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+          case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+#if PATCH_USE_GREGORY == 2
+          case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatch(edge,vertices,stride).eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f); break;
+#endif
+          default: {
+            GeneralCatmullClarkPatch patch(edge,vertices,stride);
+            eval(patch,Vec2f(u,v),0);
+            break;
+          }
+          }
+        }
+
+        FeatureAdaptiveEval (CatmullClarkPatch& patch, const float u, const float v, float dscale, size_t depth, 
+                             Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          eval(patch,Vec2f(u,v),dscale,depth);
+        }
+        
+        void eval_general_quad(const GeneralCatmullClarkPatch& patch, array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE>& patches, const Vec2f& uv, size_t depth)
+        {
+          float u = uv.x, v = uv.y;
+          if (v < 0.5f) {
+            if (u < 0.5f) {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,0);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[0],Vec2f(2.0f*u,2.0f*v),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = dpdx; *dPdv = dpdy;
+              }
+            }
+            else {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,1);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[1],Vec2f(2.0f*v,2.0f-2.0f*u),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = -dpdy; *dPdv = dpdx;
+              }
+            }
+          } else {
+            if (u > 0.5f) {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,2);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[2],Vec2f(2.0f-2.0f*u,2.0f-2.0f*v),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = -dpdx; *dPdv = -dpdy;
+              }
+            }
+            else {
+#if PATCH_USE_GREGORY == 2
+              BezierCurve borders[2]; patch.getLimitBorder(borders,3);
+              BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+              BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+              eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+              eval(patches[3],Vec2f(2.0f-2.0f*v,2.0f*u),2.0f,depth+1);
+#endif
+              if (dPdu && dPdv) {
+                const Vertex dpdx = *dPdu, dpdy = *dPdv;
+                *dPdu = dpdy; *dPdv = -dpdx;
+              }
+            }
+          }
+        }
+
+        __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+        {
+          const int max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//          return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=(size_t)max_eval_depth;
+//#else
+          return depth>=(size_t)max_eval_depth;
+//#endif
+        }
+        
+        void eval(CatmullClarkPatch& patch, Vec2f uv, float dscale, size_t depth, 
+                  BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr)
+        {
+          while (true) 
+          {
+            typename CatmullClarkPatch::Type ty = patch.type();
+
+            if (unlikely(final(patch,ty,depth)))
+            {
+              if (ty & CatmullClarkRing::TYPE_REGULAR) { 
+                RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+                PATCH_DEBUG_SUBDIVISION(234423,c,c,-1);
+                return;
+              } else {
+                IrregularFillPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+                PATCH_DEBUG_SUBDIVISION(34534,c,-1,c);
+                return;
+              }
+            }
+            else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+              assert(depth > 0); 
+              RegularPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+              PATCH_DEBUG_SUBDIVISION(43524,c,c,-1);
+              return;
+            }
+#if PATCH_USE_GREGORY == 2
+            else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+              assert(depth > 0); 
+              GregoryPatch(patch,border0,border1,border2,border3).eval(uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+              PATCH_DEBUG_SUBDIVISION(23498,c,-1,c);
+              return;
+            }
+#endif
+            else
+            {
+              array_t<CatmullClarkPatch,4> patches; 
+              patch.subdivide(patches); // FIXME: only have to generate one of the patches
+              
+              const float u = uv.x, v = uv.y;
+              if (v < 0.5f) {
+                if (u < 0.5f) { patch = patches[0]; uv = Vec2f(2.0f*u,2.0f*v); dscale *= 2.0f; }
+                else          { patch = patches[1]; uv = Vec2f(2.0f*u-1.0f,2.0f*v); dscale *= 2.0f; }
+              } else {
+                if (u > 0.5f) { patch = patches[2]; uv = Vec2f(2.0f*u-1.0f,2.0f*v-1.0f); dscale *= 2.0f; }
+                else          { patch = patches[3]; uv = Vec2f(2.0f*u,2.0f*v-1.0f); dscale *= 2.0f; }
+              }
+              depth++;
+            }
+          }
+        }
+        
+        void eval(const GeneralCatmullClarkPatch& patch, const Vec2f& uv, const size_t depth) 
+        {  
+          /* convert into standard quad patch if possible */
+          if (likely(patch.isQuadPatch())) 
+          {
+            CatmullClarkPatch qpatch; patch.init(qpatch);
+            return eval(qpatch,uv,1.0f,depth); 
+          }
+          
+          /* subdivide patch */
+          unsigned N;
+          array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+          patch.subdivide(patches,N); // FIXME: only have to generate one of the patches
+          
+          /* parametrization for quads */
+          if (N == 4) 
+            eval_general_quad(patch,patches,uv,depth);
+          
+          /* parametrization for arbitrary polygons */
+          else 
+          {
+            const unsigned l = (unsigned) floor(0.5f*uv.x); const float u = 2.0f*frac(0.5f*uv.x)-0.5f; 
+            const unsigned h = (unsigned) floor(0.5f*uv.y); const float v = 2.0f*frac(0.5f*uv.y)-0.5f; 
+            const unsigned i = 4*h+l; assert(i<N);
+            if (i >= N) return;
+
+#if PATCH_USE_GREGORY == 2
+            BezierCurve borders[2]; patch.getLimitBorder(borders,i);
+            BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+            BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+            eval(patches[i],Vec2f(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+            eval(patches[i],Vec2f(u,v),1.0f,depth+1);
+#endif
+          }
+        }
+        
+      private:
+        Vertex* const P;
+        Vertex* const dPdu;
+        Vertex* const dPdv;
+        Vertex* const ddPdudu;
+        Vertex* const ddPdvdv;
+        Vertex* const ddPdudv;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h
new file mode 100644
index 0000000000..76583b2e5d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_grid.h
@@ -0,0 +1,359 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "catmullclark_patch.h"
+#include "bspline_patch.h"
+#include "gregory_patch.h"
+#include "tessellation.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    struct FeatureAdaptiveEvalGrid
+    {
+      typedef CatmullClark1Ring3fa CatmullClarkRing;
+      typedef CatmullClarkPatch3fa CatmullClarkPatch;
+      typedef BilinearPatch3fa BilinearPatch;
+      typedef BSplinePatch3fa BSplinePatch;
+      typedef BezierPatch3fa BezierPatch;
+      typedef GregoryPatch3fa GregoryPatch;
+
+    private:
+      const unsigned x0,x1;
+      const unsigned y0,y1;
+      const unsigned swidth,sheight;
+      const float rcp_swidth, rcp_sheight;
+      float* const Px;
+      float* const Py;
+      float* const Pz;
+      float* const U;
+      float* const V;
+      float* const Nx;
+      float* const Ny;
+      float* const Nz;
+      const unsigned dwidth;
+      //const unsigned dheight;
+      unsigned count;
+      
+
+    public:      
+      FeatureAdaptiveEvalGrid (const GeneralCatmullClarkPatch3fa& patch, unsigned subPatch,
+                               const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                               float* Px, float* Py, float* Pz, float* U, float* V, 
+                               float* Nx, float* Ny, float* Nz,
+                               const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0)
+      {
+        assert(swidth < (2<<20) && sheight < (2<<20));
+        const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1)));
+        const BBox2f erange(Vec2f((float)x0,(float)y0),Vec2f((float)x1,(float)y1));
+        
+        /* convert into standard quad patch if possible */
+        if (likely(patch.isQuadPatch())) 
+        {
+          CatmullClarkPatch3fa qpatch; patch.init(qpatch);
+          eval(qpatch, srange, erange, 0);
+          assert(count == (x1-x0+1)*(y1-y0+1));
+          return;
+        }
+        
+        /* subdivide patch */
+        unsigned N;
+        array_t<CatmullClarkPatch3fa,GeneralCatmullClarkPatch3fa::SIZE> patches; 
+        patch.subdivide(patches,N);
+        
+        if (N == 4)
+        {
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+
+#if PATCH_USE_GREGORY == 2
+          BezierCurve3fa borders[GeneralCatmullClarkPatch3fa::SIZE]; patch.getLimitBorder(borders);
+          BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve3fa border1l,border1r; borders[1].subdivide(border1l,border1r);
+          BezierCurve3fa border2l,border2r; borders[2].subdivide(border2l,border2r);
+          BezierCurve3fa border3l,border3r; borders[3].subdivide(border3l,border3r);
+          GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches);
+          eval(patches[0],srange0,intersect(srange0,erange),1,&border0l,nullptr,nullptr,&border3r);
+          eval(patches[1],srange1,intersect(srange1,erange),1,&border0r,&border1l,nullptr,nullptr);
+          eval(patches[2],srange2,intersect(srange2,erange),1,nullptr,&border1r,&border2l,nullptr);
+          eval(patches[3],srange3,intersect(srange3,erange),1,nullptr,nullptr,&border2r,&border3l);
+#else
+          GeneralCatmullClarkPatch3fa::fix_quad_ring_order(patches);
+          eval(patches[0],srange0,intersect(srange0,erange),1);
+          eval(patches[1],srange1,intersect(srange1,erange),1);
+          eval(patches[2],srange2,intersect(srange2,erange),1);
+          eval(patches[3],srange3,intersect(srange3,erange),1);
+#endif
+        }
+        else
+        {
+          assert(subPatch < N);
+          
+#if PATCH_USE_GREGORY == 2
+          BezierCurve3fa borders[2]; patch.getLimitBorder(borders,subPatch);
+          BezierCurve3fa border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve3fa border2l,border2r; borders[1].subdivide(border2l,border2r);
+          eval(patches[subPatch], srange, erange, 1, &border0l, nullptr, nullptr, &border2r);
+#else
+          eval(patches[subPatch], srange, erange, 1);
+#endif
+          
+        }
+        assert(count == (x1-x0+1)*(y1-y0+1));
+      }
+      
+      FeatureAdaptiveEvalGrid (const CatmullClarkPatch3fa& patch,
+                               const BBox2f& srange, const BBox2f& erange, const unsigned depth,
+                               const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                               float* Px, float* Py, float* Pz, float* U, float* V, 
+                               float* Nx, float* Ny, float* Nz,
+                               const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), /*dheight(dheight),*/ count(0)
+      {
+        eval(patch,srange,erange,depth);
+      }
+
+      template<typename Patch>
+      void evalLocalGrid(const Patch& patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1)
+      {
+        const float scale_x = rcp(srange.upper.x-srange.lower.x);
+        const float scale_y = rcp(srange.upper.y-srange.lower.y);
+        count += (lx1-lx0)*(ly1-ly0);
+        
+#if 0
+        for (unsigned iy=ly0; iy<ly1; iy++) {
+          for (unsigned ix=lx0; ix<lx1; ix++) {
+            const float lu = select(ix == swidth -1, float(1.0f), (float(ix)-srange.lower.x)*scale_x);
+            const float lv = select(iy == sheight-1, float(1.0f), (float(iy)-srange.lower.y)*scale_y);
+            const Vec3fa p = patch.eval(lu,lv);
+            const float u = float(ix)*rcp_swidth;
+            const float v = float(iy)*rcp_sheight;
+            const int ofs = (iy-y0)*dwidth+(ix-x0);
+            Px[ofs] = p.x;
+            Py[ofs] = p.y;
+            Pz[ofs] = p.z;
+            U[ofs] = u;
+            V[ofs] = v;
+          }
+        }
+#else
+        foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) {
+            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x);
+            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y);
+            const Vec3vfx p = patch.eval(lu,lv);
+            Vec3vfx n = zero;
+            if (unlikely(Nx != nullptr)) n = normalize_safe(patch.normal(lu,lv));
+            const vfloatx u = vfloatx(ix)*rcp_swidth;
+            const vfloatx v = vfloatx(iy)*rcp_sheight;
+            const vintx ofs = (iy-y0)*dwidth+(ix-x0);
+            if (likely(all(valid)) && all(iy==iy[0])) {
+              const unsigned ofs2 = ofs[0];
+              vfloatx::storeu(Px+ofs2,p.x);
+              vfloatx::storeu(Py+ofs2,p.y);
+              vfloatx::storeu(Pz+ofs2,p.z);
+              vfloatx::storeu(U+ofs2,u);
+              vfloatx::storeu(V+ofs2,v);
+              if (unlikely(Nx != nullptr)) {
+                vfloatx::storeu(Nx+ofs2,n.x);
+                vfloatx::storeu(Ny+ofs2,n.y);
+                vfloatx::storeu(Nz+ofs2,n.z);
+              }
+            } else {
+              foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) {
+                  const unsigned ofs2 = ofs[j]-j;
+                  vfloatx::storeu(valid,Px+ofs2,p.x);
+                  vfloatx::storeu(valid,Py+ofs2,p.y);
+                  vfloatx::storeu(valid,Pz+ofs2,p.z);
+                  vfloatx::storeu(valid,U+ofs2,u);
+                  vfloatx::storeu(valid,V+ofs2,v);
+                  if (unlikely(Nx != nullptr)) {
+                    vfloatx::storeu(valid,Nx+ofs2,n.x);
+                    vfloatx::storeu(valid,Ny+ofs2,n.y);
+                    vfloatx::storeu(valid,Nz+ofs2,n.z);
+                  }
+                });
+            }
+          });
+#endif
+      }
+      
+      __forceinline bool final(const CatmullClarkPatch3fa& patch, const CatmullClarkRing::Type type, unsigned depth) 
+      {
+        const unsigned max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//        return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+        return depth>=max_eval_depth;
+//#endif
+      }
+      
+      void eval(const CatmullClarkPatch3fa& patch, const BBox2f& srange, const BBox2f& erange, const unsigned depth, 
+                const BezierCurve3fa* border0 = nullptr, const BezierCurve3fa* border1 = nullptr, const BezierCurve3fa* border2 = nullptr, const BezierCurve3fa* border3 = nullptr)
+      {
+        if (erange.empty())
+          return;
+        
+        int lx0 = (int) ceilf(erange.lower.x);
+        int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0));
+        int ly0 = (int) ceilf(erange.lower.y);
+        int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0));
+        if (lx0 >= lx1 || ly0 >= ly1) return;
+
+        CatmullClarkPatch::Type ty = patch.type();
+
+        if (unlikely(final(patch,ty,depth)))
+        {
+          if (ty & CatmullClarkRing::TYPE_REGULAR) {
+            RegularPatch rpatch(patch,border0,border1,border2,border3);
+            evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1);
+            return;
+          } else {
+            IrregularFillPatch ipatch(patch,border0,border1,border2,border3);
+            evalLocalGrid(ipatch,srange,lx0,lx1,ly0,ly1);
+            return;
+          }
+        }
+        else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+          assert(depth > 0); 
+          RegularPatch rpatch(patch,border0,border1,border2,border3);
+          evalLocalGrid(rpatch,srange,lx0,lx1,ly0,ly1);
+          return;
+        }
+#if PATCH_USE_GREGORY == 2
+        else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+          assert(depth > 0); 
+          GregoryPatch gpatch(patch,border0,border1,border2,border3);
+          evalLocalGrid(gpatch,srange,lx0,lx1,ly0,ly1);
+        }
+#endif
+        else
+        {
+          array_t<CatmullClarkPatch3fa,4> patches; 
+          patch.subdivide(patches);
+          
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+          
+          eval(patches[0],srange0,intersect(srange0,erange),depth+1);
+          eval(patches[1],srange1,intersect(srange1,erange),depth+1);
+          eval(patches[2],srange2,intersect(srange2,erange),depth+1);
+          eval(patches[3],srange3,intersect(srange3,erange),depth+1);
+        }
+      }
+    };
+    
+    template<typename Eval, typename Patch>
+      bool stitch_col(const Patch& patch, int subPatch,
+                      const bool right, const unsigned y0, const unsigned y1, const int fine_y, const int coarse_y, 
+                      float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dx0, const unsigned dwidth, const unsigned dheight)
+    {
+      assert(coarse_y <= fine_y);
+      if (likely(fine_y == coarse_y))
+        return false;
+      
+      const unsigned y0s = stitch(y0,fine_y,coarse_y);
+      const unsigned y1s = stitch(y1,fine_y,coarse_y);
+      const unsigned M = y1s-y0s+1 + VSIZEX;
+      
+      dynamic_large_stack_array(float,px,M,64*sizeof(float));
+      dynamic_large_stack_array(float,py,M,64*sizeof(float));
+      dynamic_large_stack_array(float,pz,M,64*sizeof(float));
+      dynamic_large_stack_array(float,u,M,64*sizeof(float));
+      dynamic_large_stack_array(float,v,M,64*sizeof(float));
+      dynamic_large_stack_array(float,nx,M,64*sizeof(float));
+      dynamic_large_stack_array(float,ny,M,64*sizeof(float));
+      dynamic_large_stack_array(float,nz,M,64*sizeof(float));
+      const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz));
+      Eval(patch,subPatch, right,right, y0s,y1s, 2,coarse_y+1, px,py,pz,u,v, 
+           has_Nxyz ? (float*)nx : nullptr,has_Nxyz ? (float*)ny : nullptr ,has_Nxyz ? (float*)nz : nullptr, 1,4097);
+      
+      for (unsigned y=y0; y<=y1; y++) 
+      {
+        const unsigned ys = stitch(y,fine_y,coarse_y)-y0s;
+        Px[(y-y0)*dwidth+dx0] = px[ys];
+        Py[(y-y0)*dwidth+dx0] = py[ys];
+        Pz[(y-y0)*dwidth+dx0] = pz[ys];
+        U [(y-y0)*dwidth+dx0] = u[ys];
+        V [(y-y0)*dwidth+dx0] = v[ys];
+        if (unlikely(has_Nxyz)) {
+          Nx[(y-y0)*dwidth+dx0] = nx[ys];
+          Ny[(y-y0)*dwidth+dx0] = ny[ys];
+          Nz[(y-y0)*dwidth+dx0] = nz[ys];
+        }
+      }
+      return true;
+    }
+    
+    template<typename Eval, typename Patch>
+      bool stitch_row(const Patch& patch, int subPatch, 
+                      const bool bottom, const unsigned x0, const unsigned x1, const int fine_x, const int coarse_x, 
+                      float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dy0, const unsigned dwidth, const unsigned dheight)
+    {
+      assert(coarse_x <= fine_x);
+      if (likely(fine_x == coarse_x))
+	return false;
+      
+      const unsigned x0s = stitch(x0,fine_x,coarse_x);
+      const unsigned x1s = stitch(x1,fine_x,coarse_x);
+      const unsigned M = x1s-x0s+1 + VSIZEX;
+
+      dynamic_large_stack_array(float,px,M,32*sizeof(float));
+      dynamic_large_stack_array(float,py,M,32*sizeof(float));
+      dynamic_large_stack_array(float,pz,M,32*sizeof(float));
+      dynamic_large_stack_array(float,u,M,32*sizeof(float));
+      dynamic_large_stack_array(float,v,M,32*sizeof(float));
+      dynamic_large_stack_array(float,nx,M,32*sizeof(float));
+      dynamic_large_stack_array(float,ny,M,32*sizeof(float));
+      dynamic_large_stack_array(float,nz,M,32*sizeof(float));
+      const bool has_Nxyz = Nx; assert(!Nx || (Ny && Nz));
+      Eval(patch,subPatch, x0s,x1s, bottom,bottom, coarse_x+1,2, px,py,pz,u,v, 
+           has_Nxyz ? (float*)nx :nullptr, has_Nxyz ? (float*)ny : nullptr , has_Nxyz ? (float*)nz : nullptr, 4097,1);
+      
+      for (unsigned x=x0; x<=x1; x++) 
+      {
+	const unsigned xs = stitch(x,fine_x,coarse_x)-x0s;
+	Px[dy0*dwidth+x-x0] = px[xs];
+        Py[dy0*dwidth+x-x0] = py[xs];
+        Pz[dy0*dwidth+x-x0] = pz[xs];
+        U [dy0*dwidth+x-x0] = u[xs];
+        V [dy0*dwidth+x-x0] = v[xs];
+        if (unlikely(has_Nxyz)) {
+          Nx[dy0*dwidth+x-x0] = nx[xs];
+          Ny[dy0*dwidth+x-x0] = ny[xs];
+          Nz[dy0*dwidth+x-x0] = nz[xs];
+        }
+      }
+      return true;
+    }
+    
+    template<typename Eval, typename Patch>
+    void feature_adaptive_eval_grid (const Patch& patch, unsigned subPatch, const float levels[4],
+                                     const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                                     float* Px, float* Py, float* Pz, float* U, float* V, float* Nx, float* Ny, float* Nz, const unsigned dwidth, const unsigned dheight)
+    {
+      bool sl = false, sr = false, st = false, sb = false;
+      if (levels) {
+        sl = x0 == 0         && stitch_col<Eval,Patch>(patch,subPatch,0,y0,y1,sheight-1,int(levels[3]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0    ,dwidth,dheight);
+        sr = x1 == swidth-1  && stitch_col<Eval,Patch>(patch,subPatch,1,y0,y1,sheight-1,int(levels[1]), Px,Py,Pz,U,V,Nx,Ny,Nz, x1-x0,dwidth,dheight);
+        st = y0 == 0         && stitch_row<Eval,Patch>(patch,subPatch,0,x0,x1,swidth-1,int(levels[0]), Px,Py,Pz,U,V,Nx,Ny,Nz, 0    ,dwidth,dheight);
+        sb = y1 == sheight-1 && stitch_row<Eval,Patch>(patch,subPatch,1,x0,x1,swidth-1,int(levels[2]), Px,Py,Pz,U,V,Nx,Ny,Nz, y1-y0,dwidth,dheight);
+      }
+      const unsigned ofs = st*dwidth+sl;
+      Eval(patch,subPatch,x0+sl,x1-sr,y0+st,y1-sb, swidth,sheight, Px+ofs,Py+ofs,Pz+ofs,U+ofs,V+ofs,Nx?Nx+ofs:nullptr,Ny?Ny+ofs:nullptr,Nz?Nz+ofs:nullptr, dwidth,dheight);
+    }
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h
new file mode 100644
index 0000000000..fa3216730f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/feature_adaptive_eval_simd.h
@@ -0,0 +1,186 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename vbool, typename vint, typename vfloat, typename Vertex, typename Vertex_t = Vertex>
+      struct FeatureAdaptiveEvalSimd
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+        typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        typedef BSplinePatchT<Vertex,Vertex_t> BSplinePatch;
+        typedef BezierPatchT<Vertex,Vertex_t> BezierPatch;
+        typedef GregoryPatchT<Vertex,Vertex_t> GregoryPatch;
+        typedef BilinearPatchT<Vertex,Vertex_t> BilinearPatch;
+        typedef BezierCurveT<Vertex> BezierCurve;
+
+        FeatureAdaptiveEvalSimd (const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid, const vfloat& u, const vfloat& v, 
+                                 float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          switch (edge->patch_type) {
+          case HalfEdge::BILINEAR_PATCH: BilinearPatch(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+          case HalfEdge::REGULAR_QUAD_PATCH: RegularPatchT(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+#if PATCH_USE_GREGORY == 2
+          case HalfEdge::IRREGULAR_QUAD_PATCH: GregoryPatchT<Vertex,Vertex_t>(edge,vertices,stride).eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,1.0f,dstride,N); break;
+#endif
+          default: {
+            GeneralCatmullClarkPatch patch(edge,vertices,stride);
+            eval_direct(valid,patch,Vec2<vfloat>(u,v),0);
+            break;
+          }
+          }
+        }
+
+        FeatureAdaptiveEvalSimd (const CatmullClarkPatch& patch, const vbool& valid, const vfloat& u, const vfloat& v, float dscale, size_t depth, 
+                                 float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          eval_direct(valid,patch,Vec2<vfloat>(u,v),dscale,depth);
+        }
+
+        template<size_t N>
+        __forceinline void eval_quad_direct(const vbool& valid, array_t<CatmullClarkPatch,N>& patches, const Vec2<vfloat>& uv, float dscale, size_t depth)
+        {
+          const vfloat u = uv.x, v = uv.y;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1);
+        }
+        
+        template<size_t N>
+        __forceinline void eval_general_quad_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, array_t<CatmullClarkPatch,N>& patches, const Vec2<vfloat>& uv, float dscale, size_t depth)
+        {
+#if PATCH_USE_GREGORY == 2
+          BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders);
+          BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+          BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r);
+          BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r);
+          BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r);
+#endif
+          GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+          const vfloat u = uv.x, v = uv.y;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+#if PATCH_USE_GREGORY == 2
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1,&border0l,nullptr,nullptr,&border3r);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1,&border0r,&border1l,nullptr,nullptr);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,&border1r,&border2l,nullptr);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1,nullptr,nullptr,&border2r,&border3l);
+#else
+          if (any(u0v0_mask)) eval_direct(u0v0_mask,patches[0],Vec2<vfloat>(2.0f*u,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) eval_direct(u1v0_mask,patches[1],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v),2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) eval_direct(u1v1_mask,patches[2],Vec2<vfloat>(2.0f*u-1.0f,2.0f*v-1.0f),2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) eval_direct(u0v1_mask,patches[3],Vec2<vfloat>(2.0f*u,2.0f*v-1.0f),2.0f*dscale,depth+1);
+#endif
+        }
+        
+        __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+        {
+          const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//          return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+          return depth>=max_eval_depth;
+//#endif
+        }
+
+        void eval_direct(const vbool& valid, const CatmullClarkPatch& patch, const Vec2<vfloat>& uv, float dscale, size_t depth,
+                         BezierCurve* border0 = nullptr, BezierCurve* border1 = nullptr, BezierCurve* border2 = nullptr, BezierCurve* border3 = nullptr)
+        {
+          typename CatmullClarkPatch::Type ty = patch.type();
+
+          if (unlikely(final(patch,ty,depth)))
+          {
+            if (ty & CatmullClarkRing::TYPE_REGULAR) { 
+              RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            } else {
+              IrregularFillPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            }
+          }
+          else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+            assert(depth > 0); RegularPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+          }
+#if PATCH_USE_GREGORY == 2
+          else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+            assert(depth > 0); GregoryPatch(patch,border0,border1,border2,border3).eval(valid,uv.x,uv.y,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+          }
+#endif
+          else
+          {
+            array_t<CatmullClarkPatch,4> patches; 
+            patch.subdivide(patches); // FIXME: only have to generate one of the patches
+            eval_quad_direct(valid,patches,uv,dscale,depth);
+          }
+        }  
+
+        void eval_direct(const vbool& valid, const GeneralCatmullClarkPatch& patch, const Vec2<vfloat>& uv, const size_t depth) 
+        {
+          /* convert into standard quad patch if possible */
+          if (likely(patch.isQuadPatch())) {
+            CatmullClarkPatch qpatch; patch.init(qpatch);
+            return eval_direct(valid,qpatch,uv,1.0f,depth);
+          }
+          
+          /* subdivide patch */
+          unsigned Nc;
+          array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+          patch.subdivide(patches,Nc); // FIXME: only have to generate one of the patches
+          
+          /* parametrization for quads */
+          if (Nc == 4) 
+            eval_general_quad_direct(valid,patch,patches,uv,1.0f,depth);
+          
+          /* parametrization for arbitrary polygons */
+          else 
+          {
+            const vint l = (vint)floor(0.5f*uv.x); const vfloat u = 2.0f*frac(0.5f*uv.x)-0.5f; 
+            const vint h = (vint)floor(0.5f*uv.y); const vfloat v = 2.0f*frac(0.5f*uv.y)-0.5f; 
+            const vint i = (h<<2)+l; assert(all(valid,i<Nc));
+            foreach_unique(valid,i,[&](const vbool& valid, const int i) {
+#if PATCH_USE_GREGORY == 2
+                BezierCurve borders[2]; patch.getLimitBorder(borders,i);
+                BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+                BezierCurve border2l,border2r; borders[1].subdivide(border2l,border2r);
+                eval_direct(valid,patches[i],Vec2<vfloat>(u,v),1.0f,depth+1, &border0l, nullptr, nullptr, &border2r);
+#else
+                eval_direct(valid,patches[i],Vec2<vfloat>(u,v),1.0f,depth+1);
+#endif
+              });
+          }
+        }
+
+      private:
+        float* const P;
+        float* const dPdu;
+        float* const dPdv;
+        float* const ddPdudu;
+        float* const ddPdvdv;
+        float* const ddPdudv;
+        const size_t dstride;
+        const size_t N;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h
new file mode 100644
index 0000000000..2a7c4b1f2c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch.h
@@ -0,0 +1,893 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bezier_patch.h"
+#include "bezier_curve.h"
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{  
+  template<typename Vertex, typename Vertex_t = Vertex>
+  class __aligned(64) GregoryPatchT
+  {
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClark1Ring;
+    typedef BezierCurveT<Vertex> BezierCurve;
+
+  public:
+    Vertex v[4][4];
+    Vertex f[2][2];
+
+    __forceinline GregoryPatchT() {}
+
+    __forceinline GregoryPatchT(const CatmullClarkPatch& patch) {
+      init(patch);
+    }
+
+    __forceinline GregoryPatchT(const CatmullClarkPatch& patch, 
+                                const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+    {
+      init_crackfix(patch,border0,border1,border2,border3);
+    }
+
+    __forceinline GregoryPatchT (const HalfEdge* edge, const char* vertices, size_t stride) { 
+      init(CatmullClarkPatch(edge,vertices,stride));
+    }
+      
+    __forceinline Vertex& p0() { return v[0][0]; }
+    __forceinline Vertex& p1() { return v[0][3]; }
+    __forceinline Vertex& p2() { return v[3][3]; }
+    __forceinline Vertex& p3() { return v[3][0]; }
+    
+    __forceinline Vertex& e0_p() { return v[0][1]; }
+    __forceinline Vertex& e0_m() { return v[1][0]; }
+    __forceinline Vertex& e1_p() { return v[1][3]; }
+    __forceinline Vertex& e1_m() { return v[0][2]; }
+    __forceinline Vertex& e2_p() { return v[3][2]; }
+    __forceinline Vertex& e2_m() { return v[2][3]; }
+    __forceinline Vertex& e3_p() { return v[2][0]; }
+    __forceinline Vertex& e3_m() { return v[3][1]; }
+    
+    __forceinline Vertex& f0_p() { return v[1][1]; }
+    __forceinline Vertex& f1_p() { return v[1][2]; }
+    __forceinline Vertex& f2_p() { return v[2][2]; }
+    __forceinline Vertex& f3_p() { return v[2][1]; }
+    __forceinline Vertex& f0_m() { return f[0][0]; }
+    __forceinline Vertex& f1_m() { return f[0][1]; }
+    __forceinline Vertex& f2_m() { return f[1][1]; }
+    __forceinline Vertex& f3_m() { return f[1][0]; }
+    
+    __forceinline const Vertex& p0() const { return v[0][0]; }
+    __forceinline const Vertex& p1() const { return v[0][3]; }
+    __forceinline const Vertex& p2() const { return v[3][3]; }
+    __forceinline const Vertex& p3() const { return v[3][0]; }
+    
+    __forceinline const Vertex& e0_p() const { return v[0][1]; }
+    __forceinline const Vertex& e0_m() const { return v[1][0]; }
+    __forceinline const Vertex& e1_p() const { return v[1][3]; }
+    __forceinline const Vertex& e1_m() const { return v[0][2]; }
+    __forceinline const Vertex& e2_p() const { return v[3][2]; }
+    __forceinline const Vertex& e2_m() const { return v[2][3]; }
+    __forceinline const Vertex& e3_p() const { return v[2][0]; }
+    __forceinline const Vertex& e3_m() const { return v[3][1]; }
+    
+    __forceinline const Vertex& f0_p() const { return v[1][1]; }
+    __forceinline const Vertex& f1_p() const { return v[1][2]; }
+    __forceinline const Vertex& f2_p() const { return v[2][2]; }
+    __forceinline const Vertex& f3_p() const { return v[2][1]; }
+    __forceinline const Vertex& f0_m() const { return f[0][0]; }
+    __forceinline const Vertex& f1_m() const { return f[0][1]; }
+    __forceinline const Vertex& f2_m() const { return f[1][1]; }
+    __forceinline const Vertex& f3_m() const { return f[1][0]; }
+    
+    __forceinline Vertex initCornerVertex(const CatmullClarkPatch& irreg_patch, const size_t index) {
+      return irreg_patch.ring[index].getLimitVertex();
+    }
+    
+    __forceinline Vertex initPositiveEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) {
+      return madd(1.0f/3.0f,irreg_patch.ring[index].getLimitTangent(),p_vtx);
+    }
+    
+    __forceinline Vertex initNegativeEdgeVertex(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) {
+      return madd(1.0f/3.0f,irreg_patch.ring[index].getSecondLimitTangent(),p_vtx);
+    }
+
+    __forceinline Vertex initPositiveEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) 
+    {
+      CatmullClark1Ring3fa r0,r1,r2;
+      irreg_patch.ring[index].subdivide(r0);
+      r0.subdivide(r1);
+      r1.subdivide(r2);
+      return madd(8.0f/3.0f,r2.getLimitTangent(),p_vtx);
+    }
+    
+    __forceinline Vertex initNegativeEdgeVertex2(const CatmullClarkPatch& irreg_patch, const size_t index, const Vertex& p_vtx) 
+    {
+      CatmullClark1Ring3fa r0,r1,r2;
+      irreg_patch.ring[index].subdivide(r0);
+      r0.subdivide(r1);
+      r1.subdivide(r2);
+      return madd(8.0f/3.0f,r2.getSecondLimitTangent(),p_vtx);
+    }
+    
+    void initFaceVertex(const CatmullClarkPatch& irreg_patch, 
+			const size_t index, 
+			const Vertex& p_vtx, 
+                        const Vertex& e0_p_vtx, 
+			const Vertex& e1_m_vtx, 
+			const unsigned int face_valence_p1,
+ 			const Vertex& e0_m_vtx,	
+			const Vertex& e3_p_vtx,	
+			const unsigned int face_valence_p3,
+			Vertex& f_p_vtx, 
+			Vertex& f_m_vtx)
+    {
+      const unsigned int face_valence = irreg_patch.ring[index].face_valence;
+      const unsigned int edge_valence = irreg_patch.ring[index].edge_valence;
+      const unsigned int border_index = irreg_patch.ring[index].border_index;
+      
+      const Vertex& vtx     = irreg_patch.ring[index].vtx;
+      const Vertex e_i      = irreg_patch.ring[index].getEdgeCenter(0);
+      const Vertex c_i_m_1  = irreg_patch.ring[index].getQuadCenter(0);
+      const Vertex e_i_m_1  = irreg_patch.ring[index].getEdgeCenter(1);
+      
+      Vertex c_i, e_i_p_1;
+      const bool hasHardEdge0 =
+        std::isinf(irreg_patch.ring[index].vertex_crease_weight) &&
+        std::isinf(irreg_patch.ring[index].crease_weight[0]);
+                
+      if (unlikely((border_index == edge_valence-2) || hasHardEdge0))
+      {
+        /* mirror quad center and edge mid-point */
+        c_i     = madd(2.0f, e_i - c_i_m_1, c_i_m_1);
+        e_i_p_1 = madd(2.0f, vtx - e_i_m_1, e_i_m_1);
+      }
+      else
+      {
+        c_i     = irreg_patch.ring[index].getQuadCenter( face_valence-1 );
+        e_i_p_1 = irreg_patch.ring[index].getEdgeCenter( face_valence-1 );
+      }
+      
+      Vertex c_i_m_2, e_i_m_2;
+      const bool hasHardEdge1 =
+        std::isinf(irreg_patch.ring[index].vertex_crease_weight) &&
+        std::isinf(irreg_patch.ring[index].crease_weight[1]);
+      
+      if (unlikely(border_index == 2 || hasHardEdge1))
+      {
+        /* mirror quad center and edge mid-point */
+        c_i_m_2  = madd(2.0f, e_i_m_1 - c_i_m_1, c_i_m_1);
+        e_i_m_2  = madd(2.0f, vtx - e_i, + e_i);
+      }
+      else
+      {
+        c_i_m_2  = irreg_patch.ring[index].getQuadCenter( 1 );
+        e_i_m_2  = irreg_patch.ring[index].getEdgeCenter( 2 );
+      }      
+      
+      const float d = 3.0f;
+      //const float c     = cosf(2.0f*M_PI/(float)face_valence);
+      //const float c_e_p = cosf(2.0f*M_PI/(float)face_valence_p1);
+      //const float c_e_m = cosf(2.0f*M_PI/(float)face_valence_p3);
+      
+      const float c     = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence);
+      const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1);
+      const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3);
+
+      const Vertex r_e_p = 1.0f/3.0f * (e_i_m_1 - e_i_p_1) + 2.0f/3.0f * (c_i_m_1 - c_i);
+      const Vertex r_e_m = 1.0f/3.0f * (e_i     - e_i_m_2) + 2.0f/3.0f * (c_i_m_1 - c_i_m_2);
+
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);     
+    }
+
+    __noinline void init(const CatmullClarkPatch& patch)
+    {
+      assert( patch.ring[0].hasValidPositions() );
+      assert( patch.ring[1].hasValidPositions() );
+      assert( patch.ring[2].hasValidPositions() );
+      assert( patch.ring[3].hasValidPositions() );
+      
+      p0() = initCornerVertex(patch,0);
+      p1() = initCornerVertex(patch,1);
+      p2() = initCornerVertex(patch,2);
+      p3() = initCornerVertex(patch,3);
+
+      e0_p() = initPositiveEdgeVertex(patch,0, p0());
+      e1_p() = initPositiveEdgeVertex(patch,1, p1());
+      e2_p() = initPositiveEdgeVertex(patch,2, p2());
+      e3_p() = initPositiveEdgeVertex(patch,3, p3());
+
+      e0_m() = initNegativeEdgeVertex(patch,0, p0());
+      e1_m() = initNegativeEdgeVertex(patch,1, p1());
+      e2_m() = initNegativeEdgeVertex(patch,2, p2());
+      e3_m() = initNegativeEdgeVertex(patch,3, p3());
+
+      const unsigned int face_valence_p0 = patch.ring[0].face_valence;
+      const unsigned int face_valence_p1 = patch.ring[1].face_valence;
+      const unsigned int face_valence_p2 = patch.ring[2].face_valence;
+      const unsigned int face_valence_p3 = patch.ring[3].face_valence;
+      
+      initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() );
+      initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() );
+      initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() );
+      initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() );
+
+    }
+
+    __noinline void init_crackfix(const CatmullClarkPatch& patch, 
+                                  const BezierCurve* border0, 
+                                  const BezierCurve* border1,
+                                  const BezierCurve* border2, 
+                                  const BezierCurve* border3)
+    {
+      assert( patch.ring[0].hasValidPositions() );
+      assert( patch.ring[1].hasValidPositions() );
+      assert( patch.ring[2].hasValidPositions() );
+      assert( patch.ring[3].hasValidPositions() );
+      
+      p0() = initCornerVertex(patch,0);
+      p1() = initCornerVertex(patch,1);
+      p2() = initCornerVertex(patch,2);
+      p3() = initCornerVertex(patch,3);
+
+      e0_p() = initPositiveEdgeVertex(patch,0, p0());
+      e1_p() = initPositiveEdgeVertex(patch,1, p1());
+      e2_p() = initPositiveEdgeVertex(patch,2, p2());
+      e3_p() = initPositiveEdgeVertex(patch,3, p3());
+
+      e0_m() = initNegativeEdgeVertex(patch,0, p0());
+      e1_m() = initNegativeEdgeVertex(patch,1, p1());
+      e2_m() = initNegativeEdgeVertex(patch,2, p2());
+      e3_m() = initNegativeEdgeVertex(patch,3, p3());
+
+      if (unlikely(border0 != nullptr)) 
+      {         
+        p0()   = border0->v0;
+        e0_p() = border0->v1; 
+        e1_m() = border0->v2; 
+        p1()   = border0->v3;
+      }
+      
+      if (unlikely(border1 != nullptr))
+      {          
+        p1()   = border1->v0; 
+        e1_p() = border1->v1; 
+        e2_m() = border1->v2; 
+        p2()   = border1->v3; 
+      }
+
+      if (unlikely(border2 != nullptr))
+      {          
+        p2()   = border2->v0; 
+        e2_p() = border2->v1; 
+        e3_m() = border2->v2; 
+        p3()   = border2->v3; 
+      }
+
+      if (unlikely(border3 != nullptr))
+      {          
+        p3()   = border3->v0; 
+        e3_p() = border3->v1; 
+        e0_m() = border3->v2; 
+        p0()   = border3->v3; 
+      }
+
+      const unsigned int face_valence_p0 = patch.ring[0].face_valence;
+      const unsigned int face_valence_p1 = patch.ring[1].face_valence;
+      const unsigned int face_valence_p2 = patch.ring[2].face_valence;
+      const unsigned int face_valence_p3 = patch.ring[3].face_valence;
+      
+      initFaceVertex(patch,0,p0(),e0_p(),e1_m(),face_valence_p1,e0_m(),e3_p(),face_valence_p3,f0_p(),f0_m() );
+      initFaceVertex(patch,1,p1(),e1_p(),e2_m(),face_valence_p2,e1_m(),e0_p(),face_valence_p0,f1_p(),f1_m() );
+      initFaceVertex(patch,2,p2(),e2_p(),e3_m(),face_valence_p3,e2_m(),e1_p(),face_valence_p1,f2_p(),f2_m() );
+      initFaceVertex(patch,3,p3(),e3_p(),e0_m(),face_valence_p0,e3_m(),e2_p(),face_valence_p3,f3_p(),f3_m() );
+    }
+
+    
+    void computeGregoryPatchFacePoints(const unsigned int face_valence,
+				       const Vertex& r_e_p, 
+				       const Vertex& r_e_m, 					 
+				       const Vertex& p_vtx, 
+				       const Vertex& e0_p_vtx, 
+				       const Vertex& e1_m_vtx, 
+				       const unsigned int face_valence_p1,
+				       const Vertex& e0_m_vtx,	
+				       const Vertex& e3_p_vtx,	
+				       const unsigned int face_valence_p3,
+				       Vertex& f_p_vtx, 
+				       Vertex& f_m_vtx,
+                                       const float d = 3.0f)
+    {
+      //const float c     = cosf(2.0*M_PI/(float)face_valence);
+      //const float c_e_p = cosf(2.0*M_PI/(float)face_valence_p1);
+      //const float c_e_m = cosf(2.0*M_PI/(float)face_valence_p3);
+
+      const float c     = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence);
+      const float c_e_p = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p1);
+      const float c_e_m = CatmullClarkPrecomputedCoefficients::table.cos_2PI_div_n(face_valence_p3);
+
+
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);      
+      f_p_vtx = 1.0f / d * (c_e_p * p_vtx + (d - 2.0f*c - c_e_p) * e0_p_vtx + 2.0f*c* e1_m_vtx + r_e_p);      
+      f_m_vtx = 1.0f / d * (c_e_m * p_vtx + (d - 2.0f*c - c_e_m) * e0_m_vtx + 2.0f*c* e3_p_vtx + r_e_m);
+    }
+
+    __noinline void init(const GeneralCatmullClarkPatch& patch)
+    {
+      assert(patch.size() == 4);
+#if 0
+      CatmullClarkPatch qpatch; patch.init(qpatch);
+      init(qpatch);
+#else
+      const float face_valence_p0 = patch.ring[0].face_valence;
+      const float face_valence_p1 = patch.ring[1].face_valence;
+      const float face_valence_p2 = patch.ring[2].face_valence;
+      const float face_valence_p3 = patch.ring[3].face_valence;
+
+      Vertex p0_r_p, p0_r_m;
+      patch.ring[0].computeGregoryPatchEdgePoints( p0(), e0_p(), e0_m(), p0_r_p, p0_r_m );
+
+      Vertex p1_r_p, p1_r_m;
+      patch.ring[1].computeGregoryPatchEdgePoints( p1(), e1_p(), e1_m(), p1_r_p, p1_r_m );
+      
+      Vertex p2_r_p, p2_r_m;
+      patch.ring[2].computeGregoryPatchEdgePoints( p2(), e2_p(), e2_m(), p2_r_p, p2_r_m );
+
+      Vertex p3_r_p, p3_r_m;
+      patch.ring[3].computeGregoryPatchEdgePoints( p3(), e3_p(), e3_m(), p3_r_p, p3_r_m );
+
+      computeGregoryPatchFacePoints(face_valence_p0, p0_r_p, p0_r_m, p0(), e0_p(), e1_m(), face_valence_p1, e0_m(), e3_p(), face_valence_p3, f0_p(), f0_m() );
+      computeGregoryPatchFacePoints(face_valence_p1, p1_r_p, p1_r_m, p1(), e1_p(), e2_m(), face_valence_p2, e1_m(), e0_p(), face_valence_p0, f1_p(), f1_m() );
+      computeGregoryPatchFacePoints(face_valence_p2, p2_r_p, p2_r_m, p2(), e2_p(), e3_m(), face_valence_p3, e2_m(), e1_p(), face_valence_p1, f2_p(), f2_m() );
+      computeGregoryPatchFacePoints(face_valence_p3, p3_r_p, p3_r_m, p3(), e3_p(), e0_m(), face_valence_p0, e3_m(), e2_p(), face_valence_p3, f3_p(), f3_m() );
+
+#endif
+    }
+   
+    
+    __forceinline void convert_to_bezier()
+    {
+      f0_p() = (f0_p() + f0_m()) * 0.5f;
+      f1_p() = (f1_p() + f1_m()) * 0.5f;
+      f2_p() = (f2_p() + f2_m()) * 0.5f;
+      f3_p() = (f3_p() + f3_m()) * 0.5f;
+      f0_m() = Vertex( zero );
+      f1_m() = Vertex( zero );
+      f2_m() = Vertex( zero );
+      f3_m() = Vertex( zero );      
+    }
+    
+    static __forceinline void computeInnerVertices(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv,
+						   Vertex_t& matrix_11, Vertex_t& matrix_12, Vertex_t& matrix_22, Vertex_t& matrix_21)
+    {
+      if (unlikely(uu == 0.0f || uu == 1.0f || vv == 0.0f || vv == 1.0f)) 
+      {
+	matrix_11 = matrix[1][1];
+	matrix_12 = matrix[1][2];
+	matrix_22 = matrix[2][2];
+	matrix_21 = matrix[2][1];	 
+      }
+      else
+      {
+	const Vertex_t f0_p = matrix[1][1];
+	const Vertex_t f1_p = matrix[1][2];
+	const Vertex_t f2_p = matrix[2][2];
+	const Vertex_t f3_p = matrix[2][1];
+        
+	const Vertex_t f0_m = f_m[0][0];
+	const Vertex_t f1_m = f_m[0][1];
+	const Vertex_t f2_m = f_m[1][1];
+	const Vertex_t f3_m = f_m[1][0];
+        
+	matrix_11 = (      uu  * f0_p +       vv  * f0_m)*rcp(uu+vv);
+	matrix_12 = ((1.0f-uu) * f1_m +       vv  * f1_p)*rcp(1.0f-uu+vv);
+	matrix_22 = ((1.0f-uu) * f2_p + (1.0f-vv) * f2_m)*rcp(2.0f-uu-vv);
+	matrix_21 = (      uu  * f3_m + (1.0f-vv) * f3_p)*rcp(1.0f+uu-vv);
+      }
+    } 
+
+    template<typename vfloat>
+    static __forceinline void computeInnerVertices(const Vertex v[4][4], const Vertex f[2][2], 
+                                                   size_t i, const vfloat& uu, const vfloat& vv, vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21) 
+    {
+      const auto m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+
+      const vfloat f0_p = v[1][1][i];
+      const vfloat f1_p = v[1][2][i];
+      const vfloat f2_p = v[2][2][i];
+      const vfloat f3_p = v[2][1][i];
+      
+      const vfloat f0_m = f[0][0][i];
+      const vfloat f1_m = f[0][1][i];
+      const vfloat f2_m = f[1][1][i];
+      const vfloat f3_m = f[1][0][i];
+      
+      const vfloat one_minus_uu = vfloat(1.0f) - uu;
+      const vfloat one_minus_vv = vfloat(1.0f) - vv;      
+      
+      const vfloat f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const vfloat f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const vfloat f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const vfloat f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+      
+      matrix_11 = select(m_border,f0_p,f0_i);
+      matrix_12 = select(m_border,f1_p,f1_i);
+      matrix_22 = select(m_border,f2_p,f2_i);
+      matrix_21 = select(m_border,f3_p,f3_i);
+    }
+
+    static __forceinline Vertex eval(const Vertex matrix[4][4], const Vertex f[2][2], const float& uu, const float& vv) 
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+      
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_du(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::derivative(vv);
+ 
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dudu(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative2(uu);
+      const Vec4<float> Bv = BezierBasis::eval(vv);
+ 
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+     }
+
+    static __forceinline Vertex eval_dvdv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::eval(uu);
+      const Vec4<float> Bv = BezierBasis::derivative2(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    static __forceinline Vertex eval_dudv(const Vertex matrix[4][4], const Vertex f[2][2], const float uu, const float vv) // approximative derivative
+    {
+      Vertex_t v_11, v_12, v_22, v_21;
+      computeInnerVertices(matrix,f,uu,vv,v_11, v_12, v_22, v_21);
+      
+      const Vec4<float> Bu = BezierBasis::derivative(uu);
+      const Vec4<float> Bv = BezierBasis::derivative(vv);
+
+      return madd(Bv.x,madd(Bu.x,matrix[0][0],madd(Bu.y,matrix[0][1],madd(Bu.z,matrix[0][2],Bu.w * matrix[0][3]))), 
+                  madd(Bv.y,madd(Bu.x,matrix[1][0],madd(Bu.y,v_11        ,madd(Bu.z,v_12        ,Bu.w * matrix[1][3]))), 
+                       madd(Bv.z,madd(Bu.x,matrix[2][0],madd(Bu.y,v_21        ,madd(Bu.z,v_22        ,Bu.w * matrix[2][3]))), 
+                            Bv.w*madd(Bu.x,matrix[3][0],madd(Bu.y,matrix[3][1],madd(Bu.z,matrix[3][2],Bu.w * matrix[3][3])))))); 
+    }
+
+    __forceinline Vertex eval(const float uu, const float vv) const {
+      return eval(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_du( const float uu, const float vv) const {
+      return eval_du(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dv( const float uu, const float vv) const {
+      return eval_dv(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dudu( const float uu, const float vv) const {
+      return eval_dudu(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dvdv( const float uu, const float vv) const {
+      return eval_dvdv(v,f,uu,vv);
+    }
+
+    __forceinline Vertex eval_dudv( const float uu, const float vv) const {
+      return eval_dudv(v,f,uu,vv);
+    }
+
+    static __forceinline Vertex normal(const Vertex matrix[4][4], const Vertex f_m[2][2], const float uu, const float vv)  // FIXME: why not using basis functions
+    {
+      /* interpolate inner vertices */
+      Vertex_t matrix_11, matrix_12, matrix_22, matrix_21;
+      computeInnerVertices(matrix,f_m,uu,vv,matrix_11, matrix_12, matrix_22, matrix_21);
+      
+      /* tangentU */
+      const Vertex_t col0 = deCasteljau(vv, (Vertex_t)matrix[0][0], (Vertex_t)matrix[1][0], (Vertex_t)matrix[2][0], (Vertex_t)matrix[3][0]);
+      const Vertex_t col1 = deCasteljau(vv, (Vertex_t)matrix[0][1], (Vertex_t)matrix_11   , (Vertex_t)matrix_21   , (Vertex_t)matrix[3][1]);
+      const Vertex_t col2 = deCasteljau(vv, (Vertex_t)matrix[0][2], (Vertex_t)matrix_12   , (Vertex_t)matrix_22   , (Vertex_t)matrix[3][2]);
+      const Vertex_t col3 = deCasteljau(vv, (Vertex_t)matrix[0][3], (Vertex_t)matrix[1][3], (Vertex_t)matrix[2][3], (Vertex_t)matrix[3][3]);
+      
+      const Vertex_t tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vertex_t row0 = deCasteljau(uu, (Vertex_t)matrix[0][0], (Vertex_t)matrix[0][1], (Vertex_t)matrix[0][2], (Vertex_t)matrix[0][3]);
+      const Vertex_t row1 = deCasteljau(uu, (Vertex_t)matrix[1][0], (Vertex_t)matrix_11   , (Vertex_t)matrix_12   , (Vertex_t)matrix[1][3]);
+      const Vertex_t row2 = deCasteljau(uu, (Vertex_t)matrix[2][0], (Vertex_t)matrix_21   , (Vertex_t)matrix_22   , (Vertex_t)matrix[2][3]);
+      const Vertex_t row3 = deCasteljau(uu, (Vertex_t)matrix[3][0], (Vertex_t)matrix[3][1], (Vertex_t)matrix[3][2], (Vertex_t)matrix[3][3]);
+      
+      const Vertex_t tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vertex_t n = cross(tangentU,tangentV);
+      
+      return n;     
+    }
+   
+    __forceinline Vertex normal( const float uu, const float vv) const {
+      return normal(v,f,uu,vv);
+    }    
+    
+    __forceinline void eval(const float u, const float v, 
+                            Vertex* P, Vertex* dPdu, Vertex* dPdv, 
+                            Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv,
+                            const float dscale = 1.0f) const
+    {
+      if (P) {
+        *P = eval(u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = eval_du(u,v)*dscale; 
+        assert(dPdv); *dPdv = eval_dv(u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = eval_dudu(u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = eval_dvdv(u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = eval_dudv(u,v)*sqr(dscale); 
+      }
+    }
+
+    template<class vfloat>
+    static __forceinline vfloat eval(const Vertex v[4][4], const Vertex f[2][2], 
+                                     const size_t i, const vfloat& uu, const vfloat& vv, const Vec4<vfloat>& u_n, const Vec4<vfloat>& v_n,
+                                     vfloat& matrix_11, vfloat& matrix_12, vfloat& matrix_22, vfloat& matrix_21)
+    {
+      const vfloat curve0_x = madd(v_n[0],vfloat(v[0][0][i]),madd(v_n[1],vfloat(v[1][0][i]),madd(v_n[2],vfloat(v[2][0][i]),v_n[3] * vfloat(v[3][0][i]))));
+      const vfloat curve1_x = madd(v_n[0],vfloat(v[0][1][i]),madd(v_n[1],vfloat(matrix_11 ),madd(v_n[2],vfloat(matrix_21 ),v_n[3] * vfloat(v[3][1][i]))));
+      const vfloat curve2_x = madd(v_n[0],vfloat(v[0][2][i]),madd(v_n[1],vfloat(matrix_12 ),madd(v_n[2],vfloat(matrix_22 ),v_n[3] * vfloat(v[3][2][i]))));
+      const vfloat curve3_x = madd(v_n[0],vfloat(v[0][3][i]),madd(v_n[1],vfloat(v[1][3][i]),madd(v_n[2],vfloat(v[2][3][i]),v_n[3] * vfloat(v[3][3][i]))));
+      return madd(u_n[0],curve0_x,madd(u_n[1],curve1_x,madd(u_n[2],curve2_x,u_n[3] * curve3_x)));
+    }
+    
+    template<typename vbool, typename vfloat>
+    static __forceinline void eval(const Vertex v[4][4], const Vertex f[2][2], 
+                                   const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                                   float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                                   const float dscale, const size_t dstride, const size_t N) 
+    {
+      if (P) {
+        const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+        const Vec4<vfloat> v_n = BezierBasis::eval(vv); 
+        for (size_t i=0; i<N; i++) {
+          vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+          computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21); // FIXME: calculated multiple times
+          vfloat::store(valid,P+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21));
+        }
+      }
+      if (dPdu)
+      {
+        {
+          assert(dPdu);
+          const Vec4<vfloat> u_n = BezierBasis::derivative(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,dPdu+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*dscale);
+          }
+        }
+        {
+          assert(dPdv);
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,dPdv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*dscale);
+          }
+        }
+      }
+      if (ddPdudu)
+      {
+        {
+          assert(ddPdudu);
+          const Vec4<vfloat> u_n = BezierBasis::derivative2(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::eval(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdudu+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+        {
+          assert(ddPdvdv);
+          const Vec4<vfloat> u_n = BezierBasis::eval(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative2(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdvdv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+        {
+          assert(ddPdudv);
+          const Vec4<vfloat> u_n = BezierBasis::derivative(uu); 
+          const Vec4<vfloat> v_n = BezierBasis::derivative(vv);
+          for (size_t i=0; i<N; i++) {
+            vfloat matrix_11, matrix_12, matrix_22, matrix_21;
+            computeInnerVertices(v,f,i,uu,vv,matrix_11,matrix_12,matrix_22,matrix_21);  // FIXME: calculated multiple times
+            vfloat::store(valid,ddPdudv+i*dstride,eval(v,f,i,uu,vv,u_n,v_n,matrix_11,matrix_12,matrix_22,matrix_21)*sqr(dscale));
+          }
+        }
+      }
+    }
+
+    template<typename vbool, typename vfloat>
+    __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, 
+                            float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv,
+                            const float dscale, const size_t dstride, const size_t N) const {
+      eval(v,f,valid,uu,vv,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> eval_t(const Vertex matrix[4][4], const Vec3<T> f[2][2], const T& uu, const T& vv) 
+    {
+      typedef typename T::Bool M;
+      const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+
+      const Vec3<T> f0_p = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> f1_p = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> f2_p = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> f3_p = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+      
+      const Vec3<T> f0_m = f[0][0];
+      const Vec3<T> f1_m = f[0][1];
+      const Vec3<T> f2_m = f[1][1];
+      const Vec3<T> f3_m = f[1][0];
+      
+      const T one_minus_uu = T(1.0f) - uu;
+      const T one_minus_vv = T(1.0f) - vv;      
+      
+      const Vec3<T> f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const Vec3<T> f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const Vec3<T> f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const Vec3<T> f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+      
+      const Vec3<T> F0( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) );
+      const Vec3<T> F1( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) );
+      const Vec3<T> F2( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) );
+      const Vec3<T> F3( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) );
+
+      const T B0_u = one_minus_uu * one_minus_uu * one_minus_uu;
+      const T B0_v = one_minus_vv * one_minus_vv * one_minus_vv;
+      const T B1_u = 3.0f * (one_minus_uu * uu * one_minus_uu);
+      const T B1_v = 3.0f * (one_minus_vv * vv * one_minus_vv);
+      const T B2_u = 3.0f * (uu * one_minus_uu * uu);
+      const T B2_v = 3.0f * (vv * one_minus_vv * vv);
+      const T B3_u = uu * uu * uu;
+      const T B3_v = vv * vv * vv;
+
+      const T x = madd(B0_v,madd(B0_u,matrix[0][0].x,madd(B1_u,matrix[0][1].x,madd(B2_u,matrix[0][2].x,B3_u * matrix[0][3].x))), 
+                  madd(B1_v,madd(B0_u,matrix[1][0].x,madd(B1_u,F0.x          ,madd(B2_u,F1.x          ,B3_u * matrix[1][3].x))), 
+                  madd(B2_v,madd(B0_u,matrix[2][0].x,madd(B1_u,F3.x          ,madd(B2_u,F2.x          ,B3_u * matrix[2][3].x))), 
+                       B3_v*madd(B0_u,matrix[3][0].x,madd(B1_u,matrix[3][1].x,madd(B2_u,matrix[3][2].x,B3_u * matrix[3][3].x)))))); 
+
+      const T y = madd(B0_v,madd(B0_u,matrix[0][0].y,madd(B1_u,matrix[0][1].y,madd(B2_u,matrix[0][2].y,B3_u * matrix[0][3].y))),
+                  madd(B1_v,madd(B0_u,matrix[1][0].y,madd(B1_u,F0.y          ,madd(B2_u,F1.y          ,B3_u * matrix[1][3].y))),
+                  madd(B2_v,madd(B0_u,matrix[2][0].y,madd(B1_u,F3.y          ,madd(B2_u,F2.y          ,B3_u * matrix[2][3].y))),
+                       B3_v*madd(B0_u,matrix[3][0].y,madd(B1_u,matrix[3][1].y,madd(B2_u,matrix[3][2].y,B3_u * matrix[3][3].y))))));
+      
+      const T z = madd(B0_v,madd(B0_u,matrix[0][0].z,madd(B1_u,matrix[0][1].z,madd(B2_u,matrix[0][2].z,B3_u * matrix[0][3].z))),
+                  madd(B1_v,madd(B0_u,matrix[1][0].z,madd(B1_u,F0.z          ,madd(B2_u,F1.z          ,B3_u * matrix[1][3].z))),
+                  madd(B2_v,madd(B0_u,matrix[2][0].z,madd(B1_u,F3.z          ,madd(B2_u,F2.z          ,B3_u * matrix[2][3].z))),
+                       B3_v*madd(B0_u,matrix[3][0].z,madd(B1_u,matrix[3][1].z,madd(B2_u,matrix[3][2].z,B3_u * matrix[3][3].z))))));
+      
+      return Vec3<T>(x,y,z);
+    }
+
+    template<class T>
+    __forceinline Vec3<T> eval(const T& uu, const T& vv) const 
+    {
+      Vec3<T> ff[2][2];
+      ff[0][0] = Vec3<T>(f[0][0]);
+      ff[0][1] = Vec3<T>(f[0][1]);
+      ff[1][1] = Vec3<T>(f[1][1]);
+      ff[1][0] = Vec3<T>(f[1][0]);
+      return eval_t(v,ff,uu,vv);
+    }
+
+    template<class T>
+      static __forceinline Vec3<T> normal_t(const Vertex matrix[4][4], const Vec3<T> f[2][2], const T& uu, const T& vv) 
+    {
+      typedef typename T::Bool M;
+      
+      const Vec3<T> f0_p = Vec3<T>(matrix[1][1].x,matrix[1][1].y,matrix[1][1].z);
+      const Vec3<T> f1_p = Vec3<T>(matrix[1][2].x,matrix[1][2].y,matrix[1][2].z);
+      const Vec3<T> f2_p = Vec3<T>(matrix[2][2].x,matrix[2][2].y,matrix[2][2].z);
+      const Vec3<T> f3_p = Vec3<T>(matrix[2][1].x,matrix[2][1].y,matrix[2][1].z);
+
+      const Vec3<T> f0_m = f[0][0];
+      const Vec3<T> f1_m = f[0][1];
+      const Vec3<T> f2_m = f[1][1];
+      const Vec3<T> f3_m = f[1][0];
+      
+      const T one_minus_uu = T(1.0f) - uu;
+      const T one_minus_vv = T(1.0f) - vv;      
+      
+      const Vec3<T> f0_i = (          uu * f0_p +           vv * f0_m) * rcp(uu+vv);
+      const Vec3<T> f1_i = (one_minus_uu * f1_m +           vv * f1_p) * rcp(one_minus_uu+vv);
+      const Vec3<T> f2_i = (one_minus_uu * f2_p + one_minus_vv * f2_m) * rcp(one_minus_uu+one_minus_vv);
+      const Vec3<T> f3_i = (          uu * f3_m + one_minus_vv * f3_p) * rcp(uu+one_minus_vv);
+
+#if 1
+      const M m_corner0 = (uu == 0.0f) & (vv == 0.0f);
+      const M m_corner1 = (uu == 1.0f) & (vv == 0.0f);
+      const M m_corner2 = (uu == 1.0f) & (vv == 1.0f);
+      const M m_corner3 = (uu == 0.0f) & (vv == 1.0f);      
+      const Vec3<T> matrix_11( select(m_corner0,f0_p.x,f0_i.x), select(m_corner0,f0_p.y,f0_i.y), select(m_corner0,f0_p.z,f0_i.z) );
+      const Vec3<T> matrix_12( select(m_corner1,f1_p.x,f1_i.x), select(m_corner1,f1_p.y,f1_i.y), select(m_corner1,f1_p.z,f1_i.z) );
+      const Vec3<T> matrix_22( select(m_corner2,f2_p.x,f2_i.x), select(m_corner2,f2_p.y,f2_i.y), select(m_corner2,f2_p.z,f2_i.z) );
+      const Vec3<T> matrix_21( select(m_corner3,f3_p.x,f3_i.x), select(m_corner3,f3_p.y,f3_i.y), select(m_corner3,f3_p.z,f3_i.z) );
+#else
+      const M m_border = (uu == 0.0f) | (uu == 1.0f) | (vv == 0.0f) | (vv == 1.0f);
+      const Vec3<T> matrix_11( select(m_border,f0_p.x,f0_i.x), select(m_border,f0_p.y,f0_i.y), select(m_border,f0_p.z,f0_i.z) );
+      const Vec3<T> matrix_12( select(m_border,f1_p.x,f1_i.x), select(m_border,f1_p.y,f1_i.y), select(m_border,f1_p.z,f1_i.z) );
+      const Vec3<T> matrix_22( select(m_border,f2_p.x,f2_i.x), select(m_border,f2_p.y,f2_i.y), select(m_border,f2_p.z,f2_i.z) );
+      const Vec3<T> matrix_21( select(m_border,f3_p.x,f3_i.x), select(m_border,f3_p.y,f3_i.y), select(m_border,f3_p.z,f3_i.z) );
+#endif
+      
+      const Vec3<T> matrix_00 = Vec3<T>(matrix[0][0].x,matrix[0][0].y,matrix[0][0].z);
+      const Vec3<T> matrix_10 = Vec3<T>(matrix[1][0].x,matrix[1][0].y,matrix[1][0].z);
+      const Vec3<T> matrix_20 = Vec3<T>(matrix[2][0].x,matrix[2][0].y,matrix[2][0].z);
+      const Vec3<T> matrix_30 = Vec3<T>(matrix[3][0].x,matrix[3][0].y,matrix[3][0].z);
+      
+      const Vec3<T> matrix_01 = Vec3<T>(matrix[0][1].x,matrix[0][1].y,matrix[0][1].z);
+      const Vec3<T> matrix_02 = Vec3<T>(matrix[0][2].x,matrix[0][2].y,matrix[0][2].z);
+      const Vec3<T> matrix_03 = Vec3<T>(matrix[0][3].x,matrix[0][3].y,matrix[0][3].z);
+      
+      const Vec3<T> matrix_31 = Vec3<T>(matrix[3][1].x,matrix[3][1].y,matrix[3][1].z);
+      const Vec3<T> matrix_32 = Vec3<T>(matrix[3][2].x,matrix[3][2].y,matrix[3][2].z);
+      const Vec3<T> matrix_33 = Vec3<T>(matrix[3][3].x,matrix[3][3].y,matrix[3][3].z);
+      
+      const Vec3<T> matrix_13 = Vec3<T>(matrix[1][3].x,matrix[1][3].y,matrix[1][3].z);
+      const Vec3<T> matrix_23 = Vec3<T>(matrix[2][3].x,matrix[2][3].y,matrix[2][3].z);
+      
+      /* tangentU */
+      const Vec3<T> col0 = deCasteljau(vv, matrix_00, matrix_10, matrix_20, matrix_30);
+      const Vec3<T> col1 = deCasteljau(vv, matrix_01, matrix_11, matrix_21, matrix_31);
+      const Vec3<T> col2 = deCasteljau(vv, matrix_02, matrix_12, matrix_22, matrix_32);
+      const Vec3<T> col3 = deCasteljau(vv, matrix_03, matrix_13, matrix_23, matrix_33);
+      
+      const Vec3<T> tangentU = deCasteljau_tangent(uu, col0, col1, col2, col3);
+      
+      /* tangentV */
+      const Vec3<T> row0 = deCasteljau(uu, matrix_00, matrix_01, matrix_02, matrix_03);
+      const Vec3<T> row1 = deCasteljau(uu, matrix_10, matrix_11, matrix_12, matrix_13);
+      const Vec3<T> row2 = deCasteljau(uu, matrix_20, matrix_21, matrix_22, matrix_23);
+      const Vec3<T> row3 = deCasteljau(uu, matrix_30, matrix_31, matrix_32, matrix_33);
+      
+      const Vec3<T> tangentV = deCasteljau_tangent(vv, row0, row1, row2, row3);
+      
+      /* normal = tangentU x tangentV */
+      const Vec3<T> n = cross(tangentU,tangentV);
+      return n;
+    }
+
+     template<class T>
+    __forceinline Vec3<T> normal(const T& uu, const T& vv) const 
+    {
+      Vec3<T> ff[2][2];
+      ff[0][0] = Vec3<T>(f[0][0]);
+      ff[0][1] = Vec3<T>(f[0][1]);
+      ff[1][1] = Vec3<T>(f[1][1]);
+      ff[1][0] = Vec3<T>(f[1][0]);
+      return normal_t(v,ff,uu,vv);
+    }
+
+    __forceinline BBox<Vertex> bounds() const
+    {
+      const Vertex *const cv = &v[0][0];
+      BBox<Vertex> bounds (cv[0]);
+      for (size_t i=1; i<16; i++) 
+        bounds.extend( cv[i] );
+      bounds.extend(f[0][0]);
+      bounds.extend(f[1][0]);
+      bounds.extend(f[1][1]);
+      bounds.extend(f[1][1]);
+      return bounds;
+    }
+    
+    friend embree_ostream operator<<(embree_ostream o, const GregoryPatchT& p)
+    {
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  o << "v[" << y << "][" << x << "] " << p.v[y][x] << embree_endl;
+      
+      for (size_t y=0; y<2; y++)
+	for (size_t x=0; x<2; x++)
+	  o << "f[" << y << "][" << x << "] " << p.f[y][x] << embree_endl;
+      return o;
+    } 
+  };
+
+  typedef GregoryPatchT<Vec3fa,Vec3fa_t> GregoryPatch3fa;
+
+  template<typename Vertex, typename Vertex_t>
+    __forceinline  BezierPatchT<Vertex,Vertex_t>::BezierPatchT (const HalfEdge* edge, const char* vertices, size_t stride) 
+  {
+    CatmullClarkPatchT<Vertex,Vertex_t> patch(edge,vertices,stride);
+    GregoryPatchT<Vertex,Vertex_t> gpatch(patch); 
+    gpatch.convert_to_bezier(); 
+    for (size_t y=0; y<4; y++)
+      for (size_t x=0; x<4; x++)
+        matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+  }
+  
+   template<typename Vertex, typename Vertex_t>
+    __forceinline BezierPatchT<Vertex,Vertex_t>::BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch) 
+    {
+      GregoryPatchT<Vertex,Vertex_t> gpatch(patch); 
+      gpatch.convert_to_bezier(); 
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+    }
+
+   template<typename Vertex, typename Vertex_t>
+     __forceinline BezierPatchT<Vertex,Vertex_t>::BezierPatchT(const CatmullClarkPatchT<Vertex,Vertex_t>& patch, 
+                                                               const BezierCurveT<Vertex>* border0,
+                                                               const BezierCurveT<Vertex>* border1,
+                                                               const BezierCurveT<Vertex>* border2,
+                                                               const BezierCurveT<Vertex>* border3) 
+    {
+      GregoryPatchT<Vertex,Vertex_t> gpatch(patch,border0,border1,border2,border3); 
+      gpatch.convert_to_bezier(); 
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = (Vertex_t)gpatch.v[y][x];
+    }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h
new file mode 100644
index 0000000000..85effd02cf
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/gregory_patch_dense.h
@@ -0,0 +1,113 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "gregory_patch.h"
+
+namespace embree
+{  
+  class __aligned(64) DenseGregoryPatch3fa
+  {
+    typedef Vec3fa Vec3fa_4x4[4][4];
+  public:
+
+    __forceinline DenseGregoryPatch3fa (const GregoryPatch3fa& patch)
+    {
+      for (size_t y=0; y<4; y++)
+	for (size_t x=0; x<4; x++)
+	  matrix[y][x] = Vec3ff(patch.v[y][x], 0.0f);
+      
+      matrix[0][0].w = patch.f[0][0].x;
+      matrix[0][1].w = patch.f[0][0].y;
+      matrix[0][2].w = patch.f[0][0].z;
+      matrix[0][3].w = 0.0f;
+      
+      matrix[1][0].w = patch.f[0][1].x;
+      matrix[1][1].w = patch.f[0][1].y;
+      matrix[1][2].w = patch.f[0][1].z;
+      matrix[1][3].w = 0.0f;
+      
+      matrix[2][0].w = patch.f[1][1].x;
+      matrix[2][1].w = patch.f[1][1].y;
+      matrix[2][2].w = patch.f[1][1].z;
+      matrix[2][3].w = 0.0f;
+      
+      matrix[3][0].w = patch.f[1][0].x;
+      matrix[3][1].w = patch.f[1][0].y;
+      matrix[3][2].w = patch.f[1][0].z;
+      matrix[3][3].w = 0.0f;
+    }
+
+    __forceinline void extract_f_m(Vec3fa f_m[2][2]) const
+    {
+      f_m[0][0] = Vec3fa( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3fa( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3fa( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3fa( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );      
+    }
+
+    __forceinline Vec3fa eval(const float uu, const float vv) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      return GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    __forceinline Vec3fa normal(const float uu, const float vv) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      return GregoryPatch3fa::normal(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    template<class T>
+      __forceinline Vec3<T> eval(const T &uu, const T &vv) const 
+    {
+      Vec3<T> f_m[2][2];
+      f_m[0][0] = Vec3<T>( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3<T>( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3<T>( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3<T>( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );
+      return GregoryPatch3fa::eval_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+    
+    template<class T>
+      __forceinline Vec3<T> normal(const T &uu, const T &vv) const 
+    {
+      Vec3<T> f_m[2][2];
+      f_m[0][0] = Vec3<T>( matrix[0][0].w, matrix[0][1].w, matrix[0][2].w );
+      f_m[0][1] = Vec3<T>( matrix[1][0].w, matrix[1][1].w, matrix[1][2].w );
+      f_m[1][1] = Vec3<T>( matrix[2][0].w, matrix[2][1].w, matrix[2][2].w );
+      f_m[1][0] = Vec3<T>( matrix[3][0].w, matrix[3][1].w, matrix[3][2].w );
+      return GregoryPatch3fa::normal_t(*(Vec3fa_4x4*)&matrix,f_m,uu,vv);
+    }
+
+    __forceinline void eval(const float u, const float v, 
+                            Vec3fa* P, Vec3fa* dPdu, Vec3fa* dPdv, Vec3fa* ddPdudu, Vec3fa* ddPdvdv, Vec3fa* ddPdudv,
+                            const float dscale = 1.0f) const
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      if (P) {
+        *P    = GregoryPatch3fa::eval(*(Vec3fa_4x4*)&matrix,f_m,u,v); 
+      }
+      if (dPdu) {
+        assert(dPdu); *dPdu = GregoryPatch3fa::eval_du(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; 
+        assert(dPdv); *dPdv = GregoryPatch3fa::eval_dv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*dscale; 
+      }
+      if (ddPdudu) {
+        assert(ddPdudu); *ddPdudu = GregoryPatch3fa::eval_dudu(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+        assert(ddPdvdv); *ddPdvdv = GregoryPatch3fa::eval_dvdv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+        assert(ddPdudv); *ddPdudv = GregoryPatch3fa::eval_dudv(*(Vec3fa_4x4*)&matrix,f_m,u,v)*sqr(dscale); 
+      }
+    }
+
+    template<typename vbool, typename vfloat>
+    __forceinline void eval(const vbool& valid, const vfloat& uu, const vfloat& vv, float* P, float* dPdu, float* dPdv, const float dscale, const size_t dstride, const size_t N) const 
+    {
+      __aligned(64) Vec3fa f_m[2][2]; extract_f_m(f_m);
+      GregoryPatch3fa::eval(matrix,f_m,valid,uu,vv,P,dPdu,dPdv,dscale,dstride,N);
+    }
+
+  private:
+    Vec3ff matrix[4][4]; // f_p/m points are stored in 4th component
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h b/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h
new file mode 100644
index 0000000000..4fd741c879
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/gridrange.h
@@ -0,0 +1,96 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+
+namespace embree
+{
+  struct __aligned(16) GridRange
+  {
+    unsigned int u_start;
+    unsigned int u_end;
+    unsigned int v_start;
+    unsigned int v_end;
+
+    __forceinline GridRange() {}
+
+    __forceinline GridRange(unsigned int u_start, unsigned int u_end, unsigned int v_start, unsigned int v_end) 
+      : u_start(u_start), u_end(u_end), v_start(v_start), v_end(v_end) {}
+
+    __forceinline unsigned int width() const {
+      return u_end-u_start+1;
+    }
+
+    __forceinline unsigned int height() const {
+      return v_end-v_start+1;
+    }
+
+    __forceinline bool hasLeafSize() const
+    {
+      const unsigned int u_size = u_end-u_start+1;
+      const unsigned int v_size = v_end-v_start+1;
+      assert(u_size >= 1);
+      assert(v_size >= 1);
+      return u_size <= 3 && v_size <= 3;
+    }
+
+    static __forceinline unsigned int split(unsigned int start,unsigned int end)
+    {
+      const unsigned int center = (start+end)/2;
+      assert (center > start);
+      assert (center < end);
+      return center;
+    }
+
+    __forceinline void split(GridRange& r0, GridRange& r1) const
+    {
+      assert( hasLeafSize() == false );
+      const unsigned int u_size = u_end-u_start+1;
+      const unsigned int v_size = v_end-v_start+1;
+      r0 = *this;
+      r1 = *this;
+
+      if (u_size >= v_size)
+      {
+        const unsigned int u_mid = split(u_start,u_end);
+        r0.u_end   = u_mid;
+        r1.u_start = u_mid;
+      }
+      else
+      {
+        const unsigned int v_mid = split(v_start,v_end);
+        r0.v_end   = v_mid;
+        r1.v_start = v_mid;
+      }
+    }
+
+    __forceinline unsigned int splitIntoSubRanges(GridRange r[4]) const
+    {
+      assert( !hasLeafSize() );
+      unsigned int children = 0;
+      GridRange first,second;
+      split(first,second);
+
+      if (first.hasLeafSize()) {
+        r[0] = first;
+        children++;
+      } 
+      else {
+        first.split(r[0],r[1]);
+        children += 2;
+      }
+
+      if (second.hasLeafSize())	{
+        r[children] = second;
+        children++;
+      }
+      else {
+        second.split(r[children+0],r[children+1]);
+        children += 2;
+      }
+      return children;      
+    }
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h b/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h
new file mode 100644
index 0000000000..fb350ca71f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/half_edge.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_coefficients.h"
+
+namespace embree
+{
+  class __aligned(32) HalfEdge
+  {
+    friend class SubdivMesh;
+    public:
+
+    enum PatchType : char { 
+      BILINEAR_PATCH        = 0, //!< a bilinear patch
+      REGULAR_QUAD_PATCH    = 1, //!< a regular quad patch can be represented as a B-Spline
+      IRREGULAR_QUAD_PATCH  = 2, //!< an irregular quad patch can be represented as a Gregory patch
+      COMPLEX_PATCH         = 3  //!< these patches need subdivision and cannot be processed by the above fast code paths
+    };
+    
+    enum VertexType : char { 
+      REGULAR_VERTEX           = 0, //!< regular vertex
+      NON_MANIFOLD_EDGE_VERTEX = 1, //!< vertex of a non-manifold edge
+    };
+    
+    __forceinline friend PatchType max( const PatchType& ty0, const PatchType& ty1) {
+      return (PatchType) max((int)ty0,(int)ty1);
+    }
+    
+    struct Edge 
+    {
+      /*! edge constructor */
+      __forceinline Edge(const uint32_t v0, const uint32_t v1)
+	: v0(v0), v1(v1) {}
+
+      /*! create an 64 bit identifier that is unique for the not oriented edge */
+      __forceinline operator uint64_t() const       
+      {
+	uint32_t p0 = v0, p1 = v1;
+	if (p0<p1) std::swap(p0,p1);
+	return (((uint64_t)p0) << 32) | (uint64_t)p1;
+      }
+
+    public:
+      uint32_t v0,v1;    //!< start and end vertex of the edge
+    };
+
+    HalfEdge () 
+      : vtx_index(-1), next_half_edge_ofs(0), prev_half_edge_ofs(0), opposite_half_edge_ofs(0), edge_crease_weight(0), 
+      vertex_crease_weight(0), edge_level(0), patch_type(COMPLEX_PATCH), vertex_type(REGULAR_VERTEX)
+    {
+      static_assert(sizeof(HalfEdge) == 32, "invalid half edge size");
+    }
+ 
+    __forceinline bool hasOpposite() const { return opposite_half_edge_ofs != 0; }
+    __forceinline void setOpposite(HalfEdge* opposite) { opposite_half_edge_ofs = int(opposite-this); }
+    
+    __forceinline       HalfEdge* next()       { assert( next_half_edge_ofs != 0 ); return &this[next_half_edge_ofs]; }
+    __forceinline const HalfEdge* next() const { assert( next_half_edge_ofs != 0 ); return &this[next_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* prev()       { assert( prev_half_edge_ofs != 0 ); return &this[prev_half_edge_ofs]; }
+    __forceinline const HalfEdge* prev() const { assert( prev_half_edge_ofs != 0 ); return &this[prev_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* opposite()       { assert( opposite_half_edge_ofs != 0 ); return &this[opposite_half_edge_ofs]; }
+    __forceinline const HalfEdge* opposite() const { assert( opposite_half_edge_ofs != 0 ); return &this[opposite_half_edge_ofs]; }
+    
+    __forceinline       HalfEdge* rotate()       { return opposite()->next(); }
+    __forceinline const HalfEdge* rotate() const { return opposite()->next(); }
+    
+    __forceinline unsigned int getStartVertexIndex() const { return vtx_index; }
+    __forceinline unsigned int getEndVertexIndex  () const { return next()->vtx_index; }
+    __forceinline Edge         getEdge            () const { return Edge(getStartVertexIndex(),getEndVertexIndex()); }
+   
+    
+    /*! tests if the start vertex of the edge is regular */
+    __forceinline PatchType vertexType() const
+    {
+      const HalfEdge* p = this;
+      size_t face_valence = 0;
+      bool hasBorder = false;
+      
+      do
+      {
+        /* we need subdivision to handle edge creases */
+        if (p->hasOpposite() && p->edge_crease_weight > 0.0f) 
+          return COMPLEX_PATCH;
+        
+        face_valence++;
+        
+        /* test for quad */
+        const HalfEdge* pp = p;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp == p) return COMPLEX_PATCH;
+        pp = pp->next(); if (pp != p) return COMPLEX_PATCH;
+        
+        /* continue with next face */
+        p = p->prev();
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else
+        {
+          face_valence++;
+          hasBorder = true;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->rotate();
+        }
+      } while (p != this); 
+      
+      /* calculate vertex type */
+      if (face_valence == 2 && hasBorder) {
+        if      (vertex_crease_weight == 0.0f      ) return REGULAR_QUAD_PATCH;
+        else if (vertex_crease_weight == float(inf)) return REGULAR_QUAD_PATCH;
+        else                                         return COMPLEX_PATCH;
+      }
+      else if (vertex_crease_weight != 0.0f)         return COMPLEX_PATCH;
+      else if (face_valence == 3 &&  hasBorder)      return REGULAR_QUAD_PATCH;
+      else if (face_valence == 4 && !hasBorder)      return REGULAR_QUAD_PATCH;
+      else                                           return IRREGULAR_QUAD_PATCH;
+    }
+
+    /*! tests if this edge is part of a bilinear patch */
+    __forceinline bool bilinearVertex() const {
+      return vertex_crease_weight == float(inf) && edge_crease_weight == float(inf);
+    }
+    
+    /*! calculates the type of the patch */
+    __forceinline PatchType patchType() const 
+    {
+      const HalfEdge* p = this;
+      PatchType ret = REGULAR_QUAD_PATCH;
+      bool bilinear = true;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) == this) return COMPLEX_PATCH;
+      
+      ret = max(ret,p->vertexType());
+      bilinear &= p->bilinearVertex();
+      if ((p = p->next()) != this) return COMPLEX_PATCH;
+      
+      if (bilinear) return BILINEAR_PATCH;
+      return ret;
+    }
+    
+    /*! tests if the face is a regular b-spline face */
+    __forceinline bool isRegularFace() const {
+      return patch_type == REGULAR_QUAD_PATCH;
+    }
+    
+    /*! tests if the face can be diced (using bspline or gregory patch) */
+    __forceinline bool isGregoryFace() const {
+      return patch_type == IRREGULAR_QUAD_PATCH || patch_type == REGULAR_QUAD_PATCH;
+    }
+    
+    /*! tests if the base vertex of this half edge is a corner vertex */
+    __forceinline bool isCorner() const {
+      return !hasOpposite() && !prev()->hasOpposite();
+    }
+
+    /*! tests if the vertex is attached to any border */
+    __forceinline bool vertexHasBorder() const 
+    {
+      const HalfEdge* p = this;
+      do {
+        if (!p->hasOpposite()) return true;
+        p = p->rotate();
+      } while (p != this);
+      return false;
+    }
+    
+    /*! tests if the face this half edge belongs to has some border */
+    __forceinline bool faceHasBorder() const 
+    {
+      const HalfEdge* p = this;
+      do {
+        if (p->vertexHasBorder()) return true;
+        p = p->next();
+      } while (p != this);
+      return false;
+    }
+    
+    /*! calculates conservative bounds of a catmull clark subdivision face */
+    __forceinline BBox3fa bounds(const BufferView<Vec3fa>& vertices) const
+    {
+      BBox3fa bounds = this->get1RingBounds(vertices);
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next())
+        bounds.extend(p->get1RingBounds(vertices));
+      return bounds;
+    }
+    
+    /*! tests if this is a valid patch */
+    __forceinline bool valid(const BufferView<Vec3fa>& vertices) const
+    {
+      size_t N = 1;
+      if (!this->validRing(vertices)) return false;
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++) {
+        if (!p->validRing(vertices)) return false;
+      }
+      return N >= 3 && N <= MAX_PATCH_VALENCE;
+    }
+    
+    /*! counts number of polygon edges  */
+    __forceinline unsigned int numEdges() const
+    {
+      unsigned int N = 1;
+      for (const HalfEdge* p=this->next(); p!=this; p=p->next(), N++);
+      return N;
+    }
+
+    /*! calculates face and edge valence */
+    __forceinline void calculateFaceValenceAndEdgeValence(size_t& faceValence, size_t& edgeValence) const 
+    {
+      faceValence = 0;
+      edgeValence = 0;
+      
+      const HalfEdge* p = this;
+      do 
+      {
+         /* calculate bounds of current face */
+        unsigned int numEdges = p->numEdges();
+        assert(numEdges >= 3);
+        edgeValence += numEdges-2;
+        
+        faceValence++;
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          faceValence++;
+          edgeValence++;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+    }
+
+    /*! stream output */
+    friend __forceinline std::ostream &operator<<(std::ostream &o, const HalfEdge &h)
+    {
+      return o << "{ " << 
+        "vertex = " << h.vtx_index << ", " << //" -> " << h.next()->vtx_index << ", " << 
+        "prev = " << h.prev_half_edge_ofs << ", " << 
+        "next = " << h.next_half_edge_ofs << ", " << 
+        "opposite = " << h.opposite_half_edge_ofs << ", " << 
+        "edge_crease = " << h.edge_crease_weight << ", " << 
+        "vertex_crease = " << h.vertex_crease_weight << ", " << 
+        //"edge_level = " << h.edge_level << 
+        " }";
+    } 
+    
+  private:
+    
+    /*! calculates the bounds of the face associated with the half-edge */
+    __forceinline BBox3fa getFaceBounds(const BufferView<Vec3fa>& vertices) const 
+    {
+      BBox3fa b = vertices[getStartVertexIndex()];
+      for (const HalfEdge* p = next(); p!=this; p=p->next()) {
+        b.extend(vertices[p->getStartVertexIndex()]);
+      }
+      return b;
+    }
+    
+    /*! calculates the bounds of the 1-ring associated with the vertex of the half-edge */
+    __forceinline BBox3fa get1RingBounds(const BufferView<Vec3fa>& vertices) const 
+    {
+      BBox3fa bounds = empty;
+      const HalfEdge* p = this;
+      do 
+      {
+        /* calculate bounds of current face */
+        bounds.extend(p->getFaceBounds(vertices));
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+      
+      return bounds;
+    }
+    
+    /*! tests if this is a valid face */
+    __forceinline bool validFace(const BufferView<Vec3fa>& vertices, size_t& N) const 
+    {
+      const Vec3fa v = vertices[getStartVertexIndex()];
+      if (!isvalid(v)) return false;
+      size_t n = 1;
+      for (const HalfEdge* p = next(); p!=this; p=p->next(), n++) {
+        const Vec3fa v = vertices[p->getStartVertexIndex()];
+        if (!isvalid(v)) return false;
+      }
+      N += n-2;
+      return n >= 3 && n <= MAX_PATCH_VALENCE;
+    }
+    
+    /*! tests if this is a valid ring */
+    __forceinline bool validRing(const BufferView<Vec3fa>& vertices) const 
+    {
+      size_t faceValence = 0;
+      size_t edgeValence = 0;
+      
+      const HalfEdge* p = this;
+      do 
+      {
+        /* calculate bounds of current face */
+        if (!p->validFace(vertices,edgeValence)) 
+          return false;
+        
+        faceValence++;
+        p = p->prev();
+        
+        /* continue with next face */
+        if (likely(p->hasOpposite())) 
+          p = p->opposite();
+        
+        /* if there is no opposite go the long way to the other side of the border */
+        else {
+          faceValence++;
+          edgeValence++;
+          p = this;
+          while (p->hasOpposite()) 
+            p = p->opposite()->next();
+        }
+        
+      } while (p != this); 
+      
+      return faceValence <= MAX_RING_FACE_VALENCE && edgeValence <= MAX_RING_EDGE_VALENCE;
+    }
+    
+  private:
+    unsigned int vtx_index;         //!< index of edge start vertex
+    int next_half_edge_ofs;         //!< relative offset to next half edge of face
+    int prev_half_edge_ofs;         //!< relative offset to previous half edge of face
+    int opposite_half_edge_ofs;     //!< relative offset to opposite half edge
+    
+  public:
+    float edge_crease_weight;       //!< crease weight attached to edge
+    float vertex_crease_weight;     //!< crease weight attached to start vertex
+    float edge_level;               //!< subdivision factor for edge
+    PatchType patch_type;           //!< stores type of subdiv patch
+    VertexType vertex_type;         //!< stores type of the start vertex
+    char align[2];
+  };
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h b/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h
new file mode 100644
index 0000000000..9fab79cf0c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/hermite_curve.h
@@ -0,0 +1,38 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+#include "bezier_curve.h"
+
+namespace embree
+{
+  template<typename Vertex>
+    struct HermiteCurveT : BezierCurveT<Vertex>
+    {
+      __forceinline HermiteCurveT() {}
+
+      __forceinline HermiteCurveT(const BezierCurveT<Vertex>& curve)
+        : BezierCurveT<Vertex>(curve) {}
+      
+      __forceinline HermiteCurveT(const Vertex& v0, const Vertex& t0, const Vertex& v1, const Vertex& t1)
+        : BezierCurveT<Vertex>(v0,madd(1.0f/3.0f,t0,v0),nmadd(1.0f/3.0f,t1,v1),v1) {}
+
+      __forceinline HermiteCurveT<Vec3ff> xfm_pr(const LinearSpace3fa& space, const Vec3fa& p) const
+      {
+        const Vec3ff q0(xfmVector(space,this->v0-p), this->v0.w);
+        const Vec3ff q1(xfmVector(space,this->v1-p), this->v1.w);
+        const Vec3ff q2(xfmVector(space,this->v2-p), this->v2.w);
+        const Vec3ff q3(xfmVector(space,this->v3-p), this->v3.w);
+        return BezierCurveT<Vec3ff>(q0,q1,q2,q3);
+      }
+    };
+
+  __forceinline HermiteCurveT<Vec3ff> enlargeRadiusToMinWidth(const IntersectContext* context, const CurveGeometry* geom, const Vec3fa& ray_org, const HermiteCurveT<Vec3ff>& curve) {
+    return HermiteCurveT<Vec3ff>(enlargeRadiusToMinWidth(context,geom,ray_org,BezierCurveT<Vec3ff>(curve)));
+  }
+  
+  typedef HermiteCurveT<Vec3fa> HermiteCurve3fa;
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h b/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h
new file mode 100644
index 0000000000..f4a854af7f
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/linear_bezier_patch.h
@@ -0,0 +1,403 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "bezier_curve.h"
+
+namespace embree
+{
+  namespace isa
+  {   
+    template<typename V>
+      struct TensorLinearQuadraticBezierSurface
+      {
+        QuadraticBezierCurve<V> L;
+        QuadraticBezierCurve<V> R;
+        
+        __forceinline TensorLinearQuadraticBezierSurface() {}
+        
+        __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface<V>& curve)
+          : L(curve.L), R(curve.R) {}
+        
+        __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) {
+          L = other.L; R = other.R; return *this;
+        }
+          
+          __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve<V>& L, const QuadraticBezierCurve<V>& R)
+            : L(L), R(R) {}
+        
+        __forceinline BBox<V> bounds() const {
+          return merge(L.bounds(),R.bounds());
+        }
+      };
+    
+    template<>
+      struct TensorLinearQuadraticBezierSurface<Vec2fa>
+    {
+      QuadraticBezierCurve<vfloat4> LR;
+      
+      __forceinline TensorLinearQuadraticBezierSurface() {}
+      
+      __forceinline TensorLinearQuadraticBezierSurface(const TensorLinearQuadraticBezierSurface<Vec2fa>& curve)
+        : LR(curve.LR) {}
+      
+      __forceinline TensorLinearQuadraticBezierSurface& operator= (const TensorLinearQuadraticBezierSurface& other) {
+        LR = other.LR; return *this;
+      }
+      
+      __forceinline TensorLinearQuadraticBezierSurface(const QuadraticBezierCurve<vfloat4>& LR)
+        : LR(LR) {}
+      
+      __forceinline BBox<Vec2fa> bounds() const
+      {
+        const BBox<vfloat4> b = LR.bounds();
+        const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper));
+        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper)));
+        return merge(bl,br);
+      }
+    };
+    
+    template<typename V>
+      struct TensorLinearCubicBezierSurface
+      {
+        CubicBezierCurve<V> L;
+        CubicBezierCurve<V> R;
+        
+        __forceinline TensorLinearCubicBezierSurface() {}
+        
+        __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve)
+          : L(curve.L), R(curve.R) {}
+        
+        __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) {
+          L = other.L; R = other.R; return *this;
+        }
+          
+        __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<V>& L, const CubicBezierCurve<V>& R)
+          : L(L), R(R) {}
+
+        template<template<typename T> class SourceCurve>
+        __forceinline static TensorLinearCubicBezierSurface fromCenterAndNormalCurve(const SourceCurve<Vec3ff>& center, const SourceCurve<Vec3fa>& normal)
+        {
+          SourceCurve<Vec3ff> vcurve = center;
+          SourceCurve<Vec3fa> ncurve = normal;
+          
+          /* here we construct a patch which follows the curve l(t) =
+           * p(t) +/- r(t)*normalize(cross(n(t),dp(t))) */
+          
+          const Vec3ff p0   = vcurve.eval(0.0f);
+          const Vec3ff dp0  = vcurve.eval_du(0.0f);
+          const Vec3ff ddp0 = vcurve.eval_dudu(0.0f);
+
+          const Vec3fa n0   = ncurve.eval(0.0f);
+          const Vec3fa dn0  = ncurve.eval_du(0.0f);
+
+          const Vec3ff p1   = vcurve.eval(1.0f);
+          const Vec3ff dp1  = vcurve.eval_du(1.0f);
+          const Vec3ff ddp1 = vcurve.eval_dudu(1.0f);
+
+          const Vec3fa n1   = ncurve.eval(1.0f);
+          const Vec3fa dn1  = ncurve.eval_du(1.0f);
+
+          const Vec3fa bt0  = cross(n0,dp0);
+          const Vec3fa dbt0 = cross(dn0,dp0) + cross(n0,ddp0);
+
+          const Vec3fa bt1  = cross(n1,dp1);
+          const Vec3fa dbt1 = cross(dn1,dp1) + cross(n1,ddp1);
+            
+          const Vec3fa k0  = normalize(bt0);
+          const Vec3fa dk0 = dnormalize(bt0,dbt0);
+          
+          const Vec3fa k1 = normalize(bt1);
+          const Vec3fa dk1 = dnormalize(bt1,dbt1);
+                    
+          const Vec3fa l0 = p0 - p0.w*k0;
+          const Vec3fa dl0 = dp0 - (dp0.w*k0 + p0.w*dk0);
+
+          const Vec3fa r0 = p0 + p0.w*k0;
+          const Vec3fa dr0 = dp0 + (dp0.w*k0 + p0.w*dk0);
+
+          const Vec3fa l1 = p1 - p1.w*k1;
+          const Vec3fa dl1 = dp1 - (dp1.w*k1 + p1.w*dk1);
+
+          const Vec3fa r1 = p1 + p1.w*k1;
+          const Vec3fa dr1 = dp1 + (dp1.w*k1 + p1.w*dk1);
+
+          const float scale = 1.0f/3.0f;
+          CubicBezierCurve<V> L(l0,l0+scale*dl0,l1-scale*dl1,l1);
+          CubicBezierCurve<V> R(r0,r0+scale*dr0,r1-scale*dr1,r1);
+          return TensorLinearCubicBezierSurface(L,R);
+        }
+
+        __forceinline BBox<V> bounds() const {
+          return merge(L.bounds(),R.bounds());
+        }
+
+        __forceinline BBox3fa accurateBounds() const {
+          return merge(L.accurateBounds(),R.accurateBounds());
+        }
+        
+        __forceinline CubicBezierCurve<Interval1f> reduce_v() const {
+          return merge(CubicBezierCurve<Interval<V>>(L),CubicBezierCurve<Interval<V>>(R));
+        }
+        
+        __forceinline LinearBezierCurve<Interval1f> reduce_u() const {
+          return LinearBezierCurve<Interval1f>(L.bounds(),R.bounds());
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx) const {
+          return TensorLinearCubicBezierSurface<float>(L.xfm(dx),R.xfm(dx));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<vfloatx> vxfm(const V& dx) const {
+          return TensorLinearCubicBezierSurface<vfloatx>(L.vxfm(dx),R.vxfm(dx));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<float> xfm(const V& dx, const V& p) const {
+          return TensorLinearCubicBezierSurface<float>(L.xfm(dx,p),R.xfm(dx,p));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space),R.xfm(space));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space,p),R.xfm(space,p));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface<Vec3fa> xfm(const LinearSpace3fa& space, const Vec3fa& p, const float s) const {
+          return TensorLinearCubicBezierSurface(L.xfm(space,p,s),R.xfm(space,p,s));
+        }
+
+        __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const {
+          return TensorLinearCubicBezierSurface(L.clip(u),R.clip(u));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const {
+          return TensorLinearCubicBezierSurface(clerp(L,R,V(v.lower)),clerp(L,R,V(v.upper)));
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const {
+          return clip_v(v).clip_u(u);
+        }
+        
+        __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const
+        {
+          CubicBezierCurve<V> L0,L1; L.split(L0,L1,u);
+          CubicBezierCurve<V> R0,R1; R.split(R0,R1,u);
+          new (&left ) TensorLinearCubicBezierSurface(L0,R0);
+          new (&right) TensorLinearCubicBezierSurface(L1,R1);
+        }
+        
+        __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const {
+          valid = true; clear(valid,VSIZEX-1);
+          return TensorLinearCubicBezierSurface<Vec2vfx>(L.split(u),R.split(u));
+        }
+        
+        __forceinline V eval(const float u, const float v) const {
+          return clerp(L,R,V(v)).eval(u);
+        }
+        
+        __forceinline V eval_du(const float u, const float v) const {
+          return clerp(L,R,V(v)).eval_dt(u);
+        }
+        
+        __forceinline V eval_dv(const float u, const float v) const {
+          return (R-L).eval(u);
+        }
+        
+        __forceinline void eval(const float u, const float v, V& p, V& dpdu, V& dpdv) const
+        {
+          V p0, dp0du; L.eval(u,p0,dp0du);
+          V p1, dp1du; R.eval(u,p1,dp1du);
+          p = lerp(p0,p1,v);
+          dpdu = lerp(dp0du,dp1du,v);
+          dpdv = p1-p0;
+        }
+        
+        __forceinline TensorLinearQuadraticBezierSurface<V> derivative_u() const {
+          return TensorLinearQuadraticBezierSurface<V>(L.derivative(),R.derivative());
+        }
+        
+        __forceinline CubicBezierCurve<V> derivative_v() const {
+          return R-L;
+        }
+        
+        __forceinline V axis_u() const {
+          return (L.end()-L.begin())+(R.end()-R.begin());
+        }
+        
+        __forceinline V axis_v() const {
+          return (R.begin()-L.begin())+(R.end()-L.end());
+        }
+        
+        friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a)
+        {
+          return cout << "TensorLinearCubicBezierSurface" << embree_endl
+                      << "{" << embree_endl
+                      << "  L = " << a.L << ", " << embree_endl
+                      << "  R = " << a.R << embree_endl
+                      << "}";
+        }
+
+        friend __forceinline TensorLinearCubicBezierSurface clerp(const TensorLinearCubicBezierSurface& a, const TensorLinearCubicBezierSurface& b, const float t) {
+          return TensorLinearCubicBezierSurface(clerp(a.L,b.L,V(t)), clerp(a.R,b.R,V(t)));
+        }
+      };
+    
+    template<>
+      struct TensorLinearCubicBezierSurface<Vec2fa>
+    {
+      CubicBezierCurve<vfloat4> LR;
+      
+      __forceinline TensorLinearCubicBezierSurface() {}
+      
+      __forceinline TensorLinearCubicBezierSurface(const TensorLinearCubicBezierSurface& curve)
+        : LR(curve.LR) {}
+      
+      __forceinline TensorLinearCubicBezierSurface& operator= (const TensorLinearCubicBezierSurface& other) {
+        LR = other.LR; return *this;
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<vfloat4>& LR)
+        : LR(LR) {}
+      
+      __forceinline TensorLinearCubicBezierSurface(const CubicBezierCurve<Vec2fa>& L, const CubicBezierCurve<Vec2fa>& R)
+        : LR(shuffle<0,1,0,1>(vfloat4(L.v0),vfloat4(R.v0)),shuffle<0,1,0,1>(vfloat4(L.v1),vfloat4(R.v1)),shuffle<0,1,0,1>(vfloat4(L.v2),vfloat4(R.v2)),shuffle<0,1,0,1>(vfloat4(L.v3),vfloat4(R.v3))) {}
+      
+      __forceinline CubicBezierCurve<Vec2fa> getL() const {
+        return CubicBezierCurve<Vec2fa>(Vec2fa(LR.v0),Vec2fa(LR.v1),Vec2fa(LR.v2),Vec2fa(LR.v3));
+      }
+      
+      __forceinline CubicBezierCurve<Vec2fa> getR() const {
+        return CubicBezierCurve<Vec2fa>(Vec2fa(shuffle<2,3,2,3>(LR.v0)),Vec2fa(shuffle<2,3,2,3>(LR.v1)),Vec2fa(shuffle<2,3,2,3>(LR.v2)),Vec2fa(shuffle<2,3,2,3>(LR.v3)));
+      }
+      
+      __forceinline BBox<Vec2fa> bounds() const
+      {
+        const BBox<vfloat4> b = LR.bounds();
+        const BBox<Vec2fa> bl(Vec2fa(b.lower),Vec2fa(b.upper));
+        const BBox<Vec2fa> br(Vec2fa(shuffle<2,3,2,3>(b.lower)),Vec2fa(shuffle<2,3,2,3>(b.upper)));
+        return merge(bl,br);
+      }
+      
+      __forceinline BBox1f bounds(const Vec2fa& axis) const
+      {
+        const CubicBezierCurve<vfloat4> LRx = LR;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(axis)),LRx,shuffle<1>(vfloat4(axis))*LRy);
+        const BBox<vfloat4> Lb = LRa.bounds();
+        const BBox<vfloat4> Rb(shuffle<3>(Lb.lower),shuffle<3>(Lb.upper));
+        const BBox<vfloat4> b = merge(Lb,Rb);
+        return BBox1f(b.lower[0],b.upper[0]);
+      }
+
+      __forceinline TensorLinearCubicBezierSurface<float> xfm(const Vec2fa& dx) const
+      {
+        const CubicBezierCurve<vfloat4> LRx = LR;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy);
+        return TensorLinearCubicBezierSurface<float>(CubicBezierCurve<float>(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]),
+                                                     CubicBezierCurve<float>(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2]));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface<float> xfm(const Vec2fa& dx, const Vec2fa& p) const
+      {
+        const vfloat4 pxyxy = shuffle<0,1,0,1>(vfloat4(p));
+        const CubicBezierCurve<vfloat4> LRx = LR-pxyxy;
+        const CubicBezierCurve<vfloat4> LRy(shuffle<1,0,3,2>(LR.v0),shuffle<1,0,3,2>(LR.v1),shuffle<1,0,3,2>(LR.v2),shuffle<1,0,3,2>(LR.v3));
+        const CubicBezierCurve<vfloat4> LRa = cmadd(shuffle<0>(vfloat4(dx)),LRx,shuffle<1>(vfloat4(dx))*LRy);
+        return TensorLinearCubicBezierSurface<float>(CubicBezierCurve<float>(LRa.v0[0],LRa.v1[0],LRa.v2[0],LRa.v3[0]),
+                                                     CubicBezierCurve<float>(LRa.v0[2],LRa.v1[2],LRa.v2[2],LRa.v3[2]));
+      }
+
+      __forceinline TensorLinearCubicBezierSurface clip_u(const Interval1f& u) const {
+        return TensorLinearCubicBezierSurface(LR.clip(u));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface clip_v(const Interval1f& v) const
+      {
+        const CubicBezierCurve<vfloat4> LL(shuffle<0,1,0,1>(LR.v0),shuffle<0,1,0,1>(LR.v1),shuffle<0,1,0,1>(LR.v2),shuffle<0,1,0,1>(LR.v3));
+        const CubicBezierCurve<vfloat4> RR(shuffle<2,3,2,3>(LR.v0),shuffle<2,3,2,3>(LR.v1),shuffle<2,3,2,3>(LR.v2),shuffle<2,3,2,3>(LR.v3));
+        return TensorLinearCubicBezierSurface(clerp(LL,RR,vfloat4(v.lower,v.lower,v.upper,v.upper)));
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface clip(const Interval1f& u, const Interval1f& v) const {
+        return clip_v(v).clip_u(u);
+      }
+      
+      __forceinline void split_u(TensorLinearCubicBezierSurface& left, TensorLinearCubicBezierSurface& right, const float u = 0.5f) const
+      {
+        CubicBezierCurve<vfloat4> LR0,LR1; LR.split(LR0,LR1,u);
+        new (&left ) TensorLinearCubicBezierSurface(LR0);
+        new (&right) TensorLinearCubicBezierSurface(LR1);
+      }
+      
+      __forceinline TensorLinearCubicBezierSurface<Vec2vfx> vsplit_u(vboolx& valid, const BBox1f& u) const {
+        valid = true; clear(valid,VSIZEX-1);
+        return TensorLinearCubicBezierSurface<Vec2vfx>(getL().split(u),getR().split(u));
+      }
+      
+      __forceinline Vec2fa eval(const float u, const float v) const
+      {
+        const vfloat4 p = LR.eval(u);
+        return Vec2fa(lerp(shuffle<0,1,0,1>(p),shuffle<2,3,2,3>(p),v));
+      }
+      
+      __forceinline Vec2fa eval_du(const float u, const float v) const
+      {
+        const vfloat4 dpdu = LR.eval_dt(u);
+        return Vec2fa(lerp(shuffle<0,1,0,1>(dpdu),shuffle<2,3,2,3>(dpdu),v));
+      }
+      
+      __forceinline Vec2fa eval_dv(const float u, const float v) const
+      {
+        const vfloat4 p = LR.eval(u);
+        return Vec2fa(shuffle<2,3,2,3>(p)-shuffle<0,1,0,1>(p));
+      }
+      
+      __forceinline void eval(const float u, const float v, Vec2fa& p, Vec2fa& dpdu, Vec2fa& dpdv) const
+      {
+        vfloat4 p0, dp0du; LR.eval(u,p0,dp0du);
+        p = Vec2fa(lerp(shuffle<0,1,0,1>(p0),shuffle<2,3,2,3>(p0),v));
+        dpdu = Vec2fa(lerp(shuffle<0,1,0,1>(dp0du),shuffle<2,3,2,3>(dp0du),v));
+        dpdv = Vec2fa(shuffle<2,3,2,3>(p0)-shuffle<0,1,0,1>(p0));
+      }
+      
+      __forceinline TensorLinearQuadraticBezierSurface<Vec2fa> derivative_u() const {
+        return TensorLinearQuadraticBezierSurface<Vec2fa>(LR.derivative());
+      }
+      
+      __forceinline CubicBezierCurve<Vec2fa> derivative_v() const {
+        return getR()-getL();
+      }
+      
+      __forceinline Vec2fa axis_u() const
+      {
+        const CubicBezierCurve<Vec2fa> L = getL();
+        const CubicBezierCurve<Vec2fa> R = getR();
+        return (L.end()-L.begin())+(R.end()-R.begin());
+      }
+      
+      __forceinline Vec2fa axis_v() const
+      {
+        const CubicBezierCurve<Vec2fa> L = getL();
+        const CubicBezierCurve<Vec2fa> R = getR();
+        return (R.begin()-L.begin())+(R.end()-L.end());
+      }
+      
+      friend embree_ostream operator<<(embree_ostream cout, const TensorLinearCubicBezierSurface& a)
+      {
+        return cout << "TensorLinearCubicBezierSurface" << embree_endl
+                    << "{" << embree_endl
+                    << "  L = " << a.getL() << ", " << embree_endl
+                    << "  R = " << a.getR() << embree_endl
+                    << "}";
+      }
+    };
+
+    typedef TensorLinearCubicBezierSurface<float> TensorLinearCubicBezierSurface1f;
+    typedef TensorLinearCubicBezierSurface<Vec2fa> TensorLinearCubicBezierSurface2fa;
+    typedef TensorLinearCubicBezierSurface<Vec3fa> TensorLinearCubicBezierSurface3fa;
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch.h b/thirdparty/embree-aarch64/kernels/subdiv/patch.h
new file mode 100644
index 0000000000..d58241b96d
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/patch.h
@@ -0,0 +1,371 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "catmullclark_patch.h"
+#include "bilinear_patch.h"
+#include "bspline_patch.h"
+#include "bezier_patch.h"
+#include "gregory_patch.h"
+#include "tessellation_cache.h"
+
+#if 1
+#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z)
+#else
+#define PATCH_DEBUG_SUBDIVISION(ptr,x,y,z)            \
+  {                                                   \
+    size_t hex = (size_t)ptr;                          \
+    for (size_t i=0; i<4; i++) hex = hex ^ (hex >> 8);  \
+    const float c = (float)(((hex >> 0) ^ (hex >> 4) ^ (hex >> 8) ^ (hex >> 12) ^ (hex >> 16))&0xf)/15.0f; \
+    if (P) *P = Vertex(0.5f+0.5f*x,0.5f+0.5f*y,0.5f+0.5f*z,0.0f);         \
+    }               
+#endif
+
+#define PATCH_MAX_CACHE_DEPTH 2
+//#define PATCH_MIN_RESOLUTION 1     // FIXME: not yet completely implemented
+#define PATCH_MAX_EVAL_DEPTH_IRREGULAR 10     // maximum evaluation depth at irregular vertices (has to be larger or equal than PATCH_MAX_CACHE_DEPTH)
+#define PATCH_MAX_EVAL_DEPTH_CREASE 10       // maximum evaluation depth at crease features (has to be larger or equal than PATCH_MAX_CACHE_DEPTH)
+#define PATCH_USE_GREGORY 1        // 0 = no gregory, 1 = fill, 2 = as early as possible
+
+#if PATCH_USE_GREGORY==2
+#define PATCH_USE_BEZIER_PATCH 1   // enable use of bezier instead of b-spline patches
+#else
+#define PATCH_USE_BEZIER_PATCH 0   // enable use of bezier instead of b-spline patches
+#endif
+
+#if PATCH_USE_BEZIER_PATCH
+#  define RegularPatch  BezierPatch
+#  define RegularPatchT BezierPatchT<Vertex,Vertex_t>
+#else
+#  define RegularPatch  BSplinePatch
+#  define RegularPatchT BSplinePatchT<Vertex,Vertex_t>
+#endif
+
+#if PATCH_USE_GREGORY
+#define IrregularFillPatch GregoryPatch
+#define IrregularFillPatchT GregoryPatchT<Vertex,Vertex_t>
+#else
+#define IrregularFillPatch BilinearPatch
+#define IrregularFillPatchT BilinearPatchT<Vertex,Vertex_t>
+#endif
+
+namespace embree
+{
+  template<typename Vertex, typename Vertex_t = Vertex>
+    struct __aligned(64) PatchT
+    {
+    public:
+    
+    typedef GeneralCatmullClarkPatchT<Vertex,Vertex_t> GeneralCatmullClarkPatch;
+    typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+    typedef CatmullClark1RingT<Vertex,Vertex_t> CatmullClarkRing;
+    typedef BezierCurveT<Vertex> BezierCurve;
+    
+    enum Type {
+      INVALID_PATCH = 0,
+      BILINEAR_PATCH = 1,
+      BSPLINE_PATCH = 2,  
+      BEZIER_PATCH = 3,  
+      GREGORY_PATCH = 4,
+      SUBDIVIDED_GENERAL_PATCH = 7,
+      SUBDIVIDED_QUAD_PATCH = 8,
+      EVAL_PATCH = 9,
+    };
+    
+    struct Ref
+    {
+      __forceinline Ref(void* p = nullptr) 
+        : ptr((size_t)p) {}
+
+      __forceinline operator bool() const { return ptr != 0; }
+      __forceinline operator size_t() const { return ptr; }
+
+      __forceinline Ref (Type ty, void* in) 
+        : ptr(((size_t)in)+ty) { assert((((size_t)in) & 0xF) == 0); }
+
+      __forceinline Type  type  () const { return (Type)(ptr & 0xF); }
+      __forceinline void* object() const { return (void*) (ptr & ~0xF); }
+
+      size_t ptr;
+    };
+
+    struct EvalPatch 
+    {
+      /* creates EvalPatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch) 
+      {
+        size_t ofs = 0, bytes = patch.bytes();
+        void* ptr = alloc(bytes);
+        patch.serialize(ptr,ofs);
+        assert(ofs == bytes);
+        return Ref(EVAL_PATCH, ptr);
+      }
+    };
+
+    struct BilinearPatch 
+    {
+      /* creates BilinearPatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(patch));
+      }
+
+      __forceinline BilinearPatch (const CatmullClarkPatch& patch) 
+        : patch(patch) {}
+
+      /* creates BilinearPatch from 4 vertices */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BILINEAR_PATCH, new (alloc(sizeof(BilinearPatch))) BilinearPatch(edge,vertices,stride));
+      }
+      
+      __forceinline BilinearPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+    public:
+      BilinearPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct BSplinePatch 
+    {
+      /* creates BSplinePatch from a half edge */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(edge,vertices,stride));
+      }
+      
+      __forceinline BSplinePatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+      /* creates BSplinePatch from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BSPLINE_PATCH, new (alloc(sizeof(BSplinePatch))) BSplinePatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline BSplinePatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      BSplinePatchT<Vertex,Vertex_t> patch;
+    };
+
+    struct BezierPatch
+    {
+      /* creates BezierPatch from a half edge */
+      template<typename Allocator>
+        __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(edge,vertices,stride));
+      }
+      
+      __forceinline BezierPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(edge,vertices,stride) {}
+      
+      /* creates Bezier from a CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(BEZIER_PATCH, new (alloc(sizeof(BezierPatch))) BezierPatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline BezierPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      BezierPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct GregoryPatch
+    {
+      /* creates GregoryPatch from half edge */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride) {
+        return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(edge,vertices,stride));
+      }
+      
+      __forceinline GregoryPatch (const HalfEdge* edge, const char* vertices, size_t stride) 
+        : patch(CatmullClarkPatch(edge,vertices,stride)) {}
+       
+      /* creates GregoryPatch from CatmullClarkPatch */
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const CatmullClarkPatch& patch,
+                                   const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) {
+        return Ref(GREGORY_PATCH, new (alloc(sizeof(GregoryPatch))) GregoryPatch(patch,border0,border1,border2,border3));
+      }
+      
+      __forceinline GregoryPatch (const CatmullClarkPatch& patch, const BezierCurve* border0, const BezierCurve* border1, const BezierCurve* border2, const BezierCurve* border3) 
+        : patch(patch,border0,border1,border2,border3) {}
+      
+    public:
+      GregoryPatchT<Vertex,Vertex_t> patch;
+    };
+    
+    struct SubdividedQuadPatch
+    {
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, Ref children[4]) {
+        return Ref(SUBDIVIDED_QUAD_PATCH, new (alloc(sizeof(SubdividedQuadPatch))) SubdividedQuadPatch(children));
+      }
+      
+      __forceinline SubdividedQuadPatch(Ref children[4]) {
+        for (size_t i=0; i<4; i++) child[i] = children[i];
+      }
+      
+    public:
+      Ref child[4];
+    };
+    
+    struct SubdividedGeneralPatch
+    {
+      template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, Ref* children, const unsigned N) {
+        return Ref(SUBDIVIDED_GENERAL_PATCH, new (alloc(sizeof(SubdividedGeneralPatch))) SubdividedGeneralPatch(children,N));
+      }
+      
+      __forceinline SubdividedGeneralPatch(Ref* children, const unsigned N) : N(N) {
+        for (unsigned i=0; i<N; i++) child[i] = children[i];
+      }
+      
+      unsigned N;
+      Ref child[MAX_PATCH_VALENCE];
+    };
+    
+    /*! Default constructor. */
+    __forceinline PatchT () {}
+    
+    template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, const HalfEdge* edge, const char* vertices, size_t stride)
+    {
+      if (PATCH_MAX_CACHE_DEPTH == 0) 
+        return nullptr;
+
+      Ref child(0);
+      switch (edge->patch_type) {
+      case HalfEdge::BILINEAR_PATCH:       child = BilinearPatch::create(alloc,edge,vertices,stride); break; 
+      case HalfEdge::REGULAR_QUAD_PATCH:   child = RegularPatch::create(alloc,edge,vertices,stride); break;
+#if PATCH_USE_GREGORY == 2
+      case HalfEdge::IRREGULAR_QUAD_PATCH: child = GregoryPatch::create(alloc,edge,vertices,stride); break;
+#endif
+      default: {
+        GeneralCatmullClarkPatch patch(edge,vertices,stride);
+        child = PatchT::create(alloc,patch,edge,vertices,stride,0);
+      }
+      }
+      return child;
+    }
+
+    template<typename Allocator>
+    __noinline static Ref create(const Allocator& alloc, GeneralCatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth)
+    {  
+      /* convert into standard quad patch if possible */
+      if (likely(patch.isQuadPatch())) 
+      {
+        CatmullClarkPatch qpatch; patch.init(qpatch);
+        return PatchT::create(alloc,qpatch,edge,vertices,stride,depth);
+      }
+   
+      /* do only cache up to some depth */
+      if (depth >= PATCH_MAX_CACHE_DEPTH)
+        return nullptr;
+         
+      /* subdivide patch */
+      unsigned N;
+      array_t<CatmullClarkPatch,GeneralCatmullClarkPatch::SIZE> patches; 
+      patch.subdivide(patches,N);
+      
+      if (N == 4) 
+      {
+        Ref child[4];
+#if PATCH_USE_GREGORY == 2
+        BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; patch.getLimitBorder(borders);
+        BezierCurve border0l,border0r; borders[0].subdivide(border0l,border0r);
+        BezierCurve border1l,border1r; borders[1].subdivide(border1l,border1r);
+        BezierCurve border2l,border2r; borders[2].subdivide(border2l,border2r);
+        BezierCurve border3l,border3r; borders[3].subdivide(border3l,border3r);
+        GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+        child[0] = PatchT::create(alloc,patches[0],edge,vertices,stride,depth+1,&border0l,nullptr,nullptr,&border3r);
+        child[1] = PatchT::create(alloc,patches[1],edge,vertices,stride,depth+1,&border0r,&border1l,nullptr,nullptr);
+        child[2] = PatchT::create(alloc,patches[2],edge,vertices,stride,depth+1,nullptr,&border1r,&border2l,nullptr);
+        child[3] = PatchT::create(alloc,patches[3],edge,vertices,stride,depth+1,nullptr,nullptr,&border2r,&border3l);
+#else
+        GeneralCatmullClarkPatch::fix_quad_ring_order(patches);
+        for (size_t i=0; i<4; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+#endif
+        return SubdividedQuadPatch::create(alloc,child);
+      }
+      else 
+      {
+        assert(N<MAX_PATCH_VALENCE);
+        Ref child[MAX_PATCH_VALENCE];
+        
+#if PATCH_USE_GREGORY == 2
+        BezierCurve borders[GeneralCatmullClarkPatch::SIZE]; 
+        patch.getLimitBorder(borders);
+
+        for (size_t i0=0; i0<N; i0++) {
+          const size_t i2 = i0==0 ? N-1 : i0-1; 
+          BezierCurve border0l,border0r; borders[i0].subdivide(border0l,border0r);
+          BezierCurve border2l,border2r; borders[i2].subdivide(border2l,border2r);
+          child[i0] = PatchT::create(alloc,patches[i0],edge,vertices,stride,depth+1, &border0l, nullptr, nullptr, &border2r);
+        }
+#else
+        for (size_t i=0; i<N; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+#endif
+        return SubdividedGeneralPatch::create(alloc,child,N);
+      }
+      
+      return nullptr;
+    }
+
+    static __forceinline bool final(const CatmullClarkPatch& patch, const typename CatmullClarkRing::Type type, size_t depth) 
+    {
+      const size_t max_eval_depth = (type & CatmullClarkRing::TYPE_CREASES) ? PATCH_MAX_EVAL_DEPTH_CREASE : PATCH_MAX_EVAL_DEPTH_IRREGULAR;
+//#if PATCH_MIN_RESOLUTION
+//      return patch.isFinalResolution(PATCH_MIN_RESOLUTION) || depth>=max_eval_depth;
+//#else
+      return depth>=max_eval_depth;
+//#endif
+    }
+
+    template<typename Allocator>
+      __noinline static Ref create(const Allocator& alloc, CatmullClarkPatch& patch, const HalfEdge* edge, const char* vertices, size_t stride, size_t depth,
+                                   const BezierCurve* border0 = nullptr, const BezierCurve* border1 = nullptr, const BezierCurve* border2 = nullptr, const BezierCurve* border3 = nullptr)
+    {
+      const typename CatmullClarkPatch::Type ty = patch.type();
+      if (unlikely(final(patch,ty,depth))) {
+        if (ty & CatmullClarkRing::TYPE_REGULAR) return RegularPatch::create(alloc,patch,border0,border1,border2,border3); 
+        else                                     return IrregularFillPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+      else if (ty & CatmullClarkRing::TYPE_REGULAR_CREASES) { 
+        assert(depth > 0); return RegularPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+#if PATCH_USE_GREGORY == 2
+      else if (ty & CatmullClarkRing::TYPE_GREGORY_CREASES) { 
+        assert(depth > 0); return GregoryPatch::create(alloc,patch,border0,border1,border2,border3); 
+      }
+#endif
+      else if (depth >= PATCH_MAX_CACHE_DEPTH) {
+        return EvalPatch::create(alloc,patch); 
+      }
+      
+      else 
+      {
+        Ref child[4];
+        array_t<CatmullClarkPatch,4> patches; 
+        patch.subdivide(patches);
+        
+        for (size_t i=0; i<4; i++)
+          child[i] = PatchT::create(alloc,patches[i],edge,vertices,stride,depth+1);
+        return SubdividedQuadPatch::create(alloc,child);
+      }
+    }
+  };
+
+  typedef PatchT<Vec3fa,Vec3fa_t> Patch3fa;
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h
new file mode 100644
index 0000000000..482d015fa3
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval.h
@@ -0,0 +1,129 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename Vertex, typename Vertex_t = Vertex>
+      struct PatchEval
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+        
+        PatchEval (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, 
+                   const HalfEdge* edge, const char* vertices, size_t stride, const float u, const float v, 
+                   Vertex* P, Vertex* dPdu, Vertex* dPdv, Vertex* ddPdudu, Vertex* ddPdvdv, Vertex* ddPdudv)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv)
+        {
+          /* conservative time for the very first allocation */
+          auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+
+          Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () {
+              auto alloc = [&](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); };
+              return Patch::create(alloc,edge,vertices,stride);
+            },true);
+
+          auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+          const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime);
+
+          if (patch && allAllocationsValid &&  eval(patch,u,v,1.0f,0)) {
+            SharedLazyTessellationCache::unlock();
+            return;
+          }
+          SharedLazyTessellationCache::unlock();
+          FeatureAdaptiveEval<Vertex,Vertex_t>(edge,vertices,stride,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv);
+          PATCH_DEBUG_SUBDIVISION(edge,c,-1,-1);
+        }
+        
+        __forceinline bool eval_quad(const typename Patch::SubdividedQuadPatch* This, const float u, const float v, const float dscale, const size_t depth)
+        {
+          if (v < 0.5f) {
+            if (u < 0.5f) return eval(This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1);
+            else          return eval(This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1);
+          } else {
+            if (u > 0.5f) return eval(This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1);
+            else          return eval(This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          }
+        }
+        
+        bool eval_general(const typename Patch::SubdividedGeneralPatch* This, const float U, const float V, const size_t depth)
+        {
+          const unsigned l = (unsigned) floor(0.5f*U); const float u = 2.0f*frac(0.5f*U)-0.5f; 
+          const unsigned h = (unsigned) floor(0.5f*V); const float v = 2.0f*frac(0.5f*V)-0.5f; 
+          const unsigned i = 4*h+l; assert(i<This->N);
+          return eval(This->child[i],u,v,1.0f,depth+1);
+        }
+        
+        bool eval(Ref This, const float& u, const float& v, const float dscale, const size_t depth) 
+        {
+          if (!This) return false;
+          //PRINT(depth);
+          //PRINT2(u,v);
+          
+          switch (This.type()) 
+          {
+          case Patch::BILINEAR_PATCH: {
+            //PRINT("bilinear");
+            ((typename Patch::BilinearPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,c);
+            return true;
+          }
+          case Patch::BSPLINE_PATCH: {
+            //PRINT("bspline");
+            ((typename Patch::BSplinePatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale);
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,-1);
+            return true;
+          }
+          case Patch::BEZIER_PATCH: {
+            //PRINT("bezier");
+            ((typename Patch::BezierPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale);
+            PATCH_DEBUG_SUBDIVISION(This,-1,c,-1);
+            return true;
+          }
+          case Patch::GREGORY_PATCH: {
+            //PRINT("gregory");
+            ((typename Patch::GregoryPatch*)This.object())->patch.eval(u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale); 
+            PATCH_DEBUG_SUBDIVISION(This,-1,-1,c);
+            return true;
+          }
+          case Patch::SUBDIVIDED_QUAD_PATCH: {
+            //PRINT("subdivided quad");
+            return eval_quad(((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth);
+          }
+          case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+            //PRINT("general_patch");
+            assert(dscale == 1.0f); 
+            return eval_general(((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); 
+          }
+          case Patch::EVAL_PATCH: { 
+            //PRINT("eval_patch");
+            CatmullClarkPatch patch; patch.deserialize(This.object());
+            FeatureAdaptiveEval<Vertex,Vertex_t>(patch,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv);
+            return true;
+          }
+          default: 
+            assert(false); 
+            return false;
+          }
+        }
+        
+      private:
+        Vertex* const P;
+        Vertex* const dPdu;
+        Vertex* const dPdv;
+        Vertex* const ddPdudu;
+        Vertex* const ddPdvdv;
+        Vertex* const ddPdudv;
+      };
+  }
+}
+  
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h
new file mode 100644
index 0000000000..c05db55f4c
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_grid.h
@@ -0,0 +1,245 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval_grid.h"
+
+namespace embree
+{
+  namespace isa 
+  {
+    struct PatchEvalGrid
+    {
+      typedef Patch3fa Patch;
+      typedef Patch::Ref Ref;
+      typedef GeneralCatmullClarkPatch3fa GeneralCatmullClarkPatch;
+      typedef CatmullClarkPatch3fa CatmullClarkPatch;
+      typedef BSplinePatch3fa BSplinePatch;
+      typedef BezierPatch3fa BezierPatch;
+      typedef GregoryPatch3fa GregoryPatch;
+      typedef BilinearPatch3fa BilinearPatch;
+
+    private:
+      const unsigned x0,x1;
+      const unsigned y0,y1;
+      const unsigned swidth,sheight;
+      const float rcp_swidth, rcp_sheight;
+      float* const Px;
+      float* const Py;
+      float* const Pz;
+      float* const U;
+      float* const V;
+      float* const Nx;
+      float* const Ny;
+      float* const Nz;
+      const unsigned dwidth,dheight;
+      unsigned count;
+
+    public:      
+
+      PatchEvalGrid (Ref patch, unsigned subPatch,
+                     const unsigned x0, const unsigned x1, const unsigned y0, const unsigned y1, const unsigned swidth, const unsigned sheight, 
+                     float* Px, float* Py, float* Pz, float* U, float* V, 
+                     float* Nx, float* Ny, float* Nz,
+                     const unsigned dwidth, const unsigned dheight)
+      : x0(x0), x1(x1), y0(y0), y1(y1), swidth(swidth), sheight(sheight), rcp_swidth(1.0f/(swidth-1.0f)), rcp_sheight(1.0f/(sheight-1.0f)), 
+        Px(Px), Py(Py), Pz(Pz), U(U), V(V), Nx(Nx), Ny(Ny), Nz(Nz), dwidth(dwidth), dheight(dheight), count(0)
+      {
+        assert(swidth < (2<<20) && sheight < (2<<20));
+        const BBox2f srange(Vec2f(0.0f,0.0f),Vec2f(float(swidth-1),float(sheight-1)));
+        const BBox2f erange(Vec2f(float(x0),float(y0)),Vec2f((float)x1,(float)y1));
+        bool done MAYBE_UNUSED = eval(patch,subPatch,srange,erange);
+        assert(done);
+        assert(count == (x1-x0+1)*(y1-y0+1));
+      }
+
+      template<typename Patch>
+      __forceinline void evalLocalGrid(const Patch* patch, const BBox2f& srange, const int lx0, const int lx1, const int ly0, const int ly1)
+      {
+        const float scale_x = rcp(srange.upper.x-srange.lower.x);
+        const float scale_y = rcp(srange.upper.y-srange.lower.y);
+        count += (lx1-lx0)*(ly1-ly0);
+        
+#if 0
+        for (unsigned iy=ly0; iy<ly1; iy++) {
+          for (unsigned ix=lx0; ix<lx1; ix++) {
+            const float lu = select(ix == swidth -1, float(1.0f), (float(ix)-srange.lower.x)*scale_x);
+            const float lv = select(iy == sheight-1, float(1.0f), (float(iy)-srange.lower.y)*scale_y);
+            const Vec3fa p = patch->patch.eval(lu,lv);
+            const float u = float(ix)*rcp_swidth;
+            const float v = float(iy)*rcp_sheight;
+            const int ofs = (iy-y0)*dwidth+(ix-x0);
+            Px[ofs] = p.x;
+            Py[ofs] = p.y;
+            Pz[ofs] = p.z;
+            U[ofs] = u;
+            V[ofs] = v;
+          }
+        }
+#else
+        foreach2(lx0,lx1,ly0,ly1,[&](const vboolx& valid, const vintx& ix, const vintx& iy) {
+            const vfloatx lu = select(ix == swidth -1, vfloatx(1.0f), (vfloatx(ix)-srange.lower.x)*scale_x);
+            const vfloatx lv = select(iy == sheight-1, vfloatx(1.0f), (vfloatx(iy)-srange.lower.y)*scale_y);
+            const Vec3vfx p = patch->patch.eval(lu,lv);
+            Vec3vfx n = zero;
+            if (unlikely(Nx != nullptr)) n = normalize_safe(patch->patch.normal(lu,lv));
+            const vfloatx u = vfloatx(ix)*rcp_swidth;
+            const vfloatx v = vfloatx(iy)*rcp_sheight;
+            const vintx ofs = (iy-y0)*dwidth+(ix-x0);
+            if (likely(all(valid)) && all(iy==iy[0])) {
+              const unsigned ofs2 = ofs[0];
+              vfloatx::storeu(Px+ofs2,p.x);
+              vfloatx::storeu(Py+ofs2,p.y);
+              vfloatx::storeu(Pz+ofs2,p.z);
+              vfloatx::storeu(U+ofs2,u);
+              vfloatx::storeu(V+ofs2,v);
+              if (unlikely(Nx != nullptr)) {
+                vfloatx::storeu(Nx+ofs2,n.x);
+                vfloatx::storeu(Ny+ofs2,n.y);
+                vfloatx::storeu(Nz+ofs2,n.z);
+              }
+            } else {
+              foreach_unique_index(valid,iy,[&](const vboolx& valid, const int iy0, const int j) {
+                  const unsigned ofs2 = ofs[j]-j;
+                  vfloatx::storeu(valid,Px+ofs2,p.x);
+                  vfloatx::storeu(valid,Py+ofs2,p.y);
+                  vfloatx::storeu(valid,Pz+ofs2,p.z);
+                  vfloatx::storeu(valid,U+ofs2,u);
+                  vfloatx::storeu(valid,V+ofs2,v);
+                  if (unlikely(Nx != nullptr)) {
+                    vfloatx::storeu(valid,Nx+ofs2,n.x);
+                    vfloatx::storeu(valid,Ny+ofs2,n.y);
+                    vfloatx::storeu(valid,Nz+ofs2,n.z);
+                  }
+                });
+            }
+          });
+#endif
+      }
+
+      bool eval(Ref This, const BBox2f& srange, const BBox2f& erange, const unsigned depth) 
+      {
+        if (erange.empty())
+          return true;
+        
+        const int lx0 = (int) ceilf(erange.lower.x);
+        const int lx1 = (int) ceilf(erange.upper.x) + (erange.upper.x == x1 && (srange.lower.x < erange.upper.x || erange.upper.x == 0));
+        const int ly0 = (int) ceilf(erange.lower.y);
+        const int ly1 = (int) ceilf(erange.upper.y) + (erange.upper.y == y1 && (srange.lower.y < erange.upper.y || erange.upper.y == 0));
+        if (lx0 >= lx1 || ly0 >= ly1) 
+          return true;
+
+        if (!This) 
+          return false;
+        
+        switch (This.type()) 
+        {
+        case Patch::BILINEAR_PATCH: {
+          evalLocalGrid((Patch::BilinearPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::BSPLINE_PATCH: {
+          evalLocalGrid((Patch::BSplinePatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::BEZIER_PATCH: {
+          evalLocalGrid((Patch::BezierPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::GREGORY_PATCH: {
+          evalLocalGrid((Patch::GregoryPatch*)This.object(),srange,lx0,lx1,ly0,ly1);
+          return true;
+        }
+        case Patch::SUBDIVIDED_QUAD_PATCH: 
+        {
+          const Vec2f c = srange.center();
+          const BBox2f srange0(srange.lower,c);
+          const BBox2f srange1(Vec2f(c.x,srange.lower.y),Vec2f(srange.upper.x,c.y));
+          const BBox2f srange2(c,srange.upper);
+          const BBox2f srange3(Vec2f(srange.lower.x,c.y),Vec2f(c.x,srange.upper.y));
+          
+          Patch::SubdividedQuadPatch* patch = (Patch::SubdividedQuadPatch*)This.object();
+          eval(patch->child[0],srange0,intersect(srange0,erange),depth+1);
+          eval(patch->child[1],srange1,intersect(srange1,erange),depth+1);
+          eval(patch->child[2],srange2,intersect(srange2,erange),depth+1);
+          eval(patch->child[3],srange3,intersect(srange3,erange),depth+1);
+          return true;
+        }
+        case Patch::EVAL_PATCH: { 
+          CatmullClarkPatch patch; patch.deserialize(This.object());
+          FeatureAdaptiveEvalGrid(patch,srange,erange,depth,x0,x1,y0,y1,swidth,sheight,Px,Py,Pz,U,V,Nx,Ny,Nz,dwidth,dheight);
+          count += (lx1-lx0)*(ly1-ly0);
+          return true;
+        }
+        default: 
+          assert(false); 
+          return false;
+        }
+      }
+
+      bool eval(Ref This, unsigned subPatch, const BBox2f& srange, const BBox2f& erange) 
+      {
+        if (!This) 
+          return false;
+
+        switch (This.type()) 
+        {
+        case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+          Patch::SubdividedGeneralPatch* patch = (Patch::SubdividedGeneralPatch*)This.object();
+          assert(subPatch < patch->N);
+          return eval(patch->child[subPatch],srange,erange,1);
+        }
+        default: 
+          assert(subPatch == 0);
+          return eval(This,srange,erange,0);
+        }
+      }
+    };
+
+    __forceinline unsigned patch_eval_subdivision_count (const HalfEdge* h)
+    {
+      const unsigned N = h->numEdges();
+      if (N == 4) return 1;
+      else return N;
+    }
+    
+    template<typename Tessellator>
+      inline void patch_eval_subdivision (const HalfEdge* h, Tessellator tessellator)
+    {
+      const unsigned N = h->numEdges();
+      int neighborSubdiv[GeneralCatmullClarkPatch3fa::SIZE]; // FIXME: use array_t
+      float levels[GeneralCatmullClarkPatch3fa::SIZE];
+      for (unsigned i=0; i<N; i++) {
+        assert(i<GeneralCatmullClarkPatch3fa::SIZE);
+        neighborSubdiv[i] = h->hasOpposite() ? h->opposite()->numEdges() != 4 : 0; 
+        levels[i] = h->edge_level;
+        h = h->next();
+      }      
+      if (N == 4)
+      {
+        const Vec2f uv[4] = { Vec2f(0.0f,0.0f), Vec2f(1.0f,0.0f), Vec2f(1.0f,1.0f), Vec2f(0.0f,1.0f) };
+        tessellator(uv,neighborSubdiv,levels,0);
+      }
+      else
+      {
+        for (unsigned i=0; i<N; i++) 
+        {
+          assert(i<MAX_PATCH_VALENCE);
+          static_assert(MAX_PATCH_VALENCE <= 16, "MAX_PATCH_VALENCE > 16");
+          const int h = (i >> 2) & 3, l = i & 3;
+          const Vec2f subPatchID((float)l,(float)h);
+          const Vec2f uv[4] = { 2.0f*subPatchID + (0.5f+Vec2f(0.0f,0.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(1.0f,0.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(1.0f,1.0f)),
+                                2.0f*subPatchID + (0.5f+Vec2f(0.0f,1.0f)) };
+          const int neighborSubdiv1[4] = { 0,0,0,0 }; 
+          const float levels1[4] = { 0.5f*levels[(i+0)%N], 0.5f*levels[(i+0)%N], 0.5f*levels[(i+N-1)%N], 0.5f*levels[(i+N-1)%N] };
+          tessellator(uv,neighborSubdiv1,levels1,i);
+        }
+      }
+    }
+  }
+}
+
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h
new file mode 100644
index 0000000000..28016d9e20
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/patch_eval_simd.h
@@ -0,0 +1,127 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "patch.h"
+#include "feature_adaptive_eval_simd.h"
+
+namespace embree
+{
+  namespace isa
+  {
+    template<typename vbool, typename vint, typename vfloat, typename Vertex, typename Vertex_t = Vertex>
+      struct PatchEvalSimd
+      {
+      public:
+        
+        typedef PatchT<Vertex,Vertex_t> Patch;
+        typedef typename Patch::Ref Ref;
+        typedef CatmullClarkPatchT<Vertex,Vertex_t> CatmullClarkPatch;
+
+        PatchEvalSimd (SharedLazyTessellationCache::CacheEntry& entry, size_t commitCounter, 
+                       const HalfEdge* edge, const char* vertices, size_t stride, const vbool& valid0, const vfloat& u, const vfloat& v, 
+                       float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, const size_t dstride, const size_t N)
+        : P(P), dPdu(dPdu), dPdv(dPdv), ddPdudu(ddPdudu), ddPdvdv(ddPdvdv), ddPdudv(ddPdudv), dstride(dstride), N(N)
+        {
+          /* conservative time for the very first allocation */
+          auto time = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+
+          Ref patch = SharedLazyTessellationCache::lookup(entry,commitCounter,[&] () {
+              auto alloc = [](size_t bytes) { return SharedLazyTessellationCache::malloc(bytes); };
+              return Patch::create(alloc,edge,vertices,stride);
+            }, true);
+
+          auto curTime = SharedLazyTessellationCache::sharedLazyTessellationCache.getTime(commitCounter);
+          const bool allAllocationsValid = SharedLazyTessellationCache::validTime(time,curTime);
+          
+          patch = allAllocationsValid ? patch : nullptr;
+
+          /* use cached data structure for calculations */
+          const vbool valid1 = patch ? eval(valid0,patch,u,v,1.0f,0) : vbool(false);
+          SharedLazyTessellationCache::unlock();
+          const vbool valid2 = valid0 & !valid1;
+          if (any(valid2)) {
+            FeatureAdaptiveEvalSimd<vbool,vint,vfloat,Vertex,Vertex_t>(edge,vertices,stride,valid2,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N);
+          }
+        }
+        
+        vbool eval_quad(const vbool& valid, const typename Patch::SubdividedQuadPatch* This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth)
+        {
+          vbool ret = false;
+          const vbool u0_mask = u < 0.5f, u1_mask = u >= 0.5f;
+          const vbool v0_mask = v < 0.5f, v1_mask = v >= 0.5f;
+          const vbool u0v0_mask = valid & u0_mask & v0_mask;
+          const vbool u0v1_mask = valid & u0_mask & v1_mask;
+          const vbool u1v0_mask = valid & u1_mask & v0_mask;
+          const vbool u1v1_mask = valid & u1_mask & v1_mask;
+          if (any(u0v0_mask)) ret |= eval(u0v0_mask,This->child[0],2.0f*u,2.0f*v,2.0f*dscale,depth+1);
+          if (any(u1v0_mask)) ret |= eval(u1v0_mask,This->child[1],2.0f*u-1.0f,2.0f*v,2.0f*dscale,depth+1);
+          if (any(u1v1_mask)) ret |= eval(u1v1_mask,This->child[2],2.0f*u-1.0f,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          if (any(u0v1_mask)) ret |= eval(u0v1_mask,This->child[3],2.0f*u,2.0f*v-1.0f,2.0f*dscale,depth+1);
+          return ret;
+        }
+        
+        vbool eval_general(const vbool& valid, const typename Patch::SubdividedGeneralPatch* patch, const vfloat& U, const vfloat& V, const size_t depth)
+        {
+          vbool ret = false;
+          const vint l = (vint)floor(0.5f*U); const vfloat u = 2.0f*frac(0.5f*U)-0.5f; 
+          const vint h = (vint)floor(0.5f*V); const vfloat v = 2.0f*frac(0.5f*V)-0.5f; 
+          const vint i = (h<<2)+l; assert(all(valid,i<patch->N));
+          foreach_unique(valid,i,[&](const vbool& valid, const int i) {
+              ret |= eval(valid,patch->child[i],u,v,1.0f,depth+1);
+            });
+          return ret;
+        }
+        
+        vbool eval(const vbool& valid, Ref This, const vfloat& u, const vfloat& v, const float dscale, const size_t depth) 
+        {
+          if (!This) return false;
+          switch (This.type()) 
+          {
+          case Patch::BILINEAR_PATCH: {
+            ((typename Patch::BilinearPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); 
+            return valid;
+          }
+          case Patch::BSPLINE_PATCH: {
+            ((typename Patch::BSplinePatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            return valid;
+          }
+          case Patch::BEZIER_PATCH: {
+            ((typename Patch::BezierPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N);
+            return valid;
+          }
+          case Patch::GREGORY_PATCH: {
+            ((typename Patch::GregoryPatch*)This.object())->patch.eval(valid,u,v,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dscale,dstride,N); 
+            return valid;
+          }
+          case Patch::SUBDIVIDED_QUAD_PATCH: {
+            return eval_quad(valid,((typename Patch::SubdividedQuadPatch*)This.object()),u,v,dscale,depth);
+          }
+          case Patch::SUBDIVIDED_GENERAL_PATCH: { 
+            assert(dscale == 1.0f); 
+            return eval_general(valid,((typename Patch::SubdividedGeneralPatch*)This.object()),u,v,depth); 
+          }
+          case Patch::EVAL_PATCH: { 
+            CatmullClarkPatch patch; patch.deserialize(This.object());
+            FeatureAdaptiveEvalSimd<vbool,vint,vfloat,Vertex,Vertex_t>(patch,valid,u,v,dscale,depth,P,dPdu,dPdv,ddPdudu,ddPdvdv,ddPdudv,dstride,N);
+            return valid;
+          }
+          default: 
+            assert(false); 
+            return false;
+          }
+        }
+
+      private:
+        float* const P;
+        float* const dPdu;
+        float* const dPdv;
+        float* const ddPdudu;
+        float* const ddPdvdv;
+        float* const ddPdudv;
+        const size_t dstride;
+        const size_t N;
+      };
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h b/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h
new file mode 100644
index 0000000000..d5bc403cca
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/subdivpatch1base.h
@@ -0,0 +1,156 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../geometry/primitive.h"
+#include "bspline_patch.h"
+#include "bezier_patch.h"
+#include "gregory_patch.h"
+#include "gregory_patch_dense.h"
+#include "tessellation.h"
+#include "tessellation_cache.h"
+#include "gridrange.h"
+#include "patch_eval_grid.h"
+#include "feature_adaptive_eval_grid.h"
+#include "../common/scene_subdiv_mesh.h"
+
+namespace embree
+{
+  struct __aligned(64) SubdivPatch1Base
+  {
+  public:
+
+    enum Type {
+      INVALID_PATCH          = 0,
+      BSPLINE_PATCH          = 1,  
+      BEZIER_PATCH           = 2,  
+      GREGORY_PATCH          = 3,
+      EVAL_PATCH             = 5,
+      BILINEAR_PATCH         = 6,
+    };
+
+    enum Flags {
+      TRANSITION_PATCH       = 16, 
+    };
+
+    /*! Default constructor. */
+    __forceinline SubdivPatch1Base () {}
+
+    SubdivPatch1Base (const unsigned int gID,
+                      const unsigned int pID,
+                      const unsigned int subPatch,
+                      const SubdivMesh *const mesh,
+                      const size_t time,
+                      const Vec2f uv[4],
+                      const float edge_level[4],
+                      const int subdiv[4],
+                      const int simd_width);
+
+    __forceinline bool needsStitching() const {
+      return flags & TRANSITION_PATCH;      
+    }
+
+    __forceinline Vec2f getUV(const size_t i) const {
+      return Vec2f((float)u[i],(float)v[i]) * (8.0f/0x10000);
+    }
+
+    static void computeEdgeLevels(const float edge_level[4], const int subdiv[4], float level[4]);
+    static Vec2i computeGridSize(const float level[4]);
+    bool updateEdgeLevels(const float edge_level[4], const int subdiv[4], const SubdivMesh *const mesh, const int simd_width);
+
+  public:
+
+    __forceinline size_t getGridBytes() const {
+      const size_t grid_size_xyzuv = (grid_size_simd_blocks * VSIZEX) * 4;
+      return 64*((grid_size_xyzuv+15) / 16);
+    }
+
+    __forceinline void write_lock()     { mtx.lock();   }
+    __forceinline void write_unlock()   { mtx.unlock(); }
+    __forceinline bool try_write_lock() { return mtx.try_lock(); }
+    //__forceinline bool try_read_lock()  { return mtx.try_read_lock(); }
+
+    __forceinline void resetRootRef() {
+      //assert( mtx.hasInitialState() );
+      root_ref = SharedLazyTessellationCache::Tag();
+    }
+
+    __forceinline SharedLazyTessellationCache::CacheEntry& entry() {
+      return (SharedLazyTessellationCache::CacheEntry&) root_ref;
+    }
+
+  public:    
+    __forceinline unsigned int geomID() const  {
+      return geom;
+    } 
+
+    __forceinline unsigned int primID() const  {
+      return prim;
+    } 
+
+  public:
+    SharedLazyTessellationCache::Tag root_ref;
+    SpinLock mtx;
+
+    unsigned short u[4];                        //!< 16bit discretized u,v coordinates
+    unsigned short v[4];
+    float level[4];
+
+    unsigned char flags;
+    unsigned char type;
+    unsigned short grid_u_res;
+    unsigned int geom;                          //!< geometry ID of the subdivision mesh this patch belongs to
+    unsigned int prim;                          //!< primitive ID of this subdivision patch
+    unsigned short grid_v_res;
+
+    unsigned short grid_size_simd_blocks;
+    unsigned int time_;
+
+    struct PatchHalfEdge {
+      const HalfEdge* edge;
+      unsigned subPatch;
+    };
+
+    Vec3fa patch_v[4][4];
+
+    const HalfEdge *edge() const { return ((PatchHalfEdge*)patch_v)->edge; }
+    unsigned time() const { return time_; }
+    unsigned subPatch() const { return ((PatchHalfEdge*)patch_v)->subPatch; }
+
+    void set_edge(const HalfEdge *h) const { ((PatchHalfEdge*)patch_v)->edge = h; }
+    void set_subPatch(const unsigned s) const { ((PatchHalfEdge*)patch_v)->subPatch = s; }
+  };
+
+  namespace isa
+  {
+    Vec3fa patchEval(const SubdivPatch1Base& patch, const float uu, const float vv);
+    Vec3fa patchNormal(const SubdivPatch1Base& patch, const float uu, const float vv);
+    
+    template<typename simdf>
+      Vec3<simdf> patchEval(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); 
+
+    template<typename simdf>
+      Vec3<simdf> patchNormal(const SubdivPatch1Base& patch, const simdf& uu, const simdf& vv); 
+   
+
+    /* eval grid over patch and stich edges when required */      
+    void evalGrid(const SubdivPatch1Base& patch,
+                  const unsigned x0, const unsigned x1,
+                  const unsigned y0, const unsigned y1,
+                  const unsigned swidth, const unsigned sheight,
+                  float *__restrict__ const grid_x,
+                  float *__restrict__ const grid_y,
+                  float *__restrict__ const grid_z,
+                  float *__restrict__ const grid_u,
+                  float *__restrict__ const grid_v,
+                  const SubdivMesh* const geom);
+
+    /* eval grid over patch and stich edges when required */      
+    BBox3fa evalGridBounds(const SubdivPatch1Base& patch,
+                           const unsigned x0, const unsigned x1,
+                           const unsigned y0, const unsigned y1,
+                           const unsigned swidth, const unsigned sheight,
+                           const SubdivMesh* const geom);
+  }
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h b/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h
new file mode 100644
index 0000000000..bda1e2d559
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/tessellation.h
@@ -0,0 +1,161 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace embree
+{
+  /* adjust discret tessellation level for feature-adaptive pre-subdivision */
+  __forceinline float adjustTessellationLevel(float l, const size_t sublevel)
+  {
+    for (size_t i=0; i<sublevel; i++) l *= 0.5f;
+    float r = ceilf(l);      
+    for (size_t i=0; i<sublevel; i++) r *= 2.0f;
+    return r;
+  }
+  
+  __forceinline int stitch(const int x, const int fine, const int coarse) {
+    return (2*x+1)*coarse/(2*fine);
+  }
+
+  __forceinline void stitchGridEdges(const unsigned int low_rate,
+                                     const unsigned int high_rate,
+                                     const unsigned int x0,
+                                     const unsigned int x1,
+				    float * __restrict__ const uv_array,
+				    const unsigned int uv_array_step)
+  {
+#if 1
+    const float inv_low_rate = rcp((float)(low_rate-1));
+    for (unsigned x=x0; x<=x1; x++) {
+      uv_array[(x-x0)*uv_array_step] = float(stitch(x,high_rate-1,low_rate-1))*inv_low_rate;
+    }
+    if (unlikely(x1 == high_rate-1))
+      uv_array[(x1-x0)*uv_array_step] = 1.0f;
+#else
+    assert(low_rate < high_rate);
+    assert(high_rate >= 2);
+    
+    const float inv_low_rate = rcp((float)(low_rate-1));
+    const unsigned int dy = low_rate  - 1; 
+    const unsigned int dx = high_rate - 1;
+    
+    int p = 2*dy-dx;  
+    
+    unsigned int offset = 0;
+    unsigned int y = 0;
+    float value = 0.0f;
+    for(unsigned int x=0;x<high_rate-1; x++) // '<=' would be correct but we will leave the 1.0f at the end
+    {
+      uv_array[offset] = value;
+      
+      offset += uv_array_step;      
+      if (unlikely(p > 0))
+      {
+	y++;
+	value = (float)y * inv_low_rate;
+	p -= 2*dx;
+      }
+      p += 2*dy;
+    }
+#endif
+  }
+  
+  __forceinline void stitchUVGrid(const float edge_levels[4],
+                                  const unsigned int swidth,
+                                  const unsigned int sheight,
+                                  const unsigned int x0,
+                                  const unsigned int y0,
+				  const unsigned int grid_u_res,
+				  const unsigned int grid_v_res,
+				  float * __restrict__ const u_array,
+				  float * __restrict__ const v_array)
+  {
+    const unsigned int x1 = x0+grid_u_res-1;
+    const unsigned int y1 = y0+grid_v_res-1;
+    const unsigned int int_edge_points0 = (unsigned int)edge_levels[0] + 1;
+    const unsigned int int_edge_points1 = (unsigned int)edge_levels[1] + 1;
+    const unsigned int int_edge_points2 = (unsigned int)edge_levels[2] + 1;
+    const unsigned int int_edge_points3 = (unsigned int)edge_levels[3] + 1;
+    
+    if (unlikely(y0 == 0 && int_edge_points0 < swidth))
+      stitchGridEdges(int_edge_points0,swidth,x0,x1,u_array,1);
+    
+    if (unlikely(y1 == sheight-1 && int_edge_points2 < swidth))
+      stitchGridEdges(int_edge_points2,swidth,x0,x1,&u_array[(grid_v_res-1)*grid_u_res],1);
+    
+    if (unlikely(x0 == 0 && int_edge_points1 < sheight))
+      stitchGridEdges(int_edge_points1,sheight,y0,y1,&v_array[grid_u_res-1],grid_u_res);
+    
+    if (unlikely(x1 == swidth-1 && int_edge_points3 < sheight))
+      stitchGridEdges(int_edge_points3,sheight,y0,y1,v_array,grid_u_res);  
+  }
+  
+  __forceinline void gridUVTessellator(const float edge_levels[4],  
+                                       const unsigned int swidth,
+                                       const unsigned int sheight,
+                                       const unsigned int x0,
+                                       const unsigned int y0,
+				       const unsigned int grid_u_res,
+				       const unsigned int grid_v_res,
+				       float * __restrict__ const u_array,
+				       float * __restrict__ const v_array)
+  {
+    assert( grid_u_res >= 1);
+    assert( grid_v_res >= 1);
+    assert( edge_levels[0] >= 1.0f );
+    assert( edge_levels[1] >= 1.0f );
+    assert( edge_levels[2] >= 1.0f );
+    assert( edge_levels[3] >= 1.0f );
+    
+#if defined(__AVX__)
+    const vint8 grid_u_segments = vint8(swidth)-1;
+    const vint8 grid_v_segments = vint8(sheight)-1;
+    
+    const vfloat8 inv_grid_u_segments = rcp(vfloat8(grid_u_segments));
+    const vfloat8 inv_grid_v_segments = rcp(vfloat8(grid_v_segments));
+    
+    unsigned int index = 0;
+    vint8 v_i( zero );
+    for (unsigned int y=0;y<grid_v_res;y++,index+=grid_u_res,v_i += 1)
+    {
+      vint8 u_i ( step );
+      
+      const vbool8 m_v = v_i < grid_v_segments;
+      
+      for (unsigned int x=0;x<grid_u_res;x+=8, u_i += 8)
+      {
+        const vbool8 m_u = u_i < grid_u_segments;
+	const vfloat8 u = select(m_u, vfloat8(x0+u_i) * inv_grid_u_segments, 1.0f);
+	const vfloat8 v = select(m_v, vfloat8(y0+v_i) * inv_grid_v_segments, 1.0f);
+	vfloat8::storeu(&u_array[index + x],u);
+	vfloat8::storeu(&v_array[index + x],v);	   
+      }
+    }       
+ #else   
+    const vint4 grid_u_segments = vint4(swidth)-1;
+    const vint4 grid_v_segments = vint4(sheight)-1;
+    
+    const vfloat4 inv_grid_u_segments = rcp(vfloat4(grid_u_segments));
+    const vfloat4 inv_grid_v_segments = rcp(vfloat4(grid_v_segments));
+    
+    unsigned int index = 0;
+    vint4 v_i( zero );
+    for (unsigned int y=0;y<grid_v_res;y++,index+=grid_u_res,v_i += 1)
+    {
+      vint4 u_i ( step );
+      
+      const vbool4 m_v = v_i < grid_v_segments;
+      
+      for (unsigned int x=0;x<grid_u_res;x+=4, u_i += 4)
+      {
+        const vbool4 m_u = u_i < grid_u_segments;
+	const vfloat4 u = select(m_u, vfloat4(x0+u_i) * inv_grid_u_segments, 1.0f);
+	const vfloat4 v = select(m_v, vfloat4(y0+v_i) * inv_grid_v_segments, 1.0f);
+        vfloat4::storeu(&u_array[index + x],u);
+	vfloat4::storeu(&v_array[index + x],v);	   
+      }
+    }       
+#endif
+  } 
+}
diff --git a/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h b/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h
new file mode 100644
index 0000000000..5c215288b6
--- /dev/null
+++ b/thirdparty/embree-aarch64/kernels/subdiv/tessellation_cache.h
@@ -0,0 +1,325 @@
+// Copyright 2009-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../common/default.h"
+
+/* force a complete cache invalidation when running out of allocation space */
+#define FORCE_SIMPLE_FLUSH 0
+
+#define THREAD_BLOCK_ATOMIC_ADD 4
+
+#if defined(DEBUG)
+#define CACHE_STATS(x) 
+#else
+#define CACHE_STATS(x) 
+#endif
+
+namespace embree
+{
+  class SharedTessellationCacheStats
+  {
+  public:
+    /* stats */
+    static std::atomic<size_t> cache_accesses;
+    static std::atomic<size_t> cache_hits;
+    static std::atomic<size_t> cache_misses;
+    static std::atomic<size_t> cache_flushes;                
+    static size_t        cache_num_patches;
+    __aligned(64) static SpinLock mtx;
+    
+    /* print stats for debugging */                 
+    static void printStats();
+    static void clearStats();
+  };
+  
+  void resizeTessellationCache(size_t new_size);
+  void resetTessellationCache();
+  
+ ////////////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////////////
+
+ struct __aligned(64) ThreadWorkState 
+ {
+   ALIGNED_STRUCT_(64);
+
+   std::atomic<size_t> counter;
+   ThreadWorkState* next;
+   bool allocated;
+
+   __forceinline ThreadWorkState(bool allocated = false) 
+     : counter(0), next(nullptr), allocated(allocated) 
+   {
+     assert( ((size_t)this % 64) == 0 ); 
+   }   
+ };
+
+ class __aligned(64) SharedLazyTessellationCache 
+ {
+ public:
+   
+   static const size_t NUM_CACHE_SEGMENTS              = 8;
+   static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512;
+   static const size_t COMMIT_INDEX_SHIFT              = 32+8;
+#if defined(__X86_64__) || defined(__aarch64__)
+   static const size_t REF_TAG_MASK                    = 0xffffffffff;
+#else
+   static const size_t REF_TAG_MASK                    = 0x7FFFFFFF;
+#endif
+   static const size_t MAX_TESSELLATION_CACHE_SIZE     = REF_TAG_MASK+1;
+   static const size_t BLOCK_SIZE                      = 64;
+   
+
+    /*! Per thread tessellation ref cache */
+   static __thread ThreadWorkState* init_t_state;
+   static ThreadWorkState* current_t_state;
+   
+   static __forceinline ThreadWorkState *threadState() 
+   {
+     if (unlikely(!init_t_state))
+       /* sets init_t_state, can't return pointer due to macosx icc bug*/
+       SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState();
+     return init_t_state;
+   }
+
+   struct Tag
+   {
+     __forceinline Tag() : data(0) {}
+
+     __forceinline Tag(void* ptr, size_t combinedTime) { 
+       init(ptr,combinedTime);
+     }
+
+     __forceinline Tag(size_t ptr, size_t combinedTime) {
+       init((void*)ptr,combinedTime); 
+     }
+
+     __forceinline void init(void* ptr, size_t combinedTime)
+     {
+       if (ptr == nullptr) {
+         data = 0;
+         return;
+       }
+       int64_t new_root_ref = (int64_t) ptr;
+       new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();                                
+       assert( new_root_ref <= (int64_t)REF_TAG_MASK );
+       new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT; 
+       data = new_root_ref;
+     }
+
+     __forceinline int64_t get() const { return data.load(); }
+     __forceinline void set( int64_t v ) { data.store(v); }
+     __forceinline void reset() { data.store(0); }
+
+   private:
+     atomic<int64_t> data;
+   };
+
+   static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; }
+
+   struct CacheEntry
+   {
+     Tag tag;
+     SpinLock mutex;
+   };
+
+ private:
+
+   float *data;
+   bool hugepages;
+   size_t size;
+   size_t maxBlocks;
+   ThreadWorkState *threadWorkState;
+      
+   __aligned(64) std::atomic<size_t> localTime;
+   __aligned(64) std::atomic<size_t> next_block;
+   __aligned(64) SpinLock   reset_state;
+   __aligned(64) SpinLock   linkedlist_mtx;
+   __aligned(64) std::atomic<size_t> switch_block_threshold;
+   __aligned(64) std::atomic<size_t> numRenderThreads;
+
+
+ public:
+
+      
+   SharedLazyTessellationCache();
+   ~SharedLazyTessellationCache();
+
+   void getNextRenderThreadWorkState();
+
+   __forceinline size_t maxAllocSize() const {
+     return switch_block_threshold;
+   }
+
+   __forceinline size_t getCurrentIndex() { return localTime.load(); }
+   __forceinline void   addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); }
+
+   __forceinline size_t getTime(const size_t globalTime) {
+     return localTime.load()+NUM_CACHE_SEGMENTS*globalTime;
+   }
+
+
+   __forceinline size_t lockThread  (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus);  }
+   __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); }
+
+   __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; }
+
+   static __forceinline void lock  () { sharedLazyTessellationCache.lockThread(threadState()); }
+   static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); }
+   static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); }
+   static __forceinline size_t getState() { return threadState()->counter.load(); }
+   static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); }
+
+   static __forceinline size_t getTCacheTime(const size_t globalTime) {
+     return sharedLazyTessellationCache.getTime(globalTime);
+   }
+
+   /* per thread lock */
+   __forceinline void lockThreadLoop (ThreadWorkState *const t_state) 
+   { 
+     while(1)
+     {
+       size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1);
+       if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD))
+       {
+         /* lock failed wait until sync phase is over */
+         sharedLazyTessellationCache.unlockThread(t_state,-1);	       
+         sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0);
+       }
+       else
+         break;
+     }
+   }
+
+   static __forceinline void* lookup(CacheEntry& entry, size_t globalTime)
+   {   
+     const int64_t subdiv_patch_root_ref = entry.tag.get(); 
+     CACHE_STATS(SharedTessellationCacheStats::cache_accesses++);
+     
+     if (likely(subdiv_patch_root_ref != 0)) 
+     {
+       const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr();
+       const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
+       
+       if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) ))
+       {
+         CACHE_STATS(SharedTessellationCacheStats::cache_hits++);
+         return (void*) subdiv_patch_root;
+       }
+     }
+     CACHE_STATS(SharedTessellationCacheStats::cache_misses++);
+     return nullptr;
+   }
+
+   template<typename Constructor>
+     static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor())
+   {
+     ThreadWorkState *t_state = SharedLazyTessellationCache::threadState();
+
+     while (true)
+     {
+       sharedLazyTessellationCache.lockThreadLoop(t_state);
+       void* patch = SharedLazyTessellationCache::lookup(entry,globalTime);
+       if (patch) return (decltype(constructor())) patch;
+       
+       if (entry.mutex.try_lock())
+       {
+         if (!validTag(entry.tag,globalTime)) 
+         {
+           auto timeBefore = sharedLazyTessellationCache.getTime(globalTime);
+           auto ret = constructor(); // thread is locked here!
+           assert(ret);
+           /* this should never return nullptr */
+           auto timeAfter = sharedLazyTessellationCache.getTime(globalTime);
+           auto time = before ? timeBefore : timeAfter;
+           __memory_barrier();
+           entry.tag = SharedLazyTessellationCache::Tag(ret,time);
+           __memory_barrier();
+           entry.mutex.unlock();
+           return ret;
+         }
+         entry.mutex.unlock();
+       }
+       SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state);
+     }
+   }
+   
+   __forceinline bool validCacheIndex(const size_t i, const size_t globalTime)
+   {
+#if FORCE_SIMPLE_FLUSH == 1
+     return i == getTime(globalTime);
+#else
+     return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime);
+#endif
+   }
+
+   static __forceinline bool validTime(const size_t oldtime, const size_t newTime)
+   {
+     return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime;
+   }
+
+
+    static __forceinline bool validTag(const Tag& tag, size_t globalTime)
+    {
+      const int64_t subdiv_patch_root_ref = tag.get(); 
+      if (subdiv_patch_root_ref == 0) return false;
+      const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref);
+      return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime);
+    }
+
+   void waitForUsersLessEqual(ThreadWorkState *const t_state,
+			      const unsigned int users);
+    
+   __forceinline size_t alloc(const size_t blocks)
+   {
+     if (unlikely(blocks >= switch_block_threshold))
+       throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment");
+
+     assert(blocks < switch_block_threshold);
+     size_t index = next_block.fetch_add(blocks);
+     if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1;
+     return index;
+   }
+
+   static __forceinline void* malloc(const size_t bytes)
+   {
+     size_t block_index = -1;
+     ThreadWorkState *const t_state = threadState();
+     while (true)
+     {
+       block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE);
+       if (block_index == (size_t)-1)
+       {
+         sharedLazyTessellationCache.unlockThread(t_state);		  
+         sharedLazyTessellationCache.allocNextSegment();
+         sharedLazyTessellationCache.lockThread(t_state);
+         continue; 
+       }
+       break;
+     }
+     return sharedLazyTessellationCache.getBlockPtr(block_index);
+   }
+
+   __forceinline void *getBlockPtr(const size_t block_index)
+   {
+     assert(block_index < maxBlocks);
+     assert(data);
+     assert(block_index*16 <= size);
+     return (void*)&data[block_index*16];
+   }
+
+   __forceinline void*  getDataPtr()      { return data; }
+   __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; }
+   __forceinline size_t getMaxBlocks()    { return maxBlocks; }
+   __forceinline size_t getSize()         { return size; }
+
+   void allocNextSegment();
+   void realloc(const size_t newSize);
+
+   void reset();
+
+   static SharedLazyTessellationCache sharedLazyTessellationCache;
+ };
+}
diff --git a/thirdparty/embree-aarch64/patches/godot-changes.patch b/thirdparty/embree-aarch64/patches/godot-changes.patch
new file mode 100644
index 0000000000..86fbf226d2
--- /dev/null
+++ b/thirdparty/embree-aarch64/patches/godot-changes.patch
@@ -0,0 +1,630 @@
+diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
+index 76c6b740aa..51d296fb16 100644
+--- a/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
++++ b/thirdparty/embree-aarch64/common/algorithms/parallel_for.h
+@@ -27,7 +27,10 @@ namespace embree
+           func(r.begin());
+         });
+       if (!TaskScheduler::wait())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     }
+ #elif defined(TASKING_GCD) && defined(BUILD_IOS)
+       
+@@ -55,13 +58,19 @@ namespace embree
+         func(i);
+       },context);
+     if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #else
+     tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+         func(i);
+       });
+     if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #endif
+ 
+ #elif defined(TASKING_PPL)
+@@ -81,7 +90,10 @@ namespace embree
+ #if defined(TASKING_INTERNAL)
+     TaskScheduler::spawn(first,last,minStepSize,func);
+     if (!TaskScheduler::wait())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+ 
+ #elif defined(TASKING_GCD) && defined(BUILD_IOS)
+       
+@@ -109,13 +121,19 @@ namespace embree
+         func(range<Index>(r.begin(),r.end()));
+       },context);
+     if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #else
+     tbb::parallel_for(tbb::blocked_range<Index>(first,last,minStepSize),[&](const tbb::blocked_range<Index>& r) {
+         func(range<Index>(r.begin(),r.end()));
+       });
+     if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++      // -- GODOT start --
++      // throw std::runtime_error("task cancelled");
++      abort(); 
++      // -- GODOT end --
+   #endif
+ 
+ #elif defined(TASKING_PPL)
+@@ -147,13 +165,19 @@ namespace embree
+           func(i);
+         },tbb::simple_partitioner(),context);
+       if (context.is_group_execution_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     #else
+       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+           func(i);
+         },tbb::simple_partitioner());
+       if (tbb::task::self().is_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     #endif
+   }
+ 
+@@ -168,13 +192,19 @@ namespace embree
+           func(i);
+         },ap,context);
+       if (context.is_group_execution_cancelled())
+-        throw std::runtime_error("task cancelled");
++       // -- GODOT start --
++       // throw std::runtime_error("task cancelled");
++       abort(); 
++       // -- GODOT end --
+     #else
+       tbb::parallel_for(Index(0),N,Index(1),[&](Index i) {
+           func(i);
+         },ap);
+       if (tbb::task::self().is_cancelled())
+-        throw std::runtime_error("task cancelled");
++        // -- GODOT start --
++        // throw std::runtime_error("task cancelled");
++        abort(); 
++        // -- GODOT end --
+     #endif
+   }
+ 
+diff --git a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
+index d444b6a2e4..0daf94e50e 100644
+--- a/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
++++ b/thirdparty/embree-aarch64/common/algorithms/parallel_reduce.h
+@@ -58,15 +58,19 @@ namespace embree
+     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+       reduction,context);
+-    if (context.is_group_execution_cancelled())
+-      throw std::runtime_error("task cancelled");
++    // -- GODOT start --
++    // if (context.is_group_execution_cancelled())
++    //   throw std::runtime_error("task cancelled");
++    // -- GODOT end --
+     return v;
+   #else
+     const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
+       [&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
+       reduction);
+-    if (tbb::task::self().is_cancelled())
+-      throw std::runtime_error("task cancelled");
++    // -- GODOT start --
++    // if (tbb::task::self().is_cancelled())
++    //   throw std::runtime_error("task cancelled");
++    // -- GODOT end --
+     return v;
+   #endif
+ #else // TASKING_PPL
+diff --git a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
+index 7e7b9faef8..98dc80ad59 100644
+--- a/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
++++ b/thirdparty/embree-aarch64/common/lexers/stringstream.cpp
+@@ -39,7 +39,10 @@ namespace embree
+     std::vector<char> str; str.reserve(64);
+     while (cin->peek() != EOF && !isSeparator(cin->peek())) {
+       int c = cin->get();
+-      if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
++      // -- GODOT start --
++      // if (!isValidChar(c)) throw std::runtime_error("invalid character "+std::string(1,c)+" in input");
++      if (!isValidChar(c)) abort();
++      // -- GODOT end --
+       str.push_back((char)c);
+     }
+     str.push_back(0);
+diff --git a/thirdparty/embree-aarch64/common/sys/alloc.cpp b/thirdparty/embree-aarch64/common/sys/alloc.cpp
+index 4e8928242e..12f143f131 100644
+--- a/thirdparty/embree-aarch64/common/sys/alloc.cpp
++++ b/thirdparty/embree-aarch64/common/sys/alloc.cpp
+@@ -21,7 +21,10 @@ namespace embree
+     void* ptr = _mm_malloc(size,align);
+ 
+     if (size != 0 && ptr == nullptr)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort(); 
++      // -- GODOT end --
+     
+     return ptr;
+   }
+@@ -128,7 +131,10 @@ namespace embree
+     /* fall back to 4k pages */
+     int flags = MEM_COMMIT | MEM_RESERVE;
+     char* ptr = (char*) VirtualAlloc(nullptr,bytes,flags,PAGE_READWRITE);
+-    if (ptr == nullptr) throw std::bad_alloc();
++    // -- GODOT start --
++    // if (ptr == nullptr) throw std::bad_alloc();
++    if (ptr == nullptr) abort();
++    // -- GODOT end --
+     hugepages = false;
+     return ptr;
+   }
+@@ -145,7 +151,10 @@ namespace embree
+       return bytesOld;
+ 
+     if (!VirtualFree((char*)ptr+bytesNew,bytesOld-bytesNew,MEM_DECOMMIT))
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+ 
+     return bytesNew;
+   }
+@@ -156,7 +165,10 @@ namespace embree
+       return;
+ 
+     if (!VirtualFree(ptr,0,MEM_RELEASE))
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+   }
+ 
+   void os_advise(void *ptr, size_t bytes)
+@@ -260,7 +272,10 @@ namespace embree
+ 
+     /* fallback to 4k pages */
+     void* ptr = (char*) mmap(0, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+-    if (ptr == MAP_FAILED) throw std::bad_alloc();
++    // -- GODOT start --
++    // if (ptr == MAP_FAILED) throw std::bad_alloc();
++    if (ptr == MAP_FAILED) abort();
++    // -- GODOT end --
+     hugepages = false;
+ 
+     /* advise huge page hint for THP */
+@@ -277,7 +292,10 @@ namespace embree
+       return bytesOld;
+ 
+     if (munmap((char*)ptr+bytesNew,bytesOld-bytesNew) == -1)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+ 
+     return bytesNew;
+   }
+@@ -291,7 +309,10 @@ namespace embree
+     const size_t pageSize = hugepages ? PAGE_SIZE_2M : PAGE_SIZE_4K;
+     bytes = (bytes+pageSize-1) & ~(pageSize-1);
+     if (munmap(ptr,bytes) == -1)
+-      throw std::bad_alloc();
++      // -- GODOT start --
++      // throw std::bad_alloc();
++      abort();
++      // -- GODOT end --
+   }
+ 
+   /* hint for transparent huge pages (THP) */
+diff --git a/thirdparty/embree-aarch64/common/sys/platform.h b/thirdparty/embree-aarch64/common/sys/platform.h
+index 7914eb7a52..737f14aa6e 100644
+--- a/thirdparty/embree-aarch64/common/sys/platform.h
++++ b/thirdparty/embree-aarch64/common/sys/platform.h
+@@ -174,11 +174,19 @@
+ #define PRINT4(x,y,z,w) embree_cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << embree_endl
+ 
+ #if defined(DEBUG) // only report file and line in debug mode
++  // -- GODOT start --
++  // #define THROW_RUNTIME_ERROR(str)
++  //   throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+   #define THROW_RUNTIME_ERROR(str) \
+-    throw std::runtime_error(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
++    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
++  // -- GODOT end --
+ #else
++  // -- GODOT start --
++  // #define THROW_RUNTIME_ERROR(str)
++  //   throw std::runtime_error(str);
+   #define THROW_RUNTIME_ERROR(str) \
+-    throw std::runtime_error(str);
++    abort();
++  // -- GODOT end --
+ #endif
+ 
+ #define FATAL(x)   THROW_RUNTIME_ERROR(x)
+diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
+index 98d7fb9249..ebf656d1a0 100644
+--- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
++++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.cpp
+@@ -48,13 +48,15 @@ namespace embree
+     {
+       Task* prevTask = thread.task;
+       thread.task = this;
+-      try {
+-        if (thread.scheduler->cancellingException == nullptr)
++      // -- GODOT start --
++      // try {
++      // if (thread.scheduler->cancellingException == nullptr)
+           closure->execute();
+-      } catch (...) {
+-        if (thread.scheduler->cancellingException == nullptr)
+-          thread.scheduler->cancellingException = std::current_exception();
+-      }
++      // } catch (...) {
++      //   if (thread.scheduler->cancellingException == nullptr)
++      //     thread.scheduler->cancellingException = std::current_exception();
++      // }
++      // -- GODOT end --
+       thread.task = prevTask;
+       add_dependencies(-1);
+     }
+@@ -297,8 +299,11 @@ namespace embree
+     size_t threadIndex = allocThreadIndex();
+     condition.wait(mutex, [&] () { return hasRootTask.load(); });
+     mutex.unlock();
+-    std::exception_ptr except = thread_loop(threadIndex);
+-    if (except != nullptr) std::rethrow_exception(except);
++    // -- GODOT start --
++    // std::exception_ptr except = thread_loop(threadIndex);
++    // if (except != nullptr) std::rethrow_exception(except);
++    thread_loop(threadIndex);
++    // -- GODOT end --
+   }
+ 
+   void TaskScheduler::reset() {
+@@ -330,7 +335,10 @@ namespace embree
+     return thread->scheduler->cancellingException == nullptr;
+   }
+ 
+-  std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
++// -- GODOT start --
++//   std::exception_ptr TaskScheduler::thread_loop(size_t threadIndex)
++  void TaskScheduler::thread_loop(size_t threadIndex)
++// -- GODOT end --
+   {
+     /* allocate thread structure */
+     std::unique_ptr<Thread> mthread(new Thread(threadIndex,this)); // too large for stack allocation
+@@ -353,9 +361,10 @@ namespace embree
+     swapThread(oldThread);
+ 
+     /* remember exception to throw */
+-    std::exception_ptr except = nullptr;
+-    if (cancellingException != nullptr) except = cancellingException;
+-
++    // -- GODOT start --
++    // std::exception_ptr except = nullptr;
++    // if (cancellingException != nullptr) except = cancellingException;
++    // -- GODOT end --
+     /* wait for all threads to terminate */
+     threadCounter--;
+ #if defined(__WIN32__)
+@@ -373,7 +382,10 @@ namespace embree
+           yield();
+ #endif
+ 	}
+-    return except;
++    // -- GODOT start --
++    // return except;
++    return;
++    // -- GODOT end --
+   }
+ 
+   bool TaskScheduler::steal_from_other_threads(Thread& thread)
+diff --git a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
+index c2a9391aea..8bd70b2b8c 100644
+--- a/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
++++ b/thirdparty/embree-aarch64/common/tasking/taskschedulerinternal.h
+@@ -123,7 +123,10 @@ namespace embree
+       {
+         size_t ofs = bytes + ((align - stackPtr) & (align-1));
+         if (stackPtr + ofs > CLOSURE_STACK_SIZE)
+-          throw std::runtime_error("closure stack overflow");
++          // -- GODOT start --
++          // throw std::runtime_error("closure stack overflow");
++          abort();
++          // -- GODOT end --
+         stackPtr += ofs;
+         return &stack[stackPtr-bytes];
+       }
+@@ -132,7 +135,10 @@ namespace embree
+       __forceinline void push_right(Thread& thread, const size_t size, const Closure& closure)
+       {
+         if (right >= TASK_STACK_SIZE)
+-          throw std::runtime_error("task stack overflow");
++          // -- GODOT start --
++          // throw std::runtime_error("task stack overflow");
++          abort();
++          // -- GODOT end --
+ 
+ 	/* allocate new task on right side of stack */
+         size_t oldStackPtr = stackPtr;
+@@ -239,7 +245,10 @@ namespace embree
+     void wait_for_threads(size_t threadCount);
+ 
+     /*! thread loop for all worker threads */
+-    std::exception_ptr thread_loop(size_t threadIndex);
++    // -- GODOT start --
++    // std::exception_ptr thread_loop(size_t threadIndex);
++    void thread_loop(size_t threadIndex);
++    // -- GODOT end --
+ 
+     /*! steals a task from a different thread */
+     bool steal_from_other_threads(Thread& thread);
+diff --git a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
+index 20cdd2d320..aa56035026 100644
+--- a/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
++++ b/thirdparty/embree-aarch64/kernels/bvh/bvh_statistics.cpp
+@@ -150,7 +150,10 @@ namespace embree
+       }
+     }
+     else {
+-      throw std::runtime_error("not supported node type in bvh_statistics");
++      // -- GODOT start --
++      // throw std::runtime_error("not supported node type in bvh_statistics");
++      abort();
++      // -- GODOT end --
+     }
+     return s;
+   } 
+diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
+index ee5c37b238..625fbf6d4f 100644
+--- a/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
++++ b/thirdparty/embree-aarch64/kernels/common/rtcore.cpp
+@@ -230,7 +230,10 @@ RTC_NAMESPACE_BEGIN;
+     if (quality != RTC_BUILD_QUALITY_LOW &&
+         quality != RTC_BUILD_QUALITY_MEDIUM &&
+         quality != RTC_BUILD_QUALITY_HIGH)
+-      throw std::runtime_error("invalid build quality");
++      // -- GODOT start --
++      // throw std::runtime_error("invalid build quality");
++      abort();
++      // -- GODOT end --
+     scene->setBuildQuality(quality);
+     RTC_CATCH_END2(scene);
+   }
+@@ -1383,7 +1386,10 @@ RTC_NAMESPACE_BEGIN;
+         quality != RTC_BUILD_QUALITY_MEDIUM &&
+         quality != RTC_BUILD_QUALITY_HIGH &&
+         quality != RTC_BUILD_QUALITY_REFIT)
+-      throw std::runtime_error("invalid build quality");
++      // -- GODOT start --
++      // throw std::runtime_error("invalid build quality");
++      abort();
++      // -- GODOT end --
+     geometry->setBuildQuality(quality);
+     RTC_CATCH_END2(geometry);
+   }
+diff --git a/thirdparty/embree-aarch64/kernels/common/rtcore.h b/thirdparty/embree-aarch64/kernels/common/rtcore.h
+index 6583d12d57..4b070e122b 100644
+--- a/thirdparty/embree-aarch64/kernels/common/rtcore.h
++++ b/thirdparty/embree-aarch64/kernels/common/rtcore.h
+@@ -25,52 +25,58 @@ namespace embree
+ #endif
+ 
+ /*! Macros used in the rtcore API implementation */
+-#define RTC_CATCH_BEGIN try {
++// -- GODOT start --
++// #define RTC_CATCH_BEGIN try {
++#define RTC_CATCH_BEGIN
+   
+-#define RTC_CATCH_END(device)                                                \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device::process_error(device,e.error,e.what());                             \
+-  } catch (std::exception& e) {                                                 \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-  } catch (...) {                                                               \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-  }
++// #define RTC_CATCH_END(device)                                                \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device::process_error(device,e.error,e.what());                             \
++//   } catch (std::exception& e) {                                                 \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//   } catch (...) {                                                               \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//   }
++#define RTC_CATCH_END(device)
+   
+-#define RTC_CATCH_END2(scene)                                                \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,e.error,e.what());                             \
+-  } catch (std::exception& e) {                                                 \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-  } catch (...) {                                                               \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-  }
++// #define RTC_CATCH_END2(scene)                                                \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,e.error,e.what());                             \
++//   } catch (std::exception& e) {                                                 \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//   } catch (...) {                                                               \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//   }
++#define RTC_CATCH_END2(scene)
+ 
+-#define RTC_CATCH_END2_FALSE(scene)                                             \
+-  } catch (std::bad_alloc&) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
+-    return false;                                                               \
+-  } catch (rtcore_error& e) {                                                   \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,e.error,e.what());                             \
+-    return false;                                                               \
+-  } catch (std::exception& e) {                                                 \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
+-    return false;                                                               \
+-  } catch (...) {                                                               \
+-    Device* device = scene ? scene->device : nullptr;                           \
+-    Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
+-    return false;                                                               \
+-  }
++// #define RTC_CATCH_END2_FALSE(scene)                                             \
++//   } catch (std::bad_alloc&) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_OUT_OF_MEMORY,"out of memory");      \
++//     return false;                                                               \
++//   } catch (rtcore_error& e) {                                                   \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,e.error,e.what());                             \
++//     return false;                                                               \
++//   } catch (std::exception& e) {                                                 \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,e.what());                   \
++//     return false;                                                               \
++//   } catch (...) {                                                               \
++//     Device* device = scene ? scene->device : nullptr;                           \
++//     Device::process_error(device,RTC_ERROR_UNKNOWN,"unknown exception caught"); \
++//     return false;                                                               \
++//   }
++#define RTC_CATCH_END2_FALSE(scene) return false;
++// -- GODOT end --
+ 
+ #define RTC_VERIFY_HANDLE(handle)                               \
+   if (handle == nullptr) {                                         \
+@@ -97,28 +103,38 @@ namespace embree
+ #define RTC_TRACE(x) 
+ #endif
+ 
+-  /*! used to throw embree API errors */
+-  struct rtcore_error : public std::exception
+-  {
+-    __forceinline rtcore_error(RTCError error, const std::string& str)
+-      : error(error), str(str) {}
+-    
+-    ~rtcore_error() throw() {}
+-    
+-    const char* what () const throw () {
+-      return str.c_str();
+-    }
+-    
+-    RTCError error;
+-    std::string str;
+-  };
++// -- GODOT begin --
++//   /*! used to throw embree API errors */
++//   struct rtcore_error : public std::exception
++//   {
++//     __forceinline rtcore_error(RTCError error, const std::string& str)
++//       : error(error), str(str) {}
++//     
++//     ~rtcore_error() throw() {}
++//     
++//     const char* what () const throw () {
++//       return str.c_str();
++//     }
++//     
++//     RTCError error;
++//     std::string str;
++//   };
++// -- GODOT end --
+ 
+ #if defined(DEBUG) // only report file and line in debug mode
++  // -- GODOT begin --
++  // #define throw_RTCError(error,str) \
++  //   throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
+   #define throw_RTCError(error,str) \
+-    throw rtcore_error(error,std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str));
++    printf(std::string(__FILE__) + " (" + toString(__LINE__) + "): " + std::string(str)), abort();
++  // -- GODOT end --
+ #else
++  // -- GODOT begin --
++  // #define throw_RTCError(error,str) \
++  //   throw rtcore_error(error,str);
+   #define throw_RTCError(error,str) \
+-    throw rtcore_error(error,str);
++    abort();
++  // -- GODOT end --
+ #endif
+ 
+ #define RTC_BUILD_ARGUMENTS_HAS(settings,member) \
+diff --git a/thirdparty/embree-aarch64/kernels/common/scene.cpp b/thirdparty/embree-aarch64/kernels/common/scene.cpp
+index e75aa968f9..1e23aeb415 100644
+--- a/thirdparty/embree-aarch64/kernels/common/scene.cpp
++++ b/thirdparty/embree-aarch64/kernels/common/scene.cpp
+@@ -800,16 +800,18 @@ namespace embree
+     }
+ 
+     /* initiate build */
+-    try {
++    // -- GODOT start --
++    // try {
+       scheduler->spawn_root([&]() { commit_task(); Lock<MutexSys> lock(schedulerMutex); this->scheduler = nullptr; }, 1, !join);
+-    }
+-    catch (...) {
+-      accels_clear();
+-      updateInterface();
+-      Lock<MutexSys> lock(schedulerMutex);
+-      this->scheduler = nullptr;
+-      throw;
+-    }
++    // }
++    // catch (...) {
++    //   accels_clear();
++    //   updateInterface();
++    //   Lock<MutexSys> lock(schedulerMutex);
++    //   this->scheduler = nullptr;
++    //   throw;
++    // }
++    // -- GODOT end --
+   }
+ 
+ #endif
diff --git a/thirdparty/enet/godot.cpp b/thirdparty/enet/godot.cpp
index 73fa3c62a2..189de6cc1f 100644
--- a/thirdparty/enet/godot.cpp
+++ b/thirdparty/enet/godot.cpp
@@ -46,6 +46,7 @@
 class ENetGodotSocket {
 public:
 	virtual Error bind(IP_Address p_ip, uint16_t p_port) = 0;
+	virtual Error get_socket_address(IP_Address *r_ip, uint16_t *r_port) = 0;
 	virtual Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IP_Address p_ip, uint16_t p_port) = 0;
 	virtual Error recvfrom(uint8_t *p_buffer, int p_len, int &r_read, IP_Address &r_ip, uint16_t &r_port) = 0;
 	virtual int set_option(ENetSocketOption p_option, int p_value) = 0;
@@ -64,8 +65,7 @@ class ENetUDP : public ENetGodotSocket {
 
 private:
 	Ref<NetSocket> sock;
-	IP_Address address;
-	uint16_t port = 0;
+	IP_Address local_address;
 	bool bound = false;
 
 public:
@@ -80,10 +80,13 @@ public:
 	}
 
 	Error bind(IP_Address p_ip, uint16_t p_port) {
-		address = p_ip;
-		port = p_port;
+		local_address = p_ip;
 		bound = true;
-		return sock->bind(address, port);
+		return sock->bind(p_ip, p_port);
+	}
+
+	Error get_socket_address(IP_Address *r_ip, uint16_t *r_port) {
+		return sock->get_socket_address(r_ip, r_port);
 	}
 
 	Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IP_Address p_ip, uint16_t p_port) {
@@ -142,6 +145,7 @@ public:
 
 	void close() {
 		sock->close();
+		local_address.clear();
 	}
 };
 
@@ -153,6 +157,7 @@ class ENetDTLSClient : public ENetGodotSocket {
 	bool verify = false;
 	String for_hostname;
 	Ref<X509Certificate> cert;
+	IP_Address local_address;
 
 public:
 	ENetDTLSClient(ENetUDP *p_base, Ref<X509Certificate> p_cert, bool p_verify, String p_for_hostname) {
@@ -161,9 +166,11 @@ public:
 		cert = p_cert;
 		udp.instance();
 		dtls = Ref<PacketPeerDTLS>(PacketPeerDTLS::create());
-		p_base->close();
 		if (p_base->bound) {
-			bind(p_base->address, p_base->port);
+			uint16_t port;
+			p_base->get_socket_address(&local_address, &port);
+			p_base->close();
+			bind(local_address, port);
 		}
 	}
 
@@ -172,7 +179,17 @@ public:
 	}
 
 	Error bind(IP_Address p_ip, uint16_t p_port) {
-		return udp->listen(p_port, p_ip);
+		local_address = p_ip;
+		return udp->bind(p_port, p_ip);
+	}
+
+	Error get_socket_address(IP_Address *r_ip, uint16_t *r_port) {
+		if (!udp->is_bound()) {
+			return ERR_UNCONFIGURED;
+		}
+		*r_ip = local_address;
+		*r_port = udp->get_local_port();
+		return OK;
 	}
 
 	Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IP_Address p_ip, uint16_t p_port) {
@@ -211,7 +228,7 @@ public:
 		ERR_FAIL_COND_V(err != OK, err);
 		ERR_FAIL_COND_V(p_len < r_read, ERR_OUT_OF_MEMORY);
 
-		copymem(p_buffer, buffer, r_read);
+		memcpy(p_buffer, buffer, r_read);
 		r_ip = udp->get_packet_address();
 		r_port = udp->get_packet_port();
 		return err;
@@ -233,13 +250,16 @@ class ENetDTLSServer : public ENetGodotSocket {
 	Ref<UDPServer> udp_server;
 	Map<String, Ref<PacketPeerDTLS>> peers;
 	int last_service = 0;
+	IP_Address local_address;
 
 public:
 	ENetDTLSServer(ENetUDP *p_base, Ref<CryptoKey> p_key, Ref<X509Certificate> p_cert) {
 		udp_server.instance();
-		p_base->close();
 		if (p_base->bound) {
-			bind(p_base->address, p_base->port);
+			uint16_t port;
+			p_base->get_socket_address(&local_address, &port);
+			p_base->close();
+			bind(local_address, port);
 		}
 		server = Ref<DTLSServer>(DTLSServer::create());
 		server->setup(p_key, p_cert);
@@ -254,9 +274,19 @@ public:
 	}
 
 	Error bind(IP_Address p_ip, uint16_t p_port) {
+		local_address = p_ip;
 		return udp_server->listen(p_port, p_ip);
 	}
 
+	Error get_socket_address(IP_Address *r_ip, uint16_t *r_port) {
+		if (!udp_server->is_listening()) {
+			return ERR_UNCONFIGURED;
+		}
+		*r_ip = local_address;
+		*r_port = udp_server->get_local_port();
+		return OK;
+	}
+
 	Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IP_Address p_ip, uint16_t p_port) {
 		String key = String(p_ip) + ":" + itos(p_port);
 		ERR_FAIL_COND_V(!peers.has(key), ERR_UNAVAILABLE);
@@ -315,7 +345,7 @@ public:
 				Vector<String> s = E->key().rsplit(":", false, 1);
 				ERR_CONTINUE(s.size() != 2); // BUG!
 
-				copymem(p_buffer, buffer, r_read);
+				memcpy(p_buffer, buffer, r_read);
 				r_ip = s[0];
 				r_port = s[1].to_int();
 				break; // err = OK
@@ -341,6 +371,7 @@ public:
 		peers.clear();
 		udp_server->stop();
 		server->stop();
+		local_address.clear();
 	}
 };
 
@@ -493,15 +524,26 @@ int enet_socket_receive(ENetSocket socket, ENetAddress *address, ENetBuffer *buf
 	return read;
 }
 
+int enet_socket_get_address (ENetSocket socket, ENetAddress * address) {
+	IP_Address ip;
+	uint16_t port;
+	ENetGodotSocket *sock = (ENetGodotSocket *)socket;
+
+	if (sock->get_socket_address(&ip, &port) != OK) {
+		return -1;
+	}
+
+	enet_address_set_ip(address, ip.get_ipv6(), 16);
+	address->port = port;
+
+	return 0;
+}
+
 // Not implemented
 int enet_socket_wait(ENetSocket socket, enet_uint32 *condition, enet_uint32 timeout) {
 	return 0; // do we need this function?
 }
 
-int enet_socket_get_address(ENetSocket socket, ENetAddress *address) {
-	return -1; // do we need this function?
-}
-
 int enet_socketset_select(ENetSocket maxSocket, ENetSocketSet *readSet, ENetSocketSet *writeSet, enet_uint32 timeout) {
 	return -1;
 }
diff --git a/thirdparty/etc2comp/AUTHORS b/thirdparty/etc2comp/AUTHORS
deleted file mode 100644
index e78a7f4d21..0000000000
--- a/thirdparty/etc2comp/AUTHORS
+++ /dev/null
@@ -1,7 +0,0 @@
-# This is the list of Etc2Comp authors for copyright purposes.
-#
-# This does not necessarily list everyone who has contributed code, since in
-# some cases, their employer may be the copyright holder.  To see the full list
-# of contributors, see the revision history in source control.
-Google Inc.
-Blue Shift Inc.
diff --git a/thirdparty/etc2comp/Etc.cpp b/thirdparty/etc2comp/Etc.cpp
deleted file mode 100644
index a5ee706048..0000000000
--- a/thirdparty/etc2comp/Etc.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "EtcConfig.h"
-#include "Etc.h"
-#include "EtcFilter.h"
-
-#include <string.h>
-
-namespace Etc
-{
-	// ----------------------------------------------------------------------------------------------------
-	// C-style inteface to the encoder
-	//
-	void Encode(float *a_pafSourceRGBA,
-				unsigned int a_uiSourceWidth, 
-				unsigned int a_uiSourceHeight,
-				Image::Format a_format,
-				ErrorMetric a_eErrMetric,
-				float a_fEffort,
-				unsigned int a_uiJobs,
-				unsigned int a_uiMaxJobs,
-				unsigned char **a_ppaucEncodingBits,
-				unsigned int *a_puiEncodingBitsBytes,
-				unsigned int *a_puiExtendedWidth,
-				unsigned int *a_puiExtendedHeight, 
-				int *a_piEncodingTime_ms, bool a_bVerboseOutput)
-	{
-
-		Image image(a_pafSourceRGBA, a_uiSourceWidth,
-					a_uiSourceHeight,
-					a_eErrMetric);
-		image.m_bVerboseOutput = a_bVerboseOutput;
-		image.Encode(a_format, a_eErrMetric, a_fEffort, a_uiJobs, a_uiMaxJobs);
-
-		*a_ppaucEncodingBits = image.GetEncodingBits();
-		*a_puiEncodingBitsBytes = image.GetEncodingBitsBytes();
-		*a_puiExtendedWidth = image.GetExtendedWidth();
-		*a_puiExtendedHeight = image.GetExtendedHeight();
-		*a_piEncodingTime_ms = image.GetEncodingTimeMs();
-	}
-
-	void EncodeMipmaps(float *a_pafSourceRGBA,
-		unsigned int a_uiSourceWidth,
-		unsigned int a_uiSourceHeight,
-		Image::Format a_format,
-		ErrorMetric a_eErrMetric,
-		float a_fEffort,
-		unsigned int a_uiJobs,
-		unsigned int a_uiMaxJobs,
-		unsigned int a_uiMaxMipmaps,
-		unsigned int a_uiMipFilterFlags,
-		RawImage* a_pMipmapImages,
-		int *a_piEncodingTime_ms, 
-		bool a_bVerboseOutput)
-	{
-		auto mipWidth = a_uiSourceWidth;
-		auto mipHeight = a_uiSourceHeight;
-		int totalEncodingTime = 0;
-		for(unsigned int mip = 0; mip < a_uiMaxMipmaps && mipWidth >= 1 && mipHeight >= 1; mip++)
-		{
-			float* pImageData = nullptr;
-			float* pMipImage = nullptr;
-
-			if(mip == 0)
-			{
-				pImageData = a_pafSourceRGBA;
-			}
-			else
-			{
-				pMipImage = new float[mipWidth*mipHeight*4];
-				if(FilterTwoPass(a_pafSourceRGBA, a_uiSourceWidth, a_uiSourceHeight, pMipImage, mipWidth, mipHeight, a_uiMipFilterFlags, Etc::FilterLanczos3) )
-				{
-					pImageData = pMipImage;
-				}
-			}
-
-			if ( pImageData )
-			{
-			
-				Image image(pImageData, mipWidth, mipHeight,	a_eErrMetric);
-
-			image.m_bVerboseOutput = a_bVerboseOutput;
-			image.Encode(a_format, a_eErrMetric, a_fEffort, a_uiJobs, a_uiMaxJobs);
-
-			a_pMipmapImages[mip].paucEncodingBits = std::shared_ptr<unsigned char>(image.GetEncodingBits(), [](unsigned char *p) { delete[] p; });
-			a_pMipmapImages[mip].uiEncodingBitsBytes = image.GetEncodingBitsBytes();
-			a_pMipmapImages[mip].uiExtendedWidth = image.GetExtendedWidth();
-			a_pMipmapImages[mip].uiExtendedHeight = image.GetExtendedHeight();
-
-			totalEncodingTime += image.GetEncodingTimeMs();
-			}
-
-			if(pMipImage)
-			{
-				delete[] pMipImage;
-			}
-
-			if (!pImageData)
-			{
-				break;
-			}
-
-			mipWidth >>= 1;
-			mipHeight >>= 1;
-		}
-
-		*a_piEncodingTime_ms = totalEncodingTime;
-	}
-
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-}
diff --git a/thirdparty/etc2comp/Etc.h b/thirdparty/etc2comp/Etc.h
deleted file mode 100644
index 439388d649..0000000000
--- a/thirdparty/etc2comp/Etc.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcConfig.h"
-#include "EtcImage.h"
-#include "EtcColor.h"
-#include "EtcErrorMetric.h"
-#include <memory>
-
-#define ETCCOMP_MIN_EFFORT_LEVEL (0.0f)
-#define ETCCOMP_DEFAULT_EFFORT_LEVEL (40.0f)
-#define ETCCOMP_MAX_EFFORT_LEVEL (100.0f)
-
-namespace Etc
-{
-	class Block4x4EncodingBits;
-
-	struct RawImage
-	{
-		int uiExtendedWidth;
-		int uiExtendedHeight;
-		unsigned int uiEncodingBitsBytes;
-		std::shared_ptr<unsigned char> paucEncodingBits;
-	};
-
-
-
-	// C-style inteface to the encoder
-	void Encode(float *a_pafSourceRGBA,
-				unsigned int a_uiSourceWidth,
-				unsigned int a_uiSourceHeight,
-				Image::Format a_format,
-				ErrorMetric a_eErrMetric,
-				float a_fEffort,
-				unsigned int a_uiJobs,
-				unsigned int a_uimaxJobs,
-				unsigned char **a_ppaucEncodingBits,
-				unsigned int *a_puiEncodingBitsBytes,
-				unsigned int *a_puiExtendedWidth,
-				unsigned int *a_puiExtendedHeight,
-				int *a_piEncodingTime_ms, bool a_bVerboseOutput = false);
-
-	void EncodeMipmaps(float *a_pafSourceRGBA,
-		unsigned int a_uiSourceWidth,
-		unsigned int a_uiSourceHeight,
-		Image::Format a_format,
-		ErrorMetric a_eErrMetric,
-		float a_fEffort,
-		unsigned int a_uiJobs,
-		unsigned int a_uiMaxJobs,
-		unsigned int a_uiMaxMipmaps,
-		unsigned int a_uiMipFilterFlags,
-		RawImage* a_pMipmaps,
-		int *a_piEncodingTime_ms, bool a_bVerboseOutput = false);
-
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4.cpp b/thirdparty/etc2comp/EtcBlock4x4.cpp
deleted file mode 100644
index 3082fe60db..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4.cpp
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* 
-EtcBlock4x4.cpp
-
-Implements the state associated with each 4x4 block of pixels in an image
-
-Source images that are not a multiple of 4x4 are extended to fill the Block4x4 using pixels with an 
-alpha of NAN
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcColor.h"
-#include "EtcImage.h"
-#include "EtcColorFloatRGBA.h"
-#include "EtcBlock4x4Encoding_RGB8.h"
-#include "EtcBlock4x4Encoding_RGBA8.h"
-#include "EtcBlock4x4Encoding_RGB8A1.h"
-#include "EtcBlock4x4Encoding_R11.h"
-#include "EtcBlock4x4Encoding_RG11.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-namespace Etc
-{
-	// ETC pixels are scanned vertically.  
-	// this mapping is for when someone wants to scan the ETC pixels horizontally
-	const unsigned int Block4x4::s_auiPixelOrderHScan[PIXELS] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4::Block4x4(void)
-	{
-		m_pimageSource = nullptr;
-		m_uiSourceH = 0;
-		m_uiSourceV = 0;
-
-		m_sourcealphamix = SourceAlphaMix::UNKNOWN;
-		m_boolBorderPixels = false;
-		m_boolPunchThroughPixels = false;
-
-		m_pencoding = nullptr;
-
-		m_errormetric = ErrorMetric::NUMERIC;
-
-	}
-	Block4x4::~Block4x4()
-	{
-		m_pimageSource = nullptr;
-		if (m_pencoding)
-		{
-			delete m_pencoding;
-			m_pencoding = nullptr;
-		}
-	}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding from a source image
-	// [a_uiSourceH,a_uiSourceV] is the location of the block in a_pimageSource
-	// a_paucEncodingBits is the place to store the final encoding
-	// a_errormetric is used for finding the best encoding
-	//
-	void Block4x4::InitFromSource(Image *a_pimageSource, 
-									unsigned int a_uiSourceH, unsigned int a_uiSourceV,
-									unsigned char *a_paucEncodingBits,
-									ErrorMetric a_errormetric)
-	{
-
-		Block4x4();
-
-		m_pimageSource = a_pimageSource;
-		m_uiSourceH = a_uiSourceH;
-		m_uiSourceV = a_uiSourceV;
-		m_errormetric = a_errormetric;
-
-		SetSourcePixels();
-
-		// set block encoder function
-		switch (m_pimageSource->GetFormat())
-		{
-		case Image::Format::ETC1:
-			m_pencoding = new Block4x4Encoding_ETC1;
-			break;
-
-		case Image::Format::RGB8:
-		case Image::Format::SRGB8:
-			m_pencoding = new Block4x4Encoding_RGB8;
-			break;
-
-		case Image::Format::RGBA8:
-		case Image::Format::SRGBA8:
-			if (a_errormetric == RGBX)
-			{
-				m_pencoding = new Block4x4Encoding_RGBA8;
-			}
-			else
-			{
-				switch (m_sourcealphamix)
-				{
-				case SourceAlphaMix::OPAQUE:
-					m_pencoding = new Block4x4Encoding_RGBA8_Opaque;
-					break;
-
-				case SourceAlphaMix::TRANSPARENT:
-					m_pencoding = new Block4x4Encoding_RGBA8_Transparent;
-					break;
-
-				case SourceAlphaMix::TRANSLUCENT:
-					m_pencoding = new Block4x4Encoding_RGBA8;
-					break;
-
-				default:
-					assert(0);
-					break;
-				}
-				break;
-			}
-			break;
-
-		case Image::Format::RGB8A1:
-		case Image::Format::SRGB8A1:
-			switch (m_sourcealphamix)
-			{
-			case SourceAlphaMix::OPAQUE:
-				m_pencoding = new Block4x4Encoding_RGB8A1_Opaque;
-				break;
-
-			case SourceAlphaMix::TRANSPARENT:
-				m_pencoding = new Block4x4Encoding_RGB8A1_Transparent;
-				break;
-
-			case SourceAlphaMix::TRANSLUCENT:
-				if (m_boolPunchThroughPixels)
-				{
-					m_pencoding = new Block4x4Encoding_RGB8A1;
-				}
-				else
-				{
-					m_pencoding = new Block4x4Encoding_RGB8A1_Opaque;
-				}
-				break;
-
-			default:
-				assert(0);
-				break;
-			}
-			break;
-
-		case Image::Format::R11:
-		case Image::Format::SIGNED_R11:
-			m_pencoding = new Block4x4Encoding_R11;
-			break;
-		case Image::Format::RG11:
-		case Image::Format::SIGNED_RG11:
-			m_pencoding = new Block4x4Encoding_RG11;
-			break;
-		default:
-			assert(0);
-			break;
-		}
-
-		m_pencoding->InitFromSource(this, m_afrgbaSource,
-									a_paucEncodingBits, a_errormetric);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization of encoding state from a prior encoding using encoding bits
-	// [a_uiSourceH,a_uiSourceV] is the location of the block in a_pimageSource
-	// a_paucEncodingBits is the place to read the prior encoding
-	// a_imageformat is used to determine how to interpret a_paucEncodingBits
-	// a_errormetric was used for the prior encoding
-	//
-	void Block4x4::InitFromEtcEncodingBits(Image::Format a_imageformat,
-											unsigned int a_uiSourceH, unsigned int a_uiSourceV,
-											unsigned char *a_paucEncodingBits,
-											Image *a_pimageSource,
-											ErrorMetric a_errormetric)
-	{
-		Block4x4();
-
-		m_pimageSource = a_pimageSource;
-		m_uiSourceH = a_uiSourceH;
-		m_uiSourceV = a_uiSourceV;
-		m_errormetric = a_errormetric;
-
-		SetSourcePixels();
-
-		// set block encoder function
-		switch (a_imageformat)
-		{
-		case Image::Format::ETC1:
-			m_pencoding = new Block4x4Encoding_ETC1;
-			break;
-
-		case Image::Format::RGB8:
-		case Image::Format::SRGB8:
-			m_pencoding = new Block4x4Encoding_RGB8;
-			break;
-
-		case Image::Format::RGBA8:
-		case Image::Format::SRGBA8:
-			m_pencoding = new Block4x4Encoding_RGBA8;
-			break;
-
-		case Image::Format::RGB8A1:
-		case Image::Format::SRGB8A1:
-			m_pencoding = new Block4x4Encoding_RGB8A1;
-			break;
-
-		case Image::Format::R11:
-		case Image::Format::SIGNED_R11:
-			m_pencoding = new Block4x4Encoding_R11;
-			break;
-		case Image::Format::RG11:
-		case Image::Format::SIGNED_RG11:
-			m_pencoding = new Block4x4Encoding_RG11;
-			break;
-		default:
-			assert(0);
-			break;
-		}
-
-		m_pencoding->InitFromEncodingBits(this, a_paucEncodingBits, m_afrgbaSource,
-										m_pimageSource->GetErrorMetric());
-
-	}
-	
-	// ----------------------------------------------------------------------------------------------------
-	// set source pixels from m_pimageSource
-	// set m_alphamix
-	//
-	void Block4x4::SetSourcePixels(void)
-	{
-
-		Image::Format imageformat = m_pimageSource->GetFormat();
-
-		// alpha census
-		unsigned int uiTransparentSourcePixels = 0;
-		unsigned int uiOpaqueSourcePixels = 0;
-
-		// copy source to consecutive memory locations
-		// convert from image horizontal scan to block vertical scan
-		unsigned int uiPixel = 0;
-		for (unsigned int uiBlockPixelH = 0; uiBlockPixelH < Block4x4::COLUMNS; uiBlockPixelH++)
-		{
-			unsigned int uiSourcePixelH = m_uiSourceH + uiBlockPixelH;
-
-			for (unsigned int uiBlockPixelV = 0; uiBlockPixelV < Block4x4::ROWS; uiBlockPixelV++)
-			{
-				unsigned int uiSourcePixelV = m_uiSourceV + uiBlockPixelV;
-
-				ColorFloatRGBA *pfrgbaSource = m_pimageSource->GetSourcePixel(uiSourcePixelH, uiSourcePixelV);
-
-				// if pixel extends beyond source image because of block padding
-				if (pfrgbaSource == nullptr)
-				{
-					m_afrgbaSource[uiPixel] = ColorFloatRGBA(0.0f, 0.0f, 0.0f, NAN);	// denotes border pixel
-					m_boolBorderPixels = true;
-					uiTransparentSourcePixels++;
-				}
-				else
-				{
-					//get teh current pixel data, and store some of the attributes
-					//before capping values to fit the encoder type
-					
-					m_afrgbaSource[uiPixel] = (*pfrgbaSource).ClampRGBA();
-
-					if (m_afrgbaSource[uiPixel].fA == 1.0f || m_errormetric == RGBX)
-					{
-						m_pimageSource->m_iNumOpaquePixels++;
-					}
-					else if (m_afrgbaSource[uiPixel].fA == 0.0f)
-					{
-						m_pimageSource->m_iNumTransparentPixels++;
-					}
-					else if(m_afrgbaSource[uiPixel].fA > 0.0f && m_afrgbaSource[uiPixel].fA < 1.0f)
-					{
-						m_pimageSource->m_iNumTranslucentPixels++;
-					}
-					else
-					{
-						m_pimageSource->m_numOutOfRangeValues.fA++;
-					}
-
-					if (m_afrgbaSource[uiPixel].fR != 0.0f)
-					{
-						m_pimageSource->m_numColorValues.fR++;
-						//make sure we are getting a float between 0-1
-						if (m_afrgbaSource[uiPixel].fR - 1.0f > 0.0f)
-						{
-							m_pimageSource->m_numOutOfRangeValues.fR++;
-						}
-					}
-
-					if (m_afrgbaSource[uiPixel].fG != 0.0f)
-					{
-						m_pimageSource->m_numColorValues.fG++;
-						if (m_afrgbaSource[uiPixel].fG - 1.0f > 0.0f)
-						{
-							m_pimageSource->m_numOutOfRangeValues.fG++;
-						}
-					}
-					if (m_afrgbaSource[uiPixel].fB != 0.0f)
-					{
-						m_pimageSource->m_numColorValues.fB++;
-						if (m_afrgbaSource[uiPixel].fB - 1.0f > 0.0f)
-						{
-							m_pimageSource->m_numOutOfRangeValues.fB++;
-						}
-					}
-					// for formats with no alpha, set source alpha to 1
-					if (imageformat == Image::Format::ETC1 ||
-						imageformat == Image::Format::RGB8 ||
-						imageformat == Image::Format::SRGB8)
-					{
-						m_afrgbaSource[uiPixel].fA = 1.0f;
-					}
-
-					if (imageformat == Image::Format::R11 ||
-						imageformat == Image::Format::SIGNED_R11)
-					{
-						m_afrgbaSource[uiPixel].fA = 1.0f;
-						m_afrgbaSource[uiPixel].fG = 0.0f;
-						m_afrgbaSource[uiPixel].fB = 0.0f;
-					}
-
-					if (imageformat == Image::Format::RG11 ||
-						imageformat == Image::Format::SIGNED_RG11)
-					{
-						m_afrgbaSource[uiPixel].fA = 1.0f;
-						m_afrgbaSource[uiPixel].fB = 0.0f;
-					}
-
-				
-					// for RGB8A1, set source alpha to 0.0 or 1.0
-					// set punch through flag
-					if (imageformat == Image::Format::RGB8A1 ||
-						imageformat == Image::Format::SRGB8A1)
-					{
-						if (m_afrgbaSource[uiPixel].fA >= 0.5f)
-						{
-							m_afrgbaSource[uiPixel].fA = 1.0f;
-						}
-						else
-						{
-							m_afrgbaSource[uiPixel].fA = 0.0f;
-							m_boolPunchThroughPixels = true;
-						}
-					}
-
-					if (m_afrgbaSource[uiPixel].fA == 1.0f || m_errormetric == RGBX)
-					{
-						uiOpaqueSourcePixels++;
-					}
-					else if (m_afrgbaSource[uiPixel].fA == 0.0f)
-					{
-						uiTransparentSourcePixels++;
-					}
-
-				}
-
-				uiPixel += 1;
-			}
-		}
-
-		if (uiOpaqueSourcePixels == PIXELS)
-		{
-			m_sourcealphamix = SourceAlphaMix::OPAQUE;
-		}
-		else if (uiTransparentSourcePixels == PIXELS)
-		{
-			m_sourcealphamix = SourceAlphaMix::TRANSPARENT;
-		}
-		else
-		{
-			m_sourcealphamix = SourceAlphaMix::TRANSLUCENT;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// return a name for the encoding mode
-	//
-	const char * Block4x4::GetEncodingModeName(void)
-	{
-
-		switch (m_pencoding->GetMode())
-		{
-		case Block4x4Encoding::MODE_ETC1:
-			return "ETC1";
-		case Block4x4Encoding::MODE_T:
-			return "T";
-		case Block4x4Encoding::MODE_H:
-			return "H";
-		case Block4x4Encoding::MODE_PLANAR:
-			return "PLANAR";
-		default:
-			return "???";
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4.h b/thirdparty/etc2comp/EtcBlock4x4.h
deleted file mode 100644
index 0fd30c598d..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColor.h"
-#include "EtcColorFloatRGBA.h"
-#include "EtcErrorMetric.h"
-#include "EtcImage.h"
-#include "EtcBlock4x4Encoding.h"
-
-namespace Etc
-{
-	class Block4x4EncodingBits;
-
-	class Block4x4
-	{
-	public:
-
-		static const unsigned int ROWS = 4;
-		static const unsigned int COLUMNS = 4;
-		static const unsigned int PIXELS = ROWS * COLUMNS;
-
-		// the alpha mix for a 4x4 block of pixels
-		enum class SourceAlphaMix
-		{
-			UNKNOWN,
-			//
-			OPAQUE,			// all 1.0
-			TRANSPARENT,	// all 0.0 or NAN
-			TRANSLUCENT		// not all opaque or transparent
-		};
-
-		typedef void (Block4x4::*EncoderFunctionPtr)(void);
-
-		Block4x4(void);
-		~Block4x4();
-		void InitFromSource(Image *a_pimageSource,
-							unsigned int a_uiSourceH,
-							unsigned int a_uiSourceV,
-							unsigned char *a_paucEncodingBits,
-							ErrorMetric a_errormetric);
-
-		void InitFromEtcEncodingBits(Image::Format a_imageformat,
-										unsigned int a_uiSourceH,
-										unsigned int a_uiSourceV,
-										unsigned char *a_paucEncodingBits,
-										Image *a_pimageSource,
-										ErrorMetric a_errormetric);
-
-		// return true if final iteration was performed
-		inline void PerformEncodingIteration(float a_fEffort)
-		{
-			m_pencoding->PerformIteration(a_fEffort);
-		}
-
-		inline void SetEncodingBitsFromEncoding(void)
-		{
-			m_pencoding->SetEncodingBits();
-		}
-
-		inline unsigned int GetSourceH(void)
-		{
-			return m_uiSourceH;
-		}
-
-		inline unsigned int GetSourceV(void)
-		{
-			return m_uiSourceV;
-		}
-
-		inline float GetError(void)
-		{
-			return m_pencoding->GetError();
-		}
-
-		static const unsigned int s_auiPixelOrderHScan[PIXELS];
-
-		inline ColorFloatRGBA * GetDecodedColors(void)
-		{
-			return m_pencoding->GetDecodedColors();
-		}
-
-		inline float * GetDecodedAlphas(void)
-		{
-			return m_pencoding->GetDecodedAlphas();
-		}
-
-		inline Block4x4Encoding::Mode GetEncodingMode(void)
-		{
-			return m_pencoding->GetMode();
-		}
-
-		inline bool GetFlip(void)
-		{
-			return m_pencoding->GetFlip();
-		}
-
-		inline bool IsDifferential(void)
-		{
-			return m_pencoding->IsDifferential();
-		}
-
-		inline ColorFloatRGBA * GetSource()
-		{
-			return m_afrgbaSource;
-		}
-
-		inline ErrorMetric GetErrorMetric()
-		{
-			return m_errormetric;
-		}
-
-		const char * GetEncodingModeName(void);
-
-		inline Block4x4Encoding * GetEncoding(void)
-		{
-			return m_pencoding;
-		}
-
-		inline SourceAlphaMix GetSourceAlphaMix(void)
-		{
-			return m_sourcealphamix;
-		}
-
-		inline Image * GetImageSource(void)
-		{
-			return m_pimageSource;
-		}
-
-		inline bool HasBorderPixels(void)
-		{
-			return m_boolBorderPixels;
-		}
-
-		inline bool HasPunchThroughPixels(void)
-		{
-			return m_boolPunchThroughPixels;
-		}
-
-	private:
-
-		void SetSourcePixels(void);
-
-		Image				*m_pimageSource;
-		unsigned int		m_uiSourceH;
-		unsigned int		m_uiSourceV;
-		ErrorMetric			m_errormetric;
-		ColorFloatRGBA		m_afrgbaSource[PIXELS];		// vertical scan
-
-		SourceAlphaMix		m_sourcealphamix;
-		bool				m_boolBorderPixels;			// marked as rgba(NAN, NAN, NAN, NAN)
-		bool				m_boolPunchThroughPixels;	// RGB8A1 or SRGB8A1 with any pixels with alpha < 0.5
-
-		Block4x4Encoding	*m_pencoding;
-
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding.cpp
deleted file mode 100644
index 7a9e68c4cf..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding.cpp
-
-Block4x4Encoding is the abstract base class for the different encoders.  Each encoder targets a 
-particular file format (e.g. ETC1, RGB8, RGBA8, R11)
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-namespace Etc
-{
-	// ----------------------------------------------------------------------------------------------------
-	//
-	const float Block4x4Encoding::LUMA_WEIGHT = 3.0f;
-	const float Block4x4Encoding::CHROMA_BLUE_WEIGHT = 0.5f;
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding::Block4x4Encoding(void)
-	{
-
-		m_pblockParent = nullptr;
-
-		m_pafrgbaSource = nullptr;
-
-		m_boolBorderPixels = false;
-
-		m_fError = -1.0f;
-
-		m_mode = MODE_UNKNOWN;
-
-		m_uiEncodingIterations = 0;
-		m_boolDone = false;
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(-1.0f, -1.0f, -1.0f, -1.0f);
-			m_afDecodedAlphas[uiPixel] = -1.0f;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialize the generic encoding for a 4x4 block
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// init the decoded pixels to -1 to mark them as undefined
-	// init the error to -1 to mark it as undefined
-	//
-	void Block4x4Encoding::Init(Block4x4 *a_pblockParent,
-								ColorFloatRGBA *a_pafrgbaSource,
-								ErrorMetric a_errormetric)
-	{
-
-		m_pblockParent = a_pblockParent;
-
-		m_pafrgbaSource = a_pafrgbaSource;
-
-		m_boolBorderPixels = m_pblockParent->HasBorderPixels();
-
-		m_fError = -1.0f;
-
-		m_uiEncodingIterations = 0;
-
-		m_errormetric = a_errormetric;
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(-1.0f, -1.0f, -1.0f, -1.0f);
-			m_afDecodedAlphas[uiPixel] = -1.0f;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate the error for the block by summing the pixel errors
-	//
-	void Block4x4Encoding::CalcBlockError(void)
-	{
-		m_fError = 0.0f;
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_fError += CalcPixelError(m_afrgbaDecodedColors[uiPixel], m_afDecodedAlphas[uiPixel],
-										m_pafrgbaSource[uiPixel]);
-		}
-		
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate the error between the source pixel and the decoded pixel
-	// the error amount is base on the error metric
-	//
-	float Block4x4Encoding::CalcPixelError(ColorFloatRGBA a_frgbaDecodedColor, float a_fDecodedAlpha,
-											ColorFloatRGBA a_frgbaSourcePixel)
-	{
-
-		// if a border pixel
-		if (isnan(a_frgbaSourcePixel.fA))
-		{
-			return 0.0f;
-		}
-
-		if (m_errormetric == ErrorMetric::RGBA)
-		{
-			assert(a_fDecodedAlpha >= 0.0f);
-
-			float fDRed = (a_fDecodedAlpha * a_frgbaDecodedColor.fR) -
-							(a_frgbaSourcePixel.fA * a_frgbaSourcePixel.fR);
-			float fDGreen = (a_fDecodedAlpha * a_frgbaDecodedColor.fG) -
-							(a_frgbaSourcePixel.fA * a_frgbaSourcePixel.fG);
-			float fDBlue = (a_fDecodedAlpha * a_frgbaDecodedColor.fB) -
-							(a_frgbaSourcePixel.fA * a_frgbaSourcePixel.fB);
-
-			float fDAlpha = a_fDecodedAlpha - a_frgbaSourcePixel.fA;
-
-			return fDRed*fDRed + fDGreen*fDGreen + fDBlue*fDBlue + fDAlpha*fDAlpha;
-		}
-		else if (m_errormetric == ErrorMetric::RGBX)
-		{
-			assert(a_fDecodedAlpha >= 0.0f);
-
-			float fDRed = a_frgbaDecodedColor.fR - a_frgbaSourcePixel.fR;
-			float fDGreen = a_frgbaDecodedColor.fG - a_frgbaSourcePixel.fG;
-			float fDBlue = a_frgbaDecodedColor.fB - a_frgbaSourcePixel.fB;
-			float fDAlpha = a_fDecodedAlpha - a_frgbaSourcePixel.fA;
-
-			return fDRed*fDRed + fDGreen*fDGreen + fDBlue*fDBlue + fDAlpha*fDAlpha;
-		}
-		else if (m_errormetric == ErrorMetric::REC709)
-		{
-			assert(a_fDecodedAlpha >= 0.0f);
-
-			float fLuma1 = a_frgbaSourcePixel.fR*0.2126f + a_frgbaSourcePixel.fG*0.7152f + a_frgbaSourcePixel.fB*0.0722f;
-			float fChromaR1 = 0.5f * ((a_frgbaSourcePixel.fR - fLuma1) * (1.0f / (1.0f - 0.2126f)));
-			float fChromaB1 = 0.5f * ((a_frgbaSourcePixel.fB - fLuma1) * (1.0f / (1.0f - 0.0722f)));
-
-			float fLuma2 = a_frgbaDecodedColor.fR*0.2126f +
-							a_frgbaDecodedColor.fG*0.7152f +
-							a_frgbaDecodedColor.fB*0.0722f;
-			float fChromaR2 = 0.5f * ((a_frgbaDecodedColor.fR - fLuma2) * (1.0f / (1.0f - 0.2126f)));
-			float fChromaB2 = 0.5f * ((a_frgbaDecodedColor.fB - fLuma2) * (1.0f / (1.0f - 0.0722f)));
-
-			float fDeltaL = a_frgbaSourcePixel.fA * fLuma1 - a_fDecodedAlpha * fLuma2;
-			float fDeltaCr = a_frgbaSourcePixel.fA * fChromaR1 - a_fDecodedAlpha * fChromaR2;
-			float fDeltaCb = a_frgbaSourcePixel.fA * fChromaB1 - a_fDecodedAlpha * fChromaB2;
-
-			float fDAlpha = a_fDecodedAlpha - a_frgbaSourcePixel.fA;
-
-			// Favor Luma accuracy over Chroma, and Red over Blue 
-			return LUMA_WEIGHT*fDeltaL*fDeltaL +
-					fDeltaCr*fDeltaCr +
-					CHROMA_BLUE_WEIGHT*fDeltaCb*fDeltaCb +
-					fDAlpha*fDAlpha;
-	#if 0
-			float fDRed = a_frgbaDecodedPixel.fR - a_frgbaSourcePixel.fR;
-			float fDGreen = a_frgbaDecodedPixel.fG - a_frgbaSourcePixel.fG;
-			float fDBlue = a_frgbaDecodedPixel.fB - a_frgbaSourcePixel.fB;
-			return 2.0f * 3.0f * fDeltaL * fDeltaL + fDRed*fDRed + fDGreen*fDGreen + fDBlue*fDBlue;
-#endif
-		}
-		else if (m_errormetric == ErrorMetric::NORMALXYZ)
-		{
-			float fDecodedX = 2.0f * a_frgbaDecodedColor.fR - 1.0f;
-			float fDecodedY = 2.0f * a_frgbaDecodedColor.fG - 1.0f;
-			float fDecodedZ = 2.0f * a_frgbaDecodedColor.fB - 1.0f;
-
-			float fDecodedLength = sqrtf(fDecodedX*fDecodedX + fDecodedY*fDecodedY + fDecodedZ*fDecodedZ);
-
-			if (fDecodedLength < 0.5f)
-			{
-				return 1.0f;
-			}
-			else if (fDecodedLength == 0.0f)
-			{
-				fDecodedX = 1.0f;
-				fDecodedY = 0.0f;
-				fDecodedZ = 0.0f;
-			}
-			else
-			{
-				fDecodedX /= fDecodedLength;
-				fDecodedY /= fDecodedLength;
-				fDecodedZ /= fDecodedLength;
-			}
-
-			float fSourceX = 2.0f * a_frgbaSourcePixel.fR - 1.0f;
-			float fSourceY = 2.0f * a_frgbaSourcePixel.fG - 1.0f;
-			float fSourceZ = 2.0f * a_frgbaSourcePixel.fB - 1.0f;
-
-			float fSourceLength = sqrtf(fSourceX*fSourceX + fSourceY*fSourceY + fSourceZ*fSourceZ);
-
-			if (fSourceLength == 0.0f)
-			{
-				fSourceX = 1.0f;
-				fSourceY = 0.0f;
-				fSourceZ = 0.0f;
-			}
-			else
-			{
-				fSourceX /= fSourceLength;
-				fSourceY /= fSourceLength;
-				fSourceZ /= fSourceLength;
-			}
-
-			float fDotProduct = fSourceX*fDecodedX + fSourceY*fDecodedY + fSourceZ*fDecodedZ;
-			float fNormalizedDotProduct = 1.0f - 0.5f * (fDotProduct + 1.0f);
-			float fDotProductError = fNormalizedDotProduct * fNormalizedDotProduct;
-			
-			float fLength2 = fDecodedX*fDecodedX + fDecodedY*fDecodedY + fDecodedZ*fDecodedZ;
-			float fLength2Error = fabsf(1.0f - fLength2);
-
-			float fDeltaW = a_frgbaDecodedColor.fA - a_frgbaSourcePixel.fA;
-			float fErrorW = fDeltaW * fDeltaW;
-
-			return fDotProductError + fLength2Error + fErrorW;
-		}
-		else // ErrorMetric::NUMERIC
-		{
-			assert(a_fDecodedAlpha >= 0.0f);
-
-			float fDX = a_frgbaDecodedColor.fR - a_frgbaSourcePixel.fR;
-			float fDY = a_frgbaDecodedColor.fG - a_frgbaSourcePixel.fG;
-			float fDZ = a_frgbaDecodedColor.fB - a_frgbaSourcePixel.fB;
-			float fDW = a_frgbaDecodedColor.fA - a_frgbaSourcePixel.fA;
-
-			return fDX*fDX + fDY*fDY + fDZ*fDZ + fDW*fDW;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
-
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding.h b/thirdparty/etc2comp/EtcBlock4x4Encoding.h
deleted file mode 100644
index c14c3b8616..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColorFloatRGBA.h"
-
-#include "EtcErrorMetric.h"
-
-#include <assert.h>
-#include <float.h>
-
-namespace Etc
-{
-	class Block4x4;
-
-	// abstract base class for specific encodings
-	class Block4x4Encoding
-	{
-	public:
-
-		static const unsigned int ROWS = 4;
-		static const unsigned int COLUMNS = 4;
-		static const unsigned int PIXELS = ROWS * COLUMNS;
-		static const float LUMA_WEIGHT;
-		static const float CHROMA_BLUE_WEIGHT;
-
-		typedef enum
-		{
-			MODE_UNKNOWN,
-			//
-			MODE_ETC1,
-			MODE_T,
-			MODE_H,
-			MODE_PLANAR,
-			MODE_R11,
-			MODE_RG11,
-			//
-			MODES
-		} Mode;
-
-		Block4x4Encoding(void);
-		//virtual ~Block4x4Encoding(void) =0;
-		virtual ~Block4x4Encoding(void) {}
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-									ColorFloatRGBA *a_pafrgbaSource,
-
-									unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric) = 0;
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											ColorFloatRGBA *a_pafrgbaSource,
-
-											ErrorMetric a_errormetric) = 0;
-
-		// perform an iteration of the encoding
-		// the first iteration must generate a complete, valid (if poor) encoding
-		virtual void PerformIteration(float a_fEffort) = 0;
-
-		void CalcBlockError(void);
-
-		inline float GetError(void)
-		{
-			assert(m_fError >= 0.0f);
-
-			return m_fError;
-		}
-
-		inline ColorFloatRGBA * GetDecodedColors(void)
-		{
-			return m_afrgbaDecodedColors;
-		}
-
-		inline float * GetDecodedAlphas(void)
-		{
-			return m_afDecodedAlphas;
-		}
-
-		virtual void SetEncodingBits(void) = 0;
-
-		virtual bool GetFlip(void) = 0;
-
-		virtual bool IsDifferential(void) = 0;
-
-		virtual bool HasSeverelyBentDifferentialColors(void) const = 0;
-
-		inline Mode GetMode(void)
-		{
-			return m_mode;
-		}
-
-		inline bool IsDone(void)
-		{
-			return m_boolDone;
-		}
-
-		inline void SetDoneIfPerfect()
-		{
-			if (GetError() == 0.0f)
-			{
-				m_boolDone = true;
-			}
-		}
-
-		float CalcPixelError(ColorFloatRGBA a_frgbaDecodedColor, float a_fDecodedAlpha,
-								ColorFloatRGBA a_frgbaSourcePixel);
-
-	protected:
-
-		void Init(Block4x4 *a_pblockParent,
-					ColorFloatRGBA *a_pafrgbaSource,
-
-					ErrorMetric a_errormetric);
-
-		Block4x4		*m_pblockParent;
-		ColorFloatRGBA	*m_pafrgbaSource;
-
-		bool			m_boolBorderPixels;				// if block has any border pixels
-
-		ColorFloatRGBA	m_afrgbaDecodedColors[PIXELS];	// decoded RGB components, ignore Alpha
-		float			m_afDecodedAlphas[PIXELS];		// decoded alpha component
-		float			m_fError;						// error for RGBA relative to m_pafrgbaSource
-
-		// intermediate encoding
-		Mode			m_mode;
-
-		unsigned int	m_uiEncodingIterations;
-		bool			m_boolDone;						// all iterations have been done
-		ErrorMetric		m_errormetric;
-
-	private:
-
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4EncodingBits.h b/thirdparty/etc2comp/EtcBlock4x4EncodingBits.h
deleted file mode 100644
index 4065700379..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4EncodingBits.h
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ################################################################################
-	// Block4x4EncodingBits
-	// Base class for Block4x4EncodingBits_XXXX
-	// ################################################################################
-
-	class Block4x4EncodingBits
-	{
-	public:
-
-		enum class Format
-		{
-			UNKNOWN,
-			//
-			RGB8,
-			RGBA8,
-			R11,
-			RG11,
-			RGB8A1,
-			//
-			FORMATS
-		};
-
-		static unsigned int GetBytesPerBlock(Format a_format)
-		{
-			switch (a_format)
-			{
-			case Format::RGB8:
-			case Format::R11:
-			case Format::RGB8A1:
-				return 8;
-				break;
-
-			case Format::RGBA8:
-			case Format::RG11:
-				return 16;
-				break;
-
-			default:
-				return 0;
-				break;
-			}
-
-		}
-
-	};
-
-	// ################################################################################
-	// Block4x4EncodingBits_RGB8
-	// Encoding bits for the RGB portion of ETC1, RGB8, RGB8A1 and RGBA8
-	// ################################################################################
-
-	class Block4x4EncodingBits_RGB8
-	{
-	public:
-
-		static const unsigned int BYTES_PER_BLOCK = 8;
-
-		inline Block4x4EncodingBits_RGB8(void)
-		{
-			assert(sizeof(Block4x4EncodingBits_RGB8) == BYTES_PER_BLOCK);
-
-			for (unsigned int uiByte = 0; uiByte < BYTES_PER_BLOCK; uiByte++)
-			{
-				auc[uiByte] = 0;
-			}
-
-		}
-
-		typedef struct
-		{
-			unsigned red2 : 4;
-			unsigned red1 : 4;
-			//
-			unsigned green2 : 4;
-			unsigned green1 : 4;
-			//
-			unsigned blue2 : 4;
-			unsigned blue1 : 4;
-			//
-			unsigned flip : 1;
-			unsigned diff : 1;
-			unsigned cw2 : 3;
-			unsigned cw1 : 3;
-			//
-			unsigned int selectors;
-		} Individual;
-
-		typedef struct
-		{
-			signed dred2 : 3;
-			unsigned red1 : 5;
-			//
-			signed dgreen2 : 3;
-			unsigned green1 : 5;
-			//
-			signed dblue2 : 3;
-			unsigned blue1 : 5;
-			//
-			unsigned flip : 1;
-			unsigned diff : 1;
-			unsigned cw2 : 3;
-			unsigned cw1 : 3;
-			//
-			unsigned int selectors;
-		} Differential;
-
-		typedef struct
-		{
-			unsigned red1b : 2;
-			unsigned detect2 : 1;
-			unsigned red1a : 2;
-			unsigned detect1 : 3;
-			//
-			unsigned blue1 : 4;
-			unsigned green1 : 4;
-			//
-			unsigned green2 : 4;
-			unsigned red2 : 4;
-			//
-			unsigned db : 1;
-			unsigned diff : 1;
-			unsigned da : 2;
-			unsigned blue2 : 4;
-			//
-			unsigned int selectors;
-		} T;
-
-		typedef struct
-		{
-			unsigned green1a : 3;
-			unsigned red1 : 4;
-			unsigned detect1 : 1;
-			//
-			unsigned blue1b : 2;
-			unsigned detect3 : 1;
-			unsigned blue1a : 1;
-			unsigned green1b : 1;
-			unsigned detect2 : 3;
-			//
-			unsigned green2a : 3;
-			unsigned red2 : 4;
-			unsigned blue1c : 1;
-			//
-			unsigned db : 1;
-			unsigned diff : 1;
-			unsigned da : 1;
-			unsigned blue2 : 4;
-			unsigned green2b : 1;
-			//
-			unsigned int selectors;
-		} H;
-
-		typedef struct
-		{
-			unsigned originGreen1 : 1;
-			unsigned originRed : 6;
-			unsigned detect1 : 1;
-			//
-			unsigned originBlue1 : 1;
-			unsigned originGreen2 : 6;
-			unsigned detect2 : 1;
-			//
-			unsigned originBlue3 : 2;
-			unsigned detect4 : 1;
-			unsigned originBlue2 : 2;
-			unsigned detect3 : 3;
-			//
-			unsigned horizRed2 : 1;
-			unsigned diff : 1;
-			unsigned horizRed1 : 5;
-			unsigned originBlue4 : 1;
-			//
-			unsigned horizBlue1: 1;
-			unsigned horizGreen : 7;
-			//
-			unsigned vertRed1 : 3;
-			unsigned horizBlue2 : 5;
-			//
-			unsigned vertGreen1 : 5;
-			unsigned vertRed2 : 3;
-			//
-			unsigned vertBlue : 6;
-			unsigned vertGreen2 : 2;
-		} Planar;
-
-		union
-		{
-			unsigned char auc[BYTES_PER_BLOCK];
-			unsigned long int ul;
-			Individual individual;
-			Differential differential;
-			T t;
-			H h;
-			Planar planar;
-		};
-
-	};
-
-	// ################################################################################
-	// Block4x4EncodingBits_A8
-	// Encoding bits for the A portion of RGBA8
-	// ################################################################################
-
-	class Block4x4EncodingBits_A8
-	{
-	public:
-
-		static const unsigned int BYTES_PER_BLOCK = 8;
-		static const unsigned int SELECTOR_BYTES = 6;
-
-		typedef struct
-		{
-			unsigned base : 8;
-			unsigned table : 4;
-			unsigned multiplier : 4;
-			unsigned selectors0 : 8;
-			unsigned selectors1 : 8;
-			unsigned selectors2 : 8;
-			unsigned selectors3 : 8;
-			unsigned selectors4 : 8;
-			unsigned selectors5 : 8;
-		} Data;
-
-		Data data;
-
-	};
-
-	// ################################################################################
-	// Block4x4EncodingBits_R11
-	// Encoding bits for the R portion of R11
-	// ################################################################################
-
-	class Block4x4EncodingBits_R11
-	{
-	public:
-
-		static const unsigned int BYTES_PER_BLOCK = 8;
-		static const unsigned int SELECTOR_BYTES = 6;
-
-		typedef struct
-		{
-			unsigned base : 8;
-			unsigned table : 4;
-			unsigned multiplier : 4;
-			unsigned selectors0 : 8;
-			unsigned selectors1 : 8;
-			unsigned selectors2 : 8;
-			unsigned selectors3 : 8;
-			unsigned selectors4 : 8;
-			unsigned selectors5 : 8;
-		} Data;
-
-		Data data;
-
-	};
-
-	class Block4x4EncodingBits_RG11
-	{
-	public:
-
-		static const unsigned int BYTES_PER_BLOCK = 16;
-		static const unsigned int SELECTOR_BYTES = 12;
-
-		typedef struct
-		{
-			//Red portion
-			unsigned baseR : 8;
-			unsigned tableIndexR : 4;
-			unsigned multiplierR : 4;
-			unsigned selectorsR0 : 8;
-			unsigned selectorsR1 : 8;
-			unsigned selectorsR2 : 8;
-			unsigned selectorsR3 : 8;
-			unsigned selectorsR4 : 8;
-			unsigned selectorsR5 : 8;
-			//Green portion
-			unsigned baseG : 8;
-			unsigned tableIndexG : 4;
-			unsigned multiplierG : 4;
-			unsigned selectorsG0 : 8;
-			unsigned selectorsG1 : 8;
-			unsigned selectorsG2 : 8;
-			unsigned selectorsG3 : 8;
-			unsigned selectorsG4 : 8;
-			unsigned selectorsG5 : 8;
-		} Data;
-
-		Data data;
-
-	};
-
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.cpp
deleted file mode 100644
index a27f74c0d5..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.cpp
+++ /dev/null
@@ -1,1281 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_ETC1.cpp
-
-Block4x4Encoding_ETC1 is the encoder to use when targetting file format ETC1.  This encoder is also
-used for the ETC1 subset of file format RGB8, RGBA8 and RGB8A1
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_ETC1.h"
-
-#include "EtcBlock4x4.h"
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcDifferentialTrys.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-
-namespace Etc
-{
-
-	// pixel processing order if the flip bit = 0 (horizontal split)
-	const unsigned int Block4x4Encoding_ETC1::s_auiPixelOrderFlip0[PIXELS] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
-	// pixel processing order if the flip bit = 1 (vertical split)
-	const unsigned int Block4x4Encoding_ETC1::s_auiPixelOrderFlip1[PIXELS] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
-
-	// pixel processing order for horizontal scan (ETC normally does a vertical scan)
-	const unsigned int Block4x4Encoding_ETC1::s_auiPixelOrderHScan[PIXELS] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-
-	// pixel indices for different block halves
-	const unsigned int Block4x4Encoding_ETC1::s_auiLeftPixelMapping[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
-	const unsigned int Block4x4Encoding_ETC1::s_auiRightPixelMapping[8] = { 8, 9, 10, 11, 12, 13, 14, 15 };
-	const unsigned int Block4x4Encoding_ETC1::s_auiTopPixelMapping[8] = { 0, 1, 4, 5, 8, 9, 12, 13 };
-	const unsigned int Block4x4Encoding_ETC1::s_auiBottomPixelMapping[8] = { 2, 3, 6, 7, 10, 11, 14, 15 };
-
-	// CW ranges that the ETC1 decoders use
-	// CW is basically a contrast for the different selector bits, since these values are offsets to the base color
-	// the first axis in the array is indexed by the CW in the encoding bits
-	// the second axis in the array is indexed by the selector bits
-	float Block4x4Encoding_ETC1::s_aafCwTable[CW_RANGES][SELECTORS] =
-	{
-		{ 2.0f / 255.0f, 8.0f / 255.0f, -2.0f / 255.0f, -8.0f / 255.0f },
-		{ 5.0f / 255.0f, 17.0f / 255.0f, -5.0f / 255.0f, -17.0f / 255.0f },
-		{ 9.0f / 255.0f, 29.0f / 255.0f, -9.0f / 255.0f, -29.0f / 255.0f },
-		{ 13.0f / 255.0f, 42.0f / 255.0f, -13.0f / 255.0f, -42.0f / 255.0f },
-		{ 18.0f / 255.0f, 60.0f / 255.0f, -18.0f / 255.0f, -60.0f / 255.0f },
-		{ 24.0f / 255.0f, 80.0f / 255.0f, -24.0f / 255.0f, -80.0f / 255.0f },
-		{ 33.0f / 255.0f, 106.0f / 255.0f, -33.0f / 255.0f, -106.0f / 255.0f },
-		{ 47.0f / 255.0f, 183.0f / 255.0f, -47.0f / 255.0f, -183.0f / 255.0f }
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_ETC1::Block4x4Encoding_ETC1(void)
-	{
-		m_mode = MODE_ETC1;
-		m_boolDiff = false;
-		m_boolFlip = false;
-		m_frgbaColor1 = ColorFloatRGBA();
-		m_frgbaColor2 = ColorFloatRGBA();
-		m_uiCW1 = 0;
-		m_uiCW2 = 0;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_auiSelectors[uiPixel] = 0;
-			m_afDecodedAlphas[uiPixel] = 1.0f;
-		}
-
-		m_boolMostLikelyFlip = false;
-
-		m_fError = -1.0f;
-
-		m_fError1 = -1.0f;
-		m_fError2 = -1.0f;
-		m_boolSeverelyBentDifferentialColors = false;
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afDecodedAlphas[uiPixel] = 1.0f;
-		}
-
-	}
-
-	 Block4x4Encoding_ETC1::~Block4x4Encoding_ETC1(void) {}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_ETC1::InitFromSource(Block4x4 *a_pblockParent,
-												ColorFloatRGBA *a_pafrgbaSource,
-												unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric)
-	{
-
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,a_errormetric);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afDecodedAlphas[uiPixel] = 1.0f;
-		}
-
-		m_fError = -1.0f;
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)(a_paucEncodingBits);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_ETC1::InitFromEncodingBits(Block4x4 *a_pblockParent,
-														unsigned char *a_paucEncodingBits,
-														ColorFloatRGBA *a_pafrgbaSource, 
-														ErrorMetric a_errormetric)
-	{
-
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,a_errormetric);
-		m_fError = -1.0f;
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
-
-		m_mode = MODE_ETC1;
-		m_boolDiff = m_pencodingbitsRGB8->individual.diff;
-		m_boolFlip = m_pencodingbitsRGB8->individual.flip;
-		if (m_boolDiff)
-		{
-			int iR2 = (int)(m_pencodingbitsRGB8->differential.red1 + m_pencodingbitsRGB8->differential.dred2);
-			if (iR2 < 0)
-			{
-				iR2 = 0;
-			}
-			else if (iR2 > 31)
-			{
-				iR2 = 31;
-			}
-
-			int iG2 = (int)(m_pencodingbitsRGB8->differential.green1 + m_pencodingbitsRGB8->differential.dgreen2);
-			if (iG2 < 0)
-			{
-				iG2 = 0;
-			}
-			else if (iG2 > 31)
-			{
-				iG2 = 31;
-			}
-
-			int iB2 = (int)(m_pencodingbitsRGB8->differential.blue1 + m_pencodingbitsRGB8->differential.dblue2);
-			if (iB2 < 0)
-			{
-				iB2 = 0;
-			}
-			else if (iB2 > 31)
-			{
-				iB2 = 31;
-			}
-
-			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5(m_pencodingbitsRGB8->differential.red1, m_pencodingbitsRGB8->differential.green1, m_pencodingbitsRGB8->differential.blue1);
-			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iR2, (unsigned char)iG2, (unsigned char)iB2);
-
-		}
-		else
-		{
-			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(m_pencodingbitsRGB8->individual.red1, m_pencodingbitsRGB8->individual.green1, m_pencodingbitsRGB8->individual.blue1);
-			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(m_pencodingbitsRGB8->individual.red2, m_pencodingbitsRGB8->individual.green2, m_pencodingbitsRGB8->individual.blue2);
-		}
-
-		m_uiCW1 = m_pencodingbitsRGB8->individual.cw1;
-		m_uiCW2 = m_pencodingbitsRGB8->individual.cw2;
-
-		InitFromEncodingBits_Selectors();
-
-		Decode();
-
-		CalcBlockError();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// init the selectors from a prior encoding
-	//
-	void Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors(void)
-	{
-
-		unsigned char *paucSelectors = (unsigned char *)&m_pencodingbitsRGB8->individual.selectors;
-
-		for (unsigned int iPixel = 0; iPixel < PIXELS; iPixel++)
-		{
-			unsigned int uiByteMSB = (unsigned int)(1 - (iPixel / 8));
-			unsigned int uiByteLSB = (unsigned int)(3 - (iPixel / 8));
-			unsigned int uiShift = (unsigned int)(iPixel & 7);
-
-			unsigned int uiSelectorMSB = (unsigned int)((paucSelectors[uiByteMSB] >> uiShift) & 1);
-			unsigned int uiSelectorLSB = (unsigned int)((paucSelectors[uiByteLSB] >> uiShift) & 1);
-
-			m_auiSelectors[iPixel] = (uiSelectorMSB << 1) + uiSelectorLSB;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_ETC1::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			PerformFirstIteration();
-			break;
-
-		case 1:
-			TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 2:
-			TryIndividual(m_boolMostLikelyFlip, 1);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 3:
-			TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
-			if (a_fEffort <= 59.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 4:
-			TryIndividual(!m_boolMostLikelyFlip, 1);
-			if (a_fEffort <= 69.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 5:
-			TryDegenerates1();
-			if (a_fEffort <= 79.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 6:
-			TryDegenerates2();
-			if (a_fEffort <= 89.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 7:
-			TryDegenerates3();
-			if (a_fEffort <= 99.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 8:
-			TryDegenerates4();
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best initial encoding to ensure block has a valid encoding
-	//
-	void Block4x4Encoding_ETC1::PerformFirstIteration(void)
-	{
-		CalculateMostLikelyFlip();
-
-		m_fError = FLT_MAX;
-
-		TryDifferential(m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-
-		TryIndividual(m_boolMostLikelyFlip, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		TryDifferential(!m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		TryIndividual(!m_boolMostLikelyFlip, 0);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// algorithm:
-	// create a source average color for the Left, Right, Top and Bottom halves using the 8 pixels in each half
-	// note: the "gray line" is the line of equal delta RGB that goes thru the average color
-	// for each half:
-	//		see how close each of the 8 pixels are to the "gray line" that goes thru the source average color
-	//		create an error value that is the sum of the distances from the gray line
-	// h_error is the sum of Left and Right errors
-	// v_error is the sum of Top and Bottom errors
-	//
-	void Block4x4Encoding_ETC1::CalculateMostLikelyFlip(void)
-	{
-		static const bool DEBUG_PRINT = false;
-
-		CalculateSourceAverages();
-
-		float fLeftGrayErrorSum = 0.0f;
-		float fRightGrayErrorSum = 0.0f;
-		float fTopGrayErrorSum = 0.0f;
-		float fBottomGrayErrorSum = 0.0f;
-
-		for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-		{
-			ColorFloatRGBA *pfrgbaLeft = &m_pafrgbaSource[uiPixel];
-			ColorFloatRGBA *pfrgbaRight = &m_pafrgbaSource[uiPixel + 8];
-			ColorFloatRGBA *pfrgbaTop = &m_pafrgbaSource[s_auiTopPixelMapping[uiPixel]];
-			ColorFloatRGBA *pfrgbaBottom = &m_pafrgbaSource[s_auiBottomPixelMapping[uiPixel]];
-
-			float fLeftGrayError = CalcGrayDistance2(*pfrgbaLeft, m_frgbaSourceAverageLeft);
-			float fRightGrayError = CalcGrayDistance2(*pfrgbaRight, m_frgbaSourceAverageRight);
-			float fTopGrayError = CalcGrayDistance2(*pfrgbaTop, m_frgbaSourceAverageTop);
-			float fBottomGrayError = CalcGrayDistance2(*pfrgbaBottom, m_frgbaSourceAverageBottom);
-
-			fLeftGrayErrorSum += fLeftGrayError;
-			fRightGrayErrorSum += fRightGrayError;
-			fTopGrayErrorSum += fTopGrayError;
-			fBottomGrayErrorSum += fBottomGrayError;
-		}
-
-		if (DEBUG_PRINT)
-		{
-			printf("\n%.2f %.2f\n", fLeftGrayErrorSum + fRightGrayErrorSum, fTopGrayErrorSum + fBottomGrayErrorSum);
-		}
-
-		m_boolMostLikelyFlip = (fTopGrayErrorSum + fBottomGrayErrorSum) < (fLeftGrayErrorSum + fRightGrayErrorSum);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate source pixel averages for each 2x2 quadrant in a 4x4 block
-	// these are used to determine the averages for each of the 4 different halves (left, right, top, bottom)
-	// ignore pixels that have alpha == NAN (these are border pixels outside of the source image)
-	// weight the averages based on a pixel's alpha
-	//
-	void Block4x4Encoding_ETC1::CalculateSourceAverages(void)
-	{
-		static const bool DEBUG_PRINT = false;
-
-		bool boolRGBX = m_pblockParent->GetImageSource()->GetErrorMetric() == ErrorMetric::RGBX;
-
-		if (m_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::OPAQUE || boolRGBX)
-		{
-			ColorFloatRGBA frgbaSumUL = m_pafrgbaSource[0] + m_pafrgbaSource[1] + m_pafrgbaSource[4] + m_pafrgbaSource[5];
-			ColorFloatRGBA frgbaSumLL = m_pafrgbaSource[2] + m_pafrgbaSource[3] + m_pafrgbaSource[6] + m_pafrgbaSource[7];
-			ColorFloatRGBA frgbaSumUR = m_pafrgbaSource[8] + m_pafrgbaSource[9] + m_pafrgbaSource[12] + m_pafrgbaSource[13];
-			ColorFloatRGBA frgbaSumLR = m_pafrgbaSource[10] + m_pafrgbaSource[11] + m_pafrgbaSource[14] + m_pafrgbaSource[15];
-
-			m_frgbaSourceAverageLeft = (frgbaSumUL + frgbaSumLL) * 0.125f;
-			m_frgbaSourceAverageRight = (frgbaSumUR + frgbaSumLR) * 0.125f;
-			m_frgbaSourceAverageTop = (frgbaSumUL + frgbaSumUR) * 0.125f;
-			m_frgbaSourceAverageBottom = (frgbaSumLL + frgbaSumLR) * 0.125f;
-		}
-		else
-		{
-			float afSourceAlpha[PIXELS];
-
-			// treat alpha NAN as 0.0f
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				afSourceAlpha[uiPixel] = isnan(m_pafrgbaSource[uiPixel].fA) ? 
-																		0.0f : 
-																		m_pafrgbaSource[uiPixel].fA;
-			}
-
-			ColorFloatRGBA afrgbaAlphaWeightedSource[PIXELS];
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				afrgbaAlphaWeightedSource[uiPixel] = m_pafrgbaSource[uiPixel] * afSourceAlpha[uiPixel];
-			}
-
-			ColorFloatRGBA frgbaSumUL = afrgbaAlphaWeightedSource[0] +
-										afrgbaAlphaWeightedSource[1] +
-										afrgbaAlphaWeightedSource[4] +
-										afrgbaAlphaWeightedSource[5];
-
-			ColorFloatRGBA frgbaSumLL = afrgbaAlphaWeightedSource[2] +
-										afrgbaAlphaWeightedSource[3] +
-										afrgbaAlphaWeightedSource[6] +
-										afrgbaAlphaWeightedSource[7];
-
-			ColorFloatRGBA frgbaSumUR = afrgbaAlphaWeightedSource[8] +
-										afrgbaAlphaWeightedSource[9] +
-										afrgbaAlphaWeightedSource[12] +
-										afrgbaAlphaWeightedSource[13];
-
-			ColorFloatRGBA frgbaSumLR = afrgbaAlphaWeightedSource[10] +
-										afrgbaAlphaWeightedSource[11] +
-										afrgbaAlphaWeightedSource[14] +
-										afrgbaAlphaWeightedSource[15];
-
-			float fWeightSumUL = afSourceAlpha[0] +
-									afSourceAlpha[1] +
-									afSourceAlpha[4] +
-									afSourceAlpha[5];
-
-			float fWeightSumLL = afSourceAlpha[2] +
-									afSourceAlpha[3] +
-									afSourceAlpha[6] +
-									afSourceAlpha[7];
-
-			float fWeightSumUR = afSourceAlpha[8] +
-									afSourceAlpha[9] +
-									afSourceAlpha[12] +
-									afSourceAlpha[13];
-
-			float fWeightSumLR = afSourceAlpha[10] +
-									afSourceAlpha[11] +
-									afSourceAlpha[14] +
-									afSourceAlpha[15];
-
-			ColorFloatRGBA frgbaSumLeft = frgbaSumUL + frgbaSumLL;
-			ColorFloatRGBA frgbaSumRight = frgbaSumUR + frgbaSumLR;
-			ColorFloatRGBA frgbaSumTop = frgbaSumUL + frgbaSumUR;
-			ColorFloatRGBA frgbaSumBottom = frgbaSumLL + frgbaSumLR;
-
-			float fWeightSumLeft = fWeightSumUL + fWeightSumLL;
-			float fWeightSumRight = fWeightSumUR + fWeightSumLR;
-			float fWeightSumTop = fWeightSumUL + fWeightSumUR;
-			float fWeightSumBottom = fWeightSumLL + fWeightSumLR;
-
-			// check to see if there is at least 1 pixel with  non-zero alpha
-			// completely transparent block should not make it to this code
-			assert((fWeightSumLeft + fWeightSumRight) > 0.0f);
-			assert((fWeightSumTop + fWeightSumBottom) > 0.0f);
-
-			if (fWeightSumLeft > 0.0f)
-			{
-				m_frgbaSourceAverageLeft = frgbaSumLeft * (1.0f/fWeightSumLeft);
-			}
-			if (fWeightSumRight > 0.0f)
-			{
-				m_frgbaSourceAverageRight = frgbaSumRight * (1.0f/fWeightSumRight);
-			}
-			if (fWeightSumTop > 0.0f)
-			{
-				m_frgbaSourceAverageTop = frgbaSumTop * (1.0f/fWeightSumTop);
-			}
-			if (fWeightSumBottom > 0.0f)
-			{
-				m_frgbaSourceAverageBottom = frgbaSumBottom * (1.0f/fWeightSumBottom);
-			}
-
-			if (fWeightSumLeft == 0.0f)
-			{
-				assert(fWeightSumRight > 0.0f);
-				m_frgbaSourceAverageLeft = m_frgbaSourceAverageRight;
-			}
-			if (fWeightSumRight == 0.0f)
-			{
-				assert(fWeightSumLeft > 0.0f);
-				m_frgbaSourceAverageRight = m_frgbaSourceAverageLeft;
-			}
-			if (fWeightSumTop == 0.0f)
-			{
-				assert(fWeightSumBottom > 0.0f);
-				m_frgbaSourceAverageTop = m_frgbaSourceAverageBottom;
-			}
-			if (fWeightSumBottom == 0.0f)
-			{
-				assert(fWeightSumTop > 0.0f);
-				m_frgbaSourceAverageBottom = m_frgbaSourceAverageTop;
-			}
-		}
-
-		
-
-		if (DEBUG_PRINT)
-		{
-			printf("\ntarget: [%.2f,%.2f,%.2f] [%.2f,%.2f,%.2f] [%.2f,%.2f,%.2f] [%.2f,%.2f,%.2f]\n",
-				m_frgbaSourceAverageLeft.fR, m_frgbaSourceAverageLeft.fG, m_frgbaSourceAverageLeft.fB,
-				m_frgbaSourceAverageRight.fR, m_frgbaSourceAverageRight.fG, m_frgbaSourceAverageRight.fB,
-				m_frgbaSourceAverageTop.fR, m_frgbaSourceAverageTop.fG, m_frgbaSourceAverageTop.fB,
-				m_frgbaSourceAverageBottom.fR, m_frgbaSourceAverageBottom.fG, m_frgbaSourceAverageBottom.fB);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try an ETC1 differential mode encoding
-	// use a_boolFlip to set the encoding F bit
-	// use a_uiRadius to alter basecolor components in the range[-a_uiRadius:a_uiRadius]
-	// use a_iGrayOffset1 and a_iGrayOffset2 to offset the basecolor to search for degenerate encodings
-	// replace the encoding if the encoding error is less than previous encoding
-	//
-	void Block4x4Encoding_ETC1::TryDifferential(bool a_boolFlip, unsigned int a_uiRadius,
-												int a_iGrayOffset1, int a_iGrayOffset2)
-	{
-
-		ColorFloatRGBA frgbaColor1;
-		ColorFloatRGBA frgbaColor2;
-
-		const unsigned int *pauiPixelMapping1;
-		const unsigned int *pauiPixelMapping2;
-
-		if (a_boolFlip)
-		{
-			frgbaColor1 = m_frgbaSourceAverageTop;
-			frgbaColor2 = m_frgbaSourceAverageBottom;
-
-			pauiPixelMapping1 = s_auiTopPixelMapping;
-			pauiPixelMapping2 = s_auiBottomPixelMapping;
-		}
-		else
-		{
-			frgbaColor1 = m_frgbaSourceAverageLeft;
-			frgbaColor2 = m_frgbaSourceAverageRight;
-
-			pauiPixelMapping1 = s_auiLeftPixelMapping;
-			pauiPixelMapping2 = s_auiRightPixelMapping;
-		}
-
-		DifferentialTrys trys(frgbaColor1, frgbaColor2, pauiPixelMapping1, pauiPixelMapping2, 
-								a_uiRadius, a_iGrayOffset1, a_iGrayOffset2);
-
-		Block4x4Encoding_ETC1 encodingTry = *this;
-		encodingTry.m_boolFlip = a_boolFlip;
-
-		encodingTry.TryDifferentialHalf(&trys.m_half1);
-		encodingTry.TryDifferentialHalf(&trys.m_half2);
-
-		// find best halves that are within differential range
-		DifferentialTrys::Try *ptryBest1 = nullptr;
-		DifferentialTrys::Try *ptryBest2 = nullptr;
-		encodingTry.m_fError = FLT_MAX;
-
-		// see if the best of each half are in differential range
-		int iDRed = trys.m_half2.m_ptryBest->m_iRed - trys.m_half1.m_ptryBest->m_iRed;
-		int iDGreen = trys.m_half2.m_ptryBest->m_iGreen - trys.m_half1.m_ptryBest->m_iGreen;
-		int iDBlue = trys.m_half2.m_ptryBest->m_iBlue - trys.m_half1.m_ptryBest->m_iBlue;
-		if (iDRed >= -4 && iDRed <= 3 && iDGreen >= -4 && iDGreen <= 3 && iDBlue >= -4 && iDBlue <= 3)
-		{
-			ptryBest1 = trys.m_half1.m_ptryBest;
-			ptryBest2 = trys.m_half2.m_ptryBest;
-			encodingTry.m_fError = trys.m_half1.m_ptryBest->m_fError + trys.m_half2.m_ptryBest->m_fError;
-		}
-		else
-		{
-			// else, find the next best halves that are in differential range
-			for (DifferentialTrys::Try *ptry1 = &trys.m_half1.m_atry[0];
-			ptry1 < &trys.m_half1.m_atry[trys.m_half1.m_uiTrys];
-				ptry1++)
-			{
-				for (DifferentialTrys::Try *ptry2 = &trys.m_half2.m_atry[0];
-				ptry2 < &trys.m_half2.m_atry[trys.m_half2.m_uiTrys];
-					ptry2++)
-				{
-					iDRed = ptry2->m_iRed - ptry1->m_iRed;
-					bool boolValidRedDelta = iDRed <= 3 && iDRed >= -4;
-					iDGreen = ptry2->m_iGreen - ptry1->m_iGreen;
-					bool boolValidGreenDelta = iDGreen <= 3 && iDGreen >= -4;
-					iDBlue = ptry2->m_iBlue - ptry1->m_iBlue;
-					bool boolValidBlueDelta = iDBlue <= 3 && iDBlue >= -4;
-
-					if (boolValidRedDelta && boolValidGreenDelta && boolValidBlueDelta)
-					{
-						float fError = ptry1->m_fError + ptry2->m_fError;
-
-						if (fError < encodingTry.m_fError)
-						{
-							encodingTry.m_fError = fError;
-
-							ptryBest1 = ptry1;
-							ptryBest2 = ptry2;
-						}
-					}
-
-				}
-			}
-			assert(encodingTry.m_fError < FLT_MAX);
-			assert(ptryBest1 != nullptr);
-			assert(ptryBest2 != nullptr);
-		}
-
-		if (encodingTry.m_fError < m_fError)
-		{
-			m_mode = MODE_ETC1;
-			m_boolDiff = true;
-			m_boolFlip = encodingTry.m_boolFlip;
-			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest1->m_iRed, (unsigned char)ptryBest1->m_iGreen, (unsigned char)ptryBest1->m_iBlue);
-			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest2->m_iRed, (unsigned char)ptryBest2->m_iGreen, (unsigned char)ptryBest2->m_iBlue);
-			m_uiCW1 = ptryBest1->m_uiCW;
-			m_uiCW2 = ptryBest2->m_uiCW;
-
-			for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS / 2; uiPixelOrder++)
-			{
-				unsigned int uiPixel1 = pauiPixelMapping1[uiPixelOrder];
-				unsigned int uiPixel2 = pauiPixelMapping2[uiPixelOrder];
-
-				unsigned int uiSelector1 = ptryBest1->m_auiSelectors[uiPixelOrder];
-				unsigned int uiSelector2 = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				m_auiSelectors[uiPixel1] = uiSelector1;
-				m_auiSelectors[uiPixel2] = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				float fDeltaRGB1 = s_aafCwTable[m_uiCW1][uiSelector1];
-				float fDeltaRGB2 = s_aafCwTable[m_uiCW2][uiSelector2];
-
-				m_afrgbaDecodedColors[uiPixel1] = (m_frgbaColor1 + fDeltaRGB1).ClampRGB();
-				m_afrgbaDecodedColors[uiPixel2] = (m_frgbaColor2 + fDeltaRGB2).ClampRGB();
-			}
-
-			m_fError1 = ptryBest1->m_fError;
-			m_fError2 = ptryBest2->m_fError;
-			m_boolSeverelyBentDifferentialColors = trys.m_boolSeverelyBentColors;
-			m_fError = m_fError1 + m_fError2;
-
-			// sanity check
-			{
-				int iRed1 = m_frgbaColor1.IntRed(31.0f);
-				int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
-				int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
-
-				int iRed2 = m_frgbaColor2.IntRed(31.0f);
-				int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
-				int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
-
-				iDRed = iRed2 - iRed1;
-				iDGreen = iGreen2 - iGreen1;
-				iDBlue = iBlue2 - iBlue1;
-
-				assert(iDRed >= -4 && iDRed < 4);
-				assert(iDGreen >= -4 && iDGreen < 4);
-				assert(iDBlue >= -4 && iDBlue < 4);
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try an ETC1 differential mode encoding for a half of a 4x4 block
-	// vary the basecolor components using a radius
-	//
-	void Block4x4Encoding_ETC1::TryDifferentialHalf(DifferentialTrys::Half *a_phalf)
-	{
-
-		a_phalf->m_ptryBest = nullptr;
-		float fBestTryError = FLT_MAX;
-
-		a_phalf->m_uiTrys = 0;
-		for (int iRed = a_phalf->m_iRed - (int)a_phalf->m_uiRadius; 
-				iRed <= a_phalf->m_iRed + (int)a_phalf->m_uiRadius;
-				iRed++)
-		{
-			assert(iRed >= 0 && iRed <= 31);
-
-			for (int iGreen = a_phalf->m_iGreen - (int)a_phalf->m_uiRadius;
-					iGreen <= a_phalf->m_iGreen + (int)a_phalf->m_uiRadius;
-					iGreen++)
-			{
-				assert(iGreen >= 0 && iGreen <= 31);
-
-				for (int iBlue = a_phalf->m_iBlue - (int)a_phalf->m_uiRadius;
-						iBlue <= a_phalf->m_iBlue + (int)a_phalf->m_uiRadius;
-						iBlue++)
-				{
-					assert(iBlue >= 0 && iBlue <= 31);
-
-					DifferentialTrys::Try *ptry = &a_phalf->m_atry[a_phalf->m_uiTrys];
-					assert(ptry < &a_phalf->m_atry[DifferentialTrys::Half::MAX_TRYS]);
-
-					ptry->m_iRed = iRed;
-					ptry->m_iGreen = iGreen;
-					ptry->m_iBlue = iBlue;
-					ptry->m_fError = FLT_MAX;
-					ColorFloatRGBA frgbaColor = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iRed, (unsigned char)iGreen, (unsigned char)iBlue);
-
-					// try each CW
-					for (unsigned int uiCW = 0; uiCW < CW_RANGES; uiCW++)
-					{
-						unsigned int auiPixelSelectors[PIXELS / 2];
-						ColorFloatRGBA	afrgbaDecodedPixels[PIXELS / 2];
-						float afPixelErrors[PIXELS / 2] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, 
-															FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-
-						// pre-compute decoded pixels for each selector
-						ColorFloatRGBA afrgbaSelectors[SELECTORS];
-						assert(SELECTORS == 4);
-						afrgbaSelectors[0] = (frgbaColor + s_aafCwTable[uiCW][0]).ClampRGB();
-						afrgbaSelectors[1] = (frgbaColor + s_aafCwTable[uiCW][1]).ClampRGB();
-						afrgbaSelectors[2] = (frgbaColor + s_aafCwTable[uiCW][2]).ClampRGB();
-						afrgbaSelectors[3] = (frgbaColor + s_aafCwTable[uiCW][3]).ClampRGB();
-
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-							ColorFloatRGBA *pfrgbaSourcePixel = &m_pafrgbaSource[a_phalf->m_pauiPixelMapping[uiPixel]];
-							ColorFloatRGBA frgbaDecodedPixel;
-
-							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-								frgbaDecodedPixel = afrgbaSelectors[uiSelector];
-
-								float fPixelError;
-
-								fPixelError = CalcPixelError(frgbaDecodedPixel, m_afDecodedAlphas[a_phalf->m_pauiPixelMapping[uiPixel]],
-																	*pfrgbaSourcePixel);
-
-								if (fPixelError < afPixelErrors[uiPixel])
-								{
-									auiPixelSelectors[uiPixel] = uiSelector;
-									afrgbaDecodedPixels[uiPixel] = frgbaDecodedPixel;
-									afPixelErrors[uiPixel] = fPixelError;
-								}
-
-							}
-						}
-
-						// add up all pixel errors
-						float fCWError = 0.0f;
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{	
-							fCWError += afPixelErrors[uiPixel];
-						}
-
-						// if best CW so far
-						if (fCWError < ptry->m_fError)
-						{
-							ptry->m_uiCW = uiCW;
-							for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-							{
-								ptry->m_auiSelectors[uiPixel] = auiPixelSelectors[uiPixel];
-							}
-							ptry->m_fError = fCWError;
-						}
-
-					}
-
-					if (ptry->m_fError < fBestTryError)
-					{
-						a_phalf->m_ptryBest = ptry;
-						fBestTryError = ptry->m_fError;
-					}
-
-					assert(ptry->m_fError < FLT_MAX);
-
-					a_phalf->m_uiTrys++;
-				}
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try an ETC1 individual mode encoding
-	// use a_boolFlip to set the encoding F bit
-	// use a_uiRadius to alter basecolor components in the range[-a_uiRadius:a_uiRadius]
-	// replace the encoding if the encoding error is less than previous encoding
-	//
-	void Block4x4Encoding_ETC1::TryIndividual(bool a_boolFlip, unsigned int a_uiRadius)
-	{
-
-		ColorFloatRGBA frgbaColor1;
-		ColorFloatRGBA frgbaColor2;
-
-		const unsigned int *pauiPixelMapping1;
-		const unsigned int *pauiPixelMapping2;
-
-		if (a_boolFlip)
-		{
-			frgbaColor1 = m_frgbaSourceAverageTop;
-			frgbaColor2 = m_frgbaSourceAverageBottom;
-
-			pauiPixelMapping1 = s_auiTopPixelMapping;
-			pauiPixelMapping2 = s_auiBottomPixelMapping;
-		}
-		else
-		{
-			frgbaColor1 = m_frgbaSourceAverageLeft;
-			frgbaColor2 = m_frgbaSourceAverageRight;
-
-			pauiPixelMapping1 = s_auiLeftPixelMapping;
-			pauiPixelMapping2 = s_auiRightPixelMapping;
-		}
-
-		IndividualTrys trys(frgbaColor1, frgbaColor2, pauiPixelMapping1, pauiPixelMapping2, a_uiRadius);
-
-		Block4x4Encoding_ETC1 encodingTry = *this;
-		encodingTry.m_boolFlip = a_boolFlip;
-
-		encodingTry.TryIndividualHalf(&trys.m_half1);
-		encodingTry.TryIndividualHalf(&trys.m_half2);
-
-		// use the best of each half
-		IndividualTrys::Try *ptryBest1 = trys.m_half1.m_ptryBest;
-		IndividualTrys::Try *ptryBest2 = trys.m_half2.m_ptryBest;
-		encodingTry.m_fError = trys.m_half1.m_ptryBest->m_fError + trys.m_half2.m_ptryBest->m_fError;
-
-		if (encodingTry.m_fError < m_fError)
-		{
-			m_mode = MODE_ETC1;
-			m_boolDiff = false;
-			m_boolFlip = encodingTry.m_boolFlip;
-			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)ptryBest1->m_iRed, (unsigned char)ptryBest1->m_iGreen, (unsigned char)ptryBest1->m_iBlue);
-			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)ptryBest2->m_iRed, (unsigned char)ptryBest2->m_iGreen, (unsigned char)ptryBest2->m_iBlue);
-			m_uiCW1 = ptryBest1->m_uiCW;
-			m_uiCW2 = ptryBest2->m_uiCW;
-
-			for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS / 2; uiPixelOrder++)
-			{
-				unsigned int uiPixel1 = pauiPixelMapping1[uiPixelOrder];
-				unsigned int uiPixel2 = pauiPixelMapping2[uiPixelOrder];
-
-				unsigned int uiSelector1 = ptryBest1->m_auiSelectors[uiPixelOrder];
-				unsigned int uiSelector2 = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				m_auiSelectors[uiPixel1] = uiSelector1;
-				m_auiSelectors[uiPixel2] = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				float fDeltaRGB1 = s_aafCwTable[m_uiCW1][uiSelector1];
-				float fDeltaRGB2 = s_aafCwTable[m_uiCW2][uiSelector2];
-
-				m_afrgbaDecodedColors[uiPixel1] = (m_frgbaColor1 + fDeltaRGB1).ClampRGB();
-				m_afrgbaDecodedColors[uiPixel2] = (m_frgbaColor2 + fDeltaRGB2).ClampRGB();
-			}
-
-			m_fError1 = ptryBest1->m_fError;
-			m_fError2 = ptryBest2->m_fError;
-			m_fError = m_fError1 + m_fError2;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try an ETC1 differential mode encoding for a half of a 4x4 block
-	// vary the basecolor components using a radius
-	//
-	void Block4x4Encoding_ETC1::TryIndividualHalf(IndividualTrys::Half *a_phalf)
-	{
-
-		a_phalf->m_ptryBest = nullptr;
-		float fBestTryError = FLT_MAX;
-
-		a_phalf->m_uiTrys = 0;
-		for (int iRed = a_phalf->m_iRed - (int)a_phalf->m_uiRadius;
-			iRed <= a_phalf->m_iRed + (int)a_phalf->m_uiRadius;
-			iRed++)
-		{
-			assert(iRed >= 0 && iRed <= 15);
-
-			for (int iGreen = a_phalf->m_iGreen - (int)a_phalf->m_uiRadius;
-				iGreen <= a_phalf->m_iGreen + (int)a_phalf->m_uiRadius;
-				iGreen++)
-			{
-				assert(iGreen >= 0 && iGreen <= 15);
-
-				for (int iBlue = a_phalf->m_iBlue - (int)a_phalf->m_uiRadius;
-					iBlue <= a_phalf->m_iBlue + (int)a_phalf->m_uiRadius;
-					iBlue++)
-				{
-					assert(iBlue >= 0 && iBlue <= 15);
-
-					IndividualTrys::Try *ptry = &a_phalf->m_atry[a_phalf->m_uiTrys];
-					assert(ptry < &a_phalf->m_atry[IndividualTrys::Half::MAX_TRYS]);
-
-					ptry->m_iRed = iRed;
-					ptry->m_iGreen = iGreen;
-					ptry->m_iBlue = iBlue;
-					ptry->m_fError = FLT_MAX;
-					ColorFloatRGBA frgbaColor = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed, (unsigned char)iGreen, (unsigned char)iBlue);
-
-					// try each CW
-					for (unsigned int uiCW = 0; uiCW < CW_RANGES; uiCW++)
-					{
-						unsigned int auiPixelSelectors[PIXELS / 2];
-						ColorFloatRGBA	afrgbaDecodedPixels[PIXELS / 2];
-						float afPixelErrors[PIXELS / 2] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-															FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-
-						// pre-compute decoded pixels for each selector
-						ColorFloatRGBA afrgbaSelectors[SELECTORS];
-						assert(SELECTORS == 4);
-						afrgbaSelectors[0] = (frgbaColor + s_aafCwTable[uiCW][0]).ClampRGB();
-						afrgbaSelectors[1] = (frgbaColor + s_aafCwTable[uiCW][1]).ClampRGB();
-						afrgbaSelectors[2] = (frgbaColor + s_aafCwTable[uiCW][2]).ClampRGB();
-						afrgbaSelectors[3] = (frgbaColor + s_aafCwTable[uiCW][3]).ClampRGB();
-
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-							ColorFloatRGBA *pfrgbaSourcePixel = &m_pafrgbaSource[a_phalf->m_pauiPixelMapping[uiPixel]];
-							ColorFloatRGBA frgbaDecodedPixel;
-
-							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-								frgbaDecodedPixel = afrgbaSelectors[uiSelector];
-
-								float fPixelError;
-
-								fPixelError = CalcPixelError(frgbaDecodedPixel, m_afDecodedAlphas[a_phalf->m_pauiPixelMapping[uiPixel]],
-										*pfrgbaSourcePixel);
-
-								if (fPixelError < afPixelErrors[uiPixel])
-								{
-									auiPixelSelectors[uiPixel] = uiSelector;
-									afrgbaDecodedPixels[uiPixel] = frgbaDecodedPixel;
-									afPixelErrors[uiPixel] = fPixelError;
-								}
-
-							}
-						}
-
-						// add up all pixel errors
-						float fCWError = 0.0f;
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-							fCWError += afPixelErrors[uiPixel];
-						}
-
-						// if best CW so far
-						if (fCWError < ptry->m_fError)
-						{
-							ptry->m_uiCW = uiCW;
-							for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-							{
-								ptry->m_auiSelectors[uiPixel] = auiPixelSelectors[uiPixel];
-							}
-							ptry->m_fError = fCWError;
-						}
-
-					}
-
-					if (ptry->m_fError < fBestTryError)
-					{
-						a_phalf->m_ptryBest = ptry;
-						fBestTryError = ptry->m_fError;
-					}
-
-					assert(ptry->m_fError < FLT_MAX);
-
-					a_phalf->m_uiTrys++;
-				}
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 1 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_ETC1::TryDegenerates1(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, 2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, -2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 2 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_ETC1::TryDegenerates2(void)
-	{
-
-		TryDifferential(!m_boolMostLikelyFlip, 1, -2, 0);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 2, 0);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 0, 2);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 0, -2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 3 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_ETC1::TryDegenerates3(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, -2);
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, 2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, -2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, 2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 4 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_ETC1::TryDegenerates4(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -4, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 4, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, 4);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, -4);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find the best selector for each pixel based on a particular basecolor and CW that have been previously set
-	// calculate the selectors for each half of the block separately
-	// set the block error as the sum of each half's error
-	//
-	void Block4x4Encoding_ETC1::CalculateSelectors()
-	{
-		if (m_boolFlip)
-		{
-			CalculateHalfOfTheSelectors(0, s_auiTopPixelMapping);
-			CalculateHalfOfTheSelectors(1, s_auiBottomPixelMapping);
-		}
-		else
-		{
-			CalculateHalfOfTheSelectors(0, s_auiLeftPixelMapping);
-			CalculateHalfOfTheSelectors(1, s_auiRightPixelMapping);
-		}
-
-		m_fError = m_fError1 + m_fError2;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// choose best selectors for half of the block
-	// calculate the error for half of the block
-	//
-	void Block4x4Encoding_ETC1::CalculateHalfOfTheSelectors(unsigned int a_uiHalf,
-		const unsigned int *pauiPixelMapping)
-	{
-		static const bool DEBUG_PRINT = false;
-
-		ColorFloatRGBA *pfrgbaColor = a_uiHalf ? &m_frgbaColor2 : &m_frgbaColor1;
-		unsigned int *puiCW = a_uiHalf ? &m_uiCW2 : &m_uiCW1;
-
-		float *pfHalfError = a_uiHalf ? &m_fError2 : &m_fError1;
-		*pfHalfError = FLT_MAX;
-
-		// try each CW
-		for (unsigned int uiCW = 0; uiCW < CW_RANGES; uiCW++)
-		{
-			if (DEBUG_PRINT)
-			{
-				printf("\ncw=%u\n", uiCW);
-			}
-
-			unsigned int auiPixelSelectors[PIXELS / 2];
-			ColorFloatRGBA	afrgbaDecodedPixels[PIXELS / 2];
-			float afPixelErrors[PIXELS / 2] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-
-			for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-			{
-				if (DEBUG_PRINT)
-				{
-					printf("\tsource [%.2f,%.2f,%.2f]\n", m_pafrgbaSource[pauiPixelMapping[uiPixel]].fR,
-						m_pafrgbaSource[pauiPixelMapping[uiPixel]].fG, m_pafrgbaSource[pauiPixelMapping[uiPixel]].fB);
-				}
-
-				ColorFloatRGBA *pfrgbaSourcePixel = &m_pafrgbaSource[pauiPixelMapping[uiPixel]];
-				ColorFloatRGBA frgbaDecodedPixel;
-
-				for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-				{
-					float fDeltaRGB = s_aafCwTable[uiCW][uiSelector];
-
-					frgbaDecodedPixel = (*pfrgbaColor + fDeltaRGB).ClampRGB();
-
-					float fPixelError;
-					
-					fPixelError = CalcPixelError(frgbaDecodedPixel, m_afDecodedAlphas[pauiPixelMapping[uiPixel]],
-														*pfrgbaSourcePixel);
-					
-					if (DEBUG_PRINT)
-					{
-						printf("\tpixel %u, index %u [%.2f,%.2f,%.2f], error %.2f", uiPixel, uiSelector,
-							frgbaDecodedPixel.fR,
-							frgbaDecodedPixel.fG,
-							frgbaDecodedPixel.fB,
-							fPixelError);
-					}
-
-					if (fPixelError < afPixelErrors[uiPixel])
-					{
-						if (DEBUG_PRINT)
-						{
-							printf(" *");
-						}
-
-						auiPixelSelectors[uiPixel] = uiSelector;
-						afrgbaDecodedPixels[uiPixel] = frgbaDecodedPixel;
-						afPixelErrors[uiPixel] = fPixelError;
-					}
-
-					if (DEBUG_PRINT)
-					{
-						printf("\n");
-					}
-				}
-			}
-
-			// add up all pixel errors
-			float fCWError = 0.0f;
-			for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-			{
-				fCWError += afPixelErrors[uiPixel];
-			}
-			if (DEBUG_PRINT)
-			{
-				printf("\terror %.2f\n", fCWError);
-			}
-
-			// if best CW so far
-			if (fCWError < *pfHalfError)
-			{
-				*pfHalfError = fCWError;
-				*puiCW = uiCW;
-				for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-				{
-					m_auiSelectors[pauiPixelMapping[uiPixel]] = auiPixelSelectors[uiPixel];
-					m_afrgbaDecodedColors[pauiPixelMapping[uiPixel]] = afrgbaDecodedPixels[uiPixel];
-				}
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_ETC1::SetEncodingBits(void)
-	{
-		assert(m_mode == MODE_ETC1);
-
-		if (m_boolDiff)
-		{
-			int iRed1 = m_frgbaColor1.IntRed(31.0f);
-			int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
-			int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
-
-			int iRed2 = m_frgbaColor2.IntRed(31.0f);
-			int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
-			int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
-
-			int iDRed2 = iRed2 - iRed1;
-			int iDGreen2 = iGreen2 - iGreen1;
-			int iDBlue2 = iBlue2 - iBlue1;
-
-			assert(iDRed2 >= -4 && iDRed2 < 4);
-			assert(iDGreen2 >= -4 && iDGreen2 < 4);
-			assert(iDBlue2 >= -4 && iDBlue2 < 4);
-
-			m_pencodingbitsRGB8->differential.red1 = (unsigned int)iRed1;
-			m_pencodingbitsRGB8->differential.green1 = (unsigned int)iGreen1;
-			m_pencodingbitsRGB8->differential.blue1 = (unsigned int)iBlue1;
-
-			m_pencodingbitsRGB8->differential.dred2 = iDRed2;
-			m_pencodingbitsRGB8->differential.dgreen2 = iDGreen2;
-			m_pencodingbitsRGB8->differential.dblue2 = iDBlue2;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->individual.red1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-			m_pencodingbitsRGB8->individual.green1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-			m_pencodingbitsRGB8->individual.blue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-			m_pencodingbitsRGB8->individual.red2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-			m_pencodingbitsRGB8->individual.green2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-			m_pencodingbitsRGB8->individual.blue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-		}
-
-		m_pencodingbitsRGB8->individual.cw1 = m_uiCW1;
-		m_pencodingbitsRGB8->individual.cw2 = m_uiCW2;
-
-		SetEncodingBits_Selectors();
-
-		m_pencodingbitsRGB8->individual.diff = (unsigned int)m_boolDiff;
-		m_pencodingbitsRGB8->individual.flip = (unsigned int)m_boolFlip;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the selectors in the encoding bits
-	//
-	void Block4x4Encoding_ETC1::SetEncodingBits_Selectors(void)
-	{
-
-		m_pencodingbitsRGB8->individual.selectors = 0;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiSelector = m_auiSelectors[uiPixel];
-
-			// set index msb
-			m_pencodingbitsRGB8->individual.selectors |= (uiSelector >> 1) << (uiPixel ^ 8);
-
-			// set index lsb
-			m_pencodingbitsRGB8->individual.selectors |= (uiSelector & 1) << ((16 + uiPixel) ^ 8);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the decoded colors and decoded alpha based on the encoding state
-	//
-	void Block4x4Encoding_ETC1::Decode(void)
-	{
-
-		const unsigned int *pauiPixelOrder = m_boolFlip ? s_auiPixelOrderFlip1 : s_auiPixelOrderFlip0;
-
-		for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS; uiPixelOrder++)
-		{
-			ColorFloatRGBA *pfrgbaCenter = uiPixelOrder < 8 ? &m_frgbaColor1 : &m_frgbaColor2;
-			unsigned int uiCW = uiPixelOrder < 8 ? m_uiCW1 : m_uiCW2;
-
-			unsigned int uiPixel = pauiPixelOrder[uiPixelOrder];
-
-			float fDelta = s_aafCwTable[uiCW][m_auiSelectors[uiPixel]];
-			m_afrgbaDecodedColors[uiPixel] = (*pfrgbaCenter + fDelta).ClampRGB();
-			m_afDecodedAlphas[uiPixel] = 1.0f;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.h
deleted file mode 100644
index c0dc84d5d5..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_ETC1.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding.h"
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcDifferentialTrys.h"
-#include "EtcIndividualTrys.h"
-
-namespace Etc
-{
-
-	// base class for Block4x4Encoding_RGB8
-	class Block4x4Encoding_ETC1 : public Block4x4Encoding
-	{
-	public:
-
-		Block4x4Encoding_ETC1(void);
-		virtual ~Block4x4Encoding_ETC1(void);
-
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-									ColorFloatRGBA *a_pafrgbaSource,
-
-									unsigned char *a_paucEncodingBits,
-									ErrorMetric a_errormetric);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											ColorFloatRGBA *a_pafrgbaSource, 
-
-											ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-
-		inline virtual bool GetFlip(void)
-		{
-			return m_boolFlip;
-		}
-
-		inline virtual bool IsDifferential(void)
-		{
-			return m_boolDiff;
-		}
-
-		virtual void SetEncodingBits(void);
-
-		void Decode(void);
-
-		inline ColorFloatRGBA GetColor1(void) const
-		{
-			return m_frgbaColor1;
-		}
-
-		inline ColorFloatRGBA GetColor2(void) const
-		{
-			return m_frgbaColor2;
-		}
-
-		inline const unsigned int * GetSelectors(void) const
-		{
-			return m_auiSelectors;
-		}
-
-		inline unsigned int GetCW1(void) const
-		{
-			return m_uiCW1;
-		}
-
-		inline unsigned int GetCW2(void) const
-		{
-			return m_uiCW2;
-		}
-
-		inline bool HasSeverelyBentDifferentialColors(void) const
-		{
-			return m_boolSeverelyBentDifferentialColors;
-		}
-
-	protected:
-
-		static const unsigned int s_auiPixelOrderFlip0[PIXELS];
-		static const unsigned int s_auiPixelOrderFlip1[PIXELS];
-		static const unsigned int s_auiPixelOrderHScan[PIXELS];
-
-		static const unsigned int s_auiLeftPixelMapping[8];
-		static const unsigned int s_auiRightPixelMapping[8];
-		static const unsigned int s_auiTopPixelMapping[8];
-		static const unsigned int s_auiBottomPixelMapping[8];
-
-		static const unsigned int SELECTOR_BITS = 2;
-		static const unsigned int SELECTORS = 1 << SELECTOR_BITS;
-
-		static const unsigned int CW_BITS = 3;
-		static const unsigned int CW_RANGES = 1 << CW_BITS;
-
-		static float s_aafCwTable[CW_RANGES][SELECTORS];
-		static unsigned char s_aucDifferentialCwRange[256];
-
-		static const int MAX_DIFFERENTIAL = 3;
-		static const int MIN_DIFFERENTIAL = -4;
-
-		void InitFromEncodingBits_Selectors(void);
-
-		void PerformFirstIteration(void);
-		void CalculateMostLikelyFlip(void);
-
-		void TryDifferential(bool a_boolFlip, unsigned int a_uiRadius,
-								int a_iGrayOffset1, int a_iGrayOffset2);
-		void TryDifferentialHalf(DifferentialTrys::Half *a_phalf);
-
-		void TryIndividual(bool a_boolFlip, unsigned int a_uiRadius);
-		void TryIndividualHalf(IndividualTrys::Half *a_phalf);
-
-		void TryDegenerates1(void);
-		void TryDegenerates2(void);
-		void TryDegenerates3(void);
-		void TryDegenerates4(void);
-
-		void CalculateSelectors();
-		void CalculateHalfOfTheSelectors(unsigned int a_uiHalf,
-											const unsigned int *pauiPixelMapping);
-
-		// calculate the distance2 of r_frgbaPixel from r_frgbaTarget's gray line
-		inline float CalcGrayDistance2(ColorFloatRGBA &r_frgbaPixel, 
-										ColorFloatRGBA &r_frgbaTarget)
-		{
-			float fDeltaGray = ((r_frgbaPixel.fR - r_frgbaTarget.fR) +
-								(r_frgbaPixel.fG - r_frgbaTarget.fG) +
-								(r_frgbaPixel.fB - r_frgbaTarget.fB)) / 3.0f;
-
-			ColorFloatRGBA frgbaPointOnGrayLine = (r_frgbaTarget + fDeltaGray).ClampRGB();
-
-			float fDR = r_frgbaPixel.fR - frgbaPointOnGrayLine.fR;
-			float fDG = r_frgbaPixel.fG - frgbaPointOnGrayLine.fG;
-			float fDB = r_frgbaPixel.fB - frgbaPointOnGrayLine.fB;
-
-			return (fDR*fDR) + (fDG*fDG) + (fDB*fDB);
-		}
-
-		void SetEncodingBits_Selectors(void);
-
-		// intermediate encoding
-		bool			m_boolDiff;
-		bool			m_boolFlip;
-		ColorFloatRGBA	m_frgbaColor1;
-		ColorFloatRGBA	m_frgbaColor2;
-		unsigned int	m_uiCW1;
-		unsigned int	m_uiCW2;
-		unsigned int	m_auiSelectors[PIXELS];
-
-		// state shared between iterations
-		ColorFloatRGBA	m_frgbaSourceAverageLeft;
-		ColorFloatRGBA	m_frgbaSourceAverageRight;
-		ColorFloatRGBA	m_frgbaSourceAverageTop;
-		ColorFloatRGBA	m_frgbaSourceAverageBottom;
-		bool			m_boolMostLikelyFlip;
-
-		// stats
-		float			m_fError1;	// error for Etc1 half 1
-		float			m_fError2;	// error for Etc1 half 2
-		bool			m_boolSeverelyBentDifferentialColors;	// only valid if m_boolDiff;
-
-		// final encoding
-		Block4x4EncodingBits_RGB8 *m_pencodingbitsRGB8;		// or RGB8 portion of Block4x4EncodingBits_RGB8A8
-
-		private:
-
-		void CalculateSourceAverages(void);
-
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.cpp
deleted file mode 100644
index 4c012fbbf1..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.cpp
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_R11.cpp
-
-Block4x4Encoding_R11 is the encoder to use when targetting file format R11 and SR11 (signed R11).  
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_R11.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-
-namespace Etc
-{
-
-	// modifier values to use for R11, SR11, RG11 and SRG11
-	float Block4x4Encoding_R11::s_aafModifierTable[MODIFIER_TABLE_ENTRYS][SELECTORS]
-	{
-		{ -3.0f / 255.0f, -6.0f / 255.0f,  -9.0f / 255.0f, -15.0f / 255.0f, 2.0f / 255.0f, 5.0f / 255.0f, 8.0f / 255.0f, 14.0f / 255.0f },
-		{ -3.0f / 255.0f, -7.0f / 255.0f, -10.0f / 255.0f, -13.0f / 255.0f, 2.0f / 255.0f, 6.0f / 255.0f, 9.0f / 255.0f, 12.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -13.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f, 12.0f / 255.0f },
-		{ -2.0f / 255.0f, -4.0f / 255.0f,  -6.0f / 255.0f, -13.0f / 255.0f, 1.0f / 255.0f, 3.0f / 255.0f, 5.0f / 255.0f, 12.0f / 255.0f },
-
-		{ -3.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f, -12.0f / 255.0f, 2.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f, 11.0f / 255.0f },
-		{ -3.0f / 255.0f, -7.0f / 255.0f,  -9.0f / 255.0f, -11.0f / 255.0f, 2.0f / 255.0f, 6.0f / 255.0f, 8.0f / 255.0f, 10.0f / 255.0f },
-		{ -4.0f / 255.0f, -7.0f / 255.0f,  -8.0f / 255.0f, -11.0f / 255.0f, 3.0f / 255.0f, 6.0f / 255.0f, 7.0f / 255.0f, 10.0f / 255.0f },
-		{ -3.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -11.0f / 255.0f, 2.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f, 10.0f / 255.0f },
-
-		{ -2.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -4.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 3.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -7.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 6.0f / 255.0f,  9.0f / 255.0f },
-
-		{ -3.0f / 255.0f, -4.0f / 255.0f,  -7.0f / 255.0f, -10.0f / 255.0f, 2.0f / 255.0f, 3.0f / 255.0f, 6.0f / 255.0f,  9.0f / 255.0f },
-		{ -1.0f / 255.0f, -2.0f / 255.0f,  -3.0f / 255.0f, -10.0f / 255.0f, 0.0f / 255.0f, 1.0f / 255.0f, 2.0f / 255.0f,  9.0f / 255.0f },
-		{ -4.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f,  -9.0f / 255.0f, 3.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f,  8.0f / 255.0f },
-		{ -3.0f / 255.0f, -5.0f / 255.0f,  -7.0f / 255.0f,  -9.0f / 255.0f, 2.0f / 255.0f, 4.0f / 255.0f, 6.0f / 255.0f,  8.0f / 255.0f }
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_R11::Block4x4Encoding_R11(void)
-	{
-
-		m_pencodingbitsR11 = nullptr;
-
-	}
-
-	Block4x4Encoding_R11::~Block4x4Encoding_R11(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_R11::InitFromSource(Block4x4 *a_pblockParent,
-		ColorFloatRGBA *a_pafrgbaSource,
-		unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric)
-	{
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,a_errormetric);
-
-		m_pencodingbitsR11 = (Block4x4EncodingBits_R11 *)a_paucEncodingBits;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_R11::InitFromEncodingBits(Block4x4 *a_pblockParent,
-		unsigned char *a_paucEncodingBits,
-		ColorFloatRGBA *a_pafrgbaSource,
-		ErrorMetric a_errormetric)
-	{
-		m_pencodingbitsR11 = (Block4x4EncodingBits_R11 *)a_paucEncodingBits;
-
-		// init RGB portion
-		Block4x4Encoding_RGB8::InitFromEncodingBits(a_pblockParent,
-			(unsigned char *)m_pencodingbitsR11,
-			a_pafrgbaSource,
-			a_errormetric);
-
-		// init R11 portion
-		{
-			m_mode = MODE_R11;
-			if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_R11 || a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-			{
-				m_fRedBase = (float)(signed char)m_pencodingbitsR11->data.base;
-			}
-			else
-			{
-				m_fRedBase = (float)(unsigned char)m_pencodingbitsR11->data.base;
-			}
-			m_fRedMultiplier = (float)m_pencodingbitsR11->data.multiplier;
-			m_uiRedModifierTableIndex = m_pencodingbitsR11->data.table;
-
-			unsigned long long int ulliSelectorBits = 0;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors0 << 40;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors1 << 32;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors2 << 24;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors3 << 16;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors4 << 8;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsR11->data.selectors5;
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				unsigned int uiShift = 45 - (3 * uiPixel);
-				m_auiRedSelectors[uiPixel] = (ulliSelectorBits >> uiShift) & (SELECTORS - 1);
-			}
-
-			// decode the red channel
-			// calc red error
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				float fDecodedPixelData = 0.0f;
-				if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::R11 || a_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-				{
-					fDecodedPixelData = DecodePixelRed(m_fRedBase, m_fRedMultiplier,
-						m_uiRedModifierTableIndex,
-						m_auiRedSelectors[uiPixel]);
-				}
-				else if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_R11 || a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-				{
-					fDecodedPixelData = DecodePixelRed(m_fRedBase + 128, m_fRedMultiplier,
-						m_uiRedModifierTableIndex,
-						m_auiRedSelectors[uiPixel]);
-				}
-				else
-				{
-					assert(0);
-				}
-				m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(fDecodedPixelData, 0.0f, 0.0f, 1.0f);
-			}
-			CalcBlockError();
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_R11::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-		m_mode = MODE_R11;
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			m_fError = FLT_MAX;
-			m_fRedBlockError = FLT_MAX;		// artificially high value
-			CalculateR11(8, 0.0f, 0.0f);
-			m_fError = m_fRedBlockError;
-			break;
-
-		case 1:
-			CalculateR11(8, 2.0f, 1.0f);
-			m_fError = m_fRedBlockError;
-			if (a_fEffort <= 24.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 2:
-			CalculateR11(8, 12.0f, 1.0f);
-			m_fError = m_fRedBlockError;
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 3:
-			CalculateR11(7, 6.0f, 1.0f);
-			m_fError = m_fRedBlockError;
-			break;
-
-		case 4:
-			CalculateR11(6, 3.0f, 1.0f);
-			m_fError = m_fRedBlockError;
-			break;
-
-		case 5:
-			CalculateR11(5, 1.0f, 0.0f);
-			m_fError = m_fRedBlockError;
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find the best combination of base color, multiplier and selectors
-	//
-	// a_uiSelectorsUsed limits the number of selector combinations to try
-	// a_fBaseRadius limits the range of base colors to try
-	// a_fMultiplierRadius limits the range of multipliers to try
-	//
-	void Block4x4Encoding_R11::CalculateR11(unsigned int a_uiSelectorsUsed, 
-												float a_fBaseRadius, float a_fMultiplierRadius)
-	{
-		// maps from virtual (monotonic) selector to ETC selector
-		static const unsigned int auiVirtualSelectorMap[8] = {3, 2, 1, 0, 4, 5, 6, 7};
-
-		// find min/max red
-		float fMinRed = 1.0f;
-		float fMaxRed = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			// ignore border pixels
-			float fAlpha = m_pafrgbaSource[uiPixel].fA;
-			if (isnan(fAlpha))
-			{
-				continue;
-			}
-
-			float fRed = m_pafrgbaSource[uiPixel].fR;
-
-			if (fRed < fMinRed)
-			{
-				fMinRed = fRed;
-			}
-			if (fRed > fMaxRed)
-			{
-				fMaxRed = fRed;
-			}
-		}
-		assert(fMinRed <= fMaxRed);
-
-		float fRedRange = (fMaxRed - fMinRed);
-
-		// try each modifier table entry							  
-		for (unsigned int uiTableEntry = 0; uiTableEntry < MODIFIER_TABLE_ENTRYS; uiTableEntry++)
-		{
-			for (unsigned int uiMinVirtualSelector = 0; 
-					uiMinVirtualSelector <= (8- a_uiSelectorsUsed); 
-					uiMinVirtualSelector++)
-			{
-				unsigned int uiMaxVirtualSelector = uiMinVirtualSelector + a_uiSelectorsUsed - 1;
-
-				unsigned int uiMinSelector = auiVirtualSelectorMap[uiMinVirtualSelector];
-				unsigned int uiMaxSelector = auiVirtualSelectorMap[uiMaxVirtualSelector];
-
-				float fTableEntryCenter = -s_aafModifierTable[uiTableEntry][uiMinSelector];
-
-				float fTableEntryRange = s_aafModifierTable[uiTableEntry][uiMaxSelector] -
-											s_aafModifierTable[uiTableEntry][uiMinSelector];
-
-				float fCenterRatio = fTableEntryCenter / fTableEntryRange;
-
-				float fCenter = fMinRed + fCenterRatio*fRedRange;
-				fCenter = roundf(255.0f * fCenter) / 255.0f;
-
-				float fMinBase = fCenter - (a_fBaseRadius / 255.0f);
-				if (fMinBase < 0.0f)
-				{
-					fMinBase = 0.0f;
-				}
-
-				float fMaxBase = fCenter + (a_fBaseRadius / 255.0f);
-				if (fMaxBase > 1.0f)
-				{
-					fMaxBase = 1.0f;
-				}
-
-				for (float fBase = fMinBase; fBase <= fMaxBase; fBase += (0.999999f / 255.0f))
-				{
-					float fRangeMultiplier = roundf(fRedRange / fTableEntryRange);
-
-					float fMinMultiplier = fRangeMultiplier - a_fMultiplierRadius;
-					if (fMinMultiplier < 1.0f)
-					{
-						fMinMultiplier = 0.0f;
-					}
-					else if (fMinMultiplier > 15.0f)
-					{
-						fMinMultiplier = 15.0f;
-					}
-
-					float fMaxMultiplier = fRangeMultiplier + a_fMultiplierRadius;
-					if (fMaxMultiplier < 1.0f)
-					{
-						fMaxMultiplier = 1.0f;
-					}
-					else if (fMaxMultiplier > 15.0f)
-					{
-						fMaxMultiplier = 15.0f;
-					}
-
-					for (float fMultiplier = fMinMultiplier; fMultiplier <= fMaxMultiplier; fMultiplier += 1.0f)
-					{
-						// find best selector for each pixel
-						unsigned int auiBestSelectors[PIXELS];
-						float afBestRedError[PIXELS];
-						float afBestPixelRed[PIXELS];
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							float fBestPixelRedError = FLT_MAX;
-
-							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-								float fPixelRed = DecodePixelRed(fBase * 255.0f, fMultiplier, uiTableEntry, uiSelector);
-
-								ColorFloatRGBA frgba(fPixelRed, m_pafrgbaSource[uiPixel].fG,0.0f,1.0f);
-
-								float fPixelRedError = CalcPixelError(frgba, 1.0f, m_pafrgbaSource[uiPixel]);
-
-								if (fPixelRedError < fBestPixelRedError)
-								{
-									fBestPixelRedError = fPixelRedError;
-									auiBestSelectors[uiPixel] = uiSelector;
-									afBestRedError[uiPixel] = fBestPixelRedError;
-									afBestPixelRed[uiPixel] = fPixelRed;
-								}
-							}
-						}
-						float fBlockError = 0.0f;  
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							fBlockError += afBestRedError[uiPixel];
-						}
-						if (fBlockError < m_fRedBlockError)
-						{
-							m_fRedBlockError = fBlockError;
-
-							if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::R11 || m_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-							{
-								m_fRedBase = 255.0f * fBase;
-							}
-							else if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_R11 || m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-							{
-								m_fRedBase = (fBase * 255) - 128;
-							}
-							else
-							{
-								assert(0);
-							}
-							m_fRedMultiplier = fMultiplier;
-							m_uiRedModifierTableIndex = uiTableEntry;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiRedSelectors[uiPixel] = auiBestSelectors[uiPixel];
-								float fBestPixelRed = afBestPixelRed[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(fBestPixelRed, 0.0f, 0.0f, 1.0f);
-								m_afDecodedAlphas[uiPixel] = 1.0f;
-							}
-						}
-					}
-				}
-
-			}
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_R11::SetEncodingBits(void)
-	{
-		if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::R11 || m_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-		{
-			m_pencodingbitsR11->data.base = (unsigned char)roundf(m_fRedBase);
-		}
-		else if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_R11 || m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-		{
-			m_pencodingbitsR11->data.base = (signed char)roundf(m_fRedBase);
-		}
-		else
-		{
-			assert(0);
-		}
-		m_pencodingbitsR11->data.table = m_uiRedModifierTableIndex;
-		m_pencodingbitsR11->data.multiplier = (unsigned char)roundf(m_fRedMultiplier);
-
-		unsigned long long int ulliSelectorBits = 0;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiShift = 45 - (3 * uiPixel);
-			ulliSelectorBits |= ((unsigned long long int)m_auiRedSelectors[uiPixel]) << uiShift;
-		}
-
-		m_pencodingbitsR11->data.selectors0 = ulliSelectorBits >> 40;
-		m_pencodingbitsR11->data.selectors1 = ulliSelectorBits >> 32;
-		m_pencodingbitsR11->data.selectors2 = ulliSelectorBits >> 24;
-		m_pencodingbitsR11->data.selectors3 = ulliSelectorBits >> 16;
-		m_pencodingbitsR11->data.selectors4 = ulliSelectorBits >> 8;
-		m_pencodingbitsR11->data.selectors5 = ulliSelectorBits;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.h
deleted file mode 100644
index b40c1e0036..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_R11.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_RGB8.h"
-
-namespace Etc
-{
-	class Block4x4EncodingBits_R11;
-
-	// ################################################################################
-	// Block4x4Encoding_R11
-	// ################################################################################
-
-	class Block4x4Encoding_R11 : public Block4x4Encoding_RGB8
-	{
-	public:
-
-		Block4x4Encoding_R11(void);
-		virtual ~Block4x4Encoding_R11(void);
-
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-			ColorFloatRGBA *a_pafrgbaSource,
-			unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-			unsigned char *a_paucEncodingBits,
-			ColorFloatRGBA *a_pafrgbaSource,
-			ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-		inline float GetRedBase(void) const
-		{
-			return m_fRedBase;
-		}
-
-		inline float GetRedMultiplier(void) const
-		{
-			return m_fRedMultiplier;
-		}
-
-		inline int GetRedTableIndex(void) const
-		{
-			return m_uiRedModifierTableIndex;
-		}
-
-		inline const unsigned int * GetRedSelectors(void) const
-		{
-			return m_auiRedSelectors;
-		}
-
-	protected:
-
-		static const unsigned int MODIFIER_TABLE_ENTRYS = 16;
-		static const unsigned int SELECTOR_BITS = 3;
-		static const unsigned int SELECTORS = 1 << SELECTOR_BITS;
-
-		static float s_aafModifierTable[MODIFIER_TABLE_ENTRYS][SELECTORS];
-
-		void CalculateR11(unsigned int a_uiSelectorsUsed, 
-							float a_fBaseRadius, float a_fMultiplierRadius);
-
-		
-
-	
-		inline float DecodePixelRed(float a_fBase, float a_fMultiplier,
-			unsigned int a_uiTableIndex, unsigned int a_uiSelector)
-		{
-			float fMultiplier = a_fMultiplier;
-			if (fMultiplier <= 0.0f)
-			{
-				fMultiplier = 1.0f / 8.0f;
-			}
-
-			float fPixelRed = a_fBase * 8 + 4 +
-				8 * fMultiplier*s_aafModifierTable[a_uiTableIndex][a_uiSelector]*255;
-			fPixelRed /= 2047.0f;
-
-			if (fPixelRed < 0.0f)
-			{
-				fPixelRed = 0.0f;
-			}
-			else if (fPixelRed > 1.0f)
-			{
-				fPixelRed = 1.0f;
-			}
-
-			return fPixelRed;
-		}
-
-		Block4x4EncodingBits_R11 *m_pencodingbitsR11;
-
-		float m_fRedBase;
-		float m_fRedMultiplier;
-		float m_fRedBlockError;
-		unsigned int m_uiRedModifierTableIndex;
-		unsigned int m_auiRedSelectors[PIXELS];
-
-		
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.cpp
deleted file mode 100644
index 417835db51..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.cpp
+++ /dev/null
@@ -1,447 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RG11.cpp
-
-Block4x4Encoding_RG11 is the encoder to use when targetting file format RG11 and SRG11 (signed RG11).
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RG11.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-
-namespace Etc
-{
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_RG11::Block4x4Encoding_RG11(void)
-	{
-		m_pencodingbitsRG11 = nullptr;
-	}
-
-	Block4x4Encoding_RG11::~Block4x4Encoding_RG11(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_RG11::InitFromSource(Block4x4 *a_pblockParent,
-		ColorFloatRGBA *a_pafrgbaSource,
-		unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric)
-	{
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,a_errormetric);
-
-		m_pencodingbitsRG11 = (Block4x4EncodingBits_RG11 *)a_paucEncodingBits;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_RG11::InitFromEncodingBits(Block4x4 *a_pblockParent,
-		unsigned char *a_paucEncodingBits,
-		ColorFloatRGBA *a_pafrgbaSource,
-		ErrorMetric a_errormetric)
-	{
-
-		m_pencodingbitsRG11 = (Block4x4EncodingBits_RG11 *)a_paucEncodingBits;
-
-		// init RGB portion
-		Block4x4Encoding_RGB8::InitFromEncodingBits(a_pblockParent,
-			(unsigned char *)m_pencodingbitsRG11,
-			a_pafrgbaSource,
-			a_errormetric);
-		m_fError = 0.0f;
-
-		{
-			m_mode = MODE_RG11;
-			if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-			{
-				m_fRedBase = (float)(signed char)m_pencodingbitsRG11->data.baseR;
-				m_fGrnBase = (float)(signed char)m_pencodingbitsRG11->data.baseG;
-			}
-			else
-			{
-				m_fRedBase = (float)(unsigned char)m_pencodingbitsRG11->data.baseR;
-				m_fGrnBase = (float)(unsigned char)m_pencodingbitsRG11->data.baseG;
-			}
-			m_fRedMultiplier = (float)m_pencodingbitsRG11->data.multiplierR;
-			m_fGrnMultiplier = (float)m_pencodingbitsRG11->data.multiplierG;
-			m_uiRedModifierTableIndex = m_pencodingbitsRG11->data.tableIndexR;
-			m_uiGrnModifierTableIndex = m_pencodingbitsRG11->data.tableIndexG;
-
-			unsigned long long int ulliSelectorBitsR = 0;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR0 << 40;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR1 << 32;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR2 << 24;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR3 << 16;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR4 << 8;
-			ulliSelectorBitsR |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsR5;
-
-			unsigned long long int ulliSelectorBitsG = 0;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG0 << 40;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG1 << 32;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG2 << 24;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG3 << 16;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG4 << 8;
-			ulliSelectorBitsG |= (unsigned long long int)m_pencodingbitsRG11->data.selectorsG5;
-
-			
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				unsigned int uiShift = 45 - (3 * uiPixel);
-				m_auiRedSelectors[uiPixel] = (ulliSelectorBitsR >> uiShift) & (SELECTORS - 1);
-				m_auiGrnSelectors[uiPixel] = (ulliSelectorBitsG >> uiShift) & (SELECTORS - 1);
-			}
-
-			
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				float fRedDecodedData = 0.0f;
-				float fGrnDecodedData = 0.0f;
-				if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-				{
-					fRedDecodedData = DecodePixelRed(m_fRedBase, m_fRedMultiplier, m_uiRedModifierTableIndex, m_auiRedSelectors[uiPixel]);
-					fGrnDecodedData = DecodePixelRed(m_fGrnBase, m_fGrnMultiplier, m_uiGrnModifierTableIndex, m_auiGrnSelectors[uiPixel]);
-				}
-				else if (a_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-				{
-					fRedDecodedData = DecodePixelRed(m_fRedBase + 128, m_fRedMultiplier, m_uiRedModifierTableIndex, m_auiRedSelectors[uiPixel]);
-					fGrnDecodedData = DecodePixelRed(m_fGrnBase + 128, m_fGrnMultiplier, m_uiGrnModifierTableIndex, m_auiGrnSelectors[uiPixel]);
-				}
-				else
-				{
-					assert(0);
-				}
-				m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA(fRedDecodedData, fGrnDecodedData, 0.0f, 1.0f);
-			}
-
-		}
-
-		CalcBlockError();
- 	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RG11::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			m_fError = FLT_MAX;
-			m_fGrnBlockError = FLT_MAX;		// artificially high value
-			m_fRedBlockError = FLT_MAX;
-			CalculateR11(8, 0.0f, 0.0f);
-			CalculateG11(8, 0.0f, 0.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			break;
-
-		case 1:
-			CalculateR11(8, 2.0f, 1.0f);
-			CalculateG11(8, 2.0f, 1.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			if (a_fEffort <= 24.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 2:
-			CalculateR11(8, 12.0f, 1.0f);
-			CalculateG11(8, 12.0f, 1.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 3:
-			CalculateR11(7, 6.0f, 1.0f);
-			CalculateG11(7, 6.0f, 1.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			break;
-
-		case 4:
-			CalculateR11(6, 3.0f, 1.0f);
-			CalculateG11(6, 3.0f, 1.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			break;
-
-		case 5:
-			CalculateR11(5, 1.0f, 0.0f);
-			CalculateG11(5, 1.0f, 0.0f);
-			m_fError = (m_fGrnBlockError + m_fRedBlockError);
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find the best combination of base color, multiplier and selectors
-	//
-	// a_uiSelectorsUsed limits the number of selector combinations to try
-	// a_fBaseRadius limits the range of base colors to try
-	// a_fMultiplierRadius limits the range of multipliers to try
-	//
-	void Block4x4Encoding_RG11::CalculateG11(unsigned int a_uiSelectorsUsed,
-		float a_fBaseRadius, float a_fMultiplierRadius)
-	{
-		// maps from virtual (monotonic) selector to etc selector
-		static const unsigned int auiVirtualSelectorMap[8] = { 3, 2, 1, 0, 4, 5, 6, 7 };
-
-		// find min/max Grn
-		float fMinGrn = 1.0f;
-		float fMaxGrn = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			// ignore border pixels
-			float fAlpha = m_pafrgbaSource[uiPixel].fA;
-			if (isnan(fAlpha))
-			{
-				continue;
-			}
-
-			float fGrn = m_pafrgbaSource[uiPixel].fG;
-
-			if (fGrn < fMinGrn)
-			{
-				fMinGrn = fGrn;
-			}
-			if (fGrn > fMaxGrn)
-			{
-				fMaxGrn = fGrn;
-			}
-		}
-		assert(fMinGrn <= fMaxGrn);
-
-		float fGrnRange = (fMaxGrn - fMinGrn);
-
-		// try each modifier table entry							  
-		for (unsigned int uiTableEntry = 0; uiTableEntry < MODIFIER_TABLE_ENTRYS; uiTableEntry++)
-		{
-			for (unsigned int uiMinVirtualSelector = 0;
-			uiMinVirtualSelector <= (8 - a_uiSelectorsUsed);
-				uiMinVirtualSelector++)
-			{
-				unsigned int uiMaxVirtualSelector = uiMinVirtualSelector + a_uiSelectorsUsed - 1;
-
-				unsigned int uiMinSelector = auiVirtualSelectorMap[uiMinVirtualSelector];
-				unsigned int uiMaxSelector = auiVirtualSelectorMap[uiMaxVirtualSelector];
-
-				float fTableEntryCenter = -s_aafModifierTable[uiTableEntry][uiMinSelector];
-
-				float fTableEntryRange = s_aafModifierTable[uiTableEntry][uiMaxSelector] -
-					s_aafModifierTable[uiTableEntry][uiMinSelector];
-
-				float fCenterRatio = fTableEntryCenter / fTableEntryRange;
-
-				float fCenter = fMinGrn + fCenterRatio*fGrnRange;
-				fCenter = roundf(255.0f * fCenter) / 255.0f;
-
-				float fMinBase = fCenter - (a_fBaseRadius / 255.0f);
-				if (fMinBase < 0.0f)
-				{
-					fMinBase = 0.0f;
-				}
-
-				float fMaxBase = fCenter + (a_fBaseRadius / 255.0f);
-				if (fMaxBase > 1.0f)
-				{
-					fMaxBase = 1.0f;
-				}
-
-				for (float fBase = fMinBase; fBase <= fMaxBase; fBase += (0.999999f / 255.0f))
-				{
-					float fRangeMultiplier = roundf(fGrnRange / fTableEntryRange);
-
-					float fMinMultiplier = fRangeMultiplier - a_fMultiplierRadius;
-					if (fMinMultiplier < 1.0f)
-					{
-						fMinMultiplier = 0.0f;
-					}
-					else if (fMinMultiplier > 15.0f)
-					{
-						fMinMultiplier = 15.0f;
-					}
-
-					float fMaxMultiplier = fRangeMultiplier + a_fMultiplierRadius;
-					if (fMaxMultiplier < 1.0f)
-					{
-						fMaxMultiplier = 1.0f;
-					}
-					else if (fMaxMultiplier > 15.0f)
-					{
-						fMaxMultiplier = 15.0f;
-					}
-
-					for (float fMultiplier = fMinMultiplier; fMultiplier <= fMaxMultiplier; fMultiplier += 1.0f)
-					{
-						// find best selector for each pixel
-						unsigned int auiBestSelectors[PIXELS];
-						float afBestGrnError[PIXELS];
-						float afBestPixelGrn[PIXELS];
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							float fBestPixelGrnError = FLT_MAX;
-
-							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-								//DecodePixelRed is not red channel specific
-								float fPixelGrn = DecodePixelRed(fBase * 255.0f, fMultiplier, uiTableEntry, uiSelector);
-								
-								ColorFloatRGBA frgba(m_pafrgbaSource[uiPixel].fR, fPixelGrn, 0.0f, 1.0f);
-									
-								float fPixelGrnError = CalcPixelError(frgba, 1.0f, m_pafrgbaSource[uiPixel]);
-
-								if (fPixelGrnError < fBestPixelGrnError)
-								{
-									fBestPixelGrnError = fPixelGrnError;
-									auiBestSelectors[uiPixel] = uiSelector;
-									afBestGrnError[uiPixel] = fBestPixelGrnError;
-									afBestPixelGrn[uiPixel] = fPixelGrn;
-								}
-							}
-						}
-						float fBlockError = 0.0f;
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							fBlockError += afBestGrnError[uiPixel];
-						}
-
-						if (fBlockError < m_fGrnBlockError)
-						{
-							m_fGrnBlockError = fBlockError;
-
-							if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-							{
-								m_fGrnBase = 255.0f * fBase;
-							}
-							else if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-							{
-								m_fGrnBase = (fBase * 255) - 128;
-							}
-							else
-							{
-								assert(0);
-							}
-							m_fGrnMultiplier = fMultiplier;
-							m_uiGrnModifierTableIndex = uiTableEntry;
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiGrnSelectors[uiPixel] = auiBestSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel].fG = afBestPixelGrn[uiPixel];
-								m_afDecodedAlphas[uiPixel] = 1.0f;
-							}
-						}
-					}
-				}
-
-			}
-		}
-	}
-	
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RG11::SetEncodingBits(void)
-	{
-		unsigned long long int ulliSelectorBitsR = 0;
-		unsigned long long int ulliSelectorBitsG = 0;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiShift = 45 - (3 * uiPixel);
-			ulliSelectorBitsR |= ((unsigned long long int)m_auiRedSelectors[uiPixel]) << uiShift;
-			ulliSelectorBitsG |= ((unsigned long long int)m_auiGrnSelectors[uiPixel]) << uiShift;
-		}
-		if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-		{
-			m_pencodingbitsRG11->data.baseR = (unsigned char)roundf(m_fRedBase);
-		}
-		else if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-		{
-			m_pencodingbitsRG11->data.baseR = (signed char)roundf(m_fRedBase);
-		}
-		else
-		{
-			assert(0);
-		}
-		m_pencodingbitsRG11->data.tableIndexR = m_uiRedModifierTableIndex;
-		m_pencodingbitsRG11->data.multiplierR = (unsigned char)roundf(m_fRedMultiplier);
-
-		m_pencodingbitsRG11->data.selectorsR0 = ulliSelectorBitsR >> 40;
-		m_pencodingbitsRG11->data.selectorsR1 = ulliSelectorBitsR >> 32;
-		m_pencodingbitsRG11->data.selectorsR2 = ulliSelectorBitsR >> 24;
-		m_pencodingbitsRG11->data.selectorsR3 = ulliSelectorBitsR >> 16;
-		m_pencodingbitsRG11->data.selectorsR4 = ulliSelectorBitsR >> 8;
-		m_pencodingbitsRG11->data.selectorsR5 = ulliSelectorBitsR;
-
-		if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::RG11)
-		{
-			m_pencodingbitsRG11->data.baseG = (unsigned char)roundf(m_fGrnBase);
-		}
-		else if (m_pblockParent->GetImageSource()->GetFormat() == Image::Format::SIGNED_RG11)
-		{
-			m_pencodingbitsRG11->data.baseG = (signed char)roundf(m_fGrnBase);
-		}
-		else
-		{
-			assert(0);
-		}
-		m_pencodingbitsRG11->data.tableIndexG = m_uiGrnModifierTableIndex;
-		m_pencodingbitsRG11->data.multiplierG = (unsigned char)roundf(m_fGrnMultiplier);
-
-		m_pencodingbitsRG11->data.selectorsG0 = ulliSelectorBitsG >> 40;
-		m_pencodingbitsRG11->data.selectorsG1 = ulliSelectorBitsG >> 32;
-		m_pencodingbitsRG11->data.selectorsG2 = ulliSelectorBitsG >> 24;
-		m_pencodingbitsRG11->data.selectorsG3 = ulliSelectorBitsG >> 16;
-		m_pencodingbitsRG11->data.selectorsG4 = ulliSelectorBitsG >> 8;
-		m_pencodingbitsRG11->data.selectorsG5 = ulliSelectorBitsG;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.h
deleted file mode 100644
index d4993b8c5f..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RG11.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_RGB8.h"
-#include "EtcBlock4x4Encoding_R11.h"
-
-namespace Etc
-{
-	class Block4x4EncodingBits_RG11;
-
-	// ################################################################################
-	// Block4x4Encoding_RG11
-	// ################################################################################
-
-	class Block4x4Encoding_RG11 : public Block4x4Encoding_R11
-	{
-		float m_fGrnBase;
-		float m_fGrnMultiplier;
-		float m_fGrnBlockError;
-		unsigned int m_auiGrnSelectors[PIXELS];
-		unsigned int m_uiGrnModifierTableIndex;
-	public:
-
-		Block4x4Encoding_RG11(void);
-		virtual ~Block4x4Encoding_RG11(void);
-
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-			ColorFloatRGBA *a_pafrgbaSource,
-
-			unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-			unsigned char *a_paucEncodingBits,
-			ColorFloatRGBA *a_pafrgbaSource,
-
-			ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-		Block4x4EncodingBits_RG11 *m_pencodingbitsRG11;
-
-		void CalculateG11(unsigned int a_uiSelectorsUsed, float a_fBaseRadius, float a_fMultiplierRadius);
-
-		inline float GetGrnBase(void) const
-		{
-			return m_fGrnBase;
-		}
-
-		inline float GetGrnMultiplier(void) const
-		{
-			return m_fGrnMultiplier;
-		}
-
-		inline int GetGrnTableIndex(void) const
-		{
-			return m_uiGrnModifierTableIndex;
-		}
-
-		inline const unsigned int * GetGrnSelectors(void) const
-		{
-			return m_auiGrnSelectors;
-		}
-
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
deleted file mode 100644
index 5c7ebed788..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
+++ /dev/null
@@ -1,1730 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RGB8.cpp
-
-Block4x4Encoding_RGB8 is the encoder to use for the ETC2 extensions when targetting file format RGB8.  
-This encoder is also used for the ETC2 subset of file format RGBA8.
-
-Block4x4Encoding_ETC1 encodes the ETC1 subset of RGB8.
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RGB8.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-#include "EtcMath.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-
-namespace Etc
-{
-	float Block4x4Encoding_RGB8::s_afTHDistanceTable[TH_DISTANCES] =
-	{
-		3.0f / 255.0f,
-		6.0f / 255.0f,
-		11.0f / 255.0f,
-		16.0f / 255.0f,
-		23.0f / 255.0f,
-		32.0f / 255.0f,
-		41.0f / 255.0f,
-		64.0f / 255.0f
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_RGB8::Block4x4Encoding_RGB8(void)
-	{
-
-		m_pencodingbitsRGB8 = nullptr;
-
-	}
-
-	Block4x4Encoding_RGB8::~Block4x4Encoding_RGB8(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_RGB8::InitFromEncodingBits(Block4x4 *a_pblockParent,
-														unsigned char *a_paucEncodingBits,
-														ColorFloatRGBA *a_pafrgbaSource,
-														ErrorMetric a_errormetric)
-	{
-		
-		// handle ETC1 modes
-		Block4x4Encoding_ETC1::InitFromEncodingBits(a_pblockParent,
-													a_paucEncodingBits, a_pafrgbaSource,a_errormetric);
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
-
-		// detect if there is a T, H or Planar mode present
-		if (m_pencodingbitsRGB8->differential.diff)
-		{
-			int iRed1 = (int)m_pencodingbitsRGB8->differential.red1;
-			int iDRed2 = m_pencodingbitsRGB8->differential.dred2;
-			int iRed2 = iRed1 + iDRed2;
-
-			int iGreen1 = (int)m_pencodingbitsRGB8->differential.green1;
-			int iDGreen2 = m_pencodingbitsRGB8->differential.dgreen2;
-			int iGreen2 = iGreen1 + iDGreen2;
-
-			int iBlue1 = (int)m_pencodingbitsRGB8->differential.blue1;
-			int iDBlue2 = m_pencodingbitsRGB8->differential.dblue2;
-			int iBlue2 = iBlue1 + iDBlue2;
-
-			if (iRed2 < 0 || iRed2 > 31)
-			{
-				InitFromEncodingBits_T();
-			}
-			else if (iGreen2 < 0 || iGreen2 > 31)
-			{
-				InitFromEncodingBits_H();
-			}
-			else if (iBlue2 < 0 || iBlue2 > 31)
-			{
-				InitFromEncodingBits_Planar();
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if T mode is detected
-	//
-	void Block4x4Encoding_RGB8::InitFromEncodingBits_T(void)
-	{
-
-		m_mode = MODE_T;
-
-		unsigned char ucRed1 = (unsigned char)((m_pencodingbitsRGB8->t.red1a << 2) +
-								m_pencodingbitsRGB8->t.red1b);
-		unsigned char ucGreen1 = m_pencodingbitsRGB8->t.green1;
-		unsigned char ucBlue1 = m_pencodingbitsRGB8->t.blue1;
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->t.red2;
-		unsigned char ucGreen2 = m_pencodingbitsRGB8->t.green2;
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->t.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->t.da << 1) + m_pencodingbitsRGB8->t.db;
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_T();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if H mode is detected
-	//
-	void Block4x4Encoding_RGB8::InitFromEncodingBits_H(void)
-	{
-
-		m_mode = MODE_H;
-		
-		unsigned char ucRed1 = m_pencodingbitsRGB8->h.red1;
-		unsigned char ucGreen1 = (unsigned char)((m_pencodingbitsRGB8->h.green1a << 1) +
-									m_pencodingbitsRGB8->h.green1b);
-		unsigned char ucBlue1 = (unsigned char)((m_pencodingbitsRGB8->h.blue1a << 3) +
-								(m_pencodingbitsRGB8->h.blue1b << 1) + 
-								m_pencodingbitsRGB8->h.blue1c);
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->h.red2;
-		unsigned char ucGreen2 = (unsigned char)((m_pencodingbitsRGB8->h.green2a << 1) +
-									m_pencodingbitsRGB8->h.green2b);
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->h.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		// used to determine the LSB of the CW
-		unsigned int uiRGB1 = (unsigned int)(((int)ucRed1 << 16) + ((int)ucGreen1 << 8) + (int)ucBlue1);
-		unsigned int uiRGB2 = (unsigned int)(((int)ucRed2 << 16) + ((int)ucGreen2 << 8) + (int)ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->h.da << 2) + (m_pencodingbitsRGB8->h.db << 1);
-		if (uiRGB1 >= uiRGB2)
-		{
-			m_uiCW1++;
-		}
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_H();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if Planar mode is detected
-	//
-	void Block4x4Encoding_RGB8::InitFromEncodingBits_Planar(void)
-	{
-
-		m_mode = MODE_PLANAR;
-
-		unsigned char ucOriginRed = m_pencodingbitsRGB8->planar.originRed;
-		unsigned char ucOriginGreen = (unsigned char)((m_pencodingbitsRGB8->planar.originGreen1 << 6) +
-										m_pencodingbitsRGB8->planar.originGreen2);
-		unsigned char ucOriginBlue = (unsigned char)((m_pencodingbitsRGB8->planar.originBlue1 << 5) +
-										(m_pencodingbitsRGB8->planar.originBlue2 << 3) +
-										(m_pencodingbitsRGB8->planar.originBlue3 << 1) +
-										m_pencodingbitsRGB8->planar.originBlue4);
-
-		unsigned char ucHorizRed = (unsigned char)((m_pencodingbitsRGB8->planar.horizRed1 << 1) +
-									m_pencodingbitsRGB8->planar.horizRed2);
-		unsigned char ucHorizGreen = m_pencodingbitsRGB8->planar.horizGreen;
-		unsigned char ucHorizBlue = (unsigned char)((m_pencodingbitsRGB8->planar.horizBlue1 << 5) +
-									m_pencodingbitsRGB8->planar.horizBlue2);
-
-		unsigned char ucVertRed = (unsigned char)((m_pencodingbitsRGB8->planar.vertRed1 << 3) +
-									m_pencodingbitsRGB8->planar.vertRed2);
-		unsigned char ucVertGreen = (unsigned char)((m_pencodingbitsRGB8->planar.vertGreen1 << 2) +
-									m_pencodingbitsRGB8->planar.vertGreen2);
-		unsigned char ucVertBlue = m_pencodingbitsRGB8->planar.vertBlue;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromR6G7B6(ucOriginRed, ucOriginGreen, ucOriginBlue);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromR6G7B6(ucHorizRed, ucHorizGreen, ucHorizBlue);
-		m_frgbaColor3 = ColorFloatRGBA::ConvertFromR6G7B6(ucVertRed, ucVertGreen, ucVertBlue);
-
-		DecodePixels_Planar();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGB8::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			Block4x4Encoding_ETC1::PerformFirstIteration();
-			if (m_boolDone)
-			{
-				break;
-			}
-			TryPlanar(0);
-			SetDoneIfPerfect();
-			if (m_boolDone)
-			{
-				break;
-			}
-			TryTAndH(0);
-			break;
-
-		case 1:
-			Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 2:
-			Block4x4Encoding_ETC1::TryIndividual(m_boolMostLikelyFlip, 1);
-			break;
-
-		case 3:
-			Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 4:
-			Block4x4Encoding_ETC1::TryIndividual(!m_boolMostLikelyFlip, 1);
-			break;
-
-		case 5:
-			TryPlanar(1);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 6:
-			TryTAndH(1);
-			if (a_fEffort <= 59.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 7:
-			Block4x4Encoding_ETC1::TryDegenerates1();
-			if (a_fEffort <= 69.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 8:
-			Block4x4Encoding_ETC1::TryDegenerates2();
-			if (a_fEffort <= 79.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 9:
-			Block4x4Encoding_ETC1::TryDegenerates3();
-			if (a_fEffort <= 89.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 10:
-			Block4x4Encoding_ETC1::TryDegenerates4();
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in Planar mode
-	// save this encoding if it improves the error
-	//
-	void Block4x4Encoding_RGB8::TryPlanar(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		encodingTry.CalculatePlanarCornerColors();
-
-		encodingTry.DecodePixels_Planar();
-
-		encodingTry.CalcBlockError();
-
-		if (a_uiRadius > 0)
-		{
-			encodingTry.TwiddlePlanar();
-		}
-
-		if (encodingTry.m_fError < m_fError)
-		{
-			m_mode = MODE_PLANAR;
-			m_boolDiff = true;
-			m_boolFlip = false;
-			m_frgbaColor1 = encodingTry.m_frgbaColor1;
-			m_frgbaColor2 = encodingTry.m_frgbaColor2;
-			m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-			}
-
-			m_fError = encodingTry.m_fError;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode or H mode
-	// save this encoding if it improves the error
-	//
-	void Block4x4Encoding_RGB8::TryTAndH(unsigned int a_uiRadius)
-	{
-
-		CalculateBaseColorsForTAndH();
-
-		TryT(a_uiRadius);
-
-		TryH(a_uiRadius);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate original values for base colors
-	// store them in m_frgbaOriginalColor1 and m_frgbaOriginalColor2
-	//
-	void Block4x4Encoding_RGB8::CalculateBaseColorsForTAndH(void)
-	{
-
-		bool boolRGBX = m_pblockParent->GetImageSource()->GetErrorMetric() == ErrorMetric::RGBX;
-
-		ColorFloatRGBA frgbaBlockAverage = (m_frgbaSourceAverageLeft + m_frgbaSourceAverageRight) * 0.5f;
-
-		// find pixel farthest from average gray line
-		unsigned int uiFarthestPixel = 0;
-		float fFarthestGrayDistance2 = 0.0f;
-		unsigned int uiTransparentPixels = 0;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			// don't count transparent
-			if (m_pafrgbaSource[uiPixel].fA == 0.0f && !boolRGBX)
-			{
-				uiTransparentPixels++;
-			}
-			else
-			{
-				float fGrayDistance2 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], frgbaBlockAverage);
-
-				if (fGrayDistance2 > fFarthestGrayDistance2)
-				{
-					uiFarthestPixel = uiPixel;
-					fFarthestGrayDistance2 = fGrayDistance2;
-				}
-			}
-		}
-		// a transparent block should not reach this method
-		assert(uiTransparentPixels < PIXELS);
-
-		// set the original base colors to:
-		//		half way to the farthest pixel and
-		//		the mirror color on the other side of the average
-		ColorFloatRGBA frgbaOffset = (m_pafrgbaSource[uiFarthestPixel] - frgbaBlockAverage) * 0.5f;
-		m_frgbaOriginalColor1_TAndH = (frgbaBlockAverage + frgbaOffset).QuantizeR4G4B4();
-		m_frgbaOriginalColor2_TAndH = (frgbaBlockAverage - frgbaOffset).ClampRGB().QuantizeR4G4B4();	// the "other side" might be out of range
-
-		// move base colors to find best fit
-		for (unsigned int uiIteration = 0; uiIteration < 10; uiIteration++)
-		{
-			// find the center of pixels closest to each color
-			float fPixelsCloserToColor1 = 0.0f;
-			ColorFloatRGBA frgbSumPixelsCloserToColor1;
-			float fPixelsCloserToColor2 = 0.0f;
-			ColorFloatRGBA frgbSumPixelsCloserToColor2;
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				// don't count transparent pixels
-				if (m_pafrgbaSource[uiPixel].fA == 0.0f)
-				{
-					continue;
-				}
-
-				float fGrayDistance2ToColor1 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], m_frgbaOriginalColor1_TAndH);
-				float fGrayDistance2ToColor2 = CalcGrayDistance2(m_pafrgbaSource[uiPixel], m_frgbaOriginalColor2_TAndH);
-
-				ColorFloatRGBA frgbaAlphaWeightedSource = m_pafrgbaSource[uiPixel] * m_pafrgbaSource[uiPixel].fA;
-					
-				if (fGrayDistance2ToColor1 <= fGrayDistance2ToColor2)
-				{
-					fPixelsCloserToColor1 += m_pafrgbaSource[uiPixel].fA;
-					frgbSumPixelsCloserToColor1 = frgbSumPixelsCloserToColor1 + frgbaAlphaWeightedSource;
-				}
-				else
-				{
-					fPixelsCloserToColor2 += m_pafrgbaSource[uiPixel].fA;
-					frgbSumPixelsCloserToColor2 = frgbSumPixelsCloserToColor2 + frgbaAlphaWeightedSource;
-				}
-			}
-			if (fPixelsCloserToColor1 == 0.0f || fPixelsCloserToColor2 == 0.0f)
-			{
-				break;
-			}
-
-			ColorFloatRGBA frgbAvgColor1Pixels = (frgbSumPixelsCloserToColor1 * (1.0f / fPixelsCloserToColor1)).QuantizeR4G4B4();
-			ColorFloatRGBA frgbAvgColor2Pixels = (frgbSumPixelsCloserToColor2 * (1.0f / fPixelsCloserToColor2)).QuantizeR4G4B4();
-
-			if (frgbAvgColor1Pixels.fR == m_frgbaOriginalColor1_TAndH.fR &&
-				frgbAvgColor1Pixels.fG == m_frgbaOriginalColor1_TAndH.fG &&
-				frgbAvgColor1Pixels.fB == m_frgbaOriginalColor1_TAndH.fB &&
-				frgbAvgColor2Pixels.fR == m_frgbaOriginalColor2_TAndH.fR &&
-				frgbAvgColor2Pixels.fG == m_frgbaOriginalColor2_TAndH.fG &&
-				frgbAvgColor2Pixels.fB == m_frgbaOriginalColor2_TAndH.fB)
-			{
-				break;
-			}
-
-			m_frgbaOriginalColor1_TAndH = frgbAvgColor1Pixels;
-			m_frgbaOriginalColor2_TAndH = frgbAvgColor2Pixels;
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode
-	// save this encoding if it improves the error
-	//
-	// since pixels that use base color1 don't use the distance table, color1 and color2 can be twiddled independently
-	// better encoding can be found if TWIDDLE_RADIUS is set to 2, but it will be much slower
-	//
-	void Block4x4Encoding_RGB8::TryT(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_T;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-		if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-		if (iMaxRed1 > 15)
-		{
-			iMaxRed1 = 15;
-		}
-
-		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-		if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-		if (iMaxGreen1 > 15)
-		{
-			iMaxGreen1 = 15;
-		}
-
-		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-		if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-		if (iMaxBlue1 > 15)
-		{
-			iMaxBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-		if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-		if (iMaxRed2 > 15)
-		{
-			iMaxRed2 = 15;
-		}
-
-		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-		if (iMaxGreen2 > 15)
-		{
-			iMaxGreen2 = 15;
-		}
-
-		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-		if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-		if (iMaxBlue2 > 15)
-		{
-			iMaxBlue2 = 15;
-		}
-
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			// twiddle color2 first, since it affects 3 selectors, while color1 only affects one selector
-			//
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor1_TAndH;
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-							}
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor2_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-							}
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryT
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8::TryT_BestSelectorCombination(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-		
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = m_frgbaColor1;
-		afrgbaDecodedPixel[1] = (m_frgbaColor2 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = m_frgbaColor2;
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-		
-		// try each selector
-		for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-		{
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], m_afDecodedAlphas[uiPixel],
-														m_pafrgbaSource[uiPixel]);
-
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (fBlockError < m_fError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode
-	// save this encoding if it improves the error
-	//
-	// since all pixels use the distance table, color1 and color2 can NOT be twiddled independently
-	// TWIDDLE_RADIUS of 2 is WAY too slow
-	//
-	void Block4x4Encoding_RGB8::TryH(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_H;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-		if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-		if (iMaxRed1 > 15)
-		{
-			iMaxRed1 = 15;
-		}
-
-		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-		if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-		if (iMaxGreen1 > 15)
-		{
-			iMaxGreen1 = 15;
-		}
-
-		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-		if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-		if (iMaxBlue1 > 15)
-		{
-			iMaxBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-		if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-		if (iMaxRed2 > 15)
-		{
-			iMaxRed2 = 15;
-		}
-
-		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-		if (iMaxGreen2 > 15)
-		{
-			iMaxGreen2 = 15;
-		}
-
-		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-		if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-		if (iMaxBlue2 > 15)
-		{
-			iMaxBlue2 = 15;
-		}
-
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-						encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-						encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed1 == iColor2Red && iGreen1 == iColor2Green && iBlue1 == iColor2Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-						encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-						encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed2 == iColor1Red && iGreen2 == iColor1Green && iBlue2 == iColor1Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryH
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8::TryH_BestSelectorCombination(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-		
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = (m_frgbaColor1 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[1] = (m_frgbaColor1 - fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = (m_frgbaColor2 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-		
-		// try each selector
-		for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-		{
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], m_afDecodedAlphas[uiPixel],
-														m_pafrgbaSource[uiPixel]);
-
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (fBlockError < m_fError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// use linear regression to find the best fit for colors along the edges of the 4x4 block
-	//
-	void Block4x4Encoding_RGB8::CalculatePlanarCornerColors(void)
-	{
-		ColorFloatRGBA afrgbaRegression[MAX_PLANAR_REGRESSION_SIZE];
-		ColorFloatRGBA frgbaSlope;
-		ColorFloatRGBA frgbaOffset;
-
-		// top edge
-		afrgbaRegression[0] = m_pafrgbaSource[0];
-		afrgbaRegression[1] = m_pafrgbaSource[4];
-		afrgbaRegression[2] = m_pafrgbaSource[8];
-		afrgbaRegression[3] = m_pafrgbaSource[12];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor1 = frgbaOffset;
-		m_frgbaColor2 = (frgbaSlope * 4.0f) + frgbaOffset;
-
-		// left edge
-		afrgbaRegression[0] = m_pafrgbaSource[0];
-		afrgbaRegression[1] = m_pafrgbaSource[1];
-		afrgbaRegression[2] = m_pafrgbaSource[2];
-		afrgbaRegression[3] = m_pafrgbaSource[3];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor1 = (m_frgbaColor1 + frgbaOffset) * 0.5f;		// average with top edge
-		m_frgbaColor3 = (frgbaSlope * 4.0f) + frgbaOffset;
-
-		// right edge
-		afrgbaRegression[0] = m_pafrgbaSource[12];
-		afrgbaRegression[1] = m_pafrgbaSource[13];
-		afrgbaRegression[2] = m_pafrgbaSource[14];
-		afrgbaRegression[3] = m_pafrgbaSource[15];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor2 = (m_frgbaColor2 + frgbaOffset) * 0.5f;		// average with top edge
-
-		// bottom edge
-		afrgbaRegression[0] = m_pafrgbaSource[3];
-		afrgbaRegression[1] = m_pafrgbaSource[7];
-		afrgbaRegression[2] = m_pafrgbaSource[11];
-		afrgbaRegression[3] = m_pafrgbaSource[15];
-		ColorRegression(afrgbaRegression, 4, &frgbaSlope, &frgbaOffset);
-		m_frgbaColor3 = (m_frgbaColor3 + frgbaOffset) * 0.5f;		// average with left edge
-
-		// quantize corner colors to 6/7/6
-		m_frgbaColor1 = m_frgbaColor1.QuantizeR6G7B6();
-		m_frgbaColor2 = m_frgbaColor2.QuantizeR6G7B6();
-		m_frgbaColor3 = m_frgbaColor3.QuantizeR6G7B6();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing R, G and B independently
-	//
-	// R, G and B decoding and errors are independent, so R, G and B twiddles can be independent
-	//
-	// return true if improvement
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanar(void)
-	{
-		bool boolImprovement = false;
-
-		while (TwiddlePlanarR())
-		{
-			boolImprovement = true;
-		}
-
-		while (TwiddlePlanarG())
-		{
-			boolImprovement = true;
-		}
-
-		while (TwiddlePlanarB())
-		{
-			boolImprovement = true;
-		}
-
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing R
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanarR()
-	{
-		bool boolImprovement = false;
-
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		int iOriginRed = encodingTry.m_frgbaColor1.IntRed(63.0f);
-		int iHorizRed = encodingTry.m_frgbaColor2.IntRed(63.0f);
-		int iVertRed = encodingTry.m_frgbaColor3.IntRed(63.0f);
-
-		for (int iTryOriginRed = iOriginRed - 1; iTryOriginRed <= iOriginRed + 1; iTryOriginRed++)
-		{
-			// check for out of range
-			if (iTryOriginRed < 0 || iTryOriginRed > 63)
-			{
-				continue;
-			}
-
-			encodingTry.m_frgbaColor1.fR = ((iTryOriginRed << 2) + (iTryOriginRed >> 4)) / 255.0f;
-
-			for (int iTryHorizRed = iHorizRed - 1; iTryHorizRed <= iHorizRed + 1; iTryHorizRed++)
-			{
-				// check for out of range
-				if (iTryHorizRed < 0 || iTryHorizRed > 63)
-				{
-					continue;
-				}
-
-				encodingTry.m_frgbaColor2.fR = ((iTryHorizRed << 2) + (iTryHorizRed >> 4)) / 255.0f;
-
-				for (int iTryVertRed = iVertRed - 1; iTryVertRed <= iVertRed + 1; iTryVertRed++)
-				{
-					// check for out of range
-					if (iTryVertRed < 0 || iTryVertRed > 63)
-					{
-						continue;
-					}
-
-					// don't bother with null twiddle
-					if (iTryOriginRed == iOriginRed && iTryHorizRed == iHorizRed && iTryVertRed == iVertRed)
-					{
-						continue;
-					}
-
-					encodingTry.m_frgbaColor3.fR = ((iTryVertRed << 2) + (iTryVertRed >> 4)) / 255.0f;
-
-					encodingTry.DecodePixels_Planar();
-
-					encodingTry.CalcBlockError();
-
-					if (encodingTry.m_fError < m_fError)
-					{
-						m_mode = MODE_PLANAR;
-						m_boolDiff = true;
-						m_boolFlip = false;
-						m_frgbaColor1 = encodingTry.m_frgbaColor1;
-						m_frgbaColor2 = encodingTry.m_frgbaColor2;
-						m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-						}
-
-						m_fError = encodingTry.m_fError;
-
-						boolImprovement = true;
-					}
-				}
-			}
-		}
-
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing G
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanarG()
-	{
-		bool boolImprovement = false;
-
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		int iOriginGreen = encodingTry.m_frgbaColor1.IntGreen(127.0f);
-		int iHorizGreen = encodingTry.m_frgbaColor2.IntGreen(127.0f);
-		int iVertGreen = encodingTry.m_frgbaColor3.IntGreen(127.0f);
-
-		for (int iTryOriginGreen = iOriginGreen - 1; iTryOriginGreen <= iOriginGreen + 1; iTryOriginGreen++)
-		{
-			// check for out of range
-			if (iTryOriginGreen < 0 || iTryOriginGreen > 127)
-			{
-				continue;
-			}
-
-			encodingTry.m_frgbaColor1.fG = ((iTryOriginGreen << 1) + (iTryOriginGreen >> 6)) / 255.0f;
-
-			for (int iTryHorizGreen = iHorizGreen - 1; iTryHorizGreen <= iHorizGreen + 1; iTryHorizGreen++)
-			{
-				// check for out of range
-				if (iTryHorizGreen < 0 || iTryHorizGreen > 127)
-				{
-					continue;
-				}
-
-				encodingTry.m_frgbaColor2.fG = ((iTryHorizGreen << 1) + (iTryHorizGreen >> 6)) / 255.0f;
-
-				for (int iTryVertGreen = iVertGreen - 1; iTryVertGreen <= iVertGreen + 1; iTryVertGreen++)
-				{
-					// check for out of range
-					if (iTryVertGreen < 0 || iTryVertGreen > 127)
-					{
-						continue;
-					}
-
-					// don't bother with null twiddle
-					if (iTryOriginGreen == iOriginGreen && 
-						iTryHorizGreen == iHorizGreen && 
-						iTryVertGreen == iVertGreen)
-					{
-						continue;
-					}
-
-					encodingTry.m_frgbaColor3.fG = ((iTryVertGreen << 1) + (iTryVertGreen >> 6)) / 255.0f;
-
-					encodingTry.DecodePixels_Planar();
-
-					encodingTry.CalcBlockError();
-
-					if (encodingTry.m_fError < m_fError)
-					{
-						m_mode = MODE_PLANAR;
-						m_boolDiff = true;
-						m_boolFlip = false;
-						m_frgbaColor1 = encodingTry.m_frgbaColor1;
-						m_frgbaColor2 = encodingTry.m_frgbaColor2;
-						m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-						}
-
-						m_fError = encodingTry.m_fError;
-
-						boolImprovement = true;
-					}
-				}
-			}
-		}
-
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try different corner colors by slightly changing B
-	//
-	bool Block4x4Encoding_RGB8::TwiddlePlanarB()
-	{
-		bool boolImprovement = false;
-
-		Block4x4Encoding_RGB8 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_PLANAR;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-		}
-
-		int iOriginBlue = encodingTry.m_frgbaColor1.IntBlue(63.0f);
-		int iHorizBlue = encodingTry.m_frgbaColor2.IntBlue(63.0f);
-		int iVertBlue = encodingTry.m_frgbaColor3.IntBlue(63.0f);
-
-		for (int iTryOriginBlue = iOriginBlue - 1; iTryOriginBlue <= iOriginBlue + 1; iTryOriginBlue++)
-		{
-			// check for out of range
-			if (iTryOriginBlue < 0 || iTryOriginBlue > 63)
-			{
-				continue;
-			}
-
-			encodingTry.m_frgbaColor1.fB = ((iTryOriginBlue << 2) + (iTryOriginBlue >> 4)) / 255.0f;
-
-			for (int iTryHorizBlue = iHorizBlue - 1; iTryHorizBlue <= iHorizBlue + 1; iTryHorizBlue++)
-			{
-				// check for out of range
-				if (iTryHorizBlue < 0 || iTryHorizBlue > 63)
-				{
-					continue;
-				}
-
-				encodingTry.m_frgbaColor2.fB = ((iTryHorizBlue << 2) + (iTryHorizBlue >> 4)) / 255.0f;
-
-				for (int iTryVertBlue = iVertBlue - 1; iTryVertBlue <= iVertBlue + 1; iTryVertBlue++)
-				{
-					// check for out of range
-					if (iTryVertBlue < 0 || iTryVertBlue > 63)
-					{
-						continue;
-					}
-
-					// don't bother with null twiddle
-					if (iTryOriginBlue == iOriginBlue && iTryHorizBlue == iHorizBlue && iTryVertBlue == iVertBlue)
-					{
-						continue;
-					}
-
-					encodingTry.m_frgbaColor3.fB = ((iTryVertBlue << 2) + (iTryVertBlue >> 4)) / 255.0f;
-
-					encodingTry.DecodePixels_Planar();
-
-					encodingTry.CalcBlockError();
-
-					if (encodingTry.m_fError < m_fError)
-					{
-						m_mode = MODE_PLANAR;
-						m_boolDiff = true;
-						m_boolFlip = false;
-						m_frgbaColor1 = encodingTry.m_frgbaColor1;
-						m_frgbaColor2 = encodingTry.m_frgbaColor2;
-						m_frgbaColor3 = encodingTry.m_frgbaColor3;
-
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-						}
-
-						m_fError = encodingTry.m_fError;
-
-						boolImprovement = true;
-					}
-				}
-			}
-		}
-
-		return boolImprovement;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits(void)
-	{
-
-		switch (m_mode)
-		{
-		case MODE_ETC1:
-			Block4x4Encoding_ETC1::SetEncodingBits();
-			break;
-
-		case MODE_T:
-			SetEncodingBits_T();
-			break;
-
-		case MODE_H:
-			SetEncodingBits_H();
-			break;
-
-		case MODE_PLANAR:
-			SetEncodingBits_Planar();
-			break;
-
-		default:
-			assert(false);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state for T mode
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits_T(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_T);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		m_pencodingbitsRGB8->t.red1a = uiRed1 >> 2;
-		m_pencodingbitsRGB8->t.red1b = uiRed1;
-		m_pencodingbitsRGB8->t.green1 = uiGreen1;
-		m_pencodingbitsRGB8->t.blue1 = uiBlue1;
-
-		m_pencodingbitsRGB8->t.red2 = uiRed2;
-		m_pencodingbitsRGB8->t.green2 = uiGreen2;
-		m_pencodingbitsRGB8->t.blue2 = uiBlue2;
-
-		m_pencodingbitsRGB8->t.da = m_uiCW1 >> 1;
-		m_pencodingbitsRGB8->t.db = m_uiCW1;
-
-		m_pencodingbitsRGB8->t.diff = 1;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->t.detect1 = 0;
-		m_pencodingbitsRGB8->t.detect2 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		if (iRed2 >= 4)
-		{
-			m_pencodingbitsRGB8->t.detect1 = 7;
-			m_pencodingbitsRGB8->t.detect2 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->t.detect1 = 0;
-			m_pencodingbitsRGB8->t.detect2 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-
-			// make sure red overflows
-			assert(iRed2 < 0 || iRed2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state for H mode
-	//
-	// colors and selectors may need to swap in order to generate lsb of distance index
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits_H(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_H);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		unsigned int uiColor1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
-		unsigned int uiColor2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
-
-		bool boolOddDistance = m_uiCW1 & 1;
-		bool boolSwapColors = (uiColor1 < uiColor2) ^ !boolOddDistance;
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed2;
-			m_pencodingbitsRGB8->h.green1a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue2 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue2 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue2;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed1;
-			m_pencodingbitsRGB8->h.green2a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue1;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed1;
-			m_pencodingbitsRGB8->h.green1a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue1 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue1 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue1;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed2;
-			m_pencodingbitsRGB8->h.green2a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue2;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-
-		m_pencodingbitsRGB8->h.diff = 1;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.selectors ^= 0x0000FFFF;
-		}
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->h.detect1 = 0;
-		m_pencodingbitsRGB8->h.detect2 = 0;
-		m_pencodingbitsRGB8->h.detect3 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			m_pencodingbitsRGB8->h.detect1 = 1;
-		}
-		if (iGreen2 >= 4)
-		{
-			m_pencodingbitsRGB8->h.detect2 = 7;
-			m_pencodingbitsRGB8->h.detect3 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.detect2 = 0;
-			m_pencodingbitsRGB8->h.detect3 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-
-			// make sure red doesn't overflow and green does
-			assert(iRed2 >= 0 && iRed2 <= 31);
-			assert(iGreen2 < 0 || iGreen2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state for Planar mode
-	//
-	void Block4x4Encoding_RGB8::SetEncodingBits_Planar(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_PLANAR);
-		assert(m_boolDiff == true);
-
-		unsigned int uiOriginRed = (unsigned int)m_frgbaColor1.IntRed(63.0f);
-		unsigned int uiOriginGreen = (unsigned int)m_frgbaColor1.IntGreen(127.0f);
-		unsigned int uiOriginBlue = (unsigned int)m_frgbaColor1.IntBlue(63.0f);
-
-		unsigned int uiHorizRed = (unsigned int)m_frgbaColor2.IntRed(63.0f);
-		unsigned int uiHorizGreen = (unsigned int)m_frgbaColor2.IntGreen(127.0f);
-		unsigned int uiHorizBlue = (unsigned int)m_frgbaColor2.IntBlue(63.0f);
-
-		unsigned int uiVertRed = (unsigned int)m_frgbaColor3.IntRed(63.0f);
-		unsigned int uiVertGreen = (unsigned int)m_frgbaColor3.IntGreen(127.0f);
-		unsigned int uiVertBlue = (unsigned int)m_frgbaColor3.IntBlue(63.0f);
-
-		m_pencodingbitsRGB8->planar.originRed = uiOriginRed;
-		m_pencodingbitsRGB8->planar.originGreen1 = uiOriginGreen >> 6;
-		m_pencodingbitsRGB8->planar.originGreen2 = uiOriginGreen;
-		m_pencodingbitsRGB8->planar.originBlue1 = uiOriginBlue >> 5;
-		m_pencodingbitsRGB8->planar.originBlue2 = uiOriginBlue >> 3;
-		m_pencodingbitsRGB8->planar.originBlue3 = uiOriginBlue >> 1;
-		m_pencodingbitsRGB8->planar.originBlue4 = uiOriginBlue;
-
-		m_pencodingbitsRGB8->planar.horizRed1 = uiHorizRed >> 1;
-		m_pencodingbitsRGB8->planar.horizRed2 = uiHorizRed;
-		m_pencodingbitsRGB8->planar.horizGreen = uiHorizGreen;
-		m_pencodingbitsRGB8->planar.horizBlue1 = uiHorizBlue >> 5;
-		m_pencodingbitsRGB8->planar.horizBlue2 = uiHorizBlue;
-
-		m_pencodingbitsRGB8->planar.vertRed1 = uiVertRed >> 3;
-		m_pencodingbitsRGB8->planar.vertRed2 = uiVertRed;
-		m_pencodingbitsRGB8->planar.vertGreen1 = uiVertGreen >> 2;
-		m_pencodingbitsRGB8->planar.vertGreen2 = uiVertGreen;
-		m_pencodingbitsRGB8->planar.vertBlue = uiVertBlue;
-
-		m_pencodingbitsRGB8->planar.diff = 1;
-
-		// create valid RG differentials and an invalid B differential to trigger planar mode
-		m_pencodingbitsRGB8->planar.detect1 = 0;
-		m_pencodingbitsRGB8->planar.detect2 = 0;
-		m_pencodingbitsRGB8->planar.detect3 = 0;
-		m_pencodingbitsRGB8->planar.detect4 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-		int iBlue2 = (int)m_pencodingbitsRGB8->differential.blue1 + (int)m_pencodingbitsRGB8->differential.dblue2;
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			m_pencodingbitsRGB8->planar.detect1 = 1;
-		}
-		if (iGreen2 < 0 || iGreen2 > 31)
-		{
-			m_pencodingbitsRGB8->planar.detect2 = 1;
-		}
-		if (iBlue2 >= 4)
-		{
-			m_pencodingbitsRGB8->planar.detect3 = 7;
-			m_pencodingbitsRGB8->planar.detect4 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->planar.detect3 = 0;
-			m_pencodingbitsRGB8->planar.detect4 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-			iBlue2 = (int)m_pencodingbitsRGB8->differential.blue1 + (int)m_pencodingbitsRGB8->differential.dblue2;
-
-			// make sure red and green don't overflow and blue does
-			assert(iRed2 >= 0 && iRed2 <= 31);
-			assert(iGreen2 >= 0 && iGreen2 <= 31);
-			assert(iBlue2 < 0 || iBlue2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the decoded colors and decoded alpha based on the encoding state for T mode
-	//
-	void Block4x4Encoding_RGB8::DecodePixels_T(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor1;
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-				break;
-
-			case 2:
-				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor2;
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				break;
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the decoded colors and decoded alpha based on the encoding state for H mode
-	//
-	void Block4x4Encoding_RGB8::DecodePixels_H(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 + frgbaDistance).ClampRGB();
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 - frgbaDistance).ClampRGB();
-				break;
-
-			case 2:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				break;
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the decoded colors and decoded alpha based on the encoding state for Planar mode
-	//
-	void Block4x4Encoding_RGB8::DecodePixels_Planar(void)
-	{
-
-		int iRO = (int)roundf(m_frgbaColor1.fR * 255.0f);
-		int iGO = (int)roundf(m_frgbaColor1.fG * 255.0f);
-		int iBO = (int)roundf(m_frgbaColor1.fB * 255.0f);
-
-		int iRH = (int)roundf(m_frgbaColor2.fR * 255.0f);
-		int iGH = (int)roundf(m_frgbaColor2.fG * 255.0f);
-		int iBH = (int)roundf(m_frgbaColor2.fB * 255.0f);
-
-		int iRV = (int)roundf(m_frgbaColor3.fR * 255.0f);
-		int iGV = (int)roundf(m_frgbaColor3.fG * 255.0f);
-		int iBV = (int)roundf(m_frgbaColor3.fB * 255.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			int iX = (int)(uiPixel >> 2);
-			int iY = (int)(uiPixel & 3);
-
-			int iR = (iX*(iRH - iRO) + iY*(iRV - iRO) + 4*iRO + 2) >> 2;
-			int iG = (iX*(iGH - iGO) + iY*(iGV - iGO) + 4*iGO + 2) >> 2;
-			int iB = (iX*(iBH - iBO) + iY*(iBV - iBO) + 4*iBO + 2) >> 2;
-
-			ColorFloatRGBA frgba;
-			frgba.fR = (float)iR / 255.0f;
-			frgba.fG = (float)iG / 255.0f;
-			frgba.fB = (float)iB / 255.0f;
-			frgba.fA = 1.0f;
-
-			m_afrgbaDecodedColors[uiPixel] = frgba.ClampRGB();
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a linear regression for the a_uiPixels in a_pafrgbaPixels[]
-	//
-	// output the closest color line using a_pfrgbaSlope and a_pfrgbaOffset
-	//
-	void Block4x4Encoding_RGB8::ColorRegression(ColorFloatRGBA *a_pafrgbaPixels, unsigned int a_uiPixels,
-												ColorFloatRGBA *a_pfrgbaSlope, ColorFloatRGBA *a_pfrgbaOffset)
-	{
-		typedef struct
-		{
-			float f[4];
-		} Float4;
-
-		Float4 *paf4Pixels = (Float4 *)(a_pafrgbaPixels);
-		Float4 *pf4Slope = (Float4 *)(a_pfrgbaSlope);
-		Float4 *pf4Offset = (Float4 *)(a_pfrgbaOffset);
-
-		float afX[MAX_PLANAR_REGRESSION_SIZE];
-		float afY[MAX_PLANAR_REGRESSION_SIZE];
-
-		// handle r, g and b separately.  don't bother with a
-		for (unsigned int uiComponent = 0; uiComponent < 3; uiComponent++)
-		{
-			for (unsigned int uiPixel = 0; uiPixel < a_uiPixels; uiPixel++)
-			{
-				afX[uiPixel] = (float)uiPixel;
-				afY[uiPixel] = paf4Pixels[uiPixel].f[uiComponent];
-				
-			}
-			Etc::Regression(afX, afY, a_uiPixels,
-				&(pf4Slope->f[uiComponent]), &(pf4Offset->f[uiComponent]));
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.h
deleted file mode 100644
index 03754d5e3b..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_ETC1.h"
-
-namespace Etc
-{
-
-	class Block4x4Encoding_RGB8 : public Block4x4Encoding_ETC1
-	{
-	public:
-
-		Block4x4Encoding_RGB8(void);
-		virtual ~Block4x4Encoding_RGB8(void);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											ColorFloatRGBA *a_pafrgbaSource,
-
-											ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-		
-		virtual void SetEncodingBits(void);
-
-		inline ColorFloatRGBA GetColor3(void) const
-		{
-			return m_frgbaColor3;
-		}
-
-	protected:
-
-		static const unsigned int PLANAR_CORNER_COLORS = 3;
-		static const unsigned int MAX_PLANAR_REGRESSION_SIZE = 4;
-		static const unsigned int TH_DISTANCES = 8;
-
-		static float s_afTHDistanceTable[TH_DISTANCES];
-
-		void TryPlanar(unsigned int a_uiRadius);
-		void TryTAndH(unsigned int a_uiRadius);
-
-		void InitFromEncodingBits_Planar(void);
-
-		ColorFloatRGBA	m_frgbaColor3;		// used for planar
-
-		void SetEncodingBits_T(void);
-		void SetEncodingBits_H(void);
-		void SetEncodingBits_Planar(void);
-
-		// state shared between iterations
-		ColorFloatRGBA	m_frgbaOriginalColor1_TAndH;
-		ColorFloatRGBA	m_frgbaOriginalColor2_TAndH;
-
-		void CalculateBaseColorsForTAndH(void);
-		void TryT(unsigned int a_uiRadius);
-		void TryT_BestSelectorCombination(void);
-		void TryH(unsigned int a_uiRadius);
-		void TryH_BestSelectorCombination(void);
-
-	private:
-
-		void InitFromEncodingBits_T(void);
-		void InitFromEncodingBits_H(void);
-
-		void CalculatePlanarCornerColors(void);
-
-		void ColorRegression(ColorFloatRGBA *a_pafrgbaPixels, unsigned int a_uiPixels,
-			ColorFloatRGBA *a_pfrgbaSlope, ColorFloatRGBA *a_pfrgbaOffset);
-
-		bool TwiddlePlanar(void);
-		bool TwiddlePlanarR();
-		bool TwiddlePlanarG();
-		bool TwiddlePlanarB();
-
-		void DecodePixels_T(void);
-		void DecodePixels_H(void);
-		void DecodePixels_Planar(void);
-
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
deleted file mode 100644
index b94b64e68c..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
+++ /dev/null
@@ -1,1819 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RGB8A1.cpp contains:
-	Block4x4Encoding_RGB8A1
-	Block4x4Encoding_RGB8A1_Opaque
-	Block4x4Encoding_RGB8A1_Transparent
-
-These encoders are used when targetting file format RGB8A1.
-
-Block4x4Encoding_RGB8A1_Opaque is used when all pixels in the 4x4 block are opaque
-Block4x4Encoding_RGB8A1_Transparent is used when all pixels in the 4x4 block are transparent
-Block4x4Encoding_RGB8A1 is used when there is a mixture of alphas in the 4x4 block
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RGB8A1.h"
-
-#include "EtcBlock4x4.h"
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4Encoding_RGB8.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-namespace Etc
-{
-	
-	// ####################################################################################################
-	// Block4x4Encoding_RGB8A1
-	// ####################################################################################################
-
-	float Block4x4Encoding_RGB8A1::s_aafCwOpaqueUnsetTable[CW_RANGES][SELECTORS] =
-	{
-		{ 0.0f / 255.0f, 8.0f / 255.0f, 0.0f / 255.0f, -8.0f / 255.0f },
-		{ 0.0f / 255.0f, 17.0f / 255.0f, 0.0f / 255.0f, -17.0f / 255.0f },
-		{ 0.0f / 255.0f, 29.0f / 255.0f, 0.0f / 255.0f, -29.0f / 255.0f },
-		{ 0.0f / 255.0f, 42.0f / 255.0f, 0.0f / 255.0f, -42.0f / 255.0f },
-		{ 0.0f / 255.0f, 60.0f / 255.0f, 0.0f / 255.0f, -60.0f / 255.0f },
-		{ 0.0f / 255.0f, 80.0f / 255.0f, 0.0f / 255.0f, -80.0f / 255.0f },
-		{ 0.0f / 255.0f, 106.0f / 255.0f, 0.0f / 255.0f, -106.0f / 255.0f },
-		{ 0.0f / 255.0f, 183.0f / 255.0f, 0.0f / 255.0f, -183.0f / 255.0f }
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_RGB8A1::Block4x4Encoding_RGB8A1(void)
-	{
-		m_pencodingbitsRGB8 = nullptr;
-		m_boolOpaque = false;
-		m_boolTransparent = false;
-		m_boolPunchThroughPixels = true;
-
-	}
-	Block4x4Encoding_RGB8A1::~Block4x4Encoding_RGB8A1(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_RGB8A1::InitFromSource(Block4x4 *a_pblockParent,
-													ColorFloatRGBA *a_pafrgbaSource,
-													unsigned char *a_paucEncodingBits,
-													ErrorMetric a_errormetric)
-	{
-
-		Block4x4Encoding_RGB8::InitFromSource(a_pblockParent,
-			a_pafrgbaSource,
-			a_paucEncodingBits,
-			a_errormetric);
-
-		m_boolOpaque = a_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::OPAQUE;
-		m_boolTransparent = a_pblockParent->GetSourceAlphaMix() == Block4x4::SourceAlphaMix::TRANSPARENT;
-		m_boolPunchThroughPixels = a_pblockParent->HasPunchThroughPixels();
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			if (m_pafrgbaSource[uiPixel].fA >= 0.5f)
-			{
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-			}
-			else
-			{
-				m_afDecodedAlphas[uiPixel] = 0.0f;
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_RGB8A1::InitFromEncodingBits(Block4x4 *a_pblockParent,
-														unsigned char *a_paucEncodingBits,
-														ColorFloatRGBA *a_pafrgbaSource,
-														ErrorMetric a_errormetric)
-	{
-
-
-		InitFromEncodingBits_ETC1(a_pblockParent,
-			a_paucEncodingBits,
-			a_pafrgbaSource,
-			a_errormetric);
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
-
-		// detect if there is a T, H or Planar mode present
-		int iRed1 = m_pencodingbitsRGB8->differential.red1;
-		int iDRed2 = m_pencodingbitsRGB8->differential.dred2;
-		int iRed2 = iRed1 + iDRed2;
-
-		int iGreen1 = m_pencodingbitsRGB8->differential.green1;
-		int iDGreen2 = m_pencodingbitsRGB8->differential.dgreen2;
-		int iGreen2 = iGreen1 + iDGreen2;
-
-		int iBlue1 = m_pencodingbitsRGB8->differential.blue1;
-		int iDBlue2 = m_pencodingbitsRGB8->differential.dblue2;
-		int iBlue2 = iBlue1 + iDBlue2;
-
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			InitFromEncodingBits_T();
-		}
-		else if (iGreen2 < 0 || iGreen2 > 31)
-		{
-			InitFromEncodingBits_H();
-		}
-		else if (iBlue2 < 0 || iBlue2 > 31)
-		{
-			Block4x4Encoding_RGB8::InitFromEncodingBits_Planar();
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding assuming the encoding is an ETC1 mode.
-	// if it isn't an ETC1 mode, this will be overwritten later
-	//
-	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_ETC1(Block4x4 *a_pblockParent,
-		unsigned char *a_paucEncodingBits,
-		ColorFloatRGBA *a_pafrgbaSource,
-		ErrorMetric a_errormetric)
-	{
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,
-			a_errormetric);
-
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)a_paucEncodingBits;
-
-		m_mode = MODE_ETC1;
-		m_boolDiff = true;
-		m_boolFlip = m_pencodingbitsRGB8->differential.flip;
-		m_boolOpaque = m_pencodingbitsRGB8->differential.diff;
-
-		int iR2 = m_pencodingbitsRGB8->differential.red1 + m_pencodingbitsRGB8->differential.dred2;
-		if (iR2 < 0)
-		{
-			iR2 = 0;
-		}
-		else if (iR2 > 31)
-		{
-			iR2 = 31;
-		}
-
-		int iG2 = m_pencodingbitsRGB8->differential.green1 + m_pencodingbitsRGB8->differential.dgreen2;
-		if (iG2 < 0)
-		{
-			iG2 = 0;
-		}
-		else if (iG2 > 31)
-		{
-			iG2 = 31;
-		}
-
-		int iB2 = m_pencodingbitsRGB8->differential.blue1 + m_pencodingbitsRGB8->differential.dblue2;
-		if (iB2 < 0)
-		{
-			iB2 = 0;
-		}
-		else if (iB2 > 31)
-		{
-			iB2 = 31;
-		}
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5(m_pencodingbitsRGB8->differential.red1, m_pencodingbitsRGB8->differential.green1, m_pencodingbitsRGB8->differential.blue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iR2, (unsigned char)iG2, (unsigned char)iB2);
-
-		m_uiCW1 = m_pencodingbitsRGB8->differential.cw1;
-		m_uiCW2 = m_pencodingbitsRGB8->differential.cw2;
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		Decode_ETC1();
-
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if T mode is detected
-	//
-	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_T(void)
-	{
-		m_mode = MODE_T;
-
-		unsigned char ucRed1 = (unsigned char)((m_pencodingbitsRGB8->t.red1a << 2) +
-								m_pencodingbitsRGB8->t.red1b);
-		unsigned char ucGreen1 = m_pencodingbitsRGB8->t.green1;
-		unsigned char ucBlue1 = m_pencodingbitsRGB8->t.blue1;
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->t.red2;
-		unsigned char ucGreen2 = m_pencodingbitsRGB8->t.green2;
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->t.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->t.da << 1) + m_pencodingbitsRGB8->t.db;
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_T();
-
-		CalcBlockError();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding if H mode is detected
-	//
-	void Block4x4Encoding_RGB8A1::InitFromEncodingBits_H(void)
-	{
-		m_mode = MODE_H;
-
-		unsigned char ucRed1 = m_pencodingbitsRGB8->h.red1;
-		unsigned char ucGreen1 = (unsigned char)((m_pencodingbitsRGB8->h.green1a << 1) +
-									m_pencodingbitsRGB8->h.green1b);
-		unsigned char ucBlue1 = (unsigned char)((m_pencodingbitsRGB8->h.blue1a << 3) +
-								(m_pencodingbitsRGB8->h.blue1b << 1) +
-								m_pencodingbitsRGB8->h.blue1c);
-
-		unsigned char ucRed2 = m_pencodingbitsRGB8->h.red2;
-		unsigned char ucGreen2 = (unsigned char)((m_pencodingbitsRGB8->h.green2a << 1) +
-									m_pencodingbitsRGB8->h.green2b);
-		unsigned char ucBlue2 = m_pencodingbitsRGB8->h.blue2;
-
-		m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4(ucRed1, ucGreen1, ucBlue1);
-		m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4(ucRed2, ucGreen2, ucBlue2);
-
-		// used to determine the LSB of the CW
-		unsigned int uiRGB1 = (unsigned int)(((int)ucRed1 << 16) + ((int)ucGreen1 << 8) + (int)ucBlue1);
-		unsigned int uiRGB2 = (unsigned int)(((int)ucRed2 << 16) + ((int)ucGreen2 << 8) + (int)ucBlue2);
-
-		m_uiCW1 = (m_pencodingbitsRGB8->h.da << 2) + (m_pencodingbitsRGB8->h.db << 1);
-		if (uiRGB1 >= uiRGB2)
-		{
-			m_uiCW1++;
-		}
-
-		Block4x4Encoding_ETC1::InitFromEncodingBits_Selectors();
-
-		DecodePixels_H();
-
-		CalcBlockError();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// for ETC1 modes, set the decoded colors and decoded alpha based on the encoding state
-	//
-	void Block4x4Encoding_RGB8A1::Decode_ETC1(void)
-	{
-
-		const unsigned int *pauiPixelOrder = m_boolFlip ? s_auiPixelOrderFlip1 : s_auiPixelOrderFlip0;
-
-		for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS; uiPixelOrder++)
-		{
-			ColorFloatRGBA *pfrgbaCenter = uiPixelOrder < 8 ? &m_frgbaColor1 : &m_frgbaColor2;
-			unsigned int uiCW = uiPixelOrder < 8 ? m_uiCW1 : m_uiCW2;
-
-			unsigned int uiPixel = pauiPixelOrder[uiPixelOrder];
-
-			float fDelta;
-			if (m_boolOpaque)
-				fDelta = Block4x4Encoding_ETC1::s_aafCwTable[uiCW][m_auiSelectors[uiPixel]];
-			else 
-				fDelta = s_aafCwOpaqueUnsetTable[uiCW][m_auiSelectors[uiPixel]];
-
-			if (m_boolOpaque == false && m_auiSelectors[uiPixel] == TRANSPARENT_SELECTOR)
-			{
-				m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-				m_afDecodedAlphas[uiPixel] = 0.0f;
-			}
-			else
-			{
-				m_afrgbaDecodedColors[uiPixel] = (*pfrgbaCenter + fDelta).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// for T mode, set the decoded colors and decoded alpha based on the encoding state
-	//
-	void Block4x4Encoding_RGB8A1::DecodePixels_T(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = m_frgbaColor1;
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 2:
-				if (m_boolOpaque == false)
-				{
-					m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-					m_afDecodedAlphas[uiPixel] = 0.0f;
-				}
-				else
-				{
-					m_afrgbaDecodedColors[uiPixel] = m_frgbaColor2;
-					m_afDecodedAlphas[uiPixel] = 1.0f;
-				}
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// for H mode, set the decoded colors and decoded alpha based on the encoding state
-	//
-	void Block4x4Encoding_RGB8A1::DecodePixels_H(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-		ColorFloatRGBA frgbaDistance(fDistance, fDistance, fDistance, 0.0f);
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			switch (m_auiSelectors[uiPixel])
-			{
-			case 0:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 + frgbaDistance).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 1:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor1 - frgbaDistance).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-
-			case 2:
-				if (m_boolOpaque == false)
-				{
-					m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-					m_afDecodedAlphas[uiPixel] = 0.0f;
-				}
-				else
-				{
-					m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 + frgbaDistance).ClampRGB();
-					m_afDecodedAlphas[uiPixel] = 1.0f;
-				}
-				break;
-
-			case 3:
-				m_afrgbaDecodedColors[uiPixel] = (m_frgbaColor2 - frgbaDistance).ClampRGB();
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-				break;
-			}
-
-		}
-
-	}
-
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	// RGB8A1 can't use individual mode
-	// RGB8A1 with transparent pixels can't use planar mode
-	//
-	void Block4x4Encoding_RGB8A1::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolOpaque);
-		assert(!m_boolTransparent);
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			PerformFirstIteration();
-			break;
-
-		case 1:
-			TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 2:
-			TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
-			if (a_fEffort <= 39.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 3:
-			Block4x4Encoding_RGB8::CalculateBaseColorsForTAndH();
-			TryT(1);
-			TryH(1);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 4:
-			TryDegenerates1();
-			if (a_fEffort <= 59.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 5:
-			TryDegenerates2();
-			if (a_fEffort <= 69.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 6:
-			TryDegenerates3();
-			if (a_fEffort <= 79.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 7:
-			TryDegenerates4();
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-
-		SetDoneIfPerfect();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best initial encoding to ensure block has a valid encoding
-	//
-	void Block4x4Encoding_RGB8A1::PerformFirstIteration(void)
-	{
-		Block4x4Encoding_ETC1::CalculateMostLikelyFlip();
-
-		m_fError = FLT_MAX;
-
-		TryDifferential(m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		TryDifferential(!m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// mostly copied from ETC1
-	// differences:
-	//		Block4x4Encoding_RGB8A1 encodingTry = *this;
-	//
-	void Block4x4Encoding_RGB8A1::TryDifferential(bool a_boolFlip, unsigned int a_uiRadius, 
-													int a_iGrayOffset1, int a_iGrayOffset2)
-	{
-
-		ColorFloatRGBA frgbaColor1;
-		ColorFloatRGBA frgbaColor2;
-
-		const unsigned int *pauiPixelMapping1;
-		const unsigned int *pauiPixelMapping2;
-
-		if (a_boolFlip)
-		{
-			frgbaColor1 = m_frgbaSourceAverageTop;
-			frgbaColor2 = m_frgbaSourceAverageBottom;
-
-			pauiPixelMapping1 = s_auiTopPixelMapping;
-			pauiPixelMapping2 = s_auiBottomPixelMapping;
-		}
-		else
-		{
-			frgbaColor1 = m_frgbaSourceAverageLeft;
-			frgbaColor2 = m_frgbaSourceAverageRight;
-
-			pauiPixelMapping1 = s_auiLeftPixelMapping;
-			pauiPixelMapping2 = s_auiRightPixelMapping;
-		}
-
-		DifferentialTrys trys(frgbaColor1, frgbaColor2, pauiPixelMapping1, pauiPixelMapping2, 
-								a_uiRadius, a_iGrayOffset1, a_iGrayOffset2);
-
-		Block4x4Encoding_RGB8A1 encodingTry = *this;
-		encodingTry.m_boolFlip = a_boolFlip;
-
-		encodingTry.TryDifferentialHalf(&trys.m_half1);
-		encodingTry.TryDifferentialHalf(&trys.m_half2);
-
-		// find best halves that are within differential range
-		DifferentialTrys::Try *ptryBest1 = nullptr;
-		DifferentialTrys::Try *ptryBest2 = nullptr;
-		encodingTry.m_fError = FLT_MAX;
-
-		// see if the best of each half are in differential range
-		int iDRed = trys.m_half2.m_ptryBest->m_iRed - trys.m_half1.m_ptryBest->m_iRed;
-		int iDGreen = trys.m_half2.m_ptryBest->m_iGreen - trys.m_half1.m_ptryBest->m_iGreen;
-		int iDBlue = trys.m_half2.m_ptryBest->m_iBlue - trys.m_half1.m_ptryBest->m_iBlue;
-		if (iDRed >= -4 && iDRed <= 3 && iDGreen >= -4 && iDGreen <= 3 && iDBlue >= -4 && iDBlue <= 3)
-		{
-			ptryBest1 = trys.m_half1.m_ptryBest;
-			ptryBest2 = trys.m_half2.m_ptryBest;
-			encodingTry.m_fError = trys.m_half1.m_ptryBest->m_fError + trys.m_half2.m_ptryBest->m_fError;
-		}
-		else
-		{
-			// else, find the next best halves that are in differential range
-			for (DifferentialTrys::Try *ptry1 = &trys.m_half1.m_atry[0];
-			ptry1 < &trys.m_half1.m_atry[trys.m_half1.m_uiTrys];
-				ptry1++)
-			{
-				for (DifferentialTrys::Try *ptry2 = &trys.m_half2.m_atry[0];
-				ptry2 < &trys.m_half2.m_atry[trys.m_half2.m_uiTrys];
-					ptry2++)
-				{
-					iDRed = ptry2->m_iRed - ptry1->m_iRed;
-					bool boolValidRedDelta = iDRed <= 3 && iDRed >= -4;
-					iDGreen = ptry2->m_iGreen - ptry1->m_iGreen;
-					bool boolValidGreenDelta = iDGreen <= 3 && iDGreen >= -4;
-					iDBlue = ptry2->m_iBlue - ptry1->m_iBlue;
-					bool boolValidBlueDelta = iDBlue <= 3 && iDBlue >= -4;
-
-					if (boolValidRedDelta && boolValidGreenDelta && boolValidBlueDelta)
-					{
-						float fError = ptry1->m_fError + ptry2->m_fError;
-
-						if (fError < encodingTry.m_fError)
-						{
-							encodingTry.m_fError = fError;
-
-							ptryBest1 = ptry1;
-							ptryBest2 = ptry2;
-						}
-					}
-
-				}
-			}
-			assert(encodingTry.m_fError < FLT_MAX);
-			assert(ptryBest1 != nullptr);
-			assert(ptryBest2 != nullptr);
-		}
-
-		if (encodingTry.m_fError < m_fError)
-		{
-			m_mode = MODE_ETC1;
-			m_boolDiff = true;
-			m_boolFlip = encodingTry.m_boolFlip;
-			m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest1->m_iRed, (unsigned char)ptryBest1->m_iGreen, (unsigned char)ptryBest1->m_iBlue);
-			m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB5((unsigned char)ptryBest2->m_iRed, (unsigned char)ptryBest2->m_iGreen, (unsigned char)ptryBest2->m_iBlue);
-			m_uiCW1 = ptryBest1->m_uiCW;
-			m_uiCW2 = ptryBest2->m_uiCW;
-
-			m_fError = 0.0f;
-			for (unsigned int uiPixelOrder = 0; uiPixelOrder < PIXELS / 2; uiPixelOrder++)
-			{
-				unsigned int uiPixel1 = pauiPixelMapping1[uiPixelOrder];
-				unsigned int uiPixel2 = pauiPixelMapping2[uiPixelOrder];
-
-				unsigned int uiSelector1 = ptryBest1->m_auiSelectors[uiPixelOrder];
-				unsigned int uiSelector2 = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				m_auiSelectors[uiPixel1] = uiSelector1;
-				m_auiSelectors[uiPixel2] = ptryBest2->m_auiSelectors[uiPixelOrder];
-
-				if (uiSelector1 == TRANSPARENT_SELECTOR)
-				{
-					m_afrgbaDecodedColors[uiPixel1] = ColorFloatRGBA();
-					m_afDecodedAlphas[uiPixel1] = 0.0f;
-				}
-				else
-				{
-					float fDeltaRGB1 = s_aafCwOpaqueUnsetTable[m_uiCW1][uiSelector1];
-					m_afrgbaDecodedColors[uiPixel1] = (m_frgbaColor1 + fDeltaRGB1).ClampRGB();
-					m_afDecodedAlphas[uiPixel1] = 1.0f;
-				}
-
-				if (uiSelector2 == TRANSPARENT_SELECTOR)
-				{
-					m_afrgbaDecodedColors[uiPixel2] = ColorFloatRGBA();
-					m_afDecodedAlphas[uiPixel2] = 0.0f;
-				}
-				else
-				{
-					float fDeltaRGB2 = s_aafCwOpaqueUnsetTable[m_uiCW2][uiSelector2];
-					m_afrgbaDecodedColors[uiPixel2] = (m_frgbaColor2 + fDeltaRGB2).ClampRGB();
-					m_afDecodedAlphas[uiPixel2] = 1.0f;
-				}
-
-				float fDeltaA1 = m_afDecodedAlphas[uiPixel1] - m_pafrgbaSource[uiPixel1].fA;
-				m_fError += fDeltaA1 * fDeltaA1;
-				float fDeltaA2 = m_afDecodedAlphas[uiPixel2] - m_pafrgbaSource[uiPixel2].fA;
-				m_fError += fDeltaA2 * fDeltaA2;
-			}
-
-			m_fError1 = ptryBest1->m_fError;
-			m_fError2 = ptryBest2->m_fError;
-			m_boolSeverelyBentDifferentialColors = trys.m_boolSeverelyBentColors;
-			m_fError = m_fError1 + m_fError2;
-
-			// sanity check
-			{
-				int iRed1 = m_frgbaColor1.IntRed(31.0f);
-				int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
-				int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
-
-				int iRed2 = m_frgbaColor2.IntRed(31.0f);
-				int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
-				int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
-
-				iDRed = iRed2 - iRed1;
-				iDGreen = iGreen2 - iGreen1;
-				iDBlue = iBlue2 - iBlue1;
-
-				assert(iDRed >= -4 && iDRed < 4);
-				assert(iDGreen >= -4 && iDGreen < 4);
-				assert(iDBlue >= -4 && iDBlue < 4);
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// mostly copied from ETC1
-	// differences:
-	//		uses s_aafCwOpaqueUnsetTable
-	//		color for selector set to 0,0,0,0
-	//
-	void Block4x4Encoding_RGB8A1::TryDifferentialHalf(DifferentialTrys::Half *a_phalf)
-	{
-
-		a_phalf->m_ptryBest = nullptr;
-		float fBestTryError = FLT_MAX;
-
-		a_phalf->m_uiTrys = 0;
-		for (int iRed = a_phalf->m_iRed - (int)a_phalf->m_uiRadius;
-		iRed <= a_phalf->m_iRed + (int)a_phalf->m_uiRadius;
-			iRed++)
-		{
-			assert(iRed >= 0 && iRed <= 31);
-
-			for (int iGreen = a_phalf->m_iGreen - (int)a_phalf->m_uiRadius;
-			iGreen <= a_phalf->m_iGreen + (int)a_phalf->m_uiRadius;
-				iGreen++)
-			{
-				assert(iGreen >= 0 && iGreen <= 31);
-
-				for (int iBlue = a_phalf->m_iBlue - (int)a_phalf->m_uiRadius;
-				iBlue <= a_phalf->m_iBlue + (int)a_phalf->m_uiRadius;
-					iBlue++)
-				{
-					assert(iBlue >= 0 && iBlue <= 31);
-
-					DifferentialTrys::Try *ptry = &a_phalf->m_atry[a_phalf->m_uiTrys];
-					assert(ptry < &a_phalf->m_atry[DifferentialTrys::Half::MAX_TRYS]);
-
-					ptry->m_iRed = iRed;
-					ptry->m_iGreen = iGreen;
-					ptry->m_iBlue = iBlue;
-					ptry->m_fError = FLT_MAX;
-					ColorFloatRGBA frgbaColor = ColorFloatRGBA::ConvertFromRGB5((unsigned char)iRed, (unsigned char)iGreen, (unsigned char)iBlue);
-
-					// try each CW
-					for (unsigned int uiCW = 0; uiCW < CW_RANGES; uiCW++)
-					{
-						unsigned int auiPixelSelectors[PIXELS / 2];
-						ColorFloatRGBA	afrgbaDecodedColors[PIXELS / 2];
-						float afPixelErrors[PIXELS / 2] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-							FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-
-						// pre-compute decoded pixels for each selector
-						ColorFloatRGBA afrgbaSelectors[SELECTORS];
-						assert(SELECTORS == 4);
-						afrgbaSelectors[0] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][0]).ClampRGB();
-						afrgbaSelectors[1] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][1]).ClampRGB();
-						afrgbaSelectors[2] = ColorFloatRGBA();
-						afrgbaSelectors[3] = (frgbaColor + s_aafCwOpaqueUnsetTable[uiCW][3]).ClampRGB();
-
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-							ColorFloatRGBA *pfrgbaSourcePixel = &m_pafrgbaSource[a_phalf->m_pauiPixelMapping[uiPixel]];
-							ColorFloatRGBA frgbaDecodedPixel;
-
-							for (unsigned int uiSelector = 0; uiSelector < SELECTORS; uiSelector++)
-							{
-								if (pfrgbaSourcePixel->fA < 0.5f)
-								{
-									uiSelector = TRANSPARENT_SELECTOR;
-								}
-								else if (uiSelector == TRANSPARENT_SELECTOR)
-								{
-									continue;
-								}
-
-								frgbaDecodedPixel = afrgbaSelectors[uiSelector];
-
-								float fPixelError;
-								
-								fPixelError = CalcPixelError(frgbaDecodedPixel, m_afDecodedAlphas[a_phalf->m_pauiPixelMapping[uiPixel]],
-																	*pfrgbaSourcePixel);
-
-								if (fPixelError < afPixelErrors[uiPixel])
-								{
-									auiPixelSelectors[uiPixel] = uiSelector;
-									afrgbaDecodedColors[uiPixel] = frgbaDecodedPixel;
-									afPixelErrors[uiPixel] = fPixelError;
-								}
-
-								if (uiSelector == TRANSPARENT_SELECTOR)
-								{
-									break;
-								}
-							}
-						}
-
-						// add up all pixel errors
-						float fCWError = 0.0f;
-						for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-						{
-							fCWError += afPixelErrors[uiPixel];
-						}
-
-						// if best CW so far
-						if (fCWError < ptry->m_fError)
-						{
-							ptry->m_uiCW = uiCW;
-							for (unsigned int uiPixel = 0; uiPixel < 8; uiPixel++)
-							{
-								ptry->m_auiSelectors[uiPixel] = auiPixelSelectors[uiPixel];
-							}
-							ptry->m_fError = fCWError;
-						}
-
-					}
-
-					if (ptry->m_fError < fBestTryError)
-					{
-						a_phalf->m_ptryBest = ptry;
-						fBestTryError = ptry->m_fError;
-					}
-
-					assert(ptry->m_fError < FLT_MAX);
-
-					a_phalf->m_uiTrys++;
-				}
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in T mode
-	// save this encoding if it improves the error
-	//
-	// since pixels that use base color1 don't use the distance table, color1 and color2 can be twiddled independently
-	// better encoding can be found if TWIDDLE_RADIUS is set to 2, but it will be much slower
-	//
-	void Block4x4Encoding_RGB8A1::TryT(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8A1 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_T;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-		if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-		if (iMaxRed1 > 15)
-		{
-			iMaxRed1 = 15;
-		}
-
-		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-		if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-		if (iMaxGreen1 > 15)
-		{
-			iMaxGreen1 = 15;
-		}
-
-		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-		if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-		if (iMaxBlue1 > 15)
-		{
-			iMaxBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-		if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-		if (iMaxRed2 > 15)
-		{
-			iMaxRed2 = 15;
-		}
-
-		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-		if (iMaxGreen2 > 15)
-		{
-			iMaxGreen2 = 15;
-		}
-
-		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-		if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-		if (iMaxBlue2 > 15)
-		{
-			iMaxBlue2 = 15;
-		}
-
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			// twiddle color2 first, since it affects 3 selectors, while color1 only affects one selector
-			//
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor1_TAndH;
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-							}
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-						for (unsigned int uiBaseColorSwaps = 0; uiBaseColorSwaps < 2; uiBaseColorSwaps++)
-						{
-							if (uiBaseColorSwaps == 0)
-							{
-								encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-								encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-							}
-							else
-							{
-								encodingTry.m_frgbaColor1 = m_frgbaOriginalColor2_TAndH;
-								encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-							}
-
-							encodingTry.TryT_BestSelectorCombination();
-
-							if (encodingTry.m_fError < m_fError)
-							{
-								m_mode = encodingTry.m_mode;
-								m_boolDiff = encodingTry.m_boolDiff;
-								m_boolFlip = encodingTry.m_boolFlip;
-
-								m_frgbaColor1 = encodingTry.m_frgbaColor1;
-								m_frgbaColor2 = encodingTry.m_frgbaColor2;
-								m_uiCW1 = encodingTry.m_uiCW1;
-
-								for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-								{
-									m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-									m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-								}
-
-								m_fError = encodingTry.m_fError;
-							}
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryT
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8A1::TryT_BestSelectorCombination(void)
-	{
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-			FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = m_frgbaColor1;
-		afrgbaDecodedPixel[1] = (m_frgbaColor2 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = ColorFloatRGBA();
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-
-		// try each selector
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiMinSelector = 0;
-			unsigned int uiMaxSelector = SELECTORS - 1;
-
-			if (m_pafrgbaSource[uiPixel].fA < 0.5f)
-			{
-				uiMinSelector = 2;
-				uiMaxSelector = 2;
-			}
-
-			for (unsigned int uiSelector = uiMinSelector; uiSelector <= uiMaxSelector; uiSelector++)
-			{
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], m_afDecodedAlphas[uiPixel],
-													m_pafrgbaSource[uiPixel]);
-
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (fBlockError < m_fError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try encoding in H mode
-	// save this encoding if it improves the error
-	//
-	// since all pixels use the distance table, color1 and color2 can NOT be twiddled independently
-	// TWIDDLE_RADIUS of 2 is WAY too slow
-	//
-	void Block4x4Encoding_RGB8A1::TryH(unsigned int a_uiRadius)
-	{
-		Block4x4Encoding_RGB8A1 encodingTry = *this;
-
-		// init "try"
-		{
-			encodingTry.m_mode = MODE_H;
-			encodingTry.m_boolDiff = true;
-			encodingTry.m_boolFlip = false;
-			encodingTry.m_fError = FLT_MAX;
-		}
-
-		int iColor1Red = m_frgbaOriginalColor1_TAndH.IntRed(15.0f);
-		int iColor1Green = m_frgbaOriginalColor1_TAndH.IntGreen(15.0f);
-		int iColor1Blue = m_frgbaOriginalColor1_TAndH.IntBlue(15.0f);
-
-		int iMinRed1 = iColor1Red - (int)a_uiRadius;
-		if (iMinRed1 < 0)
-		{
-			iMinRed1 = 0;
-		}
-		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
-		if (iMaxRed1 > 15)
-		{
-			iMaxRed1 = 15;
-		}
-
-		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-		if (iMinGreen1 < 0)
-		{
-			iMinGreen1 = 0;
-		}
-		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
-		if (iMaxGreen1 > 15)
-		{
-			iMaxGreen1 = 15;
-		}
-
-		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-		if (iMinBlue1 < 0)
-		{
-			iMinBlue1 = 0;
-		}
-		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
-		if (iMaxBlue1 > 15)
-		{
-			iMaxBlue1 = 15;
-		}
-
-		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-		int iColor2Green = m_frgbaOriginalColor2_TAndH.IntGreen(15.0f);
-		int iColor2Blue = m_frgbaOriginalColor2_TAndH.IntBlue(15.0f);
-
-		int iMinRed2 = iColor2Red - (int)a_uiRadius;
-		if (iMinRed2 < 0)
-		{
-			iMinRed2 = 0;
-		}
-		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
-		if (iMaxRed2 > 15)
-		{
-			iMaxRed2 = 15;
-		}
-
-		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-		if (iMinGreen2 < 0)
-		{
-			iMinGreen2 = 0;
-		}
-		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
-		if (iMaxGreen2 > 15)
-		{
-			iMaxGreen2 = 15;
-		}
-
-		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-		if (iMinBlue2 < 0)
-		{
-			iMinBlue2 = 0;
-		}
-		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
-		if (iMaxBlue2 > 15)
-		{
-			iMaxBlue2 = 15;
-		}
-
-		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-		{
-			encodingTry.m_uiCW1 = uiDistance;
-
-			// twiddle m_frgbaOriginalColor1_TAndH
-			for (int iRed1 = iMinRed1; iRed1 <= iMaxRed1; iRed1++)
-			{
-				for (int iGreen1 = iMinGreen1; iGreen1 <= iMaxGreen1; iGreen1++)
-				{
-					for (int iBlue1 = iMinBlue1; iBlue1 <= iMaxBlue1; iBlue1++)
-					{
-						encodingTry.m_frgbaColor1 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed1, (unsigned char)iGreen1, (unsigned char)iBlue1);
-						encodingTry.m_frgbaColor2 = m_frgbaOriginalColor2_TAndH;
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed1 == iColor2Red && iGreen1 == iColor2Green && iBlue1 == iColor2Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-						}
-					}
-				}
-			}
-
-			// twiddle m_frgbaOriginalColor2_TAndH
-			for (int iRed2 = iMinRed2; iRed2 <= iMaxRed2; iRed2++)
-			{
-				for (int iGreen2 = iMinGreen2; iGreen2 <= iMaxGreen2; iGreen2++)
-				{
-					for (int iBlue2 = iMinBlue2; iBlue2 <= iMaxBlue2; iBlue2++)
-					{
-						encodingTry.m_frgbaColor1 = m_frgbaOriginalColor1_TAndH;
-						encodingTry.m_frgbaColor2 = ColorFloatRGBA::ConvertFromRGB4((unsigned char)iRed2, (unsigned char)iGreen2, (unsigned char)iBlue2);
-
-						// if color1 == color2, H encoding issues can pop up, so abort
-						if (iRed2 == iColor1Red && iGreen2 == iColor1Green && iBlue2 == iColor1Blue)
-						{
-							continue;
-						}
-
-						encodingTry.TryH_BestSelectorCombination();
-
-						if (encodingTry.m_fError < m_fError)
-						{
-							m_mode = encodingTry.m_mode;
-							m_boolDiff = encodingTry.m_boolDiff;
-							m_boolFlip = encodingTry.m_boolFlip;
-
-							m_frgbaColor1 = encodingTry.m_frgbaColor1;
-							m_frgbaColor2 = encodingTry.m_frgbaColor2;
-							m_uiCW1 = encodingTry.m_uiCW1;
-
-							for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-							{
-								m_auiSelectors[uiPixel] = encodingTry.m_auiSelectors[uiPixel];
-								m_afrgbaDecodedColors[uiPixel] = encodingTry.m_afrgbaDecodedColors[uiPixel];
-							}
-
-							m_fError = encodingTry.m_fError;
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best selector combination for TryH
-	// called on an encodingTry
-	//
-	void Block4x4Encoding_RGB8A1::TryH_BestSelectorCombination(void)
-	{
-
-		// abort if colors and CW will pose an encoding problem
-		{
-			unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(255.0f);
-			unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(255.0f);
-			unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(255.0f);
-			unsigned int uiColorValue1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
-
-			unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(255.0f);
-			unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(255.0f);
-			unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(255.0f);
-			unsigned int uiColorValue2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
-
-			unsigned int uiCWLsb = m_uiCW1 & 1;
-
-			if ((uiColorValue1 >= (uiColorValue2 & uiCWLsb)) == 0 ||
-				(uiColorValue1 < (uiColorValue2 & uiCWLsb)) == 1)
-			{
-				return;
-			}
-		}
-
-		float fDistance = s_afTHDistanceTable[m_uiCW1];
-
-		unsigned int auiBestPixelSelectors[PIXELS];
-		float afBestPixelErrors[PIXELS] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
-											FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
-		ColorFloatRGBA	afrgbaBestDecodedPixels[PIXELS];
-		ColorFloatRGBA afrgbaDecodedPixel[SELECTORS];
-
-		assert(SELECTORS == 4);
-		afrgbaDecodedPixel[0] = (m_frgbaColor1 + fDistance).ClampRGB();
-		afrgbaDecodedPixel[1] = (m_frgbaColor1 - fDistance).ClampRGB();
-		afrgbaDecodedPixel[2] = ColorFloatRGBA();;
-		afrgbaDecodedPixel[3] = (m_frgbaColor2 - fDistance).ClampRGB();
-
-
-		// try each selector
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			unsigned int uiMinSelector = 0;
-			unsigned int uiMaxSelector = SELECTORS - 1;
-
-			if (m_pafrgbaSource[uiPixel].fA < 0.5f)
-			{
-				uiMinSelector = 2;
-				uiMaxSelector = 2;
-			}
-
-			for (unsigned int uiSelector = uiMinSelector; uiSelector <= uiMaxSelector; uiSelector++)
-			{
-				float fPixelError = CalcPixelError(afrgbaDecodedPixel[uiSelector], m_afDecodedAlphas[uiPixel],
-													m_pafrgbaSource[uiPixel]);
-
-				if (fPixelError < afBestPixelErrors[uiPixel])
-				{
-					afBestPixelErrors[uiPixel] = fPixelError;
-					auiBestPixelSelectors[uiPixel] = uiSelector;
-					afrgbaBestDecodedPixels[uiPixel] = afrgbaDecodedPixel[uiSelector];
-				}
-			}
-		}
-		
-
-		// add up all of the pixel errors
-		float fBlockError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			fBlockError += afBestPixelErrors[uiPixel];
-		}
-
-		if (fBlockError < m_fError)
-		{
-			m_fError = fBlockError;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_auiSelectors[uiPixel] = auiBestPixelSelectors[uiPixel];
-				m_afrgbaDecodedColors[uiPixel] = afrgbaBestDecodedPixels[uiPixel];
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 1 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates1(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, 2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, -2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 2 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates2(void)
-	{
-
-		TryDifferential(!m_boolMostLikelyFlip, 1, -2, 0);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 2, 0);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 0, 2);
-		TryDifferential(!m_boolMostLikelyFlip, 1, 0, -2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 3 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates3(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, -2);
-		TryDifferential(m_boolMostLikelyFlip, 1, -2, 2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, -2);
-		TryDifferential(m_boolMostLikelyFlip, 1, 2, 2);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// try version 4 of the degenerate search
-	// degenerate encodings use basecolor movement and a subset of the selectors to find useful encodings
-	// each subsequent version of the degenerate search uses more basecolor movement and is less likely to
-	//		be successfull
-	//
-	void Block4x4Encoding_RGB8A1::TryDegenerates4(void)
-	{
-
-		TryDifferential(m_boolMostLikelyFlip, 1, -4, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 4, 0);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, 4);
-		TryDifferential(m_boolMostLikelyFlip, 1, 0, -4);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits(void)
-	{
-		switch (m_mode)
-		{
-		case MODE_ETC1:
-			SetEncodingBits_ETC1();
-			break;
-
-		case MODE_T:
-			SetEncodingBits_T();
-			break;
-
-		case MODE_H:
-			SetEncodingBits_H();
-			break;
-
-		case MODE_PLANAR:
-			Block4x4Encoding_RGB8::SetEncodingBits_Planar();
-			break;
-
-		default:
-			assert(false);
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state if ETC1 mode
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits_ETC1(void)
-	{
-
-		// there is no individual mode in RGB8A1
-		assert(m_boolDiff);
-
-		int iRed1 = m_frgbaColor1.IntRed(31.0f);
-		int iGreen1 = m_frgbaColor1.IntGreen(31.0f);
-		int iBlue1 = m_frgbaColor1.IntBlue(31.0f);
-
-		int iRed2 = m_frgbaColor2.IntRed(31.0f);
-		int iGreen2 = m_frgbaColor2.IntGreen(31.0f);
-		int iBlue2 = m_frgbaColor2.IntBlue(31.0f);
-
-		int iDRed2 = iRed2 - iRed1;
-		int iDGreen2 = iGreen2 - iGreen1;
-		int iDBlue2 = iBlue2 - iBlue1;
-
-		assert(iDRed2 >= -4 && iDRed2 < 4);
-		assert(iDGreen2 >= -4 && iDGreen2 < 4);
-		assert(iDBlue2 >= -4 && iDBlue2 < 4);
-
-		m_pencodingbitsRGB8->differential.red1 = iRed1;
-		m_pencodingbitsRGB8->differential.green1 = iGreen1;
-		m_pencodingbitsRGB8->differential.blue1 = iBlue1;
-
-		m_pencodingbitsRGB8->differential.dred2 = iDRed2;
-		m_pencodingbitsRGB8->differential.dgreen2 = iDGreen2;
-		m_pencodingbitsRGB8->differential.dblue2 = iDBlue2;
-
-		m_pencodingbitsRGB8->individual.cw1 = m_uiCW1;
-		m_pencodingbitsRGB8->individual.cw2 = m_uiCW2;
-
-		SetEncodingBits_Selectors();
-
-		// in RGB8A1 encoding bits, opaque replaces differential
-		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
-
-		m_pencodingbitsRGB8->individual.flip = m_boolFlip;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state if T mode
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits_T(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_T);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		m_pencodingbitsRGB8->t.red1a = uiRed1 >> 2;
-		m_pencodingbitsRGB8->t.red1b = uiRed1;
-		m_pencodingbitsRGB8->t.green1 = uiGreen1;
-		m_pencodingbitsRGB8->t.blue1 = uiBlue1;
-
-		m_pencodingbitsRGB8->t.red2 = uiRed2;
-		m_pencodingbitsRGB8->t.green2 = uiGreen2;
-		m_pencodingbitsRGB8->t.blue2 = uiBlue2;
-
-		m_pencodingbitsRGB8->t.da = m_uiCW1 >> 1;
-		m_pencodingbitsRGB8->t.db = m_uiCW1;
-
-		// in RGB8A1 encoding bits, opaque replaces differential
-		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->t.detect1 = 0;
-		m_pencodingbitsRGB8->t.detect2 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		if (iRed2 >= 4)
-		{
-			m_pencodingbitsRGB8->t.detect1 = 7;
-			m_pencodingbitsRGB8->t.detect2 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->t.detect1 = 0;
-			m_pencodingbitsRGB8->t.detect2 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-
-			// make sure red overflows
-			assert(iRed2 < 0 || iRed2 > 31);
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state if H mode
-	//
-	// colors and selectors may need to swap in order to generate lsb of distance index
-	//
-	void Block4x4Encoding_RGB8A1::SetEncodingBits_H(void)
-	{
-		static const bool SANITY_CHECK = true;
-
-		assert(m_mode == MODE_H);
-		assert(m_boolDiff == true);
-
-		unsigned int uiRed1 = (unsigned int)m_frgbaColor1.IntRed(15.0f);
-		unsigned int uiGreen1 = (unsigned int)m_frgbaColor1.IntGreen(15.0f);
-		unsigned int uiBlue1 = (unsigned int)m_frgbaColor1.IntBlue(15.0f);
-
-		unsigned int uiRed2 = (unsigned int)m_frgbaColor2.IntRed(15.0f);
-		unsigned int uiGreen2 = (unsigned int)m_frgbaColor2.IntGreen(15.0f);
-		unsigned int uiBlue2 = (unsigned int)m_frgbaColor2.IntBlue(15.0f);
-
-		unsigned int uiColor1 = (uiRed1 << 16) + (uiGreen1 << 8) + uiBlue1;
-		unsigned int uiColor2 = (uiRed2 << 16) + (uiGreen2 << 8) + uiBlue2;
-
-		bool boolOddDistance = m_uiCW1 & 1;
-		bool boolSwapColors = (uiColor1 < uiColor2) ^ !boolOddDistance;
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed2;
-			m_pencodingbitsRGB8->h.green1a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue2 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue2 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue2;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed1;
-			m_pencodingbitsRGB8->h.green2a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue1;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.red1 = uiRed1;
-			m_pencodingbitsRGB8->h.green1a = uiGreen1 >> 1;
-			m_pencodingbitsRGB8->h.green1b = uiGreen1;
-			m_pencodingbitsRGB8->h.blue1a = uiBlue1 >> 3;
-			m_pencodingbitsRGB8->h.blue1b = uiBlue1 >> 1;
-			m_pencodingbitsRGB8->h.blue1c = uiBlue1;
-
-			m_pencodingbitsRGB8->h.red2 = uiRed2;
-			m_pencodingbitsRGB8->h.green2a = uiGreen2 >> 1;
-			m_pencodingbitsRGB8->h.green2b = uiGreen2;
-			m_pencodingbitsRGB8->h.blue2 = uiBlue2;
-
-			m_pencodingbitsRGB8->h.da = m_uiCW1 >> 2;
-			m_pencodingbitsRGB8->h.db = m_uiCW1 >> 1;
-		}
-
-		// in RGB8A1 encoding bits, opaque replaces differential
-		m_pencodingbitsRGB8->differential.diff = !m_boolPunchThroughPixels;
-
-		Block4x4Encoding_ETC1::SetEncodingBits_Selectors();
-
-		if (boolSwapColors)
-		{
-			m_pencodingbitsRGB8->h.selectors ^= 0x0000FFFF;
-		}
-
-		// create an invalid R differential to trigger T mode
-		m_pencodingbitsRGB8->h.detect1 = 0;
-		m_pencodingbitsRGB8->h.detect2 = 0;
-		m_pencodingbitsRGB8->h.detect3 = 0;
-		int iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-		int iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-		if (iRed2 < 0 || iRed2 > 31)
-		{
-			m_pencodingbitsRGB8->h.detect1 = 1;
-		}
-		if (iGreen2 >= 4)
-		{
-			m_pencodingbitsRGB8->h.detect2 = 7;
-			m_pencodingbitsRGB8->h.detect3 = 0;
-		}
-		else
-		{
-			m_pencodingbitsRGB8->h.detect2 = 0;
-			m_pencodingbitsRGB8->h.detect3 = 1;
-		}
-
-		if (SANITY_CHECK)
-		{
-			iRed2 = (int)m_pencodingbitsRGB8->differential.red1 + (int)m_pencodingbitsRGB8->differential.dred2;
-			iGreen2 = (int)m_pencodingbitsRGB8->differential.green1 + (int)m_pencodingbitsRGB8->differential.dgreen2;
-
-			// make sure red doesn't overflow and green does
-			assert(iRed2 >= 0 && iRed2 <= 31);
-			assert(iGreen2 < 0 || iGreen2 > 31);
-		}
-
-	}
-
-	// ####################################################################################################
-	// Block4x4Encoding_RGB8A1_Opaque
-	// ####################################################################################################
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGB8A1_Opaque::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolPunchThroughPixels);
-		assert(!m_boolTransparent);
-		assert(!m_boolDone);
-
-		switch (m_uiEncodingIterations)
-		{
-		case 0:
-			PerformFirstIteration();
-			break;
-
-		case 1:
-			Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 2:
-			Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 1, 0, 0);
-			break;
-
-		case 3:
-			Block4x4Encoding_RGB8::TryPlanar(1);
-			break;
-
-		case 4:
-			Block4x4Encoding_RGB8::TryTAndH(1);
-			if (a_fEffort <= 49.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 5:
-			Block4x4Encoding_ETC1::TryDegenerates1();
-			if (a_fEffort <= 59.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 6:
-			Block4x4Encoding_ETC1::TryDegenerates2();
-			if (a_fEffort <= 69.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 7:
-			Block4x4Encoding_ETC1::TryDegenerates3();
-			if (a_fEffort <= 79.5f)
-			{
-				m_boolDone = true;
-			}
-			break;
-
-		case 8:
-			Block4x4Encoding_ETC1::TryDegenerates4();
-			m_boolDone = true;
-			break;
-
-		default:
-			assert(0);
-			break;
-		}
-
-		m_uiEncodingIterations++;
-		SetDoneIfPerfect();
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find best initial encoding to ensure block has a valid encoding
-	//
-	void Block4x4Encoding_RGB8A1_Opaque::PerformFirstIteration(void)
-	{
-		
-		// set decoded alphas
-		// calculate alpha error
-		m_fError = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afDecodedAlphas[uiPixel] = 1.0f;
-
-			float fDeltaA = 1.0f - m_pafrgbaSource[uiPixel].fA;
-			m_fError += fDeltaA * fDeltaA;
-		}
-
-		CalculateMostLikelyFlip();
-
-		m_fError = FLT_MAX;
-
-		Block4x4Encoding_ETC1::TryDifferential(m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		Block4x4Encoding_ETC1::TryDifferential(!m_boolMostLikelyFlip, 0, 0, 0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		Block4x4Encoding_RGB8::TryPlanar(0);
-		SetDoneIfPerfect();
-		if (m_boolDone)
-		{
-			return;
-		}
-		Block4x4Encoding_RGB8::TryTAndH(0);
-		SetDoneIfPerfect();
-	}
-
-	// ####################################################################################################
-	// Block4x4Encoding_RGB8A1_Transparent
-	// ####################################################################################################
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGB8A1_Transparent::PerformIteration(float )
-	{
-		assert(!m_boolOpaque);
-		assert(m_boolTransparent);
-		assert(!m_boolDone);
-		assert(m_uiEncodingIterations == 0);
-
-		m_mode = MODE_ETC1;
-		m_boolDiff = true;
-		m_boolFlip = false;
-
-		m_uiCW1 = 0;
-		m_uiCW2 = 0;
-
-		m_frgbaColor1 = ColorFloatRGBA();
-		m_frgbaColor2 = ColorFloatRGBA();
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_auiSelectors[uiPixel] = TRANSPARENT_SELECTOR;
-
-			m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-			m_afDecodedAlphas[uiPixel] = 0.0f;
-		}
-
-		CalcBlockError();
-
-		m_boolDone = true;
-		m_uiEncodingIterations++;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.h
deleted file mode 100644
index ff26e462f8..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_RGB8.h"
-#include "EtcErrorMetric.h"
-#include "EtcBlock4x4EncodingBits.h"
-
-namespace Etc
-{
-
-	// ################################################################################
-	// Block4x4Encoding_RGB8A1
-	// RGB8A1 if not completely opaque or transparent
-	// ################################################################################
-
-	class Block4x4Encoding_RGB8A1 : public Block4x4Encoding_RGB8
-	{
-	public:
-
-		static const unsigned int TRANSPARENT_SELECTOR = 2;
-
-		Block4x4Encoding_RGB8A1(void);
-		virtual ~Block4x4Encoding_RGB8A1(void);
-
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-									ColorFloatRGBA *a_pafrgbaSource,
-									unsigned char *a_paucEncodingBits,
-									ErrorMetric a_errormetric);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											ColorFloatRGBA *a_pafrgbaSource,
-											ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-		void InitFromEncodingBits_ETC1(Block4x4 *a_pblockParent,
-										unsigned char *a_paucEncodingBits,
-										ColorFloatRGBA *a_pafrgbaSource,
-										ErrorMetric a_errormetric);
-
-		void InitFromEncodingBits_T(void);
-		void InitFromEncodingBits_H(void);
-
-		void PerformFirstIteration(void);
-
-		void Decode_ETC1(void);
-		void DecodePixels_T(void);
-		void DecodePixels_H(void);
-		void SetEncodingBits_ETC1(void);
-		void SetEncodingBits_T(void);
-		void SetEncodingBits_H(void);
-
-	protected:
-
-		bool m_boolOpaque;				// all source pixels have alpha >= 0.5
-		bool m_boolTransparent;			// all source pixels have alpha < 0.5
-		bool m_boolPunchThroughPixels;	// some source pixels have alpha < 0.5
-
-		static float s_aafCwOpaqueUnsetTable[CW_RANGES][SELECTORS];
-
-	private:
-
-		void TryDifferential(bool a_boolFlip, unsigned int a_uiRadius,
-								int a_iGrayOffset1, int a_iGrayOffset2);
-		void TryDifferentialHalf(DifferentialTrys::Half *a_phalf);
-
-		void TryT(unsigned int a_uiRadius);
-		void TryT_BestSelectorCombination(void);
-		void TryH(unsigned int a_uiRadius);
-		void TryH_BestSelectorCombination(void);
-
-		void TryDegenerates1(void);
-		void TryDegenerates2(void);
-		void TryDegenerates3(void);
-		void TryDegenerates4(void);
-
-	};
-
-	// ################################################################################
-	// Block4x4Encoding_RGB8A1_Opaque
-	// RGB8A1 if all pixels have alpha==1
-	// ################################################################################
-
-	class Block4x4Encoding_RGB8A1_Opaque : public Block4x4Encoding_RGB8A1
-	{
-	public:
-
-		virtual void PerformIteration(float a_fEffort);
-
-		void PerformFirstIteration(void);
-
-	private:
-
-	};
-
-	// ################################################################################
-	// Block4x4Encoding_RGB8A1_Transparent
-	// RGB8A1 if all pixels have alpha==0
-	// ################################################################################
-
-	class Block4x4Encoding_RGB8A1_Transparent : public Block4x4Encoding_RGB8A1
-	{
-	public:
-
-		virtual void PerformIteration(float a_fEffort);
-
-	private:
-
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp
deleted file mode 100644
index 600c7ab405..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.cpp
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcBlock4x4Encoding_RGBA8.cpp contains:
-	Block4x4Encoding_RGBA8
-	Block4x4Encoding_RGBA8_Opaque
-	Block4x4Encoding_RGBA8_Transparent
-
-These encoders are used when targetting file format RGBA8.
-
-Block4x4Encoding_RGBA8_Opaque is used when all pixels in the 4x4 block are opaque
-Block4x4Encoding_RGBA8_Transparent is used when all pixels in the 4x4 block are transparent
-Block4x4Encoding_RGBA8 is used when there is a mixture of alphas in the 4x4 block
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcBlock4x4Encoding_RGBA8.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-#include <limits>
-
-namespace Etc
-{
-
-	// ####################################################################################################
-	// Block4x4Encoding_RGBA8
-	// ####################################################################################################
-
-	float Block4x4Encoding_RGBA8::s_aafModifierTable[MODIFIER_TABLE_ENTRYS][ALPHA_SELECTORS]
-	{
-		{ -3.0f / 255.0f, -6.0f / 255.0f,  -9.0f / 255.0f, -15.0f / 255.0f, 2.0f / 255.0f, 5.0f / 255.0f, 8.0f / 255.0f, 14.0f / 255.0f },
-		{ -3.0f / 255.0f, -7.0f / 255.0f, -10.0f / 255.0f, -13.0f / 255.0f, 2.0f / 255.0f, 6.0f / 255.0f, 9.0f / 255.0f, 12.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -13.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f, 12.0f / 255.0f },
-		{ -2.0f / 255.0f, -4.0f / 255.0f,  -6.0f / 255.0f, -13.0f / 255.0f, 1.0f / 255.0f, 3.0f / 255.0f, 5.0f / 255.0f, 12.0f / 255.0f },
-
-		{ -3.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f, -12.0f / 255.0f, 2.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f, 11.0f / 255.0f },
-		{ -3.0f / 255.0f, -7.0f / 255.0f,  -9.0f / 255.0f, -11.0f / 255.0f, 2.0f / 255.0f, 6.0f / 255.0f, 8.0f / 255.0f, 10.0f / 255.0f },
-		{ -4.0f / 255.0f, -7.0f / 255.0f,  -8.0f / 255.0f, -11.0f / 255.0f, 3.0f / 255.0f, 6.0f / 255.0f, 7.0f / 255.0f, 10.0f / 255.0f },
-		{ -3.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -11.0f / 255.0f, 2.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f, 10.0f / 255.0f },
-
-		{ -2.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -4.0f / 255.0f,  -8.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 3.0f / 255.0f, 7.0f / 255.0f,  9.0f / 255.0f },
-		{ -2.0f / 255.0f, -5.0f / 255.0f,  -7.0f / 255.0f, -10.0f / 255.0f, 1.0f / 255.0f, 4.0f / 255.0f, 6.0f / 255.0f,  9.0f / 255.0f },
-
-		{ -3.0f / 255.0f, -4.0f / 255.0f,  -7.0f / 255.0f, -10.0f / 255.0f, 2.0f / 255.0f, 3.0f / 255.0f, 6.0f / 255.0f,  9.0f / 255.0f },
-		{ -1.0f / 255.0f, -2.0f / 255.0f,  -3.0f / 255.0f, -10.0f / 255.0f, 0.0f / 255.0f, 1.0f / 255.0f, 2.0f / 255.0f,  9.0f / 255.0f },
-		{ -4.0f / 255.0f, -6.0f / 255.0f,  -8.0f / 255.0f,  -9.0f / 255.0f, 3.0f / 255.0f, 5.0f / 255.0f, 7.0f / 255.0f,  8.0f / 255.0f },
-		{ -3.0f / 255.0f, -5.0f / 255.0f,  -7.0f / 255.0f,  -9.0f / 255.0f, 2.0f / 255.0f, 4.0f / 255.0f, 6.0f / 255.0f,  8.0f / 255.0f }
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Block4x4Encoding_RGBA8::Block4x4Encoding_RGBA8(void)
-	{
-
-		m_pencodingbitsA8 = nullptr;
-
-	}
-	Block4x4Encoding_RGBA8::~Block4x4Encoding_RGBA8(void) {}
-	// ----------------------------------------------------------------------------------------------------
-	// initialization prior to encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits
-	//
-	void Block4x4Encoding_RGBA8::InitFromSource(Block4x4 *a_pblockParent,
-												ColorFloatRGBA *a_pafrgbaSource,
-												unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric)
-	{
-		Block4x4Encoding::Init(a_pblockParent, a_pafrgbaSource,a_errormetric);
-
-		m_pencodingbitsA8 = (Block4x4EncodingBits_A8 *)a_paucEncodingBits;
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)(a_paucEncodingBits + sizeof(Block4x4EncodingBits_A8));
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// initialization from the encoding bits of a previous encoding
-	// a_pblockParent points to the block associated with this encoding
-	// a_errormetric is used to choose the best encoding
-	// a_pafrgbaSource points to a 4x4 block subset of the source image
-	// a_paucEncodingBits points to the final encoding bits of a previous encoding
-	//
-	void Block4x4Encoding_RGBA8::InitFromEncodingBits(Block4x4 *a_pblockParent,
-														unsigned char *a_paucEncodingBits,
-														ColorFloatRGBA *a_pafrgbaSource,
-														ErrorMetric a_errormetric)
-	{
-
-		m_pencodingbitsA8 = (Block4x4EncodingBits_A8 *)a_paucEncodingBits;
-		m_pencodingbitsRGB8 = (Block4x4EncodingBits_RGB8 *)(a_paucEncodingBits + sizeof(Block4x4EncodingBits_A8));
-
-		// init RGB portion
-		Block4x4Encoding_RGB8::InitFromEncodingBits(a_pblockParent,
-													(unsigned char *) m_pencodingbitsRGB8,
-													a_pafrgbaSource,
-													a_errormetric);
-
-		// init A8 portion
-		// has to be done after InitFromEncodingBits()
-		{
-			m_fBase = m_pencodingbitsA8->data.base / 255.0f;
-			m_fMultiplier = (float)m_pencodingbitsA8->data.multiplier;
-			m_uiModifierTableIndex = m_pencodingbitsA8->data.table;
-
-			unsigned long long int ulliSelectorBits = 0;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors0 << 40;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors1 << 32;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors2 << 24;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors3 << 16;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors4 << 8;
-			ulliSelectorBits |= (unsigned long long int)m_pencodingbitsA8->data.selectors5;
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				unsigned int uiShift = 45 - (3 * uiPixel);
-				m_auiAlphaSelectors[uiPixel] = (ulliSelectorBits >> uiShift) & (ALPHA_SELECTORS - 1);
-			}
-
-			// decode the alphas
-			// calc alpha error
-			m_fError = 0.0f;
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_afDecodedAlphas[uiPixel] = DecodePixelAlpha(m_fBase, m_fMultiplier,
-					m_uiModifierTableIndex,
-					m_auiAlphaSelectors[uiPixel]);
-
-				float fDeltaAlpha = m_afDecodedAlphas[uiPixel] - m_pafrgbaSource[uiPixel].fA;
-				m_fError += fDeltaAlpha * fDeltaAlpha;
-			}
-		}
-
-		// redo error calc to include alpha
-		CalcBlockError();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	// similar to Block4x4Encoding_RGB8_Base::Encode_RGB8(), but with alpha added
-	//
-	void Block4x4Encoding_RGBA8::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-
-		if (m_uiEncodingIterations == 0)
-		{
-			if (a_fEffort < 24.9f)
-			{
-				CalculateA8(0.0f);
-			}
-			else if (a_fEffort < 49.9f)
-			{
-				CalculateA8(1.0f);
-			}
-			else
-			{
-				CalculateA8(2.0f);
-			}
-		}
-
-		Block4x4Encoding_RGB8::PerformIteration(a_fEffort);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// find the best combination of base alpga, multiplier and selectors
-	//
-	// a_fRadius limits the range of base alpha to try
-	//
-	void Block4x4Encoding_RGBA8::CalculateA8(float a_fRadius)
-	{
-
-		// find min/max alpha
-		float fMinAlpha = 1.0f;
-		float fMaxAlpha = 0.0f;
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			float fAlpha = m_pafrgbaSource[uiPixel].fA;
-
-			// ignore border pixels
-			if (isnan(fAlpha))
-			{
-				continue;
-			}
-
-			if (fAlpha < fMinAlpha)
-			{
-				fMinAlpha = fAlpha;
-			}
-			if (fAlpha > fMaxAlpha)
-			{
-				fMaxAlpha = fAlpha;
-			}
-		}
-		assert(fMinAlpha <= fMaxAlpha);
-
-		float fAlphaRange = fMaxAlpha - fMinAlpha;
-
-		// try each modifier table entry
-		m_fError = FLT_MAX;		// artificially high value
-		for (unsigned int uiTableEntry = 0; uiTableEntry < MODIFIER_TABLE_ENTRYS; uiTableEntry++)
-		{
-			static const unsigned int MIN_VALUE_SELECTOR = 3;
-			static const unsigned int MAX_VALUE_SELECTOR = 7;
-
-			float fTableEntryCenter = -s_aafModifierTable[uiTableEntry][MIN_VALUE_SELECTOR];
-
-			float fTableEntryRange = s_aafModifierTable[uiTableEntry][MAX_VALUE_SELECTOR] -
-				s_aafModifierTable[uiTableEntry][MIN_VALUE_SELECTOR];
-
-			float fCenterRatio = fTableEntryCenter / fTableEntryRange;
-
-			float fCenter = fMinAlpha + fCenterRatio*fAlphaRange;
-			fCenter = roundf(255.0f * fCenter) / 255.0f;
-
-			float fMinBase = fCenter - (a_fRadius / 255.0f);
-			if (fMinBase < 0.0f)
-			{
-				fMinBase = 0.0f;
-			}
-
-			float fMaxBase = fCenter + (a_fRadius / 255.0f);
-			if (fMaxBase > 1.0f)
-			{
-				fMaxBase = 1.0f;
-			}
-
-			for (float fBase = fMinBase; fBase <= fMaxBase; fBase += (0.999999f / 255.0f))
-			{
-
-				float fRangeMultiplier = roundf(fAlphaRange / fTableEntryRange);
-
-				float fMinMultiplier = fRangeMultiplier - a_fRadius;
-				if (fMinMultiplier < 1.0f)
-				{
-					fMinMultiplier = 1.0f;
-				}
-				else if (fMinMultiplier > 15.0f)
-				{
-					fMinMultiplier = 15.0f;
-				}
-
-				float fMaxMultiplier = fRangeMultiplier + a_fRadius;
-				if (fMaxMultiplier < 1.0f)
-				{
-					fMaxMultiplier = 1.0f;
-				}
-				else if (fMaxMultiplier > 15.0f)
-				{
-					fMaxMultiplier = 15.0f;
-				}
-
-				for (float fMultiplier = fMinMultiplier; fMultiplier <= fMaxMultiplier; fMultiplier += 1.0f)
-				{
-					// find best selector for each pixel
-					unsigned int auiBestSelectors[PIXELS];
-					float afBestAlphaError[PIXELS];
-					float afBestDecodedAlphas[PIXELS];
-					for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-					{
-						float fBestPixelAlphaError = FLT_MAX;
-						for (unsigned int uiSelector = 0; uiSelector < ALPHA_SELECTORS; uiSelector++)
-						{
-							float fDecodedAlpha = DecodePixelAlpha(fBase, fMultiplier, uiTableEntry, uiSelector);
-
-							// border pixels (NAN) should have zero error
-							float fPixelDeltaAlpha = isnan(m_pafrgbaSource[uiPixel].fA) ?
-															0.0f :
-															fDecodedAlpha - m_pafrgbaSource[uiPixel].fA;
-
-							float fPixelAlphaError = fPixelDeltaAlpha * fPixelDeltaAlpha;
-
-							if (fPixelAlphaError < fBestPixelAlphaError)
-							{
-								fBestPixelAlphaError = fPixelAlphaError;
-								auiBestSelectors[uiPixel] = uiSelector;
-								afBestAlphaError[uiPixel] = fBestPixelAlphaError;
-								afBestDecodedAlphas[uiPixel] = fDecodedAlpha;
-							}
-						}
-					}
-
-					float fBlockError = 0.0f;
-					for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-					{
-						fBlockError += afBestAlphaError[uiPixel];
-					}
-
-					if (fBlockError < m_fError)
-					{
-						m_fError = fBlockError;
-
-						m_fBase = fBase;
-						m_fMultiplier = fMultiplier;
-						m_uiModifierTableIndex = uiTableEntry;
-						for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-						{
-							m_auiAlphaSelectors[uiPixel] = auiBestSelectors[uiPixel];
-							m_afDecodedAlphas[uiPixel] = afBestDecodedAlphas[uiPixel];
-						}
-					}
-				}
-			}
-
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGBA8::SetEncodingBits(void)
-	{
-
-		// set the RGB8 portion
-		Block4x4Encoding_RGB8::SetEncodingBits();
-
-		// set the A8 portion
-		{
-			m_pencodingbitsA8->data.base = (unsigned char)roundf(255.0f * m_fBase);
-			m_pencodingbitsA8->data.table = m_uiModifierTableIndex;
-			m_pencodingbitsA8->data.multiplier = (unsigned char)roundf(m_fMultiplier);
-
-			unsigned long long int ulliSelectorBits = 0;
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				unsigned int uiShift = 45 - (3 * uiPixel);
-				ulliSelectorBits |= ((unsigned long long int)m_auiAlphaSelectors[uiPixel]) << uiShift;
-			}
-
-			m_pencodingbitsA8->data.selectors0 = ulliSelectorBits >> 40;
-			m_pencodingbitsA8->data.selectors1 = ulliSelectorBits >> 32;
-			m_pencodingbitsA8->data.selectors2 = ulliSelectorBits >> 24;
-			m_pencodingbitsA8->data.selectors3 = ulliSelectorBits >> 16;
-			m_pencodingbitsA8->data.selectors4 = ulliSelectorBits >> 8;
-			m_pencodingbitsA8->data.selectors5 = ulliSelectorBits;
-		}
-
-	}
-
-	// ####################################################################################################
-	// Block4x4Encoding_RGBA8_Opaque
-	// ####################################################################################################
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGBA8_Opaque::PerformIteration(float a_fEffort)
-	{
-		assert(!m_boolDone);
-
-		if (m_uiEncodingIterations == 0)
-		{
-			m_fError = 0.0f;
-
-			for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-			{
-				m_afDecodedAlphas[uiPixel] = 1.0f;
-			}
-		}
-
-		Block4x4Encoding_RGB8::PerformIteration(a_fEffort);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGBA8_Opaque::SetEncodingBits(void)
-	{
-
-		// set the RGB8 portion
-		Block4x4Encoding_RGB8::SetEncodingBits();
-
-		// set the A8 portion
-		m_pencodingbitsA8->data.base = 255;
-		m_pencodingbitsA8->data.table = 15;
-		m_pencodingbitsA8->data.multiplier = 15;
-		m_pencodingbitsA8->data.selectors0 = 0xFF;
-		m_pencodingbitsA8->data.selectors1 = 0xFF;
-		m_pencodingbitsA8->data.selectors2 = 0xFF;
-		m_pencodingbitsA8->data.selectors3 = 0xFF;
-		m_pencodingbitsA8->data.selectors4 = 0xFF;
-		m_pencodingbitsA8->data.selectors5 = 0xFF;
-
-	}
-
-	// ####################################################################################################
-	// Block4x4Encoding_RGBA8_Transparent
-	// ####################################################################################################
-
-	// ----------------------------------------------------------------------------------------------------
-	// perform a single encoding iteration
-	// replace the encoding if a better encoding was found
-	// subsequent iterations generally take longer for each iteration
-	// set m_boolDone if encoding is perfect or encoding is finished based on a_fEffort
-	//
-	void Block4x4Encoding_RGBA8_Transparent::PerformIteration(float )
-	{
-		assert(!m_boolDone);
-		assert(m_uiEncodingIterations == 0);
-
-		m_mode = MODE_ETC1;
-		m_boolDiff = true;
-		m_boolFlip = false;
-
-		for (unsigned int uiPixel = 0; uiPixel < PIXELS; uiPixel++)
-		{
-			m_afrgbaDecodedColors[uiPixel] = ColorFloatRGBA();
-			m_afDecodedAlphas[uiPixel] = 0.0f;
-		}
-
-		m_fError = 0.0f;
-
-		m_boolDone = true;
-		m_uiEncodingIterations++;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// set the encoding bits based on encoding state
-	//
-	void Block4x4Encoding_RGBA8_Transparent::SetEncodingBits(void)
-	{
-
-		Block4x4Encoding_RGB8::SetEncodingBits();
-
-		// set the A8 portion
-		m_pencodingbitsA8->data.base = 0;
-		m_pencodingbitsA8->data.table = 0;
-		m_pencodingbitsA8->data.multiplier = 1;
-		m_pencodingbitsA8->data.selectors0 = 0;
-		m_pencodingbitsA8->data.selectors1 = 0;
-		m_pencodingbitsA8->data.selectors2 = 0;
-		m_pencodingbitsA8->data.selectors3 = 0;
-		m_pencodingbitsA8->data.selectors4 = 0;
-		m_pencodingbitsA8->data.selectors5 = 0;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-}
diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.h b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.h
deleted file mode 100644
index 5765d36b90..0000000000
--- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGBA8.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcBlock4x4Encoding_RGB8.h"
-
-namespace Etc
-{
-	class Block4x4EncodingBits_A8;
-
-	// ################################################################################
-	// Block4x4Encoding_RGBA8
-	// RGBA8 if not completely opaque or transparent
-	// ################################################################################
-
-	class Block4x4Encoding_RGBA8 : public Block4x4Encoding_RGB8
-	{
-	public:
-
-		Block4x4Encoding_RGBA8(void);
-		virtual ~Block4x4Encoding_RGBA8(void);
-
-		virtual void InitFromSource(Block4x4 *a_pblockParent,
-									ColorFloatRGBA *a_pafrgbaSource,
-									unsigned char *a_paucEncodingBits, ErrorMetric a_errormetric);
-
-		virtual void InitFromEncodingBits(Block4x4 *a_pblockParent,
-											unsigned char *a_paucEncodingBits,
-											ColorFloatRGBA *a_pafrgbaSource,
-											ErrorMetric a_errormetric);
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-	protected:
-
-		static const unsigned int MODIFIER_TABLE_ENTRYS = 16;
-		static const unsigned int ALPHA_SELECTOR_BITS = 3;
-		static const unsigned int ALPHA_SELECTORS = 1 << ALPHA_SELECTOR_BITS;
-
-		static float s_aafModifierTable[MODIFIER_TABLE_ENTRYS][ALPHA_SELECTORS];
-
-		void CalculateA8(float a_fRadius);
-
-		Block4x4EncodingBits_A8 *m_pencodingbitsA8;	// A8 portion of Block4x4EncodingBits_RGBA8
-
-		float m_fBase;
-		float m_fMultiplier;
-		unsigned int m_uiModifierTableIndex;
-		unsigned int m_auiAlphaSelectors[PIXELS];
-
-	private:
-
-		inline float DecodePixelAlpha(float a_fBase, float a_fMultiplier,
-										unsigned int a_uiTableIndex, unsigned int a_uiSelector)
-		{
-			float fPixelAlpha = a_fBase + 
-								a_fMultiplier*s_aafModifierTable[a_uiTableIndex][a_uiSelector];
-			if (fPixelAlpha < 0.0f)
-			{
-				fPixelAlpha = 0.0f;
-			}
-			else if (fPixelAlpha > 1.0f)
-			{
-				fPixelAlpha = 1.0f;
-			}
-
-			return fPixelAlpha;
-		}
-
-	};
-
-	// ################################################################################
-	// Block4x4Encoding_RGBA8_Opaque
-	// RGBA8 if all pixels have alpha==1
-	// ################################################################################
-
-	class Block4x4Encoding_RGBA8_Opaque : public Block4x4Encoding_RGBA8
-	{
-	public:
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-	};
-
-	// ################################################################################
-	// Block4x4Encoding_RGBA8_Transparent
-	// RGBA8 if all pixels have alpha==0
-	// ################################################################################
-
-	class Block4x4Encoding_RGBA8_Transparent : public Block4x4Encoding_RGBA8
-	{
-	public:
-
-		virtual void PerformIteration(float a_fEffort);
-
-		virtual void SetEncodingBits(void);
-
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcColor.h b/thirdparty/etc2comp/EtcColor.h
deleted file mode 100644
index 7ceae05b65..0000000000
--- a/thirdparty/etc2comp/EtcColor.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <math.h>
-
-namespace Etc
-{
-
-	inline float LogToLinear(float a_fLog)
-	{
-		static const float ALPHA = 0.055f;
-		static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
-
-		if (a_fLog <= 0.04045f)
-		{
-			return a_fLog / 12.92f;
-		}
-		else
-		{
-			return powf((a_fLog + ALPHA) / ONE_PLUS_ALPHA, 2.4f);
-		}
-	}
-
-	inline float LinearToLog(float &a_fLinear)
-	{
-		static const float ALPHA = 0.055f;
-		static const float ONE_PLUS_ALPHA = 1.0f + ALPHA;
-
-		if (a_fLinear <= 0.0031308f)
-		{
-			return 12.92f * a_fLinear;
-		}
-		else
-		{
-			return ONE_PLUS_ALPHA * powf(a_fLinear, (1.0f/2.4f)) - ALPHA;
-		}
-	}
-
-	class ColorR8G8B8A8
-	{
-	public:
-
-		unsigned char ucR;
-		unsigned char ucG;
-		unsigned char ucB;
-		unsigned char ucA;
-
-	};
-}
diff --git a/thirdparty/etc2comp/EtcColorFloatRGBA.h b/thirdparty/etc2comp/EtcColorFloatRGBA.h
deleted file mode 100644
index f2ca2c1f71..0000000000
--- a/thirdparty/etc2comp/EtcColorFloatRGBA.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcConfig.h"
-#include "EtcColor.h"
-
-#include <math.h>
-
-namespace Etc
-{
-
-	class ColorFloatRGBA
-    {
-    public:
-
-		ColorFloatRGBA(void)
-        {
-            fR = fG = fB = fA = 0.0f;
-        }
-
-		ColorFloatRGBA(float a_fR, float a_fG, float a_fB, float a_fA)
-        {
-            fR = a_fR;
-            fG = a_fG;
-            fB = a_fB;
-            fA = a_fA;
-        }
-
-		inline ColorFloatRGBA operator+(ColorFloatRGBA& a_rfrgba)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR + a_rfrgba.fR;
-			frgba.fG = fG + a_rfrgba.fG;
-			frgba.fB = fB + a_rfrgba.fB;
-			frgba.fA = fA + a_rfrgba.fA;
-			return frgba;
-		}
-
-		inline ColorFloatRGBA operator+(float a_f)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR + a_f;
-			frgba.fG = fG + a_f;
-			frgba.fB = fB + a_f;
-			frgba.fA = fA;
-			return frgba;
-		}
-
-		inline ColorFloatRGBA operator-(float a_f)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR - a_f;
-			frgba.fG = fG - a_f;
-			frgba.fB = fB - a_f;
-			frgba.fA = fA;
-			return frgba;
-		}
-
-		inline ColorFloatRGBA operator-(ColorFloatRGBA& a_rfrgba)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR - a_rfrgba.fR;
-			frgba.fG = fG - a_rfrgba.fG;
-			frgba.fB = fB - a_rfrgba.fB;
-			frgba.fA = fA - a_rfrgba.fA;
-			return frgba;
-		}
-
-		inline ColorFloatRGBA operator*(float a_f)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = fR * a_f;
-			frgba.fG = fG * a_f;
-			frgba.fB = fB * a_f;
-			frgba.fA = fA;
-
-			return frgba;
-		}
-
-		inline ColorFloatRGBA ScaleRGB(float a_f)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = a_f * fR;
-			frgba.fG = a_f * fG;
-			frgba.fB = a_f * fB;
-			frgba.fA = fA;
-
-			return frgba;
-		}
-
-		inline ColorFloatRGBA RoundRGB(void)
-		{
-			ColorFloatRGBA frgba;
-			frgba.fR = roundf(fR);
-			frgba.fG = roundf(fG);
-			frgba.fB = roundf(fB);
-
-			return frgba;
-		}
-
-		inline ColorFloatRGBA ToLinear()
-		{
-			ColorFloatRGBA frgbaLinear;
-			frgbaLinear.fR = LogToLinear(fR);
-			frgbaLinear.fG = LogToLinear(fG);
-			frgbaLinear.fB = LogToLinear(fB);
-			frgbaLinear.fA = fA;
-
-			return frgbaLinear;
-		}
-
-		inline ColorFloatRGBA ToLog(void)
-		{
-			ColorFloatRGBA frgbaLog;
-			frgbaLog.fR = LinearToLog(fR);
-			frgbaLog.fG = LinearToLog(fG);
-			frgbaLog.fB = LinearToLog(fB);
-			frgbaLog.fA = fA;
-
-			return frgbaLog;
-		}
-
-		inline static ColorFloatRGBA ConvertFromRGBA8(unsigned char a_ucR, 
-			unsigned char a_ucG, unsigned char a_ucB, unsigned char a_ucA)
-		{
-			ColorFloatRGBA frgba;
-
-			frgba.fR = (float)a_ucR / 255.0f;
-			frgba.fG = (float)a_ucG / 255.0f;
-			frgba.fB = (float)a_ucB / 255.0f;
-			frgba.fA = (float)a_ucA / 255.0f;
-
-			return frgba;
-		}
-
-		inline static ColorFloatRGBA ConvertFromRGB4(unsigned char a_ucR4,
-														unsigned char a_ucG4,
-														unsigned char a_ucB4)
-		{
-			ColorFloatRGBA frgba;
-
-			unsigned char ucR8 = (unsigned char)((a_ucR4 << 4) + a_ucR4);
-			unsigned char ucG8 = (unsigned char)((a_ucG4 << 4) + a_ucG4);
-			unsigned char ucB8 = (unsigned char)((a_ucB4 << 4) + a_ucB4);
-
-			frgba.fR = (float)ucR8 / 255.0f;
-			frgba.fG = (float)ucG8 / 255.0f;
-			frgba.fB = (float)ucB8 / 255.0f;
-			frgba.fA = 1.0f;
-
-			return frgba;
-		}
-
-		inline static ColorFloatRGBA ConvertFromRGB5(unsigned char a_ucR5,
-			unsigned char a_ucG5,
-			unsigned char a_ucB5)
-		{
-			ColorFloatRGBA frgba;
-
-			unsigned char ucR8 = (unsigned char)((a_ucR5 << 3) + (a_ucR5 >> 2));
-			unsigned char ucG8 = (unsigned char)((a_ucG5 << 3) + (a_ucG5 >> 2));
-			unsigned char ucB8 = (unsigned char)((a_ucB5 << 3) + (a_ucB5 >> 2));
-
-			frgba.fR = (float)ucR8 / 255.0f;
-			frgba.fG = (float)ucG8 / 255.0f;
-			frgba.fB = (float)ucB8 / 255.0f;
-			frgba.fA = 1.0f;
-
-			return frgba;
-		}
-
-		inline static ColorFloatRGBA ConvertFromR6G7B6(unsigned char a_ucR6,
-			unsigned char a_ucG7,
-			unsigned char a_ucB6)
-		{
-			ColorFloatRGBA frgba;
-
-			unsigned char ucR8 = (unsigned char)((a_ucR6 << 2) + (a_ucR6 >> 4));
-			unsigned char ucG8 = (unsigned char)((a_ucG7 << 1) + (a_ucG7 >> 6));
-			unsigned char ucB8 = (unsigned char)((a_ucB6 << 2) + (a_ucB6 >> 4));
-
-			frgba.fR = (float)ucR8 / 255.0f;
-			frgba.fG = (float)ucG8 / 255.0f;
-			frgba.fB = (float)ucB8 / 255.0f;
-			frgba.fA = 1.0f;
-
-			return frgba;
-		}
-
-		// quantize to 4 bits, expand to 8 bits
-		inline ColorFloatRGBA QuantizeR4G4B4(void) const
-		{
-			ColorFloatRGBA frgba = *this;
-
-			// quantize to 4 bits
-			frgba = frgba.ClampRGB().ScaleRGB(15.0f).RoundRGB();
-			unsigned int uiR4 = (unsigned int)frgba.fR;
-			unsigned int uiG4 = (unsigned int)frgba.fG;
-			unsigned int uiB4 = (unsigned int)frgba.fB;
-
-			// expand to 8 bits
-			frgba.fR = (float) ((uiR4 << 4) + uiR4);
-			frgba.fG = (float) ((uiG4 << 4) + uiG4);
-			frgba.fB = (float) ((uiB4 << 4) + uiB4);
-
-			frgba = frgba.ScaleRGB(1.0f/255.0f);
-
-			return frgba;
-		}
-
-		// quantize to 5 bits, expand to 8 bits
-		inline ColorFloatRGBA QuantizeR5G5B5(void) const
-		{
-			ColorFloatRGBA frgba = *this;
-
-			// quantize to 5 bits
-			frgba = frgba.ClampRGB().ScaleRGB(31.0f).RoundRGB();
-			unsigned int uiR5 = (unsigned int)frgba.fR;
-			unsigned int uiG5 = (unsigned int)frgba.fG;
-			unsigned int uiB5 = (unsigned int)frgba.fB;
-
-			// expand to 8 bits
-			frgba.fR = (float)((uiR5 << 3) + (uiR5 >> 2));
-			frgba.fG = (float)((uiG5 << 3) + (uiG5 >> 2));
-			frgba.fB = (float)((uiB5 << 3) + (uiB5 >> 2));
-
-			frgba = frgba.ScaleRGB(1.0f / 255.0f);
-
-			return frgba;
-		}
-
-		// quantize to 6/7/6 bits, expand to 8 bits
-		inline ColorFloatRGBA QuantizeR6G7B6(void) const
-		{
-			ColorFloatRGBA frgba = *this;
-
-			// quantize to 6/7/6 bits
-			ColorFloatRGBA frgba6 = frgba.ClampRGB().ScaleRGB(63.0f).RoundRGB();
-			ColorFloatRGBA frgba7 = frgba.ClampRGB().ScaleRGB(127.0f).RoundRGB();
-			unsigned int uiR6 = (unsigned int)frgba6.fR;
-			unsigned int uiG7 = (unsigned int)frgba7.fG;
-			unsigned int uiB6 = (unsigned int)frgba6.fB;
-
-			// expand to 8 bits
-			frgba.fR = (float)((uiR6 << 2) + (uiR6 >> 4));
-			frgba.fG = (float)((uiG7 << 1) + (uiG7 >> 6));
-			frgba.fB = (float)((uiB6 << 2) + (uiB6 >> 4));
-
-			frgba = frgba.ScaleRGB(1.0f / 255.0f);
-
-			return frgba;
-		}
-
-		inline ColorFloatRGBA ClampRGB(void)
-		{
-			ColorFloatRGBA frgba = *this;
-			if (frgba.fR < 0.0f) { frgba.fR = 0.0f; }
-			if (frgba.fR > 1.0f) { frgba.fR = 1.0f; }
-			if (frgba.fG < 0.0f) { frgba.fG = 0.0f; }
-			if (frgba.fG > 1.0f) { frgba.fG = 1.0f; }
-			if (frgba.fB < 0.0f) { frgba.fB = 0.0f; }
-			if (frgba.fB > 1.0f) { frgba.fB = 1.0f; }
-
-			return frgba;
-		}
-
-		inline ColorFloatRGBA ClampRGBA(void)
-		{
-			ColorFloatRGBA frgba = *this;
-			if (frgba.fR < 0.0f) { frgba.fR = 0.0f; }
-			if (frgba.fR > 1.0f) { frgba.fR = 1.0f; }
-			if (frgba.fG < 0.0f) { frgba.fG = 0.0f; }
-			if (frgba.fG > 1.0f) { frgba.fG = 1.0f; }
-			if (frgba.fB < 0.0f) { frgba.fB = 0.0f; }
-			if (frgba.fB > 1.0f) { frgba.fB = 1.0f; }
-			if (frgba.fA < 0.0f) { frgba.fA = 0.0f; }
-			if (frgba.fA > 1.0f) { frgba.fA = 1.0f; }
-
-			return frgba;
-		}
-
-		inline int IntRed(float a_fScale)
-		{
-			return (int)roundf(fR * a_fScale);
-		}
-
-		inline int IntGreen(float a_fScale)
-		{
-			return (int)roundf(fG * a_fScale);
-		}
-
-		inline int IntBlue(float a_fScale)
-		{
-			return (int)roundf(fB * a_fScale);
-		}
-
-		inline int IntAlpha(float a_fScale)
-		{
-			return (int)roundf(fA * a_fScale);
-		}
-
-		float	fR, fG, fB, fA;
-    };
-
-}
-
diff --git a/thirdparty/etc2comp/EtcConfig.h b/thirdparty/etc2comp/EtcConfig.h
deleted file mode 100644
index 3bfe1d99a8..0000000000
--- a/thirdparty/etc2comp/EtcConfig.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#ifdef _WIN32
-#define ETC_WINDOWS (1)
-#else
-#define ETC_WINDOWS (0)
-#endif
-
-#if __APPLE__
-#define ETC_OSX (1)
-#else
-#define ETC_OSX (0)
-#endif
-
-#if __unix__
-#define ETC_UNIX (1)
-#else
-#define ETC_UNIX (0)
-#endif
-
-
-// short names for common types
-#include <stdint.h>
-typedef int8_t i8;
-typedef int16_t i16;
-typedef int32_t i32;
-typedef int64_t i64;
-
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-
-typedef float	f32;
-typedef double	f64;
-
-// Keep asserts enabled in release builds during development
-#undef NDEBUG
-
-// 0=disable. stb_image can be used if you need to compress
-//other image formats like jpg
-#define USE_STB_IMAGE_LOAD 0
-
-#if ETC_WINDOWS
-#include <sdkddkver.h>
-#define _CRT_SECURE_NO_WARNINGS (1)
-#include <tchar.h>
-#endif
-
-#include <stdio.h>
-
diff --git a/thirdparty/etc2comp/EtcDifferentialTrys.cpp b/thirdparty/etc2comp/EtcDifferentialTrys.cpp
deleted file mode 100644
index ef4cd103d9..0000000000
--- a/thirdparty/etc2comp/EtcDifferentialTrys.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcDifferentialTrys.cpp
-
-Gathers the results of the various encoding trys for both halves of a 4x4 block for Differential mode
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcDifferentialTrys.h"
-
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// construct a list of trys (encoding attempts)
-	//
-	// a_frgbaColor1 is the basecolor for the first half
-	// a_frgbaColor2 is the basecolor for the second half
-	// a_pauiPixelMapping1 is the pixel order for the first half
-	// a_pauiPixelMapping2 is the pixel order for the second half
-	// a_uiRadius is the amount to vary the base colors
-	//
-	DifferentialTrys::DifferentialTrys(ColorFloatRGBA a_frgbaColor1, ColorFloatRGBA a_frgbaColor2,
-										const unsigned int *a_pauiPixelMapping1,
-										const unsigned int *a_pauiPixelMapping2,
-										unsigned int a_uiRadius,
-										int a_iGrayOffset1, int a_iGrayOffset2)
-	{
-		assert(a_uiRadius <= MAX_RADIUS);
-
-		m_boolSeverelyBentColors = false;
-
-		ColorFloatRGBA frgbaQuantizedColor1 = a_frgbaColor1.QuantizeR5G5B5();
-		ColorFloatRGBA frgbaQuantizedColor2 = a_frgbaColor2.QuantizeR5G5B5();
-
-		// quantize base colors
-		// ensure that trys with a_uiRadius don't overflow
-		int iRed1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntRed(31.0f)+a_iGrayOffset1, a_uiRadius);
-		int iGreen1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntGreen(31.0f) + a_iGrayOffset1, a_uiRadius);
-		int iBlue1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntBlue(31.0f) + a_iGrayOffset1, a_uiRadius);
-		int iRed2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntRed(31.0f) + a_iGrayOffset2, a_uiRadius);
-		int iGreen2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntGreen(31.0f) + a_iGrayOffset2, a_uiRadius);
-		int iBlue2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntBlue(31.0f) + a_iGrayOffset2, a_uiRadius);
-
-		int iDeltaRed = iRed2 - iRed1;
-		int iDeltaGreen = iGreen2 - iGreen1;
-		int iDeltaBlue = iBlue2 - iBlue1;
-
-		// make sure components are within range
-		{
-			if (iDeltaRed > 3)
-			{
-				if (iDeltaRed > 7)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iRed1 += (iDeltaRed - 3) / 2;
-				iRed2 = iRed1 + 3;
-				iDeltaRed = 3;
-			}
-			else if (iDeltaRed < -4)
-			{
-				if (iDeltaRed < -8)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iRed1 += (iDeltaRed + 4) / 2;
-				iRed2 = iRed1 - 4;
-				iDeltaRed = -4;
-			}
-			assert(iRed1 >= (signed)(0 + a_uiRadius) && iRed1 <= (signed)(31 - a_uiRadius));
-			assert(iRed2 >= (signed)(0 + a_uiRadius) && iRed2 <= (signed)(31 - a_uiRadius));
-			assert(iDeltaRed >= -4 && iDeltaRed <= 3);
-
-			if (iDeltaGreen > 3)
-			{
-				if (iDeltaGreen > 7)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iGreen1 += (iDeltaGreen - 3) / 2;
-				iGreen2 = iGreen1 + 3;
-				iDeltaGreen = 3;
-			}
-			else if (iDeltaGreen < -4)
-			{
-				if (iDeltaGreen < -8)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iGreen1 += (iDeltaGreen + 4) / 2;
-				iGreen2 = iGreen1 - 4;
-				iDeltaGreen = -4;
-			}
-			assert(iGreen1 >= (signed)(0 + a_uiRadius) && iGreen1 <= (signed)(31 - a_uiRadius));
-			assert(iGreen2 >= (signed)(0 + a_uiRadius) && iGreen2 <= (signed)(31 - a_uiRadius));
-			assert(iDeltaGreen >= -4 && iDeltaGreen <= 3);
-
-			if (iDeltaBlue > 3)
-			{
-				if (iDeltaBlue > 7)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iBlue1 += (iDeltaBlue - 3) / 2;
-				iBlue2 = iBlue1 + 3;
-				iDeltaBlue = 3;
-			}
-			else if (iDeltaBlue < -4)
-			{
-				if (iDeltaBlue < -8)
-				{
-					m_boolSeverelyBentColors = true;
-				}
-
-				iBlue1 += (iDeltaBlue + 4) / 2;
-				iBlue2 = iBlue1 - 4;
-				iDeltaBlue = -4;
-			}
-			assert(iBlue1 >= (signed)(0+a_uiRadius) && iBlue1 <= (signed)(31 - a_uiRadius));
-			assert(iBlue2 >= (signed)(0 + a_uiRadius) && iBlue2 <= (signed)(31 - a_uiRadius));
-			assert(iDeltaBlue >= -4 && iDeltaBlue <= 3);
-		}
-
-		m_half1.Init(iRed1, iGreen1, iBlue1, a_pauiPixelMapping1, a_uiRadius);
-		m_half2.Init(iRed2, iGreen2, iBlue2, a_pauiPixelMapping2, a_uiRadius);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	void DifferentialTrys::Half::Init(int a_iRed, int a_iGreen, int a_iBlue, 
-										const unsigned int *a_pauiPixelMapping, unsigned int a_uiRadius)
-	{
-
-		m_iRed = a_iRed;
-		m_iGreen = a_iGreen;
-		m_iBlue = a_iBlue;
-
-		m_pauiPixelMapping = a_pauiPixelMapping;
-		m_uiRadius = a_uiRadius;
-
-		m_uiTrys = 0;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcDifferentialTrys.h b/thirdparty/etc2comp/EtcDifferentialTrys.h
deleted file mode 100644
index 71860908ff..0000000000
--- a/thirdparty/etc2comp/EtcDifferentialTrys.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColorFloatRGBA.h"
-
-namespace Etc
-{
-
-	class DifferentialTrys
-	{
-	public:
-
-		static const unsigned int MAX_RADIUS = 2;
-
-		DifferentialTrys(ColorFloatRGBA a_frgbaColor1,
-							ColorFloatRGBA a_frgbaColor2,
-							const unsigned int *a_pauiPixelMapping1,
-							const unsigned int *a_pauiPixelMapping2,
-							unsigned int a_uiRadius,
-							int a_iGrayOffset1, int a_iGrayOffset2);
-
-		inline static int MoveAwayFromEdge(int a_i, int a_iDistance)
-		{
-			if (a_i < (0+ a_iDistance))
-			{
-				return (0 + a_iDistance);
-			}
-			else if (a_i > (31- a_iDistance))
-			{
-				return (31 - a_iDistance);
-			}
-
-			return a_i;
-		}
-
-		class Try
-		{
-        public :
-			static const unsigned int SELECTORS = 8;	// per half
-
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-			unsigned int m_uiCW;
-			unsigned int m_auiSelectors[SELECTORS];
-			float m_fError;
-        };
-
-		class Half
-		{
-		public:
-
-			static const unsigned int MAX_TRYS = 125;
-
-			void Init(int a_iRed, int a_iGreen, int a_iBlue, 
-						const unsigned int *a_pauiPixelMapping,
-						unsigned int a_uiRadius);
-
-			// center of trys
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-
-			const unsigned int *m_pauiPixelMapping;
-			unsigned int m_uiRadius;
-
-			unsigned int m_uiTrys;
-			Try m_atry[MAX_TRYS];
-
-			Try *m_ptryBest;
-		};
-
-		Half m_half1;
-		Half m_half2;
-
-		bool m_boolSeverelyBentColors;
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcErrorMetric.h b/thirdparty/etc2comp/EtcErrorMetric.h
deleted file mode 100644
index df4dcab4fb..0000000000
--- a/thirdparty/etc2comp/EtcErrorMetric.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace Etc
-{
-
-	enum ErrorMetric
-	{
-		RGBA,
-		RGBX,
-		REC709,
-		NUMERIC,
-		NORMALXYZ,
-		//
-		ERROR_METRICS,
-		//
-		BT709 = REC709
-	};
-
-	inline const char *ErrorMetricToString(ErrorMetric errorMetric)
-	{
-		switch (errorMetric)
-		{
-		case RGBA:
-			return "RGBA";
-		case RGBX:
-			return "RGBX";
-		case REC709:
-			return "REC709";
-		case NUMERIC:
-			return "NUMERIC";
-		case NORMALXYZ:
-			return "NORMALXYZ";
-		case ERROR_METRICS:
-		default:
-			return "UNKNOWN";
-		}
-	}
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcFile.cpp b/thirdparty/etc2comp/EtcFile.cpp
deleted file mode 100644
index 831a3aac45..0000000000
--- a/thirdparty/etc2comp/EtcFile.cpp
+++ /dev/null
@@ -1,390 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef _WIN32
-#define _CRT_SECURE_NO_WARNINGS (1)
-#endif
-
-#include "EtcConfig.h"
-
-
-#include "EtcFile.h"
-
-#include "EtcFileHeader.h"
-#include "EtcColor.h"
-#include "Etc.h"
-#include "EtcBlock4x4EncodingBits.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <stdlib.h>
-
-using namespace Etc;
-
-// ----------------------------------------------------------------------------------------------------
-//
-File::File(const char *a_pstrFilename, Format a_fileformat, Image::Format a_imageformat,
-			unsigned char *a_paucEncodingBits, unsigned int a_uiEncodingBitsBytes,
-			unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight,
-			unsigned int a_uiExtendedWidth, unsigned int a_uiExtendedHeight)
-{
-	if (a_pstrFilename == nullptr)
-	{
-		m_pstrFilename = const_cast<char *>("");
-	}
-	else
-	{
-		m_pstrFilename = new char[strlen(a_pstrFilename) + 1];
-		strcpy(m_pstrFilename, a_pstrFilename);
-	}
-
-	m_fileformat = a_fileformat;
-	if (m_fileformat == Format::INFER_FROM_FILE_EXTENSION)
-	{
-		// ***** TODO: add this later *****
-		m_fileformat = Format::KTX;
-	}
-
-	m_imageformat = a_imageformat;
-
-	m_uiNumMipmaps = 1;
-	m_pMipmapImages = new RawImage[m_uiNumMipmaps];
-	m_pMipmapImages[0].paucEncodingBits = std::shared_ptr<unsigned char>(a_paucEncodingBits, [](unsigned char *p) { delete[] p; } );
-	m_pMipmapImages[0].uiEncodingBitsBytes = a_uiEncodingBitsBytes;
-	m_pMipmapImages[0].uiExtendedWidth = a_uiExtendedWidth;
-	m_pMipmapImages[0].uiExtendedHeight = a_uiExtendedHeight;
-
-	m_uiSourceWidth = a_uiSourceWidth;
-	m_uiSourceHeight = a_uiSourceHeight;
-
-	switch (m_fileformat)
-	{
-	case Format::PKM:
-		m_pheader = new FileHeader_Pkm(this);
-		break;
-
-	case Format::KTX:
-		m_pheader = new FileHeader_Ktx(this);
-		break;
-
-	default:
-		assert(0);
-		break;
-	}
-
-}
-
-// ----------------------------------------------------------------------------------------------------
-//
-File::File(const char *a_pstrFilename, Format a_fileformat, Image::Format a_imageformat,
-	unsigned int a_uiNumMipmaps, RawImage *a_pMipmapImages,
-	unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight)
-{
-	if (a_pstrFilename == nullptr)
-	{
-		m_pstrFilename = const_cast<char *>("");
-	}
-	else
-	{
-		m_pstrFilename = new char[strlen(a_pstrFilename) + 1];
-		strcpy(m_pstrFilename, a_pstrFilename);
-	}
-
-	m_fileformat = a_fileformat;
-	if (m_fileformat == Format::INFER_FROM_FILE_EXTENSION)
-	{
-		// ***** TODO: add this later *****
-		m_fileformat = Format::KTX;
-	}
-
-	m_imageformat = a_imageformat;
-
-	m_uiNumMipmaps = a_uiNumMipmaps;
-	m_pMipmapImages = new RawImage[m_uiNumMipmaps];
-
-	for(unsigned int mip = 0; mip < m_uiNumMipmaps; mip++)
-	{
-		m_pMipmapImages[mip] = a_pMipmapImages[mip];
-	}
-
-	m_uiSourceWidth = a_uiSourceWidth;
-	m_uiSourceHeight = a_uiSourceHeight;
-
-	switch (m_fileformat)
-	{
-	case Format::PKM:
-		m_pheader = new FileHeader_Pkm(this);
-		break;
-
-	case Format::KTX:
-		m_pheader = new FileHeader_Ktx(this);
-		break;
-
-	default:
-		assert(0);
-		break;
-	}
-
-}
-
-// ----------------------------------------------------------------------------------------------------
-//
-File::File(const char *a_pstrFilename, Format a_fileformat)
-{
-	if (a_pstrFilename == nullptr)
-	{
-		return;
-	}
-	else
-	{
-		m_pstrFilename = new char[strlen(a_pstrFilename) + 1];
-		strcpy(m_pstrFilename, a_pstrFilename);
-	}
-
-	m_fileformat = a_fileformat;
-	if (m_fileformat == Format::INFER_FROM_FILE_EXTENSION)
-	{
-		// ***** TODO: add this later *****
-		m_fileformat = Format::KTX;
-	}
-
-	FILE *pfile = fopen(m_pstrFilename, "rb");
-	if (pfile == nullptr)
-	{
-		printf("ERROR: Couldn't open %s", m_pstrFilename);
-		exit(1);
-	}
-	fseek(pfile, 0, SEEK_END);
-	unsigned int fileSize = ftell(pfile);
-	fseek(pfile, 0, SEEK_SET);
-	size_t szResult;
-
-	m_pheader = new FileHeader_Ktx(this);
-	szResult = fread( ((FileHeader_Ktx*)m_pheader)->GetData(), 1, sizeof(FileHeader_Ktx::Data), pfile);
-	assert(szResult > 0);
-
-	m_uiNumMipmaps = 1;
-	m_pMipmapImages = new RawImage[m_uiNumMipmaps];
-
-	if (((FileHeader_Ktx*)m_pheader)->GetData()->m_u32BytesOfKeyValueData > 0)
-		fseek(pfile, ((FileHeader_Ktx*)m_pheader)->GetData()->m_u32BytesOfKeyValueData, SEEK_CUR);
-	szResult = fread(&m_pMipmapImages->uiEncodingBitsBytes, 1, sizeof(unsigned int), pfile);
-	assert(szResult > 0);
-
-	m_pMipmapImages->paucEncodingBits = std::shared_ptr<unsigned char>(new unsigned char[m_pMipmapImages->uiEncodingBitsBytes], [](unsigned char *p) { delete[] p; } );
-	assert(ftell(pfile) + m_pMipmapImages->uiEncodingBitsBytes <= fileSize);
-	szResult = fread(m_pMipmapImages->paucEncodingBits.get(), 1, m_pMipmapImages->uiEncodingBitsBytes, pfile);
-	assert(szResult == m_pMipmapImages->uiEncodingBitsBytes);
-
-	uint32_t uiInternalFormat = ((FileHeader_Ktx*)m_pheader)->GetData()->m_u32GlInternalFormat;
-	uint32_t uiBaseInternalFormat = ((FileHeader_Ktx*)m_pheader)->GetData()->m_u32GlBaseInternalFormat;
-	
-	if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC1_RGB8 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC1_RGB8)
-	{
-		m_imageformat = Image::Format::ETC1;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_RGB8 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_RGB8)
-	{
-		m_imageformat = Image::Format::RGB8;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_RGB8A1 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_RGB8A1)
-	{
-		m_imageformat = Image::Format::RGB8A1;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_RGBA8 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_RGBA8)
-	{
-		m_imageformat = Image::Format::RGBA8;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_R11 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_R11)
-	{
-		m_imageformat = Image::Format::R11;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_SIGNED_R11 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_R11)
-	{
-		m_imageformat = Image::Format::SIGNED_R11;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_RG11 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_RG11)
-	{
-		m_imageformat = Image::Format::RG11;
-	}
-	else if (uiInternalFormat == (uint32_t)FileHeader_Ktx::InternalFormat::ETC2_SIGNED_RG11 && uiBaseInternalFormat == (uint32_t)FileHeader_Ktx::BaseInternalFormat::ETC2_RG11)
-	{
-		m_imageformat = Image::Format::SIGNED_RG11;
-	}
-	else
-	{
-		m_imageformat = Image::Format::UNKNOWN;
-	}
-
-	m_uiSourceWidth = ((FileHeader_Ktx*)m_pheader)->GetData()->m_u32PixelWidth;
-	m_uiSourceHeight = ((FileHeader_Ktx*)m_pheader)->GetData()->m_u32PixelHeight;
-	m_pMipmapImages->uiExtendedWidth = Image::CalcExtendedDimension((unsigned short)m_uiSourceWidth);
-	m_pMipmapImages->uiExtendedHeight = Image::CalcExtendedDimension((unsigned short)m_uiSourceHeight);
-
-	unsigned int uiBlocks = m_pMipmapImages->uiExtendedWidth * m_pMipmapImages->uiExtendedHeight / 16;
-	Block4x4EncodingBits::Format encodingbitsformat = Image::DetermineEncodingBitsFormat(m_imageformat);
-	unsigned int expectedbytes = uiBlocks * Block4x4EncodingBits::GetBytesPerBlock(encodingbitsformat);
-	assert(expectedbytes == m_pMipmapImages->uiEncodingBitsBytes);
-
-	fclose(pfile);
-}
-
-File::~File()
-{
-	if (m_pMipmapImages != nullptr)
-	{
-		delete [] m_pMipmapImages;
-	}
-
-	if(m_pstrFilename != nullptr)
-	{
-		delete[] m_pstrFilename;
-		m_pstrFilename = nullptr;
-	}
-	if (m_pheader != nullptr)
-	{
-		delete m_pheader;
-		m_pheader = nullptr;
-	}
-}
-
-void File::UseSingleBlock(int a_iPixelX, int a_iPixelY)
-{
-	if (a_iPixelX <= -1 || a_iPixelY <= -1)
-		return;
-	if (a_iPixelX >(int) m_uiSourceWidth)
-	{
-		//if we are using a ktx thats the size of a single block or less
-		//then make sure we use the 4x4 image as the single block
-		if (m_uiSourceWidth <= 4)
-		{
-			a_iPixelX = 0;
-		}
-		else
-		{
-			printf("blockAtHV: H coordinate out of range, capped to image width\n");
-			a_iPixelX = m_uiSourceWidth - 1;
-		}
-	}
-	if (a_iPixelY >(int) m_uiSourceHeight)
-	{
-		//if we are using a ktx thats the size of a single block or less
-		//then make sure we use the 4x4 image as the single block
-		if (m_uiSourceHeight <= 4)
-		{
-			a_iPixelY= 0;
-		}
-		else
-		{
-			printf("blockAtHV: V coordinate out of range, capped to image height\n");
-			a_iPixelY = m_uiSourceHeight - 1;
-		}
-	}
-
-	unsigned int origWidth = m_uiSourceWidth;
-	unsigned int origHeight = m_uiSourceHeight;
-
-	m_uiSourceWidth = 4;
-	m_uiSourceHeight = 4;
-
-	Block4x4EncodingBits::Format encodingbitsformat = Image::DetermineEncodingBitsFormat(m_imageformat);
-	unsigned int uiEncodingBitsBytesPerBlock = Block4x4EncodingBits::GetBytesPerBlock(encodingbitsformat);
-
-	int numMipmaps = 1;
-	RawImage* pMipmapImages = new RawImage[numMipmaps];
-	pMipmapImages[0].uiExtendedWidth = Image::CalcExtendedDimension((unsigned short)m_uiSourceWidth);
-	pMipmapImages[0].uiExtendedHeight = Image::CalcExtendedDimension((unsigned short)m_uiSourceHeight);
-	pMipmapImages[0].uiEncodingBitsBytes = 0;
-	pMipmapImages[0].paucEncodingBits = std::shared_ptr<unsigned char>(new unsigned char[uiEncodingBitsBytesPerBlock], [](unsigned char *p) { delete[] p; });
-
-	//block position in pixels
-	// remove the bottom 2 bits to get the block coordinates 
-	unsigned int iBlockPosX = (a_iPixelX & 0xFFFFFFFC);
-	unsigned int iBlockPosY = (a_iPixelY & 0xFFFFFFFC);
-
-	int numXBlocks = (origWidth / 4);
-	int numYBlocks = (origHeight / 4);
-	
-
-	// block location 
-	//int iBlockX = (a_iPixelX % 4) == 0 ? a_iPixelX / 4.0f : (a_iPixelX / 4) + 1;
-	//int iBlockY = (a_iPixelY % 4) == 0 ? a_iPixelY / 4.0f : (a_iPixelY / 4) + 1;
-	//m_paucEncodingBits += ((iBlockY * numXBlocks) + iBlockX) * uiEncodingBitsBytesPerBlock;
-
-	
-	unsigned int num = numXBlocks*numYBlocks;
-	unsigned int uiH = 0, uiV = 0;
-	unsigned char* pEncodingBits = m_pMipmapImages[0].paucEncodingBits.get();
-	for (unsigned int uiBlock = 0; uiBlock < num; uiBlock++)
-	{
-		if (uiH == iBlockPosX && uiV == iBlockPosY)
-		{
-			memcpy(pMipmapImages[0].paucEncodingBits.get(),pEncodingBits, uiEncodingBitsBytesPerBlock);
-			break;
-		}
-		pEncodingBits += uiEncodingBitsBytesPerBlock;
-		uiH += 4;
-
-		if (uiH >= origWidth)
-		{
-			uiH = 0;
-			uiV += 4;
-		}
-	}
-
-	delete [] m_pMipmapImages;
-	m_pMipmapImages = pMipmapImages;
-}
-// ----------------------------------------------------------------------------------------------------
-//
-void File::Write()
-{
-
-	FILE *pfile = fopen(m_pstrFilename, "wb");
-	if (pfile == nullptr)
-	{
-		printf("Error: couldn't open Etc file (%s)\n", m_pstrFilename);
-		exit(1);
-	}
-
-	m_pheader->Write(pfile);
-
-	for(unsigned int mip = 0; mip < m_uiNumMipmaps; mip++)
-	{
-		if(m_fileformat == Format::KTX)
-		{
-			// Write u32 image size
-			uint32_t u32ImageSize = m_pMipmapImages[mip].uiEncodingBitsBytes;
-			uint32_t szBytesWritten = fwrite(&u32ImageSize, 1, sizeof(u32ImageSize), pfile);
-			assert(szBytesWritten == sizeof(u32ImageSize));
-		}
-
-		unsigned int iResult = (int)fwrite(m_pMipmapImages[mip].paucEncodingBits.get(), 1, m_pMipmapImages[mip].uiEncodingBitsBytes, pfile);
-		if (iResult != m_pMipmapImages[mip].uiEncodingBitsBytes)
-	{
-		printf("Error: couldn't write Etc file (%s)\n", m_pstrFilename);
-		exit(1);
-		}
-	}
-
-	fclose(pfile);
-
-}
-
-// ----------------------------------------------------------------------------------------------------
-//
-
diff --git a/thirdparty/etc2comp/EtcFile.h b/thirdparty/etc2comp/EtcFile.h
deleted file mode 100644
index 69bf3b2d3a..0000000000
--- a/thirdparty/etc2comp/EtcFile.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColorFloatRGBA.h"
-#include "EtcImage.h"
-#include "Etc.h"
-
-namespace Etc
-{
-	class FileHeader;
-	class SourceImage;
-
-	class File
-	{
-	public:
-
-		enum class Format
-		{
-			INFER_FROM_FILE_EXTENSION,
-			PKM,
-			KTX,
-		};
-
-		File(const char *a_pstrFilename, Format a_fileformat, Image::Format a_imageformat,
-				unsigned char *a_paucEncodingBits, unsigned int a_uiEncodingBitsBytes,
-				unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight,
-				unsigned int a_uiExtendedWidth, unsigned int a_uiExtendedHeight);
-
-		File(const char *a_pstrFilename, Format a_fileformat, Image::Format a_imageformat,
-			unsigned int a_uiNumMipmaps, RawImage *pMipmapImages,
-			unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight );
-
-		File(const char *a_pstrFilename, Format a_fileformat);
-		~File();
-		const char *GetFilename(void) { return m_pstrFilename; }
-
-		void Read(const char *a_pstrFilename);
-		void Write(void);
-
-		inline unsigned int GetSourceWidth(void)
-		{
-			return m_uiSourceWidth;
-		}
-
-		inline unsigned int GetSourceHeight(void)
-		{
-			return m_uiSourceHeight;
-		}
-
-		inline unsigned int GetExtendedWidth(unsigned int mipmapIndex = 0)
-		{
-			if (mipmapIndex < m_uiNumMipmaps)
-			{
-				return m_pMipmapImages[mipmapIndex].uiExtendedWidth;
-			}
-			else
-			{
-				return 0;
-			}
-		}
-
-		inline unsigned int GetExtendedHeight(unsigned int mipmapIndex = 0)
-		{
-			if (mipmapIndex < m_uiNumMipmaps)
-			{
-				return m_pMipmapImages[mipmapIndex].uiExtendedHeight;
-			}
-			else
-			{
-				return 0;
-			}
-		}
-
-		inline Image::Format GetImageFormat()
-		{
-			return m_imageformat;
-		}
-
-		inline unsigned int GetEncodingBitsBytes(unsigned int mipmapIndex = 0)
-		{
-			if (mipmapIndex < m_uiNumMipmaps)
-			{
-				return m_pMipmapImages[mipmapIndex].uiEncodingBitsBytes;
-			}
-			else
-			{
-				return 0;
-			}
-		}
-
-		inline unsigned char*  GetEncodingBits(unsigned int mipmapIndex = 0)
-		{
-			if( mipmapIndex < m_uiNumMipmaps)
-			{
-				return m_pMipmapImages[mipmapIndex].paucEncodingBits.get();
-			}
-			else
-			{
-				return nullptr;
-			}
-		}
-
-		inline unsigned int GetNumMipmaps() 
-		{
-			return m_uiNumMipmaps; 
-		}
-
-		void UseSingleBlock(int a_iPixelX = -1, int a_iPixelY = -1);
-	private:
-
-		char *m_pstrFilename;               // includes directory path and file extension
-		Format m_fileformat;
-		Image::Format m_imageformat;
-		FileHeader *m_pheader;
-		unsigned int m_uiNumMipmaps;
-		RawImage*	 m_pMipmapImages;
-		unsigned int m_uiSourceWidth;
-		unsigned int m_uiSourceHeight;
-	};
-
-}
diff --git a/thirdparty/etc2comp/EtcFileHeader.cpp b/thirdparty/etc2comp/EtcFileHeader.cpp
deleted file mode 100644
index f02fcab011..0000000000
--- a/thirdparty/etc2comp/EtcFileHeader.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "EtcFileHeader.h"
-
-#include "EtcBlock4x4EncodingBits.h"
-
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	FileHeader_Pkm::FileHeader_Pkm(File *a_pfile)
-	{
-		m_pfile = a_pfile;
-
-		static const char s_acMagicNumberData[4] = { 'P', 'K', 'M', ' ' };
-		static const char s_acVersionData[2] = { '1', '0' };
-
-		for (unsigned int ui = 0; ui < sizeof(s_acMagicNumberData); ui++)
-		{
-			m_data.m_acMagicNumber[ui] = s_acMagicNumberData[ui];
-		}
-
-		for (unsigned int ui = 0; ui < sizeof(s_acVersionData); ui++)
-		{
-			m_data.m_acVersion[ui] = s_acVersionData[ui];
-		}
-
-		m_data.m_ucDataType_msb = 0;        // ETC1_RGB_NO_MIPMAPS
-		m_data.m_ucDataType_lsb = 0;
-
-		m_data.m_ucOriginalWidth_msb = (unsigned char)(m_pfile->GetSourceWidth() >> 8);
-		m_data.m_ucOriginalWidth_lsb = m_pfile->GetSourceWidth() & 0xFF;
-		m_data.m_ucOriginalHeight_msb = (unsigned char)(m_pfile->GetSourceHeight() >> 8);
-		m_data.m_ucOriginalHeight_lsb = m_pfile->GetSourceHeight() & 0xFF;
-
-		m_data.m_ucExtendedWidth_msb = (unsigned char)(m_pfile->GetExtendedWidth() >> 8);
-		m_data.m_ucExtendedWidth_lsb = m_pfile->GetExtendedWidth() & 0xFF;
-		m_data.m_ucExtendedHeight_msb = (unsigned char)(m_pfile->GetExtendedHeight() >> 8);
-		m_data.m_ucExtendedHeight_lsb = m_pfile->GetExtendedHeight() & 0xFF;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	void FileHeader_Pkm::Write(FILE *a_pfile)
-	{
-
-		fwrite(&m_data, sizeof(Data), 1, a_pfile);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	FileHeader_Ktx::FileHeader_Ktx(File *a_pfile)
-	{
-		m_pfile = a_pfile;
-
-		static const uint8_t s_au8Itentfier[12] =
-		{ 
-			0xAB, 0x4B, 0x54, 0x58, // first four bytes of Byte[12] identifier
-			0x20, 0x31, 0x31, 0xBB, // next four bytes of Byte[12] identifier
-			0x0D, 0x0A, 0x1A, 0x0A  // final four bytes of Byte[12] identifier
-		};
-
-		for (unsigned int ui = 0; ui < sizeof(s_au8Itentfier); ui++)
-		{
-			m_data.m_au8Identifier[ui] = s_au8Itentfier[ui];
-		}
-
-		m_data.m_u32Endianness				= 0x04030201;
-		m_data.m_u32GlType					= 0;
-		m_data.m_u32GlTypeSize				= 1;
-		m_data.m_u32GlFormat				= 0;
-
-		switch (m_pfile->GetImageFormat())
-		{
-		case Image::Format::RGB8:
-		case Image::Format::SRGB8:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_RGB8;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_RGB8;
-			break;
-
-		case Image::Format::RGBA8:
-		case Image::Format::SRGBA8:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_RGBA8;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_RGBA8;
-			break;
-
-		case Image::Format::RGB8A1:
-		case Image::Format::SRGB8A1:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_RGB8A1;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_RGB8A1;
-			break;
-		
-		case Image::Format::R11:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_R11;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_R11;
-			break;
-
-		case Image::Format::SIGNED_R11:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_SIGNED_R11;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_R11;
-			break;
-		
-		case Image::Format::RG11:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_RG11;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_RG11;
-			break;
-
-		case Image::Format::SIGNED_RG11:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC2_SIGNED_RG11;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC2_RG11;
-			break;
-
-		default:
-			m_data.m_u32GlInternalFormat = (unsigned int)InternalFormat::ETC1_RGB8;
-			m_data.m_u32GlBaseInternalFormat = (unsigned int)BaseInternalFormat::ETC1_RGB8;
-			break;
-		}
-
-		m_data.m_u32PixelWidth				= 0;
-		m_data.m_u32PixelHeight				= 0;
-		m_data.m_u32PixelDepth				= 0;
-		m_data.m_u32NumberOfArrayElements	= 0;
-		m_data.m_u32NumberOfFaces			= 0;
-		m_data.m_u32BytesOfKeyValueData		= 0;
-
-		m_pkeyvaluepair = nullptr;
-
-		m_u32Images = 0;
-		m_u32KeyValuePairs = 0;
-
-		m_data.m_u32PixelWidth = m_pfile->GetSourceWidth();
-		m_data.m_u32PixelHeight = m_pfile->GetSourceHeight();
-		m_data.m_u32PixelDepth = 0;
-		m_data.m_u32NumberOfArrayElements = 0;
-		m_data.m_u32NumberOfFaces = 1;
-		m_data.m_u32NumberOfMipmapLevels = m_pfile->GetNumMipmaps();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	void FileHeader_Ktx::Write(FILE *a_pfile)
-	{
-		size_t szBytesWritten;
-
-		// Write header
-		szBytesWritten = fwrite(&m_data, 1, sizeof(Data), a_pfile);
-		assert(szBytesWritten == sizeof(Data));
-
-		// Write KeyAndValuePairs
-		if (m_u32KeyValuePairs)
-		{
-			fwrite(m_pkeyvaluepair, m_pkeyvaluepair->u32KeyAndValueByteSize, 1, a_pfile);
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	FileHeader_Ktx::Data *FileHeader_Ktx::GetData()
-	{
-		return &m_data;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcFileHeader.h b/thirdparty/etc2comp/EtcFileHeader.h
deleted file mode 100644
index 55a9cb5d9d..0000000000
--- a/thirdparty/etc2comp/EtcFileHeader.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcFile.h"
-#include <stdio.h>
-#include <inttypes.h>
-
-namespace Etc
-{
-
-	class Image;
-
-	class FileHeader
-	{
-	public:
-
-		virtual void Write(FILE *a_pfile) = 0;
-		File GetFile();
-		virtual ~FileHeader(void) {}
-	protected:
-
-		File *m_pfile;
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-    class FileHeader_Pkm : public FileHeader
-    {
-    public:
-
-		FileHeader_Pkm(File *a_pfile);
-
-		virtual void Write(FILE *a_pfile);
-		virtual ~FileHeader_Pkm(void) {}
-	private:
-
-		typedef struct
-		{
-			char m_acMagicNumber[4];
-			char m_acVersion[2];
-			unsigned char m_ucDataType_msb;             // e.g. ETC1_RGB_NO_MIPMAPS
-			unsigned char m_ucDataType_lsb;
-			unsigned char m_ucExtendedWidth_msb;     //  padded to 4x4 blocks
-			unsigned char m_ucExtendedWidth_lsb;
-			unsigned char m_ucExtendedHeight_msb;    //  padded to 4x4 blocks
-			unsigned char m_ucExtendedHeight_lsb;
-			unsigned char m_ucOriginalWidth_msb;
-			unsigned char m_ucOriginalWidth_lsb;
-			unsigned char m_ucOriginalHeight_msb;
-			unsigned char m_ucOriginalHeight_lsb;
-		} Data;
-
-		Data m_data;
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-    class FileHeader_Ktx : public FileHeader
-    {
-    public:
-
-		typedef struct
-		{
-			uint32_t	u32KeyAndValueByteSize;
-		} KeyValuePair;
-
-		typedef struct
-		{
-			uint8_t m_au8Identifier[12];
-			uint32_t m_u32Endianness;
-			uint32_t m_u32GlType;
-			uint32_t m_u32GlTypeSize;
-			uint32_t m_u32GlFormat;
-			uint32_t m_u32GlInternalFormat;
-			uint32_t m_u32GlBaseInternalFormat;
-			uint32_t m_u32PixelWidth;
-			uint32_t m_u32PixelHeight;
-			uint32_t m_u32PixelDepth;
-			uint32_t m_u32NumberOfArrayElements;
-			uint32_t m_u32NumberOfFaces;
-			uint32_t m_u32NumberOfMipmapLevels;
-			uint32_t m_u32BytesOfKeyValueData;
-		} Data;
-
-		enum class InternalFormat
-		{
-			ETC1_RGB8 = 0x8D64,
-			ETC1_ALPHA8 = ETC1_RGB8,
-			//
-			ETC2_R11 = 0x9270,
-			ETC2_SIGNED_R11 = 0x9271,
-			ETC2_RG11 = 0x9272,
-			ETC2_SIGNED_RG11 = 0x9273,
-			ETC2_RGB8 = 0x9274,
-			ETC2_SRGB8 = 0x9275,
-			ETC2_RGB8A1 = 0x9276,
-			ETC2_SRGB8_PUNCHTHROUGH_ALPHA1 = 0x9277,
-			ETC2_RGBA8 = 0x9278
-		};
-
-		enum class BaseInternalFormat
-		{
-			ETC2_R11 = 0x1903,
-			ETC2_RG11 = 0x8227,
-			ETC1_RGB8 = 0x1907,
-			ETC1_ALPHA8 = ETC1_RGB8,
-			//
-			ETC2_RGB8 = 0x1907,
-			ETC2_RGB8A1 = 0x1908,
-			ETC2_RGBA8 = 0x1908,
-		};
-
-		FileHeader_Ktx(File *a_pfile);
-
-		virtual void Write(FILE *a_pfile);
-		virtual ~FileHeader_Ktx(void) {}
-
-		void AddKeyAndValue(KeyValuePair *a_pkeyvaluepair);
-
-		Data* GetData();
-
-	private:
-
-		Data m_data;
-		KeyValuePair *m_pkeyvaluepair;
-		
-		uint32_t m_u32Images;
-		uint32_t m_u32KeyValuePairs;
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcFilter.cpp b/thirdparty/etc2comp/EtcFilter.cpp
deleted file mode 100644
index 1ec8acdf3f..0000000000
--- a/thirdparty/etc2comp/EtcFilter.cpp
+++ /dev/null
@@ -1,404 +0,0 @@
-#include <stdlib.h>
-#include <math.h>
-#include "EtcFilter.h"
-
-
-namespace Etc
-{
-
-static const double PiConst = 3.14159265358979323846;
-
-inline double sinc(double x) 
-{
-    if ( x == 0.0 ) 
-    {
-        return 1.0;
-    }
-
-    return sin(PiConst * x) / (PiConst * x);
-}
-
-//inline float sincf( float x )
-//{
-//    x *= F_PI;
-//    if (x < 0.01f && x > -0.01f)
-//    {
-//        return 1.0f + x*x*(-1.0f/6.0f + x*x*1.0f/120.0f);
-//    }
-//
-//    return sinf(x)/x;
-//}
-//
-//double bessel0(double x) 
-//{
-//    const double EPSILON_RATIO = 1E-16;
-//    double xh, sum, pow, ds;
-//    int k;
-//
-//    xh = 0.5 * x;
-//    sum = 1.0;
-//    pow = 1.0;
-//    k = 0;
-//    ds = 1.0;
-//    while (ds > sum * EPSILON_RATIO) 
-//    {
-//        ++k;
-//        pow = pow * (xh / k);
-//        ds = pow * pow;
-//        sum = sum + ds;
-//    }
-//
-//    return sum;
-//}
-
-//**--------------------------------------------------------------------------
-//** Name: kaiser(double alpha, double half_width, double x) 
-//** Returns:
-//** Description: Alpha controls shape of filter.  We are using 4.
-//**--------------------------------------------------------------------------
-//inline double kaiser(double alpha, double half_width, double x) 
-//{
-//    double ratio = (x / half_width);
-//    return bessel0(alpha * sqrt(1 - ratio * ratio)) / bessel0(alpha);
-//}
-//
-//float Filter_Lanczos4Sinc(float x)
-//{
-//    if (x <= -4.0f || x >= 4.0f)    // half-width of 4
-//    {
-//        return 0.0;
-//    }
-//
-//    return sinc(0.875f * x) * sinc(0.25f * x);
-//}
-//
-//double Filter_Kaiser4( double t )
-//{
-//    return kaiser( 4.0, 3.0, t);
-//}
-//
-//double Filter_KaiserOptimal( double t )
-//{
-//    return kaiser( 8.93, 3.0f, t);
-//}                  
-
-double FilterLanczos3( double t )
-{
-	if ( t <= -3.0 || t >= 3.0 ) 
-    {
-        return 0.0;
-    }
-
-    return sinc( t ) * sinc( t / 3.0 );
-}
-
-double FilterBox( double t )
-{
-    return ( t > -0.5 && t < 0.5) ? 1.0 : 0.0;
-}
-
-double FilterLinear( double t )
-{
-	if (t < 0.0) t = -t;
-
-    return (t < 1.0) ? (1.0 - t) : 0.0;
-}
-
-
-//**--------------------------------------------------------------------------
-//** Name: CalcContributions( int srcSize, 
-//**                          int destSize, 
-//**                          double filterSize, 
-//**						  bool wrap,
-//**                          double (*FilterProc)(double), 
-//**                          FilterWeights contrib[] )
-//** Returns: void
-//** Description:
-//**--------------------------------------------------------------------------
-void CalcContributions( int srcSize, int destSize, double filterSize, bool wrap, double (*FilterProc)(double), FilterWeights contrib[] )
-{
-    double scale;
-    double filterScale;
-    double center;
-    double totalWeight;
-    double weight;
-    int   iRight;
-    int   iLeft;
-    int   iDest;
-
-    scale = (double)destSize / srcSize;
-    if ( scale < 1.0 )
-    {
-        filterSize = filterSize / scale;
-        filterScale = scale;
-    }
-    else
-    {
-        filterScale = 1.0;
-    }
-
-    if ( filterSize > (double)MaxFilterSize )
-    {
-        filterSize = (double)MaxFilterSize;
-    }
-
-    for ( iDest = 0; iDest < destSize; ++iDest )
-    {
-        center = (double)iDest / scale;
-
-        iLeft = (int)ceil(center - filterSize);
-		iRight = (int)floor(center + filterSize);
-
-		if ( !wrap )
-		{
-        if ( iLeft < 0 )
-        {
-            iLeft = 0;
-        }
-
-        if ( iRight >= srcSize )
-        {
-            iRight = srcSize - 1;
-        }
-		}
-
-		int numWeights = iRight - iLeft + 1;
-
-        contrib[iDest].first = iLeft;
-        contrib[iDest].numWeights = numWeights;
-
-        totalWeight = 0;
-		double t = ((double)iLeft - center) * filterScale;
-		for (int i = 0; i < numWeights; i++)
-        {
-			weight = (*FilterProc)(t) * filterScale;
-            totalWeight += weight;
-			contrib[iDest].weight[i] = weight;
-			t += filterScale;
-        }
-
-        //**--------------------------------------------------------
-        //** Normalize weights by dividing by the sum of the weights
-        //**--------------------------------------------------------
-        if ( totalWeight > 0.0 )
-        {   
-            for ( int i = 0; i < numWeights; i++)
-            {
-                contrib[iDest].weight[i] /= totalWeight;
-            }
-        }
-    }
-}
-
-//**-------------------------------------------------------------------------
-//** Name: Filter_TwoPass( RGBCOLOR *pSrcImage, 
-//**                       int srcWidth, int srcHeight, 
-//**                       RGBCOLOR *pDestImage, 
-//**                       int destWidth, int destHeight, 
-//**                       double (*FilterProc)(double) )
-//** Returns: 0 on failure and 1 on success
-//** Description: Filters a 2d image with a two pass filter by averaging the
-//**    weighted contributions of the pixels within the filter region.  The
-//**    contributions are determined by a weighting function parameter.
-//**-------------------------------------------------------------------------
-int FilterTwoPass( RGBCOLOR *pSrcImage, int srcWidth, int srcHeight, 
-                    RGBCOLOR *pDestImage, int destWidth, int destHeight, unsigned int wrapFlags, double (*FilterProc)(double) )
-{
-    FilterWeights *contrib;
-    RGBCOLOR *pPixel;
-    RGBCOLOR *pSrcPixel;
-    RGBCOLOR *pTempImage;
-    int iRow;
-    int iCol;
-    int iSrcCol;
-    int iSrcRow;
-    int iWeight;
-    double dRed;
-    double dGreen;
-    double dBlue;
-    double dAlpha;
-    double filterSize = 3.0;
-
-	int maxDim = (srcWidth>srcHeight)?srcWidth:srcHeight;
-	contrib = (FilterWeights*)malloc(maxDim * sizeof(FilterWeights));
-
-	//**------------------------------------------------------------------------
-    //** Need to create a temporary image to stuff the horizontally scaled image
-    //**------------------------------------------------------------------------
-    pTempImage = (RGBCOLOR *)malloc( destWidth * srcHeight * sizeof(RGBCOLOR) );
-    if ( pTempImage == NULL )
-    {
-        // -- GODOT start --
-        free( contrib );
-        // -- GODOT end --
-        return 0;
-    }
-
-    //**-------------------------------------------------------
-    //** Horizontally filter the image into the temporary image
-    //**-------------------------------------------------------
-	bool bWrapHorizontal = !!(wrapFlags&FILTER_WRAP_X);
-	CalcContributions( srcWidth, destWidth, filterSize, bWrapHorizontal, FilterProc, contrib );
-    for ( iRow = 0; iRow < srcHeight; iRow++ )
-    {
-        for ( iCol = 0; iCol < destWidth; iCol++ )
-        {
-            dRed   = 0;
-            dGreen = 0;
-            dBlue  = 0;
-            dAlpha = 0;
-
-            for ( iWeight = 0; iWeight < contrib[iCol].numWeights; iWeight++ )
-            {
-                iSrcCol = iWeight + contrib[iCol].first;
-				if (bWrapHorizontal)
-				{
-					iSrcCol = (iSrcCol < 0) ? (srcWidth + iSrcCol) : (iSrcCol >= srcWidth) ? (iSrcCol - srcWidth) : iSrcCol;
-				}
-                pSrcPixel = pSrcImage + (iRow * srcWidth) + iSrcCol;
-                dRed   += contrib[iCol].weight[iWeight] * pSrcPixel->rgba[0];
-                dGreen += contrib[iCol].weight[iWeight] * pSrcPixel->rgba[1];
-                dBlue  += contrib[iCol].weight[iWeight] * pSrcPixel->rgba[2];
-                dAlpha += contrib[iCol].weight[iWeight] * pSrcPixel->rgba[3];
-            }
-
-            pPixel = pTempImage + (iRow * destWidth) + iCol;
-			pPixel->rgba[0] = static_cast<unsigned char>(std::max(0.0, std::min(255.0, dRed)));
-			pPixel->rgba[1] = static_cast<unsigned char>(std::max(0.0, std::min(255.0, dGreen)));
-			pPixel->rgba[2] = static_cast<unsigned char>(std::max(0.0, std::min(255.0, dBlue)));
-			pPixel->rgba[3] = static_cast<unsigned char>(std::max(0.0, std::min(255.0, dAlpha)));
-        }
-    }
-
-    //**-------------------------------------------------------
-    //** Vertically filter the image into the destination image
-    //**-------------------------------------------------------
-	bool bWrapVertical = !!(wrapFlags&FILTER_WRAP_Y);
-	CalcContributions(srcHeight, destHeight, filterSize, bWrapVertical, FilterProc, contrib);
-    for ( iCol = 0; iCol < destWidth; iCol++ )
-    {
-        for ( iRow = 0; iRow < destHeight; iRow++ )
-        {
-            dRed   = 0;
-            dGreen = 0;
-            dBlue  = 0;
-            dAlpha = 0;
-
-            for ( iWeight = 0; iWeight < contrib[iRow].numWeights; iWeight++ )
-            {
-                iSrcRow = iWeight + contrib[iRow].first;
-				if (bWrapVertical)
-				{
-					iSrcRow = (iSrcRow < 0) ? (srcHeight + iSrcRow) : (iSrcRow >= srcHeight) ? (iSrcRow - srcHeight) : iSrcRow;
-				}
-                pSrcPixel = pTempImage + (iSrcRow * destWidth) + iCol;
-                dRed   += contrib[iRow].weight[iWeight] * pSrcPixel->rgba[0];
-                dGreen += contrib[iRow].weight[iWeight] * pSrcPixel->rgba[1];
-                dBlue  += contrib[iRow].weight[iWeight] * pSrcPixel->rgba[2];
-                dAlpha += contrib[iRow].weight[iWeight] * pSrcPixel->rgba[3];
-            }
-
-            pPixel = pDestImage + (iRow * destWidth) + iCol;
-            pPixel->rgba[0]   = (unsigned char)(std::max( 0.0, std::min( 255.0, dRed)));
-            pPixel->rgba[1] = (unsigned char)(std::max( 0.0, std::min( 255.0, dGreen)));
-            pPixel->rgba[2]  = (unsigned char)(std::max( 0.0, std::min( 255.0, dBlue)));
-            pPixel->rgba[3] = (unsigned char)(std::max( 0.0, std::min( 255.0, dAlpha)));
-        }
-    }
-
-    free( pTempImage );
-	free( contrib );
-
-    return 1;
-}
-
-//**-------------------------------------------------------------------------
-//** Name: FilterResample(RGBCOLOR *pSrcImage, int srcWidth, int srcHeight, 
-//**                       RGBCOLOR *pDstImage, int dstWidth, int dstHeight)
-//** Returns: 1
-//** Description: This function runs a 2d box filter over the srouce image
-//** to produce the destination image.
-//**-------------------------------------------------------------------------
-void FilterResample( RGBCOLOR *pSrcImage, int srcWidth, int srcHeight, 
-                     RGBCOLOR *pDstImage, int dstWidth, int dstHeight )
-{
-    int iRow;
-    int iCol;
-    int iSampleRow;
-    int iSampleCol;
-    int iFirstSampleRow;
-    int iFirstSampleCol;
-    int iLastSampleRow;
-    int iLastSampleCol;
-    int red;
-    int green;
-    int blue;
-    int alpha;
-    int samples;
-    float xScale;
-    float yScale;
-
-    RGBCOLOR *pSrcPixel;
-    RGBCOLOR *pDstPixel;
-
-    xScale = (float)srcWidth / dstWidth;
-    yScale = (float)srcHeight / dstHeight;
-
-    for ( iRow = 0; iRow < dstHeight; iRow++ )
-    {
-        for ( iCol = 0; iCol < dstWidth; iCol++ )
-        {
-            iFirstSampleRow = (int)(iRow * yScale);
-            iLastSampleRow = (int)ceil(iFirstSampleRow + yScale - 1);
-            if ( iLastSampleRow >= srcHeight )
-            {
-                iLastSampleRow = srcHeight - 1;
-            }
-
-            iFirstSampleCol = (int)(iCol * xScale);
-            iLastSampleCol = (int)ceil(iFirstSampleCol + xScale - 1);
-            if ( iLastSampleCol >= srcWidth )
-            {
-                iLastSampleCol = srcWidth - 1;
-            }
-
-            samples = 0;
-            red     = 0;
-            green   = 0;
-            blue    = 0;
-            alpha   = 0;
-            for ( iSampleRow = iFirstSampleRow; iSampleRow <= iLastSampleRow; iSampleRow++ )
-            {
-                for ( iSampleCol = iFirstSampleCol; iSampleCol <= iLastSampleCol; iSampleCol++ )
-                {
-                    pSrcPixel = pSrcImage + iSampleRow * srcWidth + iSampleCol;
-                    red   += pSrcPixel->rgba[0];
-                    green += pSrcPixel->rgba[1];
-                    blue  += pSrcPixel->rgba[2];
-                    alpha += pSrcPixel->rgba[3];
-
-                    samples++;
-                }
-            }
-
-            pDstPixel = pDstImage + iRow * dstWidth + iCol;
-            if ( samples > 0 )
-            {
-                pDstPixel->rgba[0] = static_cast<uint8_t>(red / samples);
-                pDstPixel->rgba[1] = static_cast<uint8_t>(green / samples);
-                pDstPixel->rgba[2] = static_cast<uint8_t>(blue / samples);
-                pDstPixel->rgba[3] = static_cast<uint8_t>(alpha / samples);
-            }
-            else
-            {
-                pDstPixel->rgba[0] = static_cast<uint8_t>(red);
-                pDstPixel->rgba[1] = static_cast<uint8_t>(green);
-                pDstPixel->rgba[2] = static_cast<uint8_t>(blue);
-                pDstPixel->rgba[3] = static_cast<uint8_t>(alpha);
-            }
-        }
-    }
-}
-
-
-}
-\ No newline at end of file
diff --git a/thirdparty/etc2comp/EtcFilter.h b/thirdparty/etc2comp/EtcFilter.h
deleted file mode 100644
index fcf125c6df..0000000000
--- a/thirdparty/etc2comp/EtcFilter.h
+++ /dev/null
@@ -1,244 +0,0 @@
-#pragma once
-#include <stdint.h>
-#include <algorithm>
-
-namespace Etc
-{
-
-enum FilterEnums
-{
-	MaxFilterSize = 32
-};
-
-enum WrapFlags
-{
-	FILTER_WRAP_NONE = 0,
-	FILTER_WRAP_X = 0x1,
-	FILTER_WRAP_Y = 0x2
-};
-
-typedef struct tagFilterWeights
-{
-	int   first;
-	int   numWeights;
-	double weight[MaxFilterSize * 2 + 1];
-} FilterWeights;
-
-typedef struct tagRGBCOLOR
-{
-	union
-	{
-		uint32_t ulColor;
-		uint8_t rgba[4];
-	};
-} RGBCOLOR;
-
-
-double FilterBox( double t );
-double FilterLinear( double t );
-double FilterLanczos3( double t );
-
-int FilterTwoPass( RGBCOLOR *pSrcImage, int srcWidth, int srcHeight, 
-                    RGBCOLOR *pDestImage, int destWidth, int destHeight, unsigned int wrapFlags, double (*FilterProc)(double) );
-void FilterResample( RGBCOLOR *pSrcImage, int srcWidth, int srcHeight, 
-                     RGBCOLOR *pDstImage, int dstWidth, int dstHeight );
-
-
-void CalcContributions(int srcSize, int destSize, double filterSize, bool wrap, double(*FilterProc)(double), FilterWeights contrib[]);
-
-template <typename T>
-void FilterResample(T *pSrcImage, int srcWidth, int srcHeight, T *pDstImage, int dstWidth, int dstHeight)
-{
-	float xScale;
-	float yScale;
-
-	T *pSrcPixel;
-	T *pDstPixel;
-
-	xScale = (float)srcWidth / dstWidth;
-	yScale = (float)srcHeight / dstHeight;
-
-	for (int iRow = 0; iRow < dstHeight; iRow++)
-	{
-		for (int iCol = 0; iCol < dstWidth; iCol++)
-		{
-			int samples;
-			int iFirstSampleRow;
-			int iFirstSampleCol;
-			int iLastSampleRow;
-			int iLastSampleCol;
-			float red;
-			float green;
-			float blue;
-			float alpha;
-
-			iFirstSampleRow = (int)(iRow * yScale);
-			iLastSampleRow = (int)ceil(iFirstSampleRow + yScale - 1);
-			if (iLastSampleRow >= srcHeight)
-			{
-				iLastSampleRow = srcHeight - 1;
-			}
-
-			iFirstSampleCol = (int)(iCol * xScale);
-			iLastSampleCol = (int)ceil(iFirstSampleCol + xScale - 1);
-			if (iLastSampleCol >= srcWidth)
-			{
-				iLastSampleCol = srcWidth - 1;
-			}
-
-			samples = 0;
-			red = 0.f;
-			green = 0.f;
-			blue = 0.f;
-			alpha = 0.f;
-			for (int iSampleRow = iFirstSampleRow; iSampleRow <= iLastSampleRow; iSampleRow++)
-			{
-				for (int iSampleCol = iFirstSampleCol; iSampleCol <= iLastSampleCol; iSampleCol++)
-				{
-					pSrcPixel = pSrcImage + (iSampleRow * srcWidth + iSampleCol) * 4;
-					red += static_cast<float>(pSrcPixel[0]);
-					green += static_cast<float>(pSrcPixel[1]);
-					blue += static_cast<float>(pSrcPixel[2]);
-					alpha += static_cast<float>(pSrcPixel[3]);
-
-					samples++;
-				}
-			}
-
-			pDstPixel = pDstImage + (iRow * dstWidth + iCol) * 4;
-			if (samples > 0)
-			{
-				pDstPixel[0] = static_cast<T>(red / samples);
-				pDstPixel[1] = static_cast<T>(green / samples);
-				pDstPixel[2] = static_cast<T>(blue / samples);
-				pDstPixel[3] = static_cast<T>(alpha / samples);
-			}
-			else
-			{
-				pDstPixel[0] = static_cast<T>(red);
-				pDstPixel[1] = static_cast<T>(green);
-				pDstPixel[2] = static_cast<T>(blue);
-				pDstPixel[3] = static_cast<T>(alpha);
-			}
-		}
-	}
-
-}
-
-//**-------------------------------------------------------------------------
-//** Name: Filter_TwoPass( RGBCOLOR *pSrcImage, 
-//**                       int srcWidth, int srcHeight, 
-//**                       RGBCOLOR *pDestImage, 
-//**                       int destWidth, int destHeight, 
-//**                       double (*FilterProc)(double) )
-//** Returns: 0 on failure and 1 on success
-//** Description: Filters a 2d image with a two pass filter by averaging the
-//**    weighted contributions of the pixels within the filter region.  The
-//**    contributions are determined by a weighting function parameter.
-//**-------------------------------------------------------------------------
-template <typename T>
-int FilterTwoPass(T *pSrcImage, int srcWidth, int srcHeight,
-	T *pDestImage, int destWidth, int destHeight, unsigned int wrapFlags, double(*FilterProc)(double))
-{
-	const int numComponents = 4;
-	FilterWeights *contrib;
-	T *pPixel;
-	T *pTempImage;
-	double dRed;
-	double dGreen;
-	double dBlue;
-	double dAlpha;
-	double filterSize = 3.0;
-
-	int maxDim = (srcWidth>srcHeight) ? srcWidth : srcHeight;
-	contrib = new FilterWeights[maxDim];
-
-	//**------------------------------------------------------------------------
-	//** Need to create a temporary image to stuff the horizontally scaled image
-	//**------------------------------------------------------------------------
-	pTempImage = new T[destWidth * srcHeight * numComponents];
-	if (pTempImage == NULL)
-	{
-		return 0;
-	}
-
-	//**-------------------------------------------------------
-	//** Horizontally filter the image into the temporary image
-	//**-------------------------------------------------------
-	bool bWrapHorizontal = !!(wrapFlags&FILTER_WRAP_X);
-	CalcContributions(srcWidth, destWidth, filterSize, bWrapHorizontal, FilterProc, contrib);
-	for (int iRow = 0; iRow < srcHeight; iRow++)
-	{
-		for (int iCol = 0; iCol < destWidth; iCol++)
-		{
-			dRed = 0;
-			dGreen = 0;
-			dBlue = 0;
-			dAlpha = 0;
-
-			for (int iWeight = 0; iWeight < contrib[iCol].numWeights; iWeight++)
-			{
-				int iSrcCol = iWeight + contrib[iCol].first;
-				if(bWrapHorizontal)
-				{
-					iSrcCol = (iSrcCol < 0)?(srcWidth+iSrcCol):(iSrcCol >= srcWidth)?(iSrcCol-srcWidth):iSrcCol;
-				}
-				T* pSrcPixel = pSrcImage + ((iRow * srcWidth) + iSrcCol)*numComponents;
-				dRed += contrib[iCol].weight[iWeight] * pSrcPixel[0];
-				dGreen += contrib[iCol].weight[iWeight] * pSrcPixel[1];
-				dBlue += contrib[iCol].weight[iWeight] * pSrcPixel[2];
-				dAlpha += contrib[iCol].weight[iWeight] * pSrcPixel[3];
-			}
-
-			pPixel = pTempImage + ((iRow * destWidth) + iCol)*numComponents;
-			pPixel[0] = static_cast<T>(std::max(0.0, std::min(255.0, dRed)));
-			pPixel[1] = static_cast<T>(std::max(0.0, std::min(255.0, dGreen)));
-			pPixel[2] = static_cast<T>(std::max(0.0, std::min(255.0, dBlue)));
-			pPixel[3] = static_cast<T>(std::max(0.0, std::min(255.0, dAlpha)));
-		}
-	}
-
-	//**-------------------------------------------------------
-	//** Vertically filter the image into the destination image
-	//**-------------------------------------------------------
-	bool bWrapVertical = !!(wrapFlags&FILTER_WRAP_Y);
-	CalcContributions(srcHeight, destHeight, filterSize, bWrapVertical, FilterProc, contrib);
-	for (int iCol = 0; iCol < destWidth; iCol++)
-	{
-		for (int iRow = 0; iRow < destHeight; iRow++)
-		{
-			dRed = 0;
-			dGreen = 0;
-			dBlue = 0;
-			dAlpha = 0;
-
-			for (int iWeight = 0; iWeight < contrib[iRow].numWeights; iWeight++)
-			{
-				int iSrcRow = iWeight + contrib[iRow].first;
-				if (bWrapVertical)
-				{
-					iSrcRow = (iSrcRow < 0) ? (srcHeight + iSrcRow) : (iSrcRow >= srcHeight) ? (iSrcRow - srcHeight) : iSrcRow;
-				}
-				T* pSrcPixel = pTempImage + ((iSrcRow * destWidth) + iCol)*numComponents;
-				dRed += contrib[iRow].weight[iWeight] * pSrcPixel[0];
-				dGreen += contrib[iRow].weight[iWeight] * pSrcPixel[1];
-				dBlue += contrib[iRow].weight[iWeight] * pSrcPixel[2];
-				dAlpha += contrib[iRow].weight[iWeight] * pSrcPixel[3];
-			}
-
-			pPixel = pDestImage + ((iRow * destWidth) + iCol)*numComponents;
-			pPixel[0] = static_cast<T>(std::max(0.0, std::min(255.0, dRed)));
-			pPixel[1] = static_cast<T>(std::max(0.0, std::min(255.0, dGreen)));
-			pPixel[2] = static_cast<T>(std::max(0.0, std::min(255.0, dBlue)));
-			pPixel[3] = static_cast<T>(std::max(0.0, std::min(255.0, dAlpha)));
-		}
-	}
-
-	delete[] pTempImage;
-	delete[] contrib;
-
-	return 1;
-}
-
-
-}
-\ No newline at end of file
diff --git a/thirdparty/etc2comp/EtcImage.cpp b/thirdparty/etc2comp/EtcImage.cpp
deleted file mode 100644
index 7a1058844d..0000000000
--- a/thirdparty/etc2comp/EtcImage.cpp
+++ /dev/null
@@ -1,685 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcImage.cpp
-
-Image is an array of 4x4 blocks that represent the encoding of the source image
-
-*/
-
-#include "EtcConfig.h"
-
-#include <stdlib.h>
-
-#include "EtcImage.h"
-
-#include "Etc.h"
-#include "EtcBlock4x4.h"
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcSortedBlockList.h"
-
-#if ETC_WINDOWS
-#include <windows.h>
-#endif
-#include <ctime>
-#include <chrono>
-#include <future>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-// fix conflict with Block4x4::AlphaMix
-#ifdef OPAQUE
-#undef OPAQUE
-#endif
-#ifdef TRANSPARENT
-#undef TRANSPARENT
-#endif
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Image::Image(void)
-	{
-		m_encodingStatus = EncodingStatus::SUCCESS;
-		m_warningsToCapture = EncodingStatus::SUCCESS;
-		m_pafrgbaSource = nullptr;
-
-		m_pablock = nullptr;
-
-		m_encodingbitsformat = Block4x4EncodingBits::Format::UNKNOWN;
-		m_uiEncodingBitsBytes = 0;
-		m_paucEncodingBits = nullptr;
-
-		m_format = Format::UNKNOWN;
-		m_iNumOpaquePixels = 0;
-		m_iNumTranslucentPixels = 0;
-		m_iNumTransparentPixels = 0;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// constructor using source image
-	// used to set state before Encode() is called
-	//
-	Image::Image(float *a_pafSourceRGBA, unsigned int a_uiSourceWidth,
-					unsigned int a_uiSourceHeight, 
-					ErrorMetric a_errormetric)
-	{
-		m_encodingStatus = EncodingStatus::SUCCESS;
-		m_warningsToCapture = EncodingStatus::SUCCESS;
-		m_pafrgbaSource = (ColorFloatRGBA *) a_pafSourceRGBA;
-		m_uiSourceWidth = a_uiSourceWidth;
-		m_uiSourceHeight = a_uiSourceHeight;
-
-		m_uiExtendedWidth = CalcExtendedDimension((unsigned short)m_uiSourceWidth);
-		m_uiExtendedHeight = CalcExtendedDimension((unsigned short)m_uiSourceHeight);
-
-		m_uiBlockColumns = m_uiExtendedWidth >> 2;
-		m_uiBlockRows = m_uiExtendedHeight >> 2;
-
-		m_pablock = new Block4x4[GetNumberOfBlocks()];
-		assert(m_pablock);
-
-		m_format = Format::UNKNOWN;
-
-		m_encodingbitsformat = Block4x4EncodingBits::Format::UNKNOWN;
-		m_uiEncodingBitsBytes = 0;
-		m_paucEncodingBits = nullptr;
-
-		m_errormetric = a_errormetric;
-		m_fEffort = 0.0f;
-
-		m_iEncodeTime_ms = -1;
-
-		m_iNumOpaquePixels = 0;
-		m_iNumTranslucentPixels = 0;
-		m_iNumTransparentPixels = 0;
-		m_bVerboseOutput = false;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// constructor using encoding bits
-	// recreates encoding state using a previously encoded image
-	//
-	Image::Image(Format a_format,
-					unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight,
-					unsigned char *a_paucEncidingBits, unsigned int a_uiEncodingBitsBytes,
-					Image *a_pimageSource, ErrorMetric a_errormetric)
-	{
-		m_encodingStatus = EncodingStatus::SUCCESS;
-		m_pafrgbaSource = nullptr;
-		m_uiSourceWidth = a_uiSourceWidth;
-		m_uiSourceHeight = a_uiSourceHeight;
-
-		m_uiExtendedWidth = CalcExtendedDimension((unsigned short)m_uiSourceWidth);
-		m_uiExtendedHeight = CalcExtendedDimension((unsigned short)m_uiSourceHeight);
-
-		m_uiBlockColumns = m_uiExtendedWidth >> 2;
-		m_uiBlockRows = m_uiExtendedHeight >> 2;
-
-		unsigned int uiBlocks = GetNumberOfBlocks();
-
-		m_pablock = new Block4x4[uiBlocks];
-		assert(m_pablock);
-
-		m_format = a_format;
-
-		m_iNumOpaquePixels = 0;
-		m_iNumTranslucentPixels = 0;
-		m_iNumTransparentPixels = 0;
-		
-		m_encodingbitsformat = DetermineEncodingBitsFormat(m_format);
-		if (m_encodingbitsformat == Block4x4EncodingBits::Format::UNKNOWN)
-		{
-			AddToEncodingStatus(ERROR_UNKNOWN_FORMAT);
-			return;
-		}
-		m_uiEncodingBitsBytes = a_uiEncodingBitsBytes;
-		m_paucEncodingBits = a_paucEncidingBits;
-
-		m_errormetric = a_errormetric;
-		m_fEffort = 0.0f;
-		m_bVerboseOutput = false;
-		m_iEncodeTime_ms = -1;
-		
-		unsigned char *paucEncodingBits = m_paucEncodingBits;
-		unsigned int uiEncodingBitsBytesPerBlock = Block4x4EncodingBits::GetBytesPerBlock(m_encodingbitsformat);
-
-		unsigned int uiH = 0;
-		unsigned int uiV = 0;
-		for (unsigned int uiBlock = 0; uiBlock < uiBlocks; uiBlock++)
-		{
-			m_pablock[uiBlock].InitFromEtcEncodingBits(a_format, uiH, uiV, paucEncodingBits, 
-														a_pimageSource, a_errormetric);
-			paucEncodingBits += uiEncodingBitsBytesPerBlock;
-			uiH += 4;
-			if (uiH >= m_uiSourceWidth)
-			{
-				uiH = 0;
-				uiV += 4;
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	Image::~Image(void)
-	{
-		if (m_pablock != nullptr)
-		{
-			delete[] m_pablock;
-			m_pablock = nullptr;
-		}
-
-		/*if (m_paucEncodingBits != nullptr)
-		{
-			delete[] m_paucEncodingBits;
-			m_paucEncodingBits = nullptr;
-		}*/
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// encode an image
-	// create a set of encoding bits that conforms to a_format
-	// find best fit using a_errormetric
-	// explore a range of possible encodings based on a_fEffort (range = [0:100])
-	// speed up process using a_uiJobs as the number of process threads (a_uiJobs must not excede a_uiMaxJobs)
-	//
-	Image::EncodingStatus Image::Encode(Format a_format, ErrorMetric a_errormetric, float a_fEffort, unsigned int a_uiJobs, unsigned int a_uiMaxJobs)
-	{
-
-		auto start = std::chrono::steady_clock::now();
-		
-		m_encodingStatus = EncodingStatus::SUCCESS;
-
-		m_format = a_format;
-		m_errormetric = a_errormetric;
-		m_fEffort = a_fEffort;
-
-		if (m_errormetric < 0 || m_errormetric > ERROR_METRICS)
-		{
-			AddToEncodingStatus(ERROR_UNKNOWN_ERROR_METRIC);
-			return m_encodingStatus;
-		}
-
-		if (m_fEffort < ETCCOMP_MIN_EFFORT_LEVEL)
-		{
-			AddToEncodingStatus(WARNING_EFFORT_OUT_OF_RANGE);
-			m_fEffort = ETCCOMP_MIN_EFFORT_LEVEL;
-		}
-		else if (m_fEffort > ETCCOMP_MAX_EFFORT_LEVEL)
-		{
-			AddToEncodingStatus(WARNING_EFFORT_OUT_OF_RANGE);
-			m_fEffort = ETCCOMP_MAX_EFFORT_LEVEL;
-		}
-		if (a_uiJobs < 1)
-		{
-			a_uiJobs = 1;
-			AddToEncodingStatus(WARNING_JOBS_OUT_OF_RANGE);
-		}
-		else if (a_uiJobs > a_uiMaxJobs)
-		{
-			a_uiJobs = a_uiMaxJobs;
-			AddToEncodingStatus(WARNING_JOBS_OUT_OF_RANGE);
-		}
-
-		m_encodingbitsformat = DetermineEncodingBitsFormat(m_format);
-
-		if (m_encodingbitsformat == Block4x4EncodingBits::Format::UNKNOWN)
-		{
-			AddToEncodingStatus(ERROR_UNKNOWN_FORMAT);
-			return m_encodingStatus;
-		}
-
-		assert(m_paucEncodingBits == nullptr);
-		m_uiEncodingBitsBytes = GetNumberOfBlocks() * Block4x4EncodingBits::GetBytesPerBlock(m_encodingbitsformat);
-		m_paucEncodingBits = new unsigned char[m_uiEncodingBitsBytes];
-
-		InitBlocksAndBlockSorter();
-
-
-		std::future<void> *handle = new std::future<void>[a_uiMaxJobs];
-
-		unsigned int uiNumThreadsNeeded = 0;
-		unsigned int uiUnfinishedBlocks = GetNumberOfBlocks();
-
-		uiNumThreadsNeeded = (uiUnfinishedBlocks < a_uiJobs) ? uiUnfinishedBlocks : a_uiJobs;
-			
-		for (int i = 0; i < (int)uiNumThreadsNeeded - 1; i++)
-		{
-			handle[i] = async(std::launch::async, &Image::RunFirstPass, this, i, uiNumThreadsNeeded);
-		}
-
-		RunFirstPass(uiNumThreadsNeeded - 1, uiNumThreadsNeeded);
-
-		for (int i = 0; i < (int)uiNumThreadsNeeded - 1; i++)
-		{
-			handle[i].get();
-		}
-
-		// perform effort-based encoding
-		if (m_fEffort > ETCCOMP_MIN_EFFORT_LEVEL)
-		{
-			unsigned int uiFinishedBlocks = 0;
-			unsigned int uiTotalEffortBlocks = static_cast<unsigned int>(roundf(0.01f * m_fEffort  * GetNumberOfBlocks()));
-
-			if (m_bVerboseOutput)
-			{
-				printf("effortblocks = %d\n", uiTotalEffortBlocks);
-			}
-			unsigned int uiPass = 0;
-			while (1)
-			{
-				if (m_bVerboseOutput)
-				{
-					uiPass++;
-					printf("pass %u\n", uiPass);
-				}
-				m_psortedblocklist->Sort();
-				uiUnfinishedBlocks = m_psortedblocklist->GetNumberOfSortedBlocks();
-				uiFinishedBlocks = GetNumberOfBlocks() - uiUnfinishedBlocks;
-				if (m_bVerboseOutput)
-				{
-					printf("    %u unfinished blocks\n", uiUnfinishedBlocks);
-					// m_psortedblocklist->Print();
-				}
-
-				
-
-				//stop enocding when we did enough to satify the effort percentage
-				if (uiFinishedBlocks >= uiTotalEffortBlocks)
-				{
-					if (m_bVerboseOutput)
-					{
-						printf("Finished %d Blocks out of %d\n", uiFinishedBlocks, uiTotalEffortBlocks);
-					}
-					break;
-				}
-
-				unsigned int uiIteratedBlocks = 0;
-				unsigned int blocksToIterateThisPass = (uiTotalEffortBlocks - uiFinishedBlocks);
-				uiNumThreadsNeeded = (uiUnfinishedBlocks < a_uiJobs) ? uiUnfinishedBlocks : a_uiJobs;
-
-				if (uiNumThreadsNeeded <= 1)
-				{
-					//since we already how many blocks each thread will process
-					//cap the thread limit to do the proper amount of work, and not more
-					uiIteratedBlocks = IterateThroughWorstBlocks(blocksToIterateThisPass, 0, 1);
-				}
-				else
-				{
-					//we have a lot of work to do, so lets multi thread it
-					std::future<unsigned int> *handleToBlockEncoders = new std::future<unsigned int>[uiNumThreadsNeeded-1];
-
-					for (int i = 0; i < (int)uiNumThreadsNeeded - 1; i++)
-					{
-						handleToBlockEncoders[i] = async(std::launch::async, &Image::IterateThroughWorstBlocks, this, blocksToIterateThisPass, i, uiNumThreadsNeeded);
-					}
-					uiIteratedBlocks = IterateThroughWorstBlocks(blocksToIterateThisPass, uiNumThreadsNeeded - 1, uiNumThreadsNeeded);
-
-					for (int i = 0; i < (int)uiNumThreadsNeeded - 1; i++)
-					{
-						uiIteratedBlocks += handleToBlockEncoders[i].get();
-					}
-
-					delete[] handleToBlockEncoders;
-				}
-
-				if (m_bVerboseOutput)
-				{
-					printf("    %u iterated blocks\n", uiIteratedBlocks);
-				}
-			}
-		}
-
-		// generate Etc2-compatible bit-format 4x4 blocks
-		for (int i = 0; i < (int)a_uiJobs - 1; i++)
-		{
-			handle[i] = async(std::launch::async, &Image::SetEncodingBits, this, i, a_uiJobs);
-		}
-		SetEncodingBits(a_uiJobs - 1, a_uiJobs);
-
-		for (int i = 0; i < (int)a_uiJobs - 1; i++)
-		{
-			handle[i].get();
-		}
-
-		auto end = std::chrono::steady_clock::now();
-		std::chrono::milliseconds elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-		m_iEncodeTime_ms = (int)elapsed.count();
-
-		delete[] handle;
-		delete m_psortedblocklist;
-		return m_encodingStatus;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// iterate the encoding thru the blocks with the worst error
-	// stop when a_uiMaxBlocks blocks have been iterated
-	// split the blocks between the process threads using a_uiMultithreadingOffset and a_uiMultithreadingStride
-	//
-	unsigned int Image::IterateThroughWorstBlocks(unsigned int a_uiMaxBlocks, 
-													unsigned int a_uiMultithreadingOffset, 
-													unsigned int a_uiMultithreadingStride)
-	{
-		assert(a_uiMultithreadingStride > 0);
-		unsigned int uiIteratedBlocks = a_uiMultithreadingOffset;
-
-		SortedBlockList::Link *plink = m_psortedblocklist->GetLinkToFirstBlock();
-		for (plink = plink->Advance(a_uiMultithreadingOffset);
-				plink != nullptr;
-				plink = plink->Advance(a_uiMultithreadingStride) )
-		{
-			if (uiIteratedBlocks >= a_uiMaxBlocks)
-			{
-				break;
-			}
-
-			plink->GetBlock()->PerformEncodingIteration(m_fEffort);
-
-			uiIteratedBlocks += a_uiMultithreadingStride;	
-		}
-
-		return uiIteratedBlocks;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// determine which warnings to check for during Encode() based on encoding format
-	//
-	void Image::FindEncodingWarningTypesForCurFormat()
-	{
-		TrackEncodingWarning(WARNING_ALL_TRANSPARENT_PIXELS);
-		TrackEncodingWarning(WARNING_SOME_RGBA_NOT_0_TO_1);
-		switch (m_format)
-		{
-		case Image::Format::ETC1:
-		case Image::Format::RGB8:
-		case Image::Format::SRGB8:
-			TrackEncodingWarning(WARNING_SOME_NON_OPAQUE_PIXELS);
-			TrackEncodingWarning(WARNING_SOME_TRANSLUCENT_PIXELS);
-			break;
-
-		case Image::Format::RGB8A1:
-		case Image::Format::SRGB8A1:
-			TrackEncodingWarning(WARNING_SOME_TRANSLUCENT_PIXELS);
-			TrackEncodingWarning(WARNING_ALL_OPAQUE_PIXELS);
-			break;
-		case Image::Format::RGBA8:
-		case Image::Format::SRGBA8:
-			TrackEncodingWarning(WARNING_ALL_OPAQUE_PIXELS);
-			break;
-
-		case Image::Format::R11:
-		case Image::Format::SIGNED_R11:
-			TrackEncodingWarning(WARNING_SOME_NON_OPAQUE_PIXELS);
-			TrackEncodingWarning(WARNING_SOME_TRANSLUCENT_PIXELS);
-			TrackEncodingWarning(WARNING_SOME_GREEN_VALUES_ARE_NOT_ZERO);
-			TrackEncodingWarning(WARNING_SOME_BLUE_VALUES_ARE_NOT_ZERO);
-			break;
-
-		case Image::Format::RG11:
-		case Image::Format::SIGNED_RG11:
-			TrackEncodingWarning(WARNING_SOME_NON_OPAQUE_PIXELS);
-			TrackEncodingWarning(WARNING_SOME_TRANSLUCENT_PIXELS);
-			TrackEncodingWarning(WARNING_SOME_BLUE_VALUES_ARE_NOT_ZERO);
-			break;
-		case Image::Format::FORMATS:
-		case Image::Format::UNKNOWN:
-		default:
-			assert(0);
-			break;
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// examine source pixels to check for warnings
-	//
-	void Image::FindAndSetEncodingWarnings()
-	{
-		int numPixels = (m_uiBlockRows * 4) * (m_uiBlockColumns * 4);
-		if (m_iNumOpaquePixels == numPixels)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_ALL_OPAQUE_PIXELS);
-		}
-		if (m_iNumOpaquePixels < numPixels)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_NON_OPAQUE_PIXELS);
-		}
-		if (m_iNumTranslucentPixels > 0)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_TRANSLUCENT_PIXELS);
-		}
-		if (m_iNumTransparentPixels == numPixels)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_ALL_TRANSPARENT_PIXELS);
-		}
-		if (m_numColorValues.fB > 0.0f)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_BLUE_VALUES_ARE_NOT_ZERO);
-		}
-		if (m_numColorValues.fG > 0.0f) 
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_GREEN_VALUES_ARE_NOT_ZERO);
-		}
-
-		if (m_numOutOfRangeValues.fR > 0.0f || m_numOutOfRangeValues.fG > 0.0f)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_RGBA_NOT_0_TO_1);
-		}
-		if (m_numOutOfRangeValues.fB > 0.0f || m_numOutOfRangeValues.fA > 0.0f)
-		{
-			AddToEncodingStatusIfSignfigant(Image::EncodingStatus::WARNING_SOME_RGBA_NOT_0_TO_1);
-		}
-	}
-	
-	// ----------------------------------------------------------------------------------------------------
-	// return a string name for a given image format
-	//
-	const char * Image::EncodingFormatToString(Image::Format a_format)
-	{
-		switch (a_format)
-		{
-		case Image::Format::ETC1:
-			return "ETC1";
-		case Image::Format::RGB8:
-			return "RGB8";
-		case Image::Format::SRGB8:
-			return "SRGB8";
-
-		case Image::Format::RGB8A1:
-			return "RGB8A1";
-		case Image::Format::SRGB8A1:
-			return "SRGB8A1";
-		case Image::Format::RGBA8:
-			return "RGBA8";
-		case Image::Format::SRGBA8:
-			return "SRGBA8";
-
-		case Image::Format::R11:
-			return "R11";
-		case Image::Format::SIGNED_R11:
-			return "SIGNED_R11";
-
-		case Image::Format::RG11:
-			return "RG11";
-		case Image::Format::SIGNED_RG11:
-			return "SIGNED_RG11";
-		case Image::Format::FORMATS:
-		case Image::Format::UNKNOWN:
-		default:
-			return "UNKNOWN";
-		}
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// return a string name for the image's format
-	//
-	const char * Image::EncodingFormatToString(void)
-	{
-		return EncodingFormatToString(m_format);
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// init image blocks prior to encoding
-	// init block sorter for subsequent sortings
-	// check for encoding warnings
-	//
-	void Image::InitBlocksAndBlockSorter(void)
-	{
-		
-		FindEncodingWarningTypesForCurFormat();
-
-		// init each block
-		Block4x4 *pblock = m_pablock;
-		unsigned char *paucEncodingBits = m_paucEncodingBits;
-		for (unsigned int uiBlockRow = 0; uiBlockRow < m_uiBlockRows; uiBlockRow++)
-		{
-			unsigned int uiBlockV = uiBlockRow * 4;
-
-			for (unsigned int uiBlockColumn = 0; uiBlockColumn < m_uiBlockColumns; uiBlockColumn++)
-			{
-				unsigned int uiBlockH = uiBlockColumn * 4;
-
-				pblock->InitFromSource(this, uiBlockH, uiBlockV, paucEncodingBits, m_errormetric);
-
-				paucEncodingBits += Block4x4EncodingBits::GetBytesPerBlock(m_encodingbitsformat);
-
-				pblock++;
-			}
-		}
-
-		FindAndSetEncodingWarnings();
-
-		// init block sorter
-		{
-			m_psortedblocklist = new SortedBlockList(GetNumberOfBlocks(), 100);
-
-			for (unsigned int uiBlock = 0; uiBlock < GetNumberOfBlocks(); uiBlock++)
-			{
-				pblock = &m_pablock[uiBlock];
-				m_psortedblocklist->AddBlock(pblock);
-			}
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// run the first pass of the encoder
-	// the encoder generally finds a reasonable, fast encoding
-	// this is run on all blocks regardless of effort to ensure that all blocks have a valid encoding
-	//
-	void Image::RunFirstPass(unsigned int a_uiMultithreadingOffset, unsigned int a_uiMultithreadingStride)
-	{
-		assert(a_uiMultithreadingStride > 0);
-
-		for (unsigned int uiBlock = a_uiMultithreadingOffset;
-				uiBlock < GetNumberOfBlocks(); 
-				uiBlock += a_uiMultithreadingStride)
-		{
-			Block4x4 *pblock = &m_pablock[uiBlock];
-			pblock->PerformEncodingIteration(m_fEffort);
-		}
-	}
-
-    // ----------------------------------------------------------------------------------------------------
-	// set the encoding bits (for the output file) based on the best encoding for each block
-	//
-	void Image::SetEncodingBits(unsigned int a_uiMultithreadingOffset,
-								unsigned int a_uiMultithreadingStride)
-	{
-		assert(a_uiMultithreadingStride > 0);
-
-		for (unsigned int uiBlock = a_uiMultithreadingOffset; 
-				uiBlock < GetNumberOfBlocks(); 
-				uiBlock += a_uiMultithreadingStride)
-		{
-			Block4x4 *pblock = &m_pablock[uiBlock];
-			pblock->SetEncodingBitsFromEncoding();
-		}
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// return the image error
-	// image error is the sum of all block errors
-	//
-	float Image::GetError(void)
-	{
-		float fError = 0.0f;
-
-		for (unsigned int uiBlock = 0; uiBlock < GetNumberOfBlocks(); uiBlock++)
-		{
-			Block4x4 *pblock = &m_pablock[uiBlock];
-			fError += pblock->GetError();
-		}
-
-		return fError;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// determine the encoding bits format based on the encoding format
-	// the encoding bits format is a family of bit encodings that are shared across various encoding formats
-	//
-	Block4x4EncodingBits::Format Image::DetermineEncodingBitsFormat(Format a_format)
-	{
-		Block4x4EncodingBits::Format encodingbitsformat;
-
-		// determine encoding bits format from image format
-		switch (a_format)
-		{
-		case Format::ETC1:
-		case Format::RGB8:
-		case Format::SRGB8:
-			encodingbitsformat = Block4x4EncodingBits::Format::RGB8;
-			break;
-
-		case Format::RGBA8:
-		case Format::SRGBA8:
-			encodingbitsformat = Block4x4EncodingBits::Format::RGBA8;
-			break;
-
-		case Format::R11:
-		case Format::SIGNED_R11:
-			encodingbitsformat = Block4x4EncodingBits::Format::R11;
-			break;
-
-		case Format::RG11:
-		case Format::SIGNED_RG11:
-			encodingbitsformat = Block4x4EncodingBits::Format::RG11;
-			break;
-
-		case Format::RGB8A1:
-		case Format::SRGB8A1:
-			encodingbitsformat = Block4x4EncodingBits::Format::RGB8A1;
-			break;
-
-		default:
-			encodingbitsformat = Block4x4EncodingBits::Format::UNKNOWN;
-			break;
-		}
-
-		return encodingbitsformat;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-}	// namespace Etc
diff --git a/thirdparty/etc2comp/EtcImage.h b/thirdparty/etc2comp/EtcImage.h
deleted file mode 100644
index bd807ac32e..0000000000
--- a/thirdparty/etc2comp/EtcImage.h
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-//#include "Etc.h"
-#include "EtcColorFloatRGBA.h"
-#include "EtcBlock4x4EncodingBits.h"
-#include "EtcErrorMetric.h"
-
-
-namespace Etc
-{
-	class Block4x4;
-	class EncoderSpec;
-	class SortedBlockList;
-
-    class Image
-    {
-    public:
-
-		//the differnt warning and errors that can come up during encoding
-		enum  EncodingStatus
-		{
-			SUCCESS = 0,
-			//
-			WARNING_THRESHOLD = 1 << 0,
-			//
-			WARNING_EFFORT_OUT_OF_RANGE = 1 << 1,
-			WARNING_JOBS_OUT_OF_RANGE = 1 << 2,
-			WARNING_SOME_NON_OPAQUE_PIXELS = 1 << 3,//just for opaque formats, etc1, rgb8, r11, rg11
-			WARNING_ALL_OPAQUE_PIXELS = 1 << 4,
-			WARNING_ALL_TRANSPARENT_PIXELS = 1 << 5,
-			WARNING_SOME_TRANSLUCENT_PIXELS = 1 << 6,//just for rgb8A1
-			WARNING_SOME_RGBA_NOT_0_TO_1 = 1 << 7,
-			WARNING_SOME_BLUE_VALUES_ARE_NOT_ZERO = 1 << 8,
-			WARNING_SOME_GREEN_VALUES_ARE_NOT_ZERO = 1 << 9,
-			//
-			ERROR_THRESHOLD = 1 << 16,
-			//
-			ERROR_UNKNOWN_FORMAT = 1 << 17,
-			ERROR_UNKNOWN_ERROR_METRIC = 1 << 18,
-			ERROR_ZERO_WIDTH_OR_HEIGHT = 1 << 19,
-			//
-		};
-		
-		enum class Format
-		{
-			UNKNOWN,
-			//
-			ETC1,
-			//
-			// ETC2 formats
-			RGB8,
-			SRGB8,
-			RGBA8,
-			SRGBA8,
-			R11,
-			SIGNED_R11,
-			RG11,
-			SIGNED_RG11,
-			RGB8A1,
-			SRGB8A1,
-			//
-			FORMATS,
-			//
-			DEFAULT = SRGB8
-		};
-
-		// constructor using source image
-		Image(float *a_pafSourceRGBA, unsigned int a_uiSourceWidth,
-				unsigned int a_uiSourceHeight,
-				ErrorMetric a_errormetric);
-
-		// constructor using encoding bits
-		Image(Format a_format, 
-				unsigned int a_uiSourceWidth, unsigned int a_uiSourceHeight,
-				unsigned char *a_paucEncidingBits, unsigned int a_uiEncodingBitsBytes,
-				Image *a_pimageSource,
-				ErrorMetric a_errormetric);
-
-		~Image(void);
-
-		EncodingStatus Encode(Format a_format, ErrorMetric a_errormetric, float a_fEffort, 
-			unsigned int a_uiJobs, unsigned int a_uiMaxJobs);
-
-		inline void AddToEncodingStatus(EncodingStatus a_encStatus)
-		{
-			m_encodingStatus = (EncodingStatus)((unsigned int)m_encodingStatus | (unsigned int)a_encStatus);
-		}
-		
-		inline unsigned int GetSourceWidth(void)
-		{
-			return m_uiSourceWidth;
-		}
-
-		inline unsigned int GetSourceHeight(void)
-		{
-			return m_uiSourceHeight;
-		}
-
-		inline unsigned int GetExtendedWidth(void)
-		{
-			return m_uiExtendedWidth;
-		}
-
-		inline unsigned int GetExtendedHeight(void)
-		{
-			return m_uiExtendedHeight;
-		}
-
-		inline unsigned int GetNumberOfBlocks()
-		{
-			return m_uiBlockColumns * m_uiBlockRows;
-		}
-
-		inline Block4x4 * GetBlocks()
-		{
-			return m_pablock;
-		}
-
-		inline unsigned char * GetEncodingBits(void)
-		{
-			return m_paucEncodingBits;
-		}
-
-		inline unsigned int GetEncodingBitsBytes(void)
-		{
-			return m_uiEncodingBitsBytes;
-		}
-
-		inline int GetEncodingTimeMs(void)
-		{
-			return m_iEncodeTime_ms;
-		}
-
-		float GetError(void);
-
-		inline ColorFloatRGBA * GetSourcePixel(unsigned int a_uiH, unsigned int a_uiV)
-		{
-			if (a_uiH >= m_uiSourceWidth || a_uiV >= m_uiSourceHeight)
-			{
-				return nullptr;
-			}
-
-			return &m_pafrgbaSource[a_uiV*m_uiSourceWidth + a_uiH];
-		}
-
-		inline Format GetFormat(void)
-		{
-			return m_format;
-		}
-
-		static Block4x4EncodingBits::Format DetermineEncodingBitsFormat(Format a_format);
-
-		inline static unsigned short CalcExtendedDimension(unsigned short a_ushOriginalDimension)
-		{
-			return (unsigned short)((a_ushOriginalDimension + 3) & ~3);
-		}
-
-		inline ErrorMetric GetErrorMetric(void)
-		{
-			return m_errormetric;
-		}
-
-		static const char * EncodingFormatToString(Image::Format a_format);
-		const char * EncodingFormatToString(void);
-		//used to get basic information about the image data
-		int m_iNumOpaquePixels;
-		int m_iNumTranslucentPixels;
-		int m_iNumTransparentPixels;
-
-		ColorFloatRGBA m_numColorValues;
-		ColorFloatRGBA m_numOutOfRangeValues;
-
-		bool m_bVerboseOutput;
-	private:
-		//add a warning or error to check for while encoding
-		inline void TrackEncodingWarning(EncodingStatus a_encStatus)
-		{
-			m_warningsToCapture = (EncodingStatus)((unsigned int)m_warningsToCapture | (unsigned int)a_encStatus);
-		}
-
-		//report the warning if it is something we care about for this encoding
-		inline void AddToEncodingStatusIfSignfigant(EncodingStatus a_encStatus)
-		{
-			if ((EncodingStatus)((unsigned int)m_warningsToCapture & (unsigned int)a_encStatus) == a_encStatus)
-			{
-				AddToEncodingStatus(a_encStatus);
-			}
-		}
-
-		Image(void);
-		void FindEncodingWarningTypesForCurFormat();
-		void FindAndSetEncodingWarnings();
-
-		void InitBlocksAndBlockSorter(void);
-
-		void RunFirstPass(unsigned int a_uiMultithreadingOffset, 
-							unsigned int a_uiMultithreadingStride);
-
-		void SetEncodingBits(unsigned int a_uiMultithreadingOffset,
-								unsigned int a_uiMultithreadingStride);
-
-		unsigned int IterateThroughWorstBlocks(unsigned int a_uiMaxBlocks,
-												unsigned int a_uiMultithreadingOffset,
-												unsigned int a_uiMultithreadingStride);
-
-		// inputs
-		ColorFloatRGBA *m_pafrgbaSource;
-		unsigned int m_uiSourceWidth;
-		unsigned int m_uiSourceHeight;
-		unsigned int m_uiExtendedWidth;
-		unsigned int m_uiExtendedHeight;
-		unsigned int m_uiBlockColumns;
-		unsigned int m_uiBlockRows;
-		// intermediate data
-		Block4x4 *m_pablock;
-		// encoding
-		Format m_format;
-		Block4x4EncodingBits::Format m_encodingbitsformat;
-		unsigned int m_uiEncodingBitsBytes;		// for entire image
-		unsigned char *m_paucEncodingBits;
-		ErrorMetric m_errormetric;
-		float m_fEffort;
-		// stats
-		int m_iEncodeTime_ms;
-		
-		SortedBlockList *m_psortedblocklist;
-		//this will hold any warning or errors that happen during encoding
-		EncodingStatus m_encodingStatus;
-		//these will be the warnings we are tracking
-		EncodingStatus m_warningsToCapture;
-	};
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcIndividualTrys.cpp b/thirdparty/etc2comp/EtcIndividualTrys.cpp
deleted file mode 100644
index 56ff4c65ec..0000000000
--- a/thirdparty/etc2comp/EtcIndividualTrys.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcIndividualTrys.cpp
-
-Gathers the results of the various encoding trys for both halves of a 4x4 block for Individual mode
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcIndividualTrys.h"
-
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// construct a list of trys (encoding attempts)
-	//
-	// a_frgbaColor1 is the basecolor for the first half
-	// a_frgbaColor2 is the basecolor for the second half
-	// a_pauiPixelMapping1 is the pixel order for the first half
-	// a_pauiPixelMapping2 is the pixel order for the second half
-	// a_uiRadius is the amount to vary the base colors
-	//
-	IndividualTrys::IndividualTrys(ColorFloatRGBA a_frgbaColor1, ColorFloatRGBA a_frgbaColor2,
-									const unsigned int *a_pauiPixelMapping1,
-									const unsigned int *a_pauiPixelMapping2,
-									unsigned int a_uiRadius)
-	{
-		assert(a_uiRadius <= MAX_RADIUS);
-
-		ColorFloatRGBA frgbaQuantizedColor1 = a_frgbaColor1.QuantizeR4G4B4();
-		ColorFloatRGBA frgbaQuantizedColor2 = a_frgbaColor2.QuantizeR4G4B4();
-
-		// quantize base colors
-		// ensure that trys with a_uiRadius don't overflow
-		int iRed1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntRed(15.0f), a_uiRadius);
-		int iGreen1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntGreen(15.0f), a_uiRadius);
-		int iBlue1 = MoveAwayFromEdge(frgbaQuantizedColor1.IntBlue(15.0f), a_uiRadius);
-		int iRed2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntRed(15.0f), a_uiRadius);
-		int iGreen2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntGreen(15.0f), a_uiRadius);
-		int iBlue2 = MoveAwayFromEdge(frgbaQuantizedColor2.IntBlue(15.0f), a_uiRadius);
-
-		m_half1.Init(iRed1, iGreen1, iBlue1, a_pauiPixelMapping1, a_uiRadius);
-		m_half2.Init(iRed2, iGreen2, iBlue2, a_pauiPixelMapping2, a_uiRadius);
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	void IndividualTrys::Half::Init(int a_iRed, int a_iGreen, int a_iBlue,
-									const unsigned int *a_pauiPixelMapping, unsigned int a_uiRadius)
-	{
-
-		m_iRed = a_iRed;
-		m_iGreen = a_iGreen;
-		m_iBlue = a_iBlue;
-
-		m_pauiPixelMapping = a_pauiPixelMapping;
-		m_uiRadius = a_uiRadius;
-
-		m_uiTrys = 0;
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcIndividualTrys.h b/thirdparty/etc2comp/EtcIndividualTrys.h
deleted file mode 100644
index 5fb12fbcf4..0000000000
--- a/thirdparty/etc2comp/EtcIndividualTrys.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "EtcColorFloatRGBA.h"
-
-namespace Etc
-{
-
-	class IndividualTrys
-	{
-	public:
-
-		static const unsigned int MAX_RADIUS = 1;
-
-		IndividualTrys(ColorFloatRGBA a_frgbaColor1,
-						ColorFloatRGBA a_frgbaColor2,
-						const unsigned int *a_pauiPixelMapping1,
-						const unsigned int *a_pauiPixelMapping2,
-						unsigned int a_uiRadius);
-
-		inline static int MoveAwayFromEdge(int a_i, int a_iDistance)
-		{
-			if (a_i < (0+ a_iDistance))
-			{
-				return (0 + a_iDistance);
-			}
-			else if (a_i > (15- a_iDistance))
-			{
-				return (15 - a_iDistance);
-			}
-
-			return a_i;
-		}
-
-		class Try
-		{
-        public :
-			static const unsigned int SELECTORS = 8;	// per half
-
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-			unsigned int m_uiCW;
-			unsigned int m_auiSelectors[SELECTORS];
-			float m_fError;
-        };
-
-		class Half
-		{
-		public:
-
-			static const unsigned int MAX_TRYS = 27;
-
-			void Init(int a_iRed, int a_iGreen, int a_iBlue, 
-						const unsigned int *a_pauiPixelMapping,
-						unsigned int a_uiRadius);
-
-			// center of trys
-			int m_iRed;
-			int m_iGreen;
-			int m_iBlue;
-
-			const unsigned int *m_pauiPixelMapping;
-			unsigned int m_uiRadius;
-
-			unsigned int m_uiTrys;
-			Try m_atry[MAX_TRYS];
-
-			Try *m_ptryBest;
-		};
-
-		Half m_half1;
-		Half m_half2;
-
-	};
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcMath.cpp b/thirdparty/etc2comp/EtcMath.cpp
deleted file mode 100644
index 096d5f7ab9..0000000000
--- a/thirdparty/etc2comp/EtcMath.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "EtcConfig.h"
-#include "EtcMath.h"
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// calculate the line that best fits the set of XY points contained in a_afX[] and a_afY[]
-	// use a_fSlope and a_fOffset to define that line
-	//
-	bool Regression(float a_afX[], float a_afY[], unsigned int a_Points,
-					float *a_fSlope, float *a_fOffset)
-	{
-		float fPoints = (float)a_Points;
-
-		float fSumX = 0.0f;
-		float fSumY = 0.0f;
-		float fSumXY = 0.0f;
-		float fSumX2 = 0.0f;
-
-		for (unsigned int uiPoint = 0; uiPoint < a_Points; uiPoint++)
-		{
-			fSumX += a_afX[uiPoint];
-			fSumY += a_afY[uiPoint];
-			fSumXY += a_afX[uiPoint] * a_afY[uiPoint];
-			fSumX2 += a_afX[uiPoint] * a_afX[uiPoint];
-		}
-
-		float fDivisor = fPoints*fSumX2 - fSumX*fSumX;
-
-		// if vertical line
-		if (fDivisor == 0.0f)
-		{
-			*a_fSlope = 0.0f;
-			*a_fOffset = 0.0f;
-			return true;
-		}
-
-		*a_fSlope = (fPoints*fSumXY - fSumX*fSumY) / fDivisor;
-		*a_fOffset = (fSumY - (*a_fSlope)*fSumX) / fPoints;
-
-		return false;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/EtcMath.h b/thirdparty/etc2comp/EtcMath.h
deleted file mode 100644
index c58c9a91bc..0000000000
--- a/thirdparty/etc2comp/EtcMath.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <math.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// return true if vertical line
-	bool Regression(float a_afX[], float a_afY[], unsigned int a_Points,
-					float *a_fSlope, float *a_fOffset);
-
-	inline float ConvertMSEToPSNR(float a_fMSE)
-	{
-		if (a_fMSE == 0.0f)
-		{
-			return INFINITY;
-		}
-
-		return 10.0f * log10f(1.0f / a_fMSE);
-	}
-
-
-}
diff --git a/thirdparty/etc2comp/EtcSortedBlockList.cpp b/thirdparty/etc2comp/EtcSortedBlockList.cpp
deleted file mode 100644
index bfa6b7b3fa..0000000000
--- a/thirdparty/etc2comp/EtcSortedBlockList.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-EtcSortedBlockList.cpp
-
-SortedBlockList is a list of 4x4 blocks that can be used by the "effort" system to prioritize
-the encoding of the 4x4 blocks.
-
-The sorting is done with buckets, where each bucket is an indication of how much error each 4x4 block has
-
-*/
-
-#include "EtcConfig.h"
-#include "EtcSortedBlockList.h"
-
-#include "EtcBlock4x4.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-namespace Etc
-{
-
-	// ----------------------------------------------------------------------------------------------------
-	// construct an empty list
-	//
-	// allocate enough memory to add all of the image's 4x4 blocks later
-	// allocate enough buckets to sort the blocks
-	//
-	SortedBlockList::SortedBlockList(unsigned int a_uiImageBlocks, unsigned int a_uiBuckets)
-	{
-		m_uiImageBlocks = a_uiImageBlocks;
-		m_iBuckets = (int)a_uiBuckets;
-
-		m_uiAddedBlocks = 0;
-		m_uiSortedBlocks = 0;
-		m_palinkPool = new Link[m_uiImageBlocks];
-		m_pabucket = new Bucket[m_iBuckets];
-		m_fMaxError = 0.0f;
-
-		InitBuckets();
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	//
-	SortedBlockList::~SortedBlockList(void)
-	{
-		delete[] m_palinkPool;
-		delete[] m_pabucket;
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-    // add a 4x4 block to the list
-	// the 4x4 block will be sorted later
-	//
-    void SortedBlockList::AddBlock(Block4x4 *a_pblock)
-    {
-        assert(m_uiAddedBlocks < m_uiImageBlocks);
-        Link *plink = &m_palinkPool[m_uiAddedBlocks++];
-		plink->Init(a_pblock);
-    }
-
-	// ----------------------------------------------------------------------------------------------------
-	// sort all of the 4x4 blocks that have been added to the list
-	//
-	// first, determine the maximum error, then assign an error range to each bucket
-	// next, determine which bucket each 4x4 block belongs to based on the 4x4 block's error
-	// add the 4x4 block to the appropriate bucket
-	// lastly, walk thru the buckets and add each bucket to a sorted linked list
-	//
-	// the resultant sorting is an approximate sorting from most to least error
-	//
-    void SortedBlockList::Sort(void)
-    {
-		assert(m_uiAddedBlocks == m_uiImageBlocks);
-        InitBuckets();
-
-        // find max block error
-        m_fMaxError = -1.0f;
-
-        for (unsigned int uiLink = 0; uiLink < m_uiAddedBlocks; uiLink++)
-        {
-            Link *plinkBlock = &m_palinkPool[uiLink];
-
-            float fBlockError = plinkBlock->GetBlock()->GetError();
-            if (fBlockError > m_fMaxError)
-            {
-                m_fMaxError = fBlockError;
-            }
-        }
-        // prevent divide by zero or divide by negative
-        if (m_fMaxError <= 0.0f)
-        {
-            m_fMaxError = 1.0f;
-        }
-		//used for debugging
-		//int numDone = 0;
-        // put all of the blocks with unfinished encodings into the appropriate bucket
-		m_uiSortedBlocks = 0;
-        for (unsigned int uiLink = 0; uiLink < m_uiAddedBlocks; uiLink++)
-        {
-            Link *plinkBlock = &m_palinkPool[uiLink];
-
-			// if the encoding is done, don't add it to the list
-			if (plinkBlock->GetBlock()->GetEncoding()->IsDone())
-			{
-				//numDone++;
-				continue;
-			}
-
-            // calculate the appropriate sort bucket
-            float fBlockError = plinkBlock->GetBlock()->GetError();
-            int iBucket = (int) floorf(m_iBuckets * fBlockError / m_fMaxError);
-            // clamp to bucket index
-            iBucket = iBucket < 0 ? 0 : iBucket >= m_iBuckets ? m_iBuckets - 1 : iBucket;
-
-            // add block to bucket
-			{
-				Bucket *pbucket = &m_pabucket[iBucket];
-				if (pbucket->plinkLast)
-				{
-					pbucket->plinkLast->SetNext(plinkBlock);
-					pbucket->plinkLast = plinkBlock;
-				}
-				else
-				{
-					pbucket->plinkFirst = pbucket->plinkLast = plinkBlock;
-				}
-				plinkBlock->SetNext(nullptr);
-			}
-
-			m_uiSortedBlocks++;
-
-            if (0)
-            {
-                printf("%u: e=%.3f\n", uiLink, fBlockError);
-                Print();
-                printf("\n\n\n");
-            }
-        }
-		//printf("num blocks already done: %d\n",numDone);
-		//link the blocks together across buckets
-		m_plinkFirst = nullptr;
-		m_plinkLast = nullptr;
-		for (int iBucket = m_iBuckets - 1; iBucket >= 0; iBucket--)
-		{
-			Bucket *pbucket = &m_pabucket[iBucket];
-
-			if (pbucket->plinkFirst)
-			{
-				if (m_plinkFirst == nullptr)
-				{
-					m_plinkFirst = pbucket->plinkFirst;
-				}
-				else
-				{
-					assert(pbucket->plinkLast->GetNext() == nullptr);
-					m_plinkLast->SetNext(pbucket->plinkFirst);
-				}
-
-				m_plinkLast = pbucket->plinkLast;
-			}
-		}
-
-
-	}
-
-	// ----------------------------------------------------------------------------------------------------
-	// clear all of the buckets.  normally done in preparation for a sort
-	//
-	void SortedBlockList::InitBuckets(void)
-    {
-        for (int iBucket = 0; iBucket < m_iBuckets; iBucket++)
-        {
-            Bucket *pbucket = &m_pabucket[iBucket];
-
-            pbucket->plinkFirst = 0;
-            pbucket->plinkLast = 0;
-        }
-    }
-
-    // ----------------------------------------------------------------------------------------------------
-    // print out the list of sorted 4x4 blocks
-	// normally used for debugging
-	//
-    void SortedBlockList::Print(void)
-    {
-        for (int iBucket = m_iBuckets-1; iBucket >= 0; iBucket--)
-        {
-            Bucket *pbucket = &m_pabucket[iBucket];
-
-            unsigned int uiBlocks = 0;
-            for (Link *plink = pbucket->plinkFirst; plink != nullptr; plink = plink->GetNext() )
-            {
-                uiBlocks++;
-
-				if (plink == pbucket->plinkLast)
-				{
-					break;
-				}
-            }
-
-            float fBucketError = m_fMaxError * iBucket / m_iBuckets;
-            float fBucketRMS = sqrtf(fBucketError / (4.0f*16.0f) );
-            printf("%3d: e=%.3f rms=%.6f %u\n", iBucket, fBucketError, fBucketRMS, uiBlocks);
-        }
-    }
-
-    // ----------------------------------------------------------------------------------------------------
-    //
-
-}   // namespace Etc
diff --git a/thirdparty/etc2comp/EtcSortedBlockList.h b/thirdparty/etc2comp/EtcSortedBlockList.h
deleted file mode 100644
index 960e8adc34..0000000000
--- a/thirdparty/etc2comp/EtcSortedBlockList.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright 2015 The Etc2Comp Authors.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace Etc
-{
-	class Block4x4;
-
-    class SortedBlockList
-    {
-    public:
-
-		class Link
-		{
-		public:
-
-			inline void Init(Block4x4 *a_pblock)
-			{
-				m_pblock = a_pblock;
-				m_plinkNext = nullptr;
-			}
-
-			inline Block4x4 * GetBlock(void)
-			{
-				return m_pblock;
-			}
-
-			inline void SetNext(Link *a_plinkNext)
-			{
-				m_plinkNext = a_plinkNext;
-			}
-
-			inline Link * GetNext(void)
-			{
-				return m_plinkNext;
-			}
-
-			inline Link * Advance(unsigned int a_uiSteps = 1)
-			{
-				Link *plink = this;
-
-				for (unsigned int uiStep = 0; uiStep < a_uiSteps; uiStep++)
-				{
-					if (plink == nullptr)
-					{
-						break;
-					}
-
-					plink = plink->m_plinkNext;
-				}
-
-				return plink;
-			}
-
-		private:
-
-			Block4x4 *m_pblock;
-			Link *m_plinkNext;
-		};
-
-		SortedBlockList(unsigned int a_uiImageBlocks, unsigned int a_uiBuckets);
-		~SortedBlockList(void);
-
-        void AddBlock(Block4x4 *a_pblock);
-
-        void Sort(void);
-
-		inline Link * GetLinkToFirstBlock(void)
-		{
-			return m_plinkFirst;
-		}
-
-		inline unsigned int GetNumberOfAddedBlocks(void)
-		{
-			return m_uiAddedBlocks;
-		}
-
-		inline unsigned int GetNumberOfSortedBlocks(void)
-		{
-			return m_uiSortedBlocks;
-		}
-
-		void Print(void);
-
-	private:
-
-        void InitBuckets(void);
-
-        class Bucket
-        {
-        public:
-            Link *plinkFirst;
-            Link *plinkLast;
-        };
-
-        unsigned int m_uiImageBlocks;
-        int m_iBuckets;
-
-		unsigned int m_uiAddedBlocks;
-		unsigned int m_uiSortedBlocks;
-		Link *m_palinkPool;
-        Bucket *m_pabucket;
-        float m_fMaxError;
-
-		Link *m_plinkFirst;
-		Link *m_plinkLast;
-
-    };
-
-} // namespace Etc
diff --git a/thirdparty/etc2comp/LICENSE b/thirdparty/etc2comp/LICENSE
deleted file mode 100644
index d645695673..0000000000
--- a/thirdparty/etc2comp/LICENSE
+++ /dev/null
@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/thirdparty/etc2comp/README.md b/thirdparty/etc2comp/README.md
deleted file mode 100644
index 2f4363d042..0000000000
--- a/thirdparty/etc2comp/README.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# Etc2Comp - Texture to ETC2 compressor
-
-Etc2Comp is a command line tool that converts textures (e.g. bitmaps)
-into the [ETC2](https://en.wikipedia.org/wiki/Ericsson_Texture_Compression)
-format. The tool is built with a focus on encoding performance
-to reduce the amount of time required to compile asset heavy applications as
-well as reduce overall application size.
-
-This repo provides source code that can be compiled into a binary. The
-binary can then be used to convert textures to the ETC2 format.
-
-Important: This is not an official Google product. It is an experimental
-library published as-is. Please see the CONTRIBUTORS.md file for information
-about questions or issues.
-
-## Setup
-This project uses [CMake](https://cmake.org/) to generate platform-specific
-build files:
- - Linux: make files
- - OS X: Xcode workspace files
- - Microsoft Windows: Visual Studio solution files
- - Note: CMake supports other formats, but this doc only provides steps for
- one of each platform for brevity.
-
-Refer to each platform's setup section to setup your environment and build
-an Etc2Comp binary. Then skip to the usage section of this page for examples
-of how to use the library.
-
-### Setup for OS X
- build tested on this config:
-  OS X 10.9.5 i7 16GB RAM
-  Xcode 5.1.1
-  cmake 3.2.3
-  
-Start by downloading and installing the following components if they are not
-already installed on your development machine.
- - *Xcode* version 5.1.1, or greater
- - [CMake](https://cmake.org/download/) version 3.2.3, or greater
-
-To build the Etc2Comp binary:
- 1. Open a *Terminal* window and navigate to the project directory.
- 1. Run `mkdir build_xcode`
- 1. Run `cd build_xcode`
- 1. Run `cmake -G Xcode ../`
- 1. Open *Xcode* and import the `build_xcode/EtcTest.xcodeproj` file.
- 1. Open the Product menu and choose Build For -> Running.
- 1. Once the build succeeds the binary located at `build_xcode/EtcTool/Debug/EtcTool`
-can be executed.
-
-Optional
-Xcode EtcTool ‘Run’ preferences
-note: if the build_xcode/EtcTest.xcodeproj is manually deleted then some Xcode preferences 
-will need to be set by hand after cmake is run (these prefs are retained across 
-cmake updates if the .xcodeproj is not deleted/removed)
-
-1. Set the active scheme to ‘EtcTool’
-1. Edit the scheme
-1. Select option ‘Run EtcTool’, then tab ‘Arguments’. 
-Add this launch argument: ‘-argfile ../../EtcTool/args.txt’
-1. Select tab ‘Options’ and set a custom working directory to: ‘$(SRCROOT)/Build_Xcode/EtcTool’
-
-### SetUp for Windows
-
-1. Open a *Terminal* window and navigate to the project directory.
-1. Run `mkdir build_vs`
-1. Run `cd build_vs`
-1. Run CMAKE, noting what build version you need, and pointing to the parent directory as the source root; 
-  For VS 2013 : `cmake -G "Visual Studio 12 2013 Win64" ../`
-  For VS 2015 : `cmake -G "Visual Studio 14 2015 Win64" ../`
-  NOTE: To see what supported Visual Studio outputs there are, run `cmake -G`
-1. open the 'EtcTest' solution
-1. make the 'EtcTool' project the start up project 
-1. (optional) in the project properties, under 'Debugging ->command arguments' 
-add the argfile textfile thats included in the EtcTool directory. 
-example: -argfile C:\etc2\EtcTool\Args.txt
-
-### Setup For Linux
-The Linux build was tested on this config:
-  Ubuntu desktop 14.04
-  gcc/g++ 4.8
-  cmake 2.8.12.2
-
-1. Verify linux has cmake and C++-11 capable g++ installed
-1. Open shell
-1. Run `mkdir build_linux`
-1. Run `cd build_linux`
-1. Run `cmake ../`
-1. Run `make`
-1. navigate to the newly created EtcTool directory `cd EtcTool`
-1. run the executable: `./EtcTool -argfile ../../EtcTool/args.txt`
-
-Skip to the <a href="#usage">Usage</a> section for more information about using the
-tool.
-
-## Usage
-
-### Command Line Usage
-EtcTool can be run from the command line with the following usage:
-    etctool.exe source_image [options ...] -output encoded_image
-
-The encoder will use an array of RGBA floats read from the source_image to create 
-an ETC1 or ETC2 encoded image in encoded_image.  The RGBA floats should be in the 
-range [0:1].
-
-Options:
-
-    -analyze <analysis_folder>
-    -argfile <arg_file>           additional command line arguments read from a file
-    -blockAtHV <H V>              encodes a single block that contains the
-                                  pixel specified by the H V coordinates
-    -compare <comparison_image>   compares source_image to comparison_image
-    -effort <amount>              number between 0 and 100 to specify the encoding quality 
-                                  (100 is the highest quality)
-    -errormetric <error_metric>   specify the error metric, the options are
-                                  rgba, rgbx, rec709, numeric and normalxyz
-    -format <etc_format>          ETC1, RGB8, SRGB8, RGBA8, SRGB8, RGB8A1,
-                                  SRGB8A1 or R11
-    -help                         prints this message
-    -jobs or -j <thread_count>    specifies the number of threads (default=1)
-    -normalizexyz                 normalize RGB to have a length of 1
-    -verbose or -v                shows status information during the encoding
-                                  process
-	-mipmaps or -m <mip_count>    sets the maximum number of mipaps to generate (default=1)
-	-mipwrap or -w <x|y|xy>       sets the mipmap filter wrap mode (default=clamp)
-
-* -analyze will run an analysis of the encoding and place it in folder 
-"analysis_folder" (e.g. ../analysis/kodim05).  within the analysis_folder, a folder 
-will be created with a name of the current date/time (e.g. 20151204_153306).  this 
-date/time folder is used to compare encodings of the same texture over time.  
-within the date/time folder is a text file with several encoding stats and a 2x png 
-image showing the encoding mode for each 4x4 block.
-
-* -argfile allows additional command line arguments to be placed in a text file
-
-* -blockAtHV selects the 4x4 pixel subset of the source image at position (H,V).  
-This is mainly used for debugging
-
-* -compare compares the source image to the created encoded image. The encoding
-will dictate what error analysis is used in the comparison.
-
-* -effort uses an "amount" between 0 and 100 to determine how much additional effort 
-to apply during the encoding.
-
-* -errormetric selects the fitting algorithm used by the encoder.  "rgba" calculates 
-RMS error using RGB components that are weighted by A.  "rgbx" calculates RMS error 
-using RGBA components, where A is treated as an additional data channel, instead of 
-as alpha.  "rec709" is similar to "rgba", except the RGB components are also weighted 
-according to Rec709.  "numeric" calculates RMS error using unweighted RGBA components.  
-"normalize" calculates error based on dot product and vector length for RGB and RMS 
-error for A.
-
-* -help prints out the usage message
-
-* -jobs enables multi-threading to speed up image encoding
-
-* -normalizexyz normalizes the source RGB to have a length of 1.
-
-* -verbose shows information on the current encoding process. It will then display the 
-PSNR and time time it took to encode the image.
-
-* -mipmaps takes an argument that specifies how many mipmaps to generate from the 
-source image.  The mipmaps are generated with a lanczos3 filter using edge clamping.
-If the mipmaps option is not specified no mipmaps are created.
-
-* -mipwrap takes an argument that specifies the mipmap filter wrap mode.  The options 
-are "x", "y" and "xy" which specify wrapping in x only, y only or x and y respectively.
-The default options are clamping in both x and y.
-
-Note: Path names can use slashes or backslashes.  The tool will convert the 
-slashes to the appropriate polarity for the current platform.
-
-
-## API
-
-The library supports two different APIs - a C-like API that is not heavily 
-class-based and a class-based API.
-
-main() in EtcTool.cpp contains an example of both APIs.
-
-The Encode() method now returns an EncodingStatus that contains bit flags for
-reporting various warnings and flags encountered when encoding.
-
-
-## Copyright
-Copyright 2015 Etc2Comp Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
diff --git a/thirdparty/etc2comp/patches/fix-rgba8-max-channels.patch b/thirdparty/etc2comp/patches/fix-rgba8-max-channels.patch
deleted file mode 100644
index ea9b5640b6..0000000000
--- a/thirdparty/etc2comp/patches/fix-rgba8-max-channels.patch
+++ /dev/null
@@ -1,224 +0,0 @@
-diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
-index 5656556db9..5c7ebed788 100644
---- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
-+++ b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8.cpp
-@@ -508,7 +508,7 @@ namespace Etc
- 		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
- 		if (iMaxRed1 > 15)
- 		{
--			iMinRed1 = 15;
-+			iMaxRed1 = 15;
- 		}
- 
- 		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-@@ -519,7 +519,7 @@ namespace Etc
- 		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
- 		if (iMaxGreen1 > 15)
- 		{
--			iMinGreen1 = 15;
-+			iMaxGreen1 = 15;
- 		}
- 
- 		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-@@ -530,7 +530,7 @@ namespace Etc
- 		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
- 		if (iMaxBlue1 > 15)
- 		{
--			iMinBlue1 = 15;
-+			iMaxBlue1 = 15;
- 		}
- 
- 		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-@@ -545,7 +545,7 @@ namespace Etc
- 		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
- 		if (iMaxRed2 > 15)
- 		{
--			iMinRed2 = 15;
-+			iMaxRed2 = 15;
- 		}
- 
- 		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-@@ -556,7 +556,7 @@ namespace Etc
- 		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
- 		if (iMaxGreen2 > 15)
- 		{
--			iMinGreen2 = 15;
-+			iMaxGreen2 = 15;
- 		}
- 
- 		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-@@ -567,7 +567,7 @@ namespace Etc
- 		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
- 		if (iMaxBlue2 > 15)
- 		{
--			iMinBlue2 = 15;
-+			iMaxBlue2 = 15;
- 		}
- 
- 		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-@@ -761,7 +761,7 @@ namespace Etc
- 		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
- 		if (iMaxRed1 > 15)
- 		{
--			iMinRed1 = 15;
-+			iMaxRed1 = 15;
- 		}
- 
- 		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-@@ -772,7 +772,7 @@ namespace Etc
- 		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
- 		if (iMaxGreen1 > 15)
- 		{
--			iMinGreen1 = 15;
-+			iMaxGreen1 = 15;
- 		}
- 
- 		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-@@ -783,7 +783,7 @@ namespace Etc
- 		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
- 		if (iMaxBlue1 > 15)
- 		{
--			iMinBlue1 = 15;
-+			iMaxBlue1 = 15;
- 		}
- 
- 		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-@@ -798,7 +798,7 @@ namespace Etc
- 		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
- 		if (iMaxRed2 > 15)
- 		{
--			iMinRed2 = 15;
-+			iMaxRed2 = 15;
- 		}
- 
- 		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-@@ -809,7 +809,7 @@ namespace Etc
- 		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
- 		if (iMaxGreen2 > 15)
- 		{
--			iMinGreen2 = 15;
-+			iMaxGreen2 = 15;
- 		}
- 
- 		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-@@ -820,7 +820,7 @@ namespace Etc
- 		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
- 		if (iMaxBlue2 > 15)
- 		{
--			iMinBlue2 = 15;
-+			iMaxBlue2 = 15;
- 		}
- 
- 		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-diff --git a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
-index ba2b42fb05..b94b64e68c 100644
---- a/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
-+++ b/thirdparty/etc2comp/EtcBlock4x4Encoding_RGB8A1.cpp
-@@ -847,7 +847,7 @@ namespace Etc
- 		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
- 		if (iMaxRed1 > 15)
- 		{
--			iMinRed1 = 15;
-+			iMaxRed1 = 15;
- 		}
- 
- 		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-@@ -858,7 +858,7 @@ namespace Etc
- 		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
- 		if (iMaxGreen1 > 15)
- 		{
--			iMinGreen1 = 15;
-+			iMaxGreen1 = 15;
- 		}
- 
- 		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-@@ -869,7 +869,7 @@ namespace Etc
- 		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
- 		if (iMaxBlue1 > 15)
- 		{
--			iMinBlue1 = 15;
-+			iMaxBlue1 = 15;
- 		}
- 
- 		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-@@ -884,7 +884,7 @@ namespace Etc
- 		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
- 		if (iMaxRed2 > 15)
- 		{
--			iMinRed2 = 15;
-+			iMaxRed2 = 15;
- 		}
- 
- 		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-@@ -895,7 +895,7 @@ namespace Etc
- 		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
- 		if (iMaxGreen2 > 15)
- 		{
--			iMinGreen2 = 15;
-+			iMaxGreen2 = 15;
- 		}
- 
- 		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-@@ -906,7 +906,7 @@ namespace Etc
- 		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
- 		if (iMaxBlue2 > 15)
- 		{
--			iMinBlue2 = 15;
-+			iMaxBlue2 = 15;
- 		}
- 
- 		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
-@@ -1108,7 +1108,7 @@ namespace Etc
- 		int iMaxRed1 = iColor1Red + (int)a_uiRadius;
- 		if (iMaxRed1 > 15)
- 		{
--			iMinRed1 = 15;
-+			iMaxRed1 = 15;
- 		}
- 
- 		int iMinGreen1 = iColor1Green - (int)a_uiRadius;
-@@ -1119,7 +1119,7 @@ namespace Etc
- 		int iMaxGreen1 = iColor1Green + (int)a_uiRadius;
- 		if (iMaxGreen1 > 15)
- 		{
--			iMinGreen1 = 15;
-+			iMaxGreen1 = 15;
- 		}
- 
- 		int iMinBlue1 = iColor1Blue - (int)a_uiRadius;
-@@ -1130,7 +1130,7 @@ namespace Etc
- 		int iMaxBlue1 = iColor1Blue + (int)a_uiRadius;
- 		if (iMaxBlue1 > 15)
- 		{
--			iMinBlue1 = 15;
-+			iMaxBlue1 = 15;
- 		}
- 
- 		int iColor2Red = m_frgbaOriginalColor2_TAndH.IntRed(15.0f);
-@@ -1145,7 +1145,7 @@ namespace Etc
- 		int iMaxRed2 = iColor2Red + (int)a_uiRadius;
- 		if (iMaxRed2 > 15)
- 		{
--			iMinRed2 = 15;
-+			iMaxRed2 = 15;
- 		}
- 
- 		int iMinGreen2 = iColor2Green - (int)a_uiRadius;
-@@ -1156,7 +1156,7 @@ namespace Etc
- 		int iMaxGreen2 = iColor2Green + (int)a_uiRadius;
- 		if (iMaxGreen2 > 15)
- 		{
--			iMinGreen2 = 15;
-+			iMaxGreen2 = 15;
- 		}
- 
- 		int iMinBlue2 = iColor2Blue - (int)a_uiRadius;
-@@ -1167,7 +1167,7 @@ namespace Etc
- 		int iMaxBlue2 = iColor2Blue + (int)a_uiRadius;
- 		if (iMaxBlue2 > 15)
- 		{
--			iMinBlue2 = 15;
-+			iMaxBlue2 = 15;
- 		}
- 
- 		for (unsigned int uiDistance = 0; uiDistance < TH_DISTANCES; uiDistance++)
diff --git a/thirdparty/etcpak/AUTHORS.txt b/thirdparty/etcpak/AUTHORS.txt
new file mode 100644
index 0000000000..e7bae62c85
--- /dev/null
+++ b/thirdparty/etcpak/AUTHORS.txt
@@ -0,0 +1,3 @@
+Bartosz Taudul <wolf@nereid.pl>
+Daniel Jungmann <el.3d.source@gmail.com>
+Florian Penzkofer <fp@nullptr.de>
diff --git a/thirdparty/etcpak/Dither.cpp b/thirdparty/etcpak/Dither.cpp
new file mode 100644
index 0000000000..355686f26b
--- /dev/null
+++ b/thirdparty/etcpak/Dither.cpp
@@ -0,0 +1,120 @@
+#include <algorithm>
+#include <string.h>
+
+#include "Dither.hpp"
+#include "Math.hpp"
+#ifdef __SSE4_1__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#    include <Windows.h>
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+#ifdef __AVX2__
+void DitherAvx2( uint8_t* data, __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
+{
+    static constexpr uint8_t a31[] = { 0, 0, 0, 1, 2, 0, 4, 0, 0, 2, 0, 0, 4, 0, 3, 0 };
+    static constexpr uint8_t a63[] = { 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0 };
+    static constexpr uint8_t s31[] = { 5, 0, 4, 0, 0, 2, 0, 1, 3, 0, 4, 0, 0, 0, 0, 2 };
+    static constexpr uint8_t s63[] = { 2, 0, 2, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1 };
+
+    const __m256i BayerAdd0 = _mm256_setr_epi8(
+        a31[0], a63[0], a31[0], 0, a31[1], a63[1], a31[1], 0, a31[2], a63[2], a31[2], 0, a31[3], a63[3], a31[3], 0,
+        a31[4], a63[4], a31[4], 0, a31[5], a63[5], a31[5], 0, a31[6], a63[6], a31[6], 0, a31[7], a63[7], a31[7], 0
+    );
+    const __m256i BayerAdd1 = _mm256_setr_epi8(
+        a31[8],  a63[8],  a31[8],  0, a31[9],  a63[9],  a31[9],  0, a31[10], a63[10], a31[10], 0, a31[11], a63[11], a31[11], 0,
+        a31[12], a63[12], a31[12], 0, a31[13], a63[13], a31[13], 0, a31[14], a63[14], a31[14], 0, a31[15], a63[15], a31[15], 0
+    );
+    const __m256i BayerSub0 = _mm256_setr_epi8(
+        s31[0], s63[0], s31[0], 0, s31[1], s63[1], s31[1], 0, s31[2], s63[2], s31[2], 0, s31[3], s63[3], s31[3], 0,
+        s31[4], s63[4], s31[4], 0, s31[5], s63[5], s31[5], 0, s31[6], s63[6], s31[6], 0, s31[7], s63[7], s31[7], 0
+    );
+    const __m256i BayerSub1 = _mm256_setr_epi8(
+        s31[8],  s63[8],  s31[8],  0, s31[9],  s63[9],  s31[9],  0, s31[10], s63[10], s31[10], 0, s31[11], s63[11], s31[11], 0,
+        s31[12], s63[12], s31[12], 0, s31[13], s63[13], s31[13], 0, s31[14], s63[14], s31[14], 0, s31[15], s63[15], s31[15], 0
+    );
+
+    __m256i l0 = _mm256_inserti128_si256( _mm256_castsi128_si256( px0 ), px1, 1 );
+    __m256i l1 = _mm256_inserti128_si256( _mm256_castsi128_si256( px2 ), px3, 1 );
+
+    __m256i a0 = _mm256_adds_epu8( l0, BayerAdd0 );
+    __m256i a1 = _mm256_adds_epu8( l1, BayerAdd1 );
+    __m256i s0 = _mm256_subs_epu8( a0, BayerSub0 );
+    __m256i s1 = _mm256_subs_epu8( a1, BayerSub1 );
+
+    _mm256_storeu_si256( (__m256i*)(data   ), s0 );
+    _mm256_storeu_si256( (__m256i*)(data+32), s1 );
+
+}
+#endif
+
+void Dither( uint8_t* data )
+{
+#ifdef __AVX2__
+    static constexpr uint8_t a31[] = { 0, 0, 0, 1, 2, 0, 4, 0, 0, 2, 0, 0, 4, 0, 3, 0 };
+    static constexpr uint8_t a63[] = { 0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0 };
+    static constexpr uint8_t s31[] = { 5, 0, 4, 0, 0, 2, 0, 1, 3, 0, 4, 0, 0, 0, 0, 2 };
+    static constexpr uint8_t s63[] = { 2, 0, 2, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1 };
+
+    const __m256i BayerAdd0 = _mm256_setr_epi8(
+        a31[0], a63[0], a31[0], 0, a31[1], a63[1], a31[1], 0, a31[2], a63[2], a31[2], 0, a31[3], a63[3], a31[3], 0,
+        a31[4], a63[4], a31[4], 0, a31[5], a63[5], a31[5], 0, a31[6], a63[6], a31[6], 0, a31[7], a63[7], a31[7], 0
+    );
+    const __m256i BayerAdd1 = _mm256_setr_epi8(
+        a31[8],  a63[8],  a31[8],  0, a31[9],  a63[9],  a31[9],  0, a31[10], a63[10], a31[10], 0, a31[11], a63[11], a31[11], 0,
+        a31[12], a63[12], a31[12], 0, a31[13], a63[13], a31[13], 0, a31[14], a63[14], a31[14], 0, a31[15], a63[15], a31[15], 0
+    );
+    const __m256i BayerSub0 = _mm256_setr_epi8(
+        s31[0], s63[0], s31[0], 0, s31[1], s63[1], s31[1], 0, s31[2], s63[2], s31[2], 0, s31[3], s63[3], s31[3], 0,
+        s31[4], s63[4], s31[4], 0, s31[5], s63[5], s31[5], 0, s31[6], s63[6], s31[6], 0, s31[7], s63[7], s31[7], 0
+    );
+    const __m256i BayerSub1 = _mm256_setr_epi8(
+        s31[8],  s63[8],  s31[8],  0, s31[9],  s63[9],  s31[9],  0, s31[10], s63[10], s31[10], 0, s31[11], s63[11], s31[11], 0,
+        s31[12], s63[12], s31[12], 0, s31[13], s63[13], s31[13], 0, s31[14], s63[14], s31[14], 0, s31[15], s63[15], s31[15], 0
+    );
+
+    __m256i px0 = _mm256_loadu_si256( (__m256i*)(data   ) );
+    __m256i px1 = _mm256_loadu_si256( (__m256i*)(data+32) );
+
+    __m256i a0 = _mm256_adds_epu8( px0, BayerAdd0 );
+    __m256i a1 = _mm256_adds_epu8( px1, BayerAdd1 );
+    __m256i s0 = _mm256_subs_epu8( a0, BayerSub0 );
+    __m256i s1 = _mm256_subs_epu8( a1, BayerSub1 );
+
+    _mm256_storeu_si256( (__m256i*)(data   ), s0 );
+    _mm256_storeu_si256( (__m256i*)(data+32), s1 );
+#else
+    static constexpr int8_t Bayer31[16] = {
+        ( 0-8)*2/3, ( 8-8)*2/3, ( 2-8)*2/3, (10-8)*2/3,
+        (12-8)*2/3, ( 4-8)*2/3, (14-8)*2/3, ( 6-8)*2/3,
+        ( 3-8)*2/3, (11-8)*2/3, ( 1-8)*2/3, ( 9-8)*2/3,
+        (15-8)*2/3, ( 7-8)*2/3, (13-8)*2/3, ( 5-8)*2/3
+    };
+    static constexpr int8_t Bayer63[16] = {
+        ( 0-8)*2/6, ( 8-8)*2/6, ( 2-8)*2/6, (10-8)*2/6,
+        (12-8)*2/6, ( 4-8)*2/6, (14-8)*2/6, ( 6-8)*2/6,
+        ( 3-8)*2/6, (11-8)*2/6, ( 1-8)*2/6, ( 9-8)*2/6,
+        (15-8)*2/6, ( 7-8)*2/6, (13-8)*2/6, ( 5-8)*2/6
+    };
+
+    for( int i=0; i<16; i++ )
+    {
+        uint32_t col;
+        memcpy( &col, data, 4 );
+        uint8_t r = col & 0xFF;
+        uint8_t g = ( col >> 8 ) & 0xFF;
+        uint8_t b = ( col >> 16 ) & 0xFF;
+
+        r = clampu8( r + Bayer31[i] );
+        g = clampu8( g + Bayer63[i] );
+        b = clampu8( b + Bayer31[i] );
+
+        col = r | ( g << 8 ) | ( b << 16 );
+        memcpy( data, &col, 4 );
+        data += 4;
+    }
+#endif
+}
diff --git a/thirdparty/etcpak/Dither.hpp b/thirdparty/etcpak/Dither.hpp
new file mode 100644
index 0000000000..e43ce5676d
--- /dev/null
+++ b/thirdparty/etcpak/Dither.hpp
@@ -0,0 +1,21 @@
+#ifndef __DITHER_HPP__
+#define __DITHER_HPP__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __AVX2__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+void Dither( uint8_t* data );
+
+#ifdef __AVX2__
+void DitherAvx2( uint8_t* data, __m128i px0, __m128i px1, __m128i px2, __m128i px3 );
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/ForceInline.hpp b/thirdparty/etcpak/ForceInline.hpp
new file mode 100644
index 0000000000..b6f012841b
--- /dev/null
+++ b/thirdparty/etcpak/ForceInline.hpp
@@ -0,0 +1,20 @@
+#ifndef __FORCEINLINE_HPP__
+#define __FORCEINLINE_HPP__
+
+#if defined(__GNUC__)
+#  define etcpak_force_inline __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#  define etcpak_force_inline __forceinline
+#else
+#  define etcpak_force_inline inline
+#endif
+
+#if defined(__GNUC__)
+#  define etcpak_no_inline __attribute__((noinline))
+#elif defined(_MSC_VER)
+#  define etcpak_no_inline __declspec(noinline)
+#else
+#  define etcpak_no_inline
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/LICENSE.txt b/thirdparty/etcpak/LICENSE.txt
new file mode 100644
index 0000000000..59e85d6ea5
--- /dev/null
+++ b/thirdparty/etcpak/LICENSE.txt
@@ -0,0 +1,26 @@
+etcpak, an extremely fast ETC compression utility (https://github.com/wolfpld/etcpak)
+
+Copyright (c) 2013-2021, Bartosz Taudul <wolf@nereid.pl>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/thirdparty/etcpak/Math.hpp b/thirdparty/etcpak/Math.hpp
new file mode 100644
index 0000000000..994e1ac4ea
--- /dev/null
+++ b/thirdparty/etcpak/Math.hpp
@@ -0,0 +1,92 @@
+#ifndef __DARKRL__MATH_HPP__
+#define __DARKRL__MATH_HPP__
+
+#include <algorithm>
+#include <cmath>
+#include <stdint.h>
+
+#include "ForceInline.hpp"
+
+template<typename T>
+static etcpak_force_inline T AlignPOT( T val )
+{
+    if( val == 0 ) return 1;
+    val--;
+    for( unsigned int i=1; i<sizeof( T ) * 8; i <<= 1 )
+    {
+        val |= val >> i;
+    }
+    return val + 1;
+}
+
+static etcpak_force_inline int CountSetBits( uint32_t val )
+{
+    val -= ( val >> 1 ) & 0x55555555;
+    val = ( ( val >> 2 ) & 0x33333333 ) + ( val & 0x33333333 );
+    val = ( ( val >> 4 ) + val ) & 0x0f0f0f0f;
+    val += val >> 8;
+    val += val >> 16;
+    return val & 0x0000003f;
+}
+
+static etcpak_force_inline int CountLeadingZeros( uint32_t val )
+{
+    val |= val >> 1;
+    val |= val >> 2;
+    val |= val >> 4;
+    val |= val >> 8;
+    val |= val >> 16;
+    return 32 - CountSetBits( val );
+}
+
+static etcpak_force_inline float sRGB2linear( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.04045f )
+    {
+        return v / 12.92f;
+    }
+    else
+    {
+        return pow( ( v + a ) / ( 1 + a ), 2.4f );
+    }
+}
+
+static etcpak_force_inline float linear2sRGB( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.0031308f )
+    {
+        return 12.92f * v;
+    }
+    else
+    {
+        return ( 1 + a ) * pow( v, 1/2.4f ) - a;
+    }
+}
+
+template<class T>
+static etcpak_force_inline T SmoothStep( T x )
+{
+    return x*x*(3-2*x);
+}
+
+static etcpak_force_inline uint8_t clampu8( int32_t val )
+{
+    if( ( val & ~0xFF ) == 0 ) return val;
+    return ( ( ~val ) >> 31 ) & 0xFF;
+}
+
+template<class T>
+static etcpak_force_inline T sq( T val )
+{
+    return val * val;
+}
+
+static etcpak_force_inline int mul8bit( int a, int b )
+{
+    int t = a*b + 128;
+    return ( t + ( t >> 8 ) ) >> 8;
+}
+
+#endif
diff --git a/thirdparty/etcpak/ProcessCommon.hpp b/thirdparty/etcpak/ProcessCommon.hpp
new file mode 100644
index 0000000000..657d68888f
--- /dev/null
+++ b/thirdparty/etcpak/ProcessCommon.hpp
@@ -0,0 +1,50 @@
+#ifndef __PROCESSCOMMON_HPP__
+#define __PROCESSCOMMON_HPP__
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+template<class T>
+static size_t GetLeastError( const T* err, size_t num )
+{
+    size_t idx = 0;
+    for( size_t i=1; i<num; i++ )
+    {
+        if( err[i] < err[idx] )
+        {
+            idx = i;
+        }
+    }
+    return idx;
+}
+
+static uint64_t FixByteOrder( uint64_t d )
+{
+    return ( ( d & 0x00000000FFFFFFFF ) ) |
+           ( ( d & 0xFF00000000000000 ) >> 24 ) |
+           ( ( d & 0x000000FF00000000 ) << 24 ) |
+           ( ( d & 0x00FF000000000000 ) >> 8 ) |
+           ( ( d & 0x0000FF0000000000 ) << 8 );
+}
+
+template<class T, class S>
+static uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id )
+{
+    size_t tidx[2];
+    tidx[0] = GetLeastError( terr[0], 8 );
+    tidx[1] = GetLeastError( terr[1], 8 );
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+    for( int i=0; i<16; i++ )
+    {
+        uint64_t t = tsel[i][tidx[id[i]%2]];
+        d |= ( t & 0x1 ) << ( i + 32 );
+        d |= ( t & 0x2 ) << ( i + 47 );
+    }
+
+    return d;
+}
+
+#endif
diff --git a/thirdparty/etcpak/ProcessDxtc.cpp b/thirdparty/etcpak/ProcessDxtc.cpp
new file mode 100644
index 0000000000..508d55fd75
--- /dev/null
+++ b/thirdparty/etcpak/ProcessDxtc.cpp
@@ -0,0 +1,956 @@
+#include "Dither.hpp"
+#include "ForceInline.hpp"
+#include "ProcessDxtc.hpp"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+#endif
+
+#if defined __AVX__ && !defined __SSE4_1__
+#  define __SSE4_1__
+#endif
+
+#if defined __SSE4_1__ || defined __AVX2__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#  else
+#    include <x86intrin.h>
+#    ifndef _mm256_cvtsi256_si32
+#      define _mm256_cvtsi256_si32( v ) ( _mm_cvtsi128_si32( _mm256_castsi256_si128( v ) ) )
+#    endif
+#  endif
+#endif
+
+
+static etcpak_force_inline uint16_t to565( uint8_t r, uint8_t g, uint8_t b )
+{
+    return ( ( r & 0xF8 ) << 8 ) | ( ( g & 0xFC ) << 3 ) | ( b >> 3 );
+}
+
+static etcpak_force_inline uint16_t to565( uint32_t c )
+{
+    return
+        ( ( c & 0xF80000 ) >> 19 ) |
+        ( ( c & 0x00FC00 ) >> 5 ) |
+        ( ( c & 0x0000F8 ) << 8 );
+}
+
+static const uint8_t DxtcIndexTable[256] = {
+    85,     87,     86,     84,     93,     95,     94,     92,     89,     91,     90,     88,     81,     83,     82,     80,
+    117,    119,    118,    116,    125,    127,    126,    124,    121,    123,    122,    120,    113,    115,    114,    112,
+    101,    103,    102,    100,    109,    111,    110,    108,    105,    107,    106,    104,    97,     99,     98,     96,
+    69,     71,     70,     68,     77,     79,     78,     76,     73,     75,     74,     72,     65,     67,     66,     64,
+    213,    215,    214,    212,    221,    223,    222,    220,    217,    219,    218,    216,    209,    211,    210,    208,
+    245,    247,    246,    244,    253,    255,    254,    252,    249,    251,    250,    248,    241,    243,    242,    240,
+    229,    231,    230,    228,    237,    239,    238,    236,    233,    235,    234,    232,    225,    227,    226,    224,
+    197,    199,    198,    196,    205,    207,    206,    204,    201,    203,    202,    200,    193,    195,    194,    192,
+    149,    151,    150,    148,    157,    159,    158,    156,    153,    155,    154,    152,    145,    147,    146,    144,
+    181,    183,    182,    180,    189,    191,    190,    188,    185,    187,    186,    184,    177,    179,    178,    176,
+    165,    167,    166,    164,    173,    175,    174,    172,    169,    171,    170,    168,    161,    163,    162,    160,
+    133,    135,    134,    132,    141,    143,    142,    140,    137,    139,    138,    136,    129,    131,    130,    128,
+    21,     23,     22,     20,     29,     31,     30,     28,     25,     27,     26,     24,     17,     19,     18,     16,
+    53,     55,     54,     52,     61,     63,     62,     60,     57,     59,     58,     56,     49,     51,     50,     48,
+    37,     39,     38,     36,     45,     47,     46,     44,     41,     43,     42,     40,     33,     35,     34,     32,
+    5,      7,      6,      4,      13,     15,     14,     12,     9,      11,     10,     8,      1,      3,      2,      0
+};
+
+static const uint8_t AlphaIndexTable_SSE[64] = {
+    9,      15,     14,     13,     12,     11,     10,     8,      57,     63,     62,     61,     60,     59,     58,     56,
+    49,     55,     54,     53,     52,     51,     50,     48,     41,     47,     46,     45,     44,     43,     42,     40,
+    33,     39,     38,     37,     36,     35,     34,     32,     25,     31,     30,     29,     28,     27,     26,     24,
+    17,     23,     22,     21,     20,     19,     18,     16,     1,      7,      6,      5,      4,      3,      2,      0,
+};
+
+static const uint16_t DivTable[255*3+1] = {
+    0xffff, 0xffff, 0xffff, 0xffff, 0xcccc, 0xaaaa, 0x9249, 0x8000, 0x71c7, 0x6666, 0x5d17, 0x5555, 0x4ec4, 0x4924, 0x4444, 0x4000,
+    0x3c3c, 0x38e3, 0x35e5, 0x3333, 0x30c3, 0x2e8b, 0x2c85, 0x2aaa, 0x28f5, 0x2762, 0x25ed, 0x2492, 0x234f, 0x2222, 0x2108, 0x2000,
+    0x1f07, 0x1e1e, 0x1d41, 0x1c71, 0x1bac, 0x1af2, 0x1a41, 0x1999, 0x18f9, 0x1861, 0x17d0, 0x1745, 0x16c1, 0x1642, 0x15c9, 0x1555,
+    0x14e5, 0x147a, 0x1414, 0x13b1, 0x1352, 0x12f6, 0x129e, 0x1249, 0x11f7, 0x11a7, 0x115b, 0x1111, 0x10c9, 0x1084, 0x1041, 0x1000,
+    0x0fc0, 0x0f83, 0x0f48, 0x0f0f, 0x0ed7, 0x0ea0, 0x0e6c, 0x0e38, 0x0e07, 0x0dd6, 0x0da7, 0x0d79, 0x0d4c, 0x0d20, 0x0cf6, 0x0ccc,
+    0x0ca4, 0x0c7c, 0x0c56, 0x0c30, 0x0c0c, 0x0be8, 0x0bc5, 0x0ba2, 0x0b81, 0x0b60, 0x0b40, 0x0b21, 0x0b02, 0x0ae4, 0x0ac7, 0x0aaa,
+    0x0a8e, 0x0a72, 0x0a57, 0x0a3d, 0x0a23, 0x0a0a, 0x09f1, 0x09d8, 0x09c0, 0x09a9, 0x0991, 0x097b, 0x0964, 0x094f, 0x0939, 0x0924,
+    0x090f, 0x08fb, 0x08e7, 0x08d3, 0x08c0, 0x08ad, 0x089a, 0x0888, 0x0876, 0x0864, 0x0853, 0x0842, 0x0831, 0x0820, 0x0810, 0x0800,
+    0x07f0, 0x07e0, 0x07d1, 0x07c1, 0x07b3, 0x07a4, 0x0795, 0x0787, 0x0779, 0x076b, 0x075d, 0x0750, 0x0743, 0x0736, 0x0729, 0x071c,
+    0x070f, 0x0703, 0x06f7, 0x06eb, 0x06df, 0x06d3, 0x06c8, 0x06bc, 0x06b1, 0x06a6, 0x069b, 0x0690, 0x0685, 0x067b, 0x0670, 0x0666,
+    0x065c, 0x0652, 0x0648, 0x063e, 0x0634, 0x062b, 0x0621, 0x0618, 0x060f, 0x0606, 0x05fd, 0x05f4, 0x05eb, 0x05e2, 0x05d9, 0x05d1,
+    0x05c9, 0x05c0, 0x05b8, 0x05b0, 0x05a8, 0x05a0, 0x0598, 0x0590, 0x0588, 0x0581, 0x0579, 0x0572, 0x056b, 0x0563, 0x055c, 0x0555,
+    0x054e, 0x0547, 0x0540, 0x0539, 0x0532, 0x052b, 0x0525, 0x051e, 0x0518, 0x0511, 0x050b, 0x0505, 0x04fe, 0x04f8, 0x04f2, 0x04ec,
+    0x04e6, 0x04e0, 0x04da, 0x04d4, 0x04ce, 0x04c8, 0x04c3, 0x04bd, 0x04b8, 0x04b2, 0x04ad, 0x04a7, 0x04a2, 0x049c, 0x0497, 0x0492,
+    0x048d, 0x0487, 0x0482, 0x047d, 0x0478, 0x0473, 0x046e, 0x0469, 0x0465, 0x0460, 0x045b, 0x0456, 0x0452, 0x044d, 0x0448, 0x0444,
+    0x043f, 0x043b, 0x0436, 0x0432, 0x042d, 0x0429, 0x0425, 0x0421, 0x041c, 0x0418, 0x0414, 0x0410, 0x040c, 0x0408, 0x0404, 0x0400,
+    0x03fc, 0x03f8, 0x03f4, 0x03f0, 0x03ec, 0x03e8, 0x03e4, 0x03e0, 0x03dd, 0x03d9, 0x03d5, 0x03d2, 0x03ce, 0x03ca, 0x03c7, 0x03c3,
+    0x03c0, 0x03bc, 0x03b9, 0x03b5, 0x03b2, 0x03ae, 0x03ab, 0x03a8, 0x03a4, 0x03a1, 0x039e, 0x039b, 0x0397, 0x0394, 0x0391, 0x038e,
+    0x038b, 0x0387, 0x0384, 0x0381, 0x037e, 0x037b, 0x0378, 0x0375, 0x0372, 0x036f, 0x036c, 0x0369, 0x0366, 0x0364, 0x0361, 0x035e,
+    0x035b, 0x0358, 0x0355, 0x0353, 0x0350, 0x034d, 0x034a, 0x0348, 0x0345, 0x0342, 0x0340, 0x033d, 0x033a, 0x0338, 0x0335, 0x0333,
+    0x0330, 0x032e, 0x032b, 0x0329, 0x0326, 0x0324, 0x0321, 0x031f, 0x031c, 0x031a, 0x0317, 0x0315, 0x0313, 0x0310, 0x030e, 0x030c,
+    0x0309, 0x0307, 0x0305, 0x0303, 0x0300, 0x02fe, 0x02fc, 0x02fa, 0x02f7, 0x02f5, 0x02f3, 0x02f1, 0x02ef, 0x02ec, 0x02ea, 0x02e8,
+    0x02e6, 0x02e4, 0x02e2, 0x02e0, 0x02de, 0x02dc, 0x02da, 0x02d8, 0x02d6, 0x02d4, 0x02d2, 0x02d0, 0x02ce, 0x02cc, 0x02ca, 0x02c8,
+    0x02c6, 0x02c4, 0x02c2, 0x02c0, 0x02be, 0x02bc, 0x02bb, 0x02b9, 0x02b7, 0x02b5, 0x02b3, 0x02b1, 0x02b0, 0x02ae, 0x02ac, 0x02aa,
+    0x02a8, 0x02a7, 0x02a5, 0x02a3, 0x02a1, 0x02a0, 0x029e, 0x029c, 0x029b, 0x0299, 0x0297, 0x0295, 0x0294, 0x0292, 0x0291, 0x028f,
+    0x028d, 0x028c, 0x028a, 0x0288, 0x0287, 0x0285, 0x0284, 0x0282, 0x0280, 0x027f, 0x027d, 0x027c, 0x027a, 0x0279, 0x0277, 0x0276,
+    0x0274, 0x0273, 0x0271, 0x0270, 0x026e, 0x026d, 0x026b, 0x026a, 0x0268, 0x0267, 0x0265, 0x0264, 0x0263, 0x0261, 0x0260, 0x025e,
+    0x025d, 0x025c, 0x025a, 0x0259, 0x0257, 0x0256, 0x0255, 0x0253, 0x0252, 0x0251, 0x024f, 0x024e, 0x024d, 0x024b, 0x024a, 0x0249,
+    0x0247, 0x0246, 0x0245, 0x0243, 0x0242, 0x0241, 0x0240, 0x023e, 0x023d, 0x023c, 0x023b, 0x0239, 0x0238, 0x0237, 0x0236, 0x0234,
+    0x0233, 0x0232, 0x0231, 0x0230, 0x022e, 0x022d, 0x022c, 0x022b, 0x022a, 0x0229, 0x0227, 0x0226, 0x0225, 0x0224, 0x0223, 0x0222,
+    0x0220, 0x021f, 0x021e, 0x021d, 0x021c, 0x021b, 0x021a, 0x0219, 0x0218, 0x0216, 0x0215, 0x0214, 0x0213, 0x0212, 0x0211, 0x0210,
+    0x020f, 0x020e, 0x020d, 0x020c, 0x020b, 0x020a, 0x0209, 0x0208, 0x0207, 0x0206, 0x0205, 0x0204, 0x0203, 0x0202, 0x0201, 0x0200,
+    0x01ff, 0x01fe, 0x01fd, 0x01fc, 0x01fb, 0x01fa, 0x01f9, 0x01f8, 0x01f7, 0x01f6, 0x01f5, 0x01f4, 0x01f3, 0x01f2, 0x01f1, 0x01f0,
+    0x01ef, 0x01ee, 0x01ed, 0x01ec, 0x01eb, 0x01ea, 0x01e9, 0x01e9, 0x01e8, 0x01e7, 0x01e6, 0x01e5, 0x01e4, 0x01e3, 0x01e2, 0x01e1,
+    0x01e0, 0x01e0, 0x01df, 0x01de, 0x01dd, 0x01dc, 0x01db, 0x01da, 0x01da, 0x01d9, 0x01d8, 0x01d7, 0x01d6, 0x01d5, 0x01d4, 0x01d4,
+    0x01d3, 0x01d2, 0x01d1, 0x01d0, 0x01cf, 0x01cf, 0x01ce, 0x01cd, 0x01cc, 0x01cb, 0x01cb, 0x01ca, 0x01c9, 0x01c8, 0x01c7, 0x01c7,
+    0x01c6, 0x01c5, 0x01c4, 0x01c3, 0x01c3, 0x01c2, 0x01c1, 0x01c0, 0x01c0, 0x01bf, 0x01be, 0x01bd, 0x01bd, 0x01bc, 0x01bb, 0x01ba,
+    0x01ba, 0x01b9, 0x01b8, 0x01b7, 0x01b7, 0x01b6, 0x01b5, 0x01b4, 0x01b4, 0x01b3, 0x01b2, 0x01b2, 0x01b1, 0x01b0, 0x01af, 0x01af,
+    0x01ae, 0x01ad, 0x01ad, 0x01ac, 0x01ab, 0x01aa, 0x01aa, 0x01a9, 0x01a8, 0x01a8, 0x01a7, 0x01a6, 0x01a6, 0x01a5, 0x01a4, 0x01a4,
+    0x01a3, 0x01a2, 0x01a2, 0x01a1, 0x01a0, 0x01a0, 0x019f, 0x019e, 0x019e, 0x019d, 0x019c, 0x019c, 0x019b, 0x019a, 0x019a, 0x0199,
+    0x0198, 0x0198, 0x0197, 0x0197, 0x0196, 0x0195, 0x0195, 0x0194, 0x0193, 0x0193, 0x0192, 0x0192, 0x0191, 0x0190, 0x0190, 0x018f,
+    0x018f, 0x018e, 0x018d, 0x018d, 0x018c, 0x018b, 0x018b, 0x018a, 0x018a, 0x0189, 0x0189, 0x0188, 0x0187, 0x0187, 0x0186, 0x0186,
+    0x0185, 0x0184, 0x0184, 0x0183, 0x0183, 0x0182, 0x0182, 0x0181, 0x0180, 0x0180, 0x017f, 0x017f, 0x017e, 0x017e, 0x017d, 0x017d,
+    0x017c, 0x017b, 0x017b, 0x017a, 0x017a, 0x0179, 0x0179, 0x0178, 0x0178, 0x0177, 0x0177, 0x0176, 0x0175, 0x0175, 0x0174, 0x0174,
+    0x0173, 0x0173, 0x0172, 0x0172, 0x0171, 0x0171, 0x0170, 0x0170, 0x016f, 0x016f, 0x016e, 0x016e, 0x016d, 0x016d, 0x016c, 0x016c,
+    0x016b, 0x016b, 0x016a, 0x016a, 0x0169, 0x0169, 0x0168, 0x0168, 0x0167, 0x0167, 0x0166, 0x0166, 0x0165, 0x0165, 0x0164, 0x0164,
+    0x0163, 0x0163, 0x0162, 0x0162, 0x0161, 0x0161, 0x0160, 0x0160, 0x015f, 0x015f, 0x015e, 0x015e, 0x015d, 0x015d, 0x015d, 0x015c,
+    0x015c, 0x015b, 0x015b, 0x015a, 0x015a, 0x0159, 0x0159, 0x0158, 0x0158, 0x0158, 0x0157, 0x0157, 0x0156, 0x0156
+};
+static const uint16_t DivTableNEON[255*3+1] = {
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x1c71, 0x1af2, 0x1999, 0x1861, 0x1745, 0x1642, 0x1555, 0x147a, 0x13b1, 0x12f6, 0x1249, 0x11a7, 0x1111, 0x1084, 0x1000,
+    0x0f83, 0x0f0f, 0x0ea0, 0x0e38, 0x0dd6, 0x0d79, 0x0d20, 0x0ccc, 0x0c7c, 0x0c30, 0x0be8, 0x0ba2, 0x0b60, 0x0b21, 0x0ae4, 0x0aaa,
+    0x0a72, 0x0a3d, 0x0a0a, 0x09d8, 0x09a9, 0x097b, 0x094f, 0x0924, 0x08fb, 0x08d3, 0x08ad, 0x0888, 0x0864, 0x0842, 0x0820, 0x0800,
+    0x07e0, 0x07c1, 0x07a4, 0x0787, 0x076b, 0x0750, 0x0736, 0x071c, 0x0703, 0x06eb, 0x06d3, 0x06bc, 0x06a6, 0x0690, 0x067b, 0x0666,
+    0x0652, 0x063e, 0x062b, 0x0618, 0x0606, 0x05f4, 0x05e2, 0x05d1, 0x05c0, 0x05b0, 0x05a0, 0x0590, 0x0581, 0x0572, 0x0563, 0x0555,
+    0x0547, 0x0539, 0x052b, 0x051e, 0x0511, 0x0505, 0x04f8, 0x04ec, 0x04e0, 0x04d4, 0x04c8, 0x04bd, 0x04b2, 0x04a7, 0x049c, 0x0492,
+    0x0487, 0x047d, 0x0473, 0x0469, 0x0460, 0x0456, 0x044d, 0x0444, 0x043b, 0x0432, 0x0429, 0x0421, 0x0418, 0x0410, 0x0408, 0x0400,
+    0x03f8, 0x03f0, 0x03e8, 0x03e0, 0x03d9, 0x03d2, 0x03ca, 0x03c3, 0x03bc, 0x03b5, 0x03ae, 0x03a8, 0x03a1, 0x039b, 0x0394, 0x038e,
+    0x0387, 0x0381, 0x037b, 0x0375, 0x036f, 0x0369, 0x0364, 0x035e, 0x0358, 0x0353, 0x034d, 0x0348, 0x0342, 0x033d, 0x0338, 0x0333,
+    0x032e, 0x0329, 0x0324, 0x031f, 0x031a, 0x0315, 0x0310, 0x030c, 0x0307, 0x0303, 0x02fe, 0x02fa, 0x02f5, 0x02f1, 0x02ec, 0x02e8,
+    0x02e4, 0x02e0, 0x02dc, 0x02d8, 0x02d4, 0x02d0, 0x02cc, 0x02c8, 0x02c4, 0x02c0, 0x02bc, 0x02b9, 0x02b5, 0x02b1, 0x02ae, 0x02aa,
+    0x02a7, 0x02a3, 0x02a0, 0x029c, 0x0299, 0x0295, 0x0292, 0x028f, 0x028c, 0x0288, 0x0285, 0x0282, 0x027f, 0x027c, 0x0279, 0x0276,
+    0x0273, 0x0270, 0x026d, 0x026a, 0x0267, 0x0264, 0x0261, 0x025e, 0x025c, 0x0259, 0x0256, 0x0253, 0x0251, 0x024e, 0x024b, 0x0249,
+    0x0246, 0x0243, 0x0241, 0x023e, 0x023c, 0x0239, 0x0237, 0x0234, 0x0232, 0x0230, 0x022d, 0x022b, 0x0229, 0x0226, 0x0224, 0x0222,
+    0x021f, 0x021d, 0x021b, 0x0219, 0x0216, 0x0214, 0x0212, 0x0210, 0x020e, 0x020c, 0x020a, 0x0208, 0x0206, 0x0204, 0x0202, 0x0200,
+    0x01fe, 0x01fc, 0x01fa, 0x01f8, 0x01f6, 0x01f4, 0x01f2, 0x01f0, 0x01ee, 0x01ec, 0x01ea, 0x01e9, 0x01e7, 0x01e5, 0x01e3, 0x01e1,
+    0x01e0, 0x01de, 0x01dc, 0x01da, 0x01d9, 0x01d7, 0x01d5, 0x01d4, 0x01d2, 0x01d0, 0x01cf, 0x01cd, 0x01cb, 0x01ca, 0x01c8, 0x01c7,
+    0x01c5, 0x01c3, 0x01c2, 0x01c0, 0x01bf, 0x01bd, 0x01bc, 0x01ba, 0x01b9, 0x01b7, 0x01b6, 0x01b4, 0x01b3, 0x01b2, 0x01b0, 0x01af,
+    0x01ad, 0x01ac, 0x01aa, 0x01a9, 0x01a8, 0x01a6, 0x01a5, 0x01a4, 0x01a2, 0x01a1, 0x01a0, 0x019e, 0x019d, 0x019c, 0x019a, 0x0199,
+    0x0198, 0x0197, 0x0195, 0x0194, 0x0193, 0x0192, 0x0190, 0x018f, 0x018e, 0x018d, 0x018b, 0x018a, 0x0189, 0x0188, 0x0187, 0x0186,
+    0x0184, 0x0183, 0x0182, 0x0181, 0x0180, 0x017f, 0x017e, 0x017d, 0x017b, 0x017a, 0x0179, 0x0178, 0x0177, 0x0176, 0x0175, 0x0174,
+    0x0173, 0x0172, 0x0171, 0x0170, 0x016f, 0x016e, 0x016d, 0x016c, 0x016b, 0x016a, 0x0169, 0x0168, 0x0167, 0x0166, 0x0165, 0x0164,
+    0x0163, 0x0162, 0x0161, 0x0160, 0x015f, 0x015e, 0x015d, 0x015c, 0x015b, 0x015a, 0x0159, 0x0158, 0x0158, 0x0157, 0x0156, 0x0155,
+    0x0154, 0x0153, 0x0152, 0x0151, 0x0150, 0x0150, 0x014f, 0x014e, 0x014d, 0x014c, 0x014b, 0x014a, 0x014a, 0x0149, 0x0148, 0x0147,
+    0x0146, 0x0146, 0x0145, 0x0144, 0x0143, 0x0142, 0x0142, 0x0141, 0x0140, 0x013f, 0x013e, 0x013e, 0x013d, 0x013c, 0x013b, 0x013b,
+    0x013a, 0x0139, 0x0138, 0x0138, 0x0137, 0x0136, 0x0135, 0x0135, 0x0134, 0x0133, 0x0132, 0x0132, 0x0131, 0x0130, 0x0130, 0x012f,
+    0x012e, 0x012e, 0x012d, 0x012c, 0x012b, 0x012b, 0x012a, 0x0129, 0x0129, 0x0128, 0x0127, 0x0127, 0x0126, 0x0125, 0x0125, 0x0124,
+    0x0123, 0x0123, 0x0122, 0x0121, 0x0121, 0x0120, 0x0120, 0x011f, 0x011e, 0x011e, 0x011d, 0x011c, 0x011c, 0x011b, 0x011b, 0x011a,
+    0x0119, 0x0119, 0x0118, 0x0118, 0x0117, 0x0116, 0x0116, 0x0115, 0x0115, 0x0114, 0x0113, 0x0113, 0x0112, 0x0112, 0x0111, 0x0111,
+    0x0110, 0x010f, 0x010f, 0x010e, 0x010e, 0x010d, 0x010d, 0x010c, 0x010c, 0x010b, 0x010a, 0x010a, 0x0109, 0x0109, 0x0108, 0x0108,
+    0x0107, 0x0107, 0x0106, 0x0106, 0x0105, 0x0105, 0x0104, 0x0104, 0x0103, 0x0103, 0x0102, 0x0102, 0x0101, 0x0101, 0x0100, 0x0100,
+    0x00ff, 0x00ff, 0x00fe, 0x00fe, 0x00fd, 0x00fd, 0x00fc, 0x00fc, 0x00fb, 0x00fb, 0x00fa, 0x00fa, 0x00f9, 0x00f9, 0x00f8, 0x00f8,
+    0x00f7, 0x00f7, 0x00f6, 0x00f6, 0x00f5, 0x00f5, 0x00f4, 0x00f4, 0x00f4, 0x00f3, 0x00f3, 0x00f2, 0x00f2, 0x00f1, 0x00f1, 0x00f0,
+    0x00f0, 0x00f0, 0x00ef, 0x00ef, 0x00ee, 0x00ee, 0x00ed, 0x00ed, 0x00ed, 0x00ec, 0x00ec, 0x00eb, 0x00eb, 0x00ea, 0x00ea, 0x00ea,
+    0x00e9, 0x00e9, 0x00e8, 0x00e8, 0x00e7, 0x00e7, 0x00e7, 0x00e6, 0x00e6, 0x00e5, 0x00e5, 0x00e5, 0x00e4, 0x00e4, 0x00e3, 0x00e3,
+    0x00e3, 0x00e2, 0x00e2, 0x00e1, 0x00e1, 0x00e1, 0x00e0, 0x00e0, 0x00e0, 0x00df, 0x00df, 0x00de, 0x00de, 0x00de, 0x00dd, 0x00dd,
+    0x00dd, 0x00dc, 0x00dc, 0x00db, 0x00db, 0x00db, 0x00da, 0x00da, 0x00da, 0x00d9, 0x00d9, 0x00d9, 0x00d8, 0x00d8, 0x00d7, 0x00d7,
+    0x00d7, 0x00d6, 0x00d6, 0x00d6, 0x00d5, 0x00d5, 0x00d5, 0x00d4, 0x00d4, 0x00d4, 0x00d3, 0x00d3, 0x00d3, 0x00d2, 0x00d2, 0x00d2,
+    0x00d1, 0x00d1, 0x00d1, 0x00d0, 0x00d0, 0x00d0, 0x00cf, 0x00cf, 0x00cf, 0x00ce, 0x00ce, 0x00ce, 0x00cd, 0x00cd, 0x00cd, 0x00cc,
+    0x00cc, 0x00cc, 0x00cb, 0x00cb, 0x00cb, 0x00ca, 0x00ca, 0x00ca, 0x00c9, 0x00c9, 0x00c9, 0x00c9, 0x00c8, 0x00c8, 0x00c8, 0x00c7,
+    0x00c7, 0x00c7, 0x00c6, 0x00c6, 0x00c6, 0x00c5, 0x00c5, 0x00c5, 0x00c5, 0x00c4, 0x00c4, 0x00c4, 0x00c3, 0x00c3, 0x00c3, 0x00c3,
+    0x00c2, 0x00c2, 0x00c2, 0x00c1, 0x00c1, 0x00c1, 0x00c1, 0x00c0, 0x00c0, 0x00c0, 0x00bf, 0x00bf, 0x00bf, 0x00bf, 0x00be, 0x00be,
+    0x00be, 0x00bd, 0x00bd, 0x00bd, 0x00bd, 0x00bc, 0x00bc, 0x00bc, 0x00bc, 0x00bb, 0x00bb, 0x00bb, 0x00ba, 0x00ba, 0x00ba, 0x00ba,
+    0x00b9, 0x00b9, 0x00b9, 0x00b9, 0x00b8, 0x00b8, 0x00b8, 0x00b8, 0x00b7, 0x00b7, 0x00b7, 0x00b7, 0x00b6, 0x00b6, 0x00b6, 0x00b6,
+    0x00b5, 0x00b5, 0x00b5, 0x00b5, 0x00b4, 0x00b4, 0x00b4, 0x00b4, 0x00b3, 0x00b3, 0x00b3, 0x00b3, 0x00b2, 0x00b2, 0x00b2, 0x00b2,
+    0x00b1, 0x00b1, 0x00b1, 0x00b1, 0x00b0, 0x00b0, 0x00b0, 0x00b0, 0x00af, 0x00af, 0x00af, 0x00af, 0x00ae, 0x00ae, 0x00ae, 0x00ae,
+    0x00ae, 0x00ad, 0x00ad, 0x00ad, 0x00ad, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ac, 0x00ab, 0x00ab, 0x00ab, 0x00ab,
+};
+
+static const uint16_t DivTableAlpha[256] = {
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xe38e, 0xcccc, 0xba2e, 0xaaaa, 0x9d89, 0x9249, 0x8888, 0x8000,
+    0x7878, 0x71c7, 0x6bca, 0x6666, 0x6186, 0x5d17, 0x590b, 0x5555, 0x51eb, 0x4ec4, 0x4bda, 0x4924, 0x469e, 0x4444, 0x4210, 0x4000,
+    0x3e0f, 0x3c3c, 0x3a83, 0x38e3, 0x3759, 0x35e5, 0x3483, 0x3333, 0x31f3, 0x30c3, 0x2fa0, 0x2e8b, 0x2d82, 0x2c85, 0x2b93, 0x2aaa,
+    0x29cb, 0x28f5, 0x2828, 0x2762, 0x26a4, 0x25ed, 0x253c, 0x2492, 0x23ee, 0x234f, 0x22b6, 0x2222, 0x2192, 0x2108, 0x2082, 0x2000,
+    0x1f81, 0x1f07, 0x1e91, 0x1e1e, 0x1dae, 0x1d41, 0x1cd8, 0x1c71, 0x1c0e, 0x1bac, 0x1b4e, 0x1af2, 0x1a98, 0x1a41, 0x19ec, 0x1999,
+    0x1948, 0x18f9, 0x18ac, 0x1861, 0x1818, 0x17d0, 0x178a, 0x1745, 0x1702, 0x16c1, 0x1681, 0x1642, 0x1605, 0x15c9, 0x158e, 0x1555,
+    0x151d, 0x14e5, 0x14af, 0x147a, 0x1446, 0x1414, 0x13e2, 0x13b1, 0x1381, 0x1352, 0x1323, 0x12f6, 0x12c9, 0x129e, 0x1273, 0x1249,
+    0x121f, 0x11f7, 0x11cf, 0x11a7, 0x1181, 0x115b, 0x1135, 0x1111, 0x10ec, 0x10c9, 0x10a6, 0x1084, 0x1062, 0x1041, 0x1020, 0x1000,
+    0x0fe0, 0x0fc0, 0x0fa2, 0x0f83, 0x0f66, 0x0f48, 0x0f2b, 0x0f0f, 0x0ef2, 0x0ed7, 0x0ebb, 0x0ea0, 0x0e86, 0x0e6c, 0x0e52, 0x0e38,
+    0x0e1f, 0x0e07, 0x0dee, 0x0dd6, 0x0dbe, 0x0da7, 0x0d90, 0x0d79, 0x0d62, 0x0d4c, 0x0d36, 0x0d20, 0x0d0b, 0x0cf6, 0x0ce1, 0x0ccc,
+    0x0cb8, 0x0ca4, 0x0c90, 0x0c7c, 0x0c69, 0x0c56, 0x0c43, 0x0c30, 0x0c1e, 0x0c0c, 0x0bfa, 0x0be8, 0x0bd6, 0x0bc5, 0x0bb3, 0x0ba2,
+    0x0b92, 0x0b81, 0x0b70, 0x0b60, 0x0b50, 0x0b40, 0x0b30, 0x0b21, 0x0b11, 0x0b02, 0x0af3, 0x0ae4, 0x0ad6, 0x0ac7, 0x0ab8, 0x0aaa,
+    0x0a9c, 0x0a8e, 0x0a80, 0x0a72, 0x0a65, 0x0a57, 0x0a4a, 0x0a3d, 0x0a30, 0x0a23, 0x0a16, 0x0a0a, 0x09fd, 0x09f1, 0x09e4, 0x09d8,
+    0x09cc, 0x09c0, 0x09b4, 0x09a9, 0x099d, 0x0991, 0x0986, 0x097b, 0x0970, 0x0964, 0x095a, 0x094f, 0x0944, 0x0939, 0x092f, 0x0924,
+    0x091a, 0x090f, 0x0905, 0x08fb, 0x08f1, 0x08e7, 0x08dd, 0x08d3, 0x08ca, 0x08c0, 0x08b7, 0x08ad, 0x08a4, 0x089a, 0x0891, 0x0888,
+    0x087f, 0x0876, 0x086d, 0x0864, 0x085b, 0x0853, 0x084a, 0x0842, 0x0839, 0x0831, 0x0828, 0x0820, 0x0818, 0x0810, 0x0808, 0x0800,
+};
+
+static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src )
+{
+#ifdef __SSE4_1__
+    __m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i smask = _mm_set1_epi32( 0xF8FCF8 );
+    __m128i sd0 = _mm_and_si128( px0, smask );
+    __m128i sd1 = _mm_and_si128( px1, smask );
+    __m128i sd2 = _mm_and_si128( px2, smask );
+    __m128i sd3 = _mm_and_si128( px3, smask );
+
+    __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
+    __m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
+    __m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
+    __m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
+
+    __m128i sm0 = _mm_and_si128(sc0, sc1);
+    __m128i sm1 = _mm_and_si128(sc2, sc3);
+    __m128i sm = _mm_and_si128(sm0, sm1);
+
+    if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
+    {
+        uint32_t c;
+        memcpy( &c, src, 4 );
+        return uint64_t( to565( c ) ) << 16;
+    }
+
+    __m128i min0 = _mm_min_epu8( px0, px1 );
+    __m128i min1 = _mm_min_epu8( px2, px3 );
+    __m128i min2 = _mm_min_epu8( min0, min1 );
+
+    __m128i max0 = _mm_max_epu8( px0, px1 );
+    __m128i max1 = _mm_max_epu8( px2, px3 );
+    __m128i max2 = _mm_max_epu8( max0, max1 );
+
+    __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i min4 = _mm_min_epu8( min2, min3 );
+    __m128i max4 = _mm_max_epu8( max2, max3 );
+
+    __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i rmin = _mm_min_epu8( min4, min5 );
+    __m128i rmax = _mm_max_epu8( max4, max5 );
+
+    __m128i range1 = _mm_subs_epu8( rmax, rmin );
+    __m128i range2 = _mm_sad_epu8( rmax, rmin );
+
+    uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
+    __m128i range = _mm_set1_epi16( DivTable[vrange] );
+
+    __m128i inset1 = _mm_srli_epi16( range1, 4 );
+    __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
+    __m128i min = _mm_adds_epu8( rmin, inset );
+    __m128i max = _mm_subs_epu8( rmax, inset );
+
+    __m128i c0 = _mm_subs_epu8( px0, rmin );
+    __m128i c1 = _mm_subs_epu8( px1, rmin );
+    __m128i c2 = _mm_subs_epu8( px2, rmin );
+    __m128i c3 = _mm_subs_epu8( px3, rmin );
+
+    __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
+    __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
+    __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
+    __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
+
+    __m128i s0 = _mm_hadd_epi16( is0, is1 );
+    __m128i s1 = _mm_hadd_epi16( is2, is3 );
+
+    __m128i m0 = _mm_mulhi_epu16( s0, range );
+    __m128i m1 = _mm_mulhi_epu16( s1, range );
+
+    __m128i p0 = _mm_packus_epi16( m0, m1 );
+
+    __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
+    __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
+    __m128i p3 = _mm_or_si128( p1, p2 );
+    __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
+
+    uint32_t vmin = _mm_cvtsi128_si32( min );
+    uint32_t vmax = _mm_cvtsi128_si32( max );
+    uint32_t vp = _mm_cvtsi128_si32( p );
+
+    return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
+#elif defined __ARM_NEON
+#  ifdef __aarch64__
+    uint8x16x4_t px = vld4q_u8( src );
+
+    uint8x16_t lr = px.val[0];
+    uint8x16_t lg = px.val[1];
+    uint8x16_t lb = px.val[2];
+
+    uint8_t rmaxr = vmaxvq_u8( lr );
+    uint8_t rmaxg = vmaxvq_u8( lg );
+    uint8_t rmaxb = vmaxvq_u8( lb );
+
+    uint8_t rminr = vminvq_u8( lr );
+    uint8_t rming = vminvq_u8( lg );
+    uint8_t rminb = vminvq_u8( lb );
+
+    int rr = rmaxr - rminr;
+    int rg = rmaxg - rming;
+    int rb = rmaxb - rminb;
+
+    int vrange1 = rr + rg + rb;
+    uint16_t vrange2 = DivTableNEON[vrange1];
+
+    uint8_t insetr = rr >> 4;
+    uint8_t insetg = rg >> 4;
+    uint8_t insetb = rb >> 4;
+
+    uint8_t minr = rminr + insetr;
+    uint8_t ming = rming + insetg;
+    uint8_t minb = rminb + insetb;
+
+    uint8_t maxr = rmaxr - insetr;
+    uint8_t maxg = rmaxg - insetg;
+    uint8_t maxb = rmaxb - insetb;
+
+    uint8x16_t cr = vsubq_u8( lr, vdupq_n_u8( rminr ) );
+    uint8x16_t cg = vsubq_u8( lg, vdupq_n_u8( rming ) );
+    uint8x16_t cb = vsubq_u8( lb, vdupq_n_u8( rminb ) );
+
+    uint16x8_t is0l = vaddl_u8( vget_low_u8( cr ), vget_low_u8( cg ) );
+    uint16x8_t is0h = vaddl_u8( vget_high_u8( cr ), vget_high_u8( cg ) );
+    uint16x8_t is1l = vaddw_u8( is0l, vget_low_u8( cb ) );
+    uint16x8_t is1h = vaddw_u8( is0h, vget_high_u8( cb ) );
+
+    int16x8_t range = vdupq_n_s16( vrange2 );
+    uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1l ), range ) );
+    uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( is1h ), range ) );
+
+    uint8x8_t p00 = vmovn_u16( m0 );
+    uint8x8_t p01 = vmovn_u16( m1 );
+    uint8x16_t p0 = vcombine_u8( p00, p01 );
+
+    uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
+    uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
+    uint32x4_t p3 = vaddq_u32( p1, p2 );
+
+    uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
+    uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
+
+    uint32_t vp;
+    vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
+
+    return uint64_t( ( uint64_t( to565( minr, ming, minb ) ) << 16 ) | to565( maxr, maxg, maxb ) | ( uint64_t( vp ) << 32 ) );
+#  else
+    uint32x4_t px0 = vld1q_u32( (uint32_t*)src );
+    uint32x4_t px1 = vld1q_u32( (uint32_t*)src + 4 );
+    uint32x4_t px2 = vld1q_u32( (uint32_t*)src + 8 );
+    uint32x4_t px3 = vld1q_u32( (uint32_t*)src + 12 );
+
+    uint32x4_t smask = vdupq_n_u32( 0xF8FCF8 );
+    uint32x4_t sd0 = vandq_u32( smask, px0 );
+    uint32x4_t sd1 = vandq_u32( smask, px1 );
+    uint32x4_t sd2 = vandq_u32( smask, px2 );
+    uint32x4_t sd3 = vandq_u32( smask, px3 );
+
+    uint32x4_t sc = vdupq_n_u32( sd0[0] );
+
+    uint32x4_t sc0 = vceqq_u32( sd0, sc );
+    uint32x4_t sc1 = vceqq_u32( sd1, sc );
+    uint32x4_t sc2 = vceqq_u32( sd2, sc );
+    uint32x4_t sc3 = vceqq_u32( sd3, sc );
+
+    uint32x4_t sm0 = vandq_u32( sc0, sc1 );
+    uint32x4_t sm1 = vandq_u32( sc2, sc3 );
+    int64x2_t sm = vreinterpretq_s64_u32( vandq_u32( sm0, sm1 ) );
+
+    if( sm[0] == -1 && sm[1] == -1 )
+    {
+        return uint64_t( to565( src[0], src[1], src[2] ) ) << 16;
+    }
+
+    uint32x4_t mask = vdupq_n_u32( 0xFFFFFF );
+    uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, px0 ) );
+    uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, px1 ) );
+    uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, px2 ) );
+    uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, px3 ) );
+
+    uint8x16_t min0 = vminq_u8( l0, l1 );
+    uint8x16_t min1 = vminq_u8( l2, l3 );
+    uint8x16_t min2 = vminq_u8( min0, min1 );
+
+    uint8x16_t max0 = vmaxq_u8( l0, l1 );
+    uint8x16_t max1 = vmaxq_u8( l2, l3 );
+    uint8x16_t max2 = vmaxq_u8( max0, max1 );
+
+    uint8x16_t min3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( min2 ) ) );
+    uint8x16_t max3 = vreinterpretq_u8_u32( vrev64q_u32( vreinterpretq_u32_u8( max2 ) ) );
+
+    uint8x16_t min4 = vminq_u8( min2, min3 );
+    uint8x16_t max4 = vmaxq_u8( max2, max3 );
+
+    uint8x16_t min5 = vcombine_u8( vget_high_u8( min4 ), vget_low_u8( min4 ) );
+    uint8x16_t max5 = vcombine_u8( vget_high_u8( max4 ), vget_low_u8( max4 ) );
+
+    uint8x16_t rmin = vminq_u8( min4, min5 );
+    uint8x16_t rmax = vmaxq_u8( max4, max5 );
+
+    uint8x16_t range1 = vsubq_u8( rmax, rmin );
+    uint8x8_t range2 = vget_low_u8( range1 );
+    uint8x8x2_t range3 = vzip_u8( range2, vdup_n_u8( 0 ) );
+    uint16x4_t range4 = vreinterpret_u16_u8( range3.val[0] );
+
+    uint16_t vrange1;
+    uint16x4_t range5 = vpadd_u16( range4, range4 );
+    uint16x4_t range6 = vpadd_u16( range5, range5 );
+    vst1_lane_u16( &vrange1, range6, 0 );
+
+    uint32_t vrange2 = ( 2 << 16 ) / uint32_t( vrange1 + 1 );
+    uint16x8_t range = vdupq_n_u16( vrange2 );
+
+    uint8x16_t inset = vshrq_n_u8( range1, 4 );
+    uint8x16_t min = vaddq_u8( rmin, inset );
+    uint8x16_t max = vsubq_u8( rmax, inset );
+
+    uint8x16_t c0 = vsubq_u8( l0, rmin );
+    uint8x16_t c1 = vsubq_u8( l1, rmin );
+    uint8x16_t c2 = vsubq_u8( l2, rmin );
+    uint8x16_t c3 = vsubq_u8( l3, rmin );
+
+    uint16x8_t is0 = vpaddlq_u8( c0 );
+    uint16x8_t is1 = vpaddlq_u8( c1 );
+    uint16x8_t is2 = vpaddlq_u8( c2 );
+    uint16x8_t is3 = vpaddlq_u8( c3 );
+
+    uint16x4_t is4 = vpadd_u16( vget_low_u16( is0 ), vget_high_u16( is0 ) );
+    uint16x4_t is5 = vpadd_u16( vget_low_u16( is1 ), vget_high_u16( is1 ) );
+    uint16x4_t is6 = vpadd_u16( vget_low_u16( is2 ), vget_high_u16( is2 ) );
+    uint16x4_t is7 = vpadd_u16( vget_low_u16( is3 ), vget_high_u16( is3 ) );
+
+    uint16x8_t s0 = vcombine_u16( is4, is5 );
+    uint16x8_t s1 = vcombine_u16( is6, is7 );
+
+    uint16x8_t m0 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s0 ), vreinterpretq_s16_u16( range ) ) );
+    uint16x8_t m1 = vreinterpretq_u16_s16( vqdmulhq_s16( vreinterpretq_s16_u16( s1 ), vreinterpretq_s16_u16( range ) ) );
+
+    uint8x8_t p00 = vmovn_u16( m0 );
+    uint8x8_t p01 = vmovn_u16( m1 );
+    uint8x16_t p0 = vcombine_u8( p00, p01 );
+
+    uint32x4_t p1 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 6 ), vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 12 ) );
+    uint32x4_t p2 = vaddq_u32( vshrq_n_u32( vreinterpretq_u32_u8( p0 ), 18 ), vreinterpretq_u32_u8( p0 ) );
+    uint32x4_t p3 = vaddq_u32( p1, p2 );
+
+    uint16x4x2_t p4 = vuzp_u16( vget_low_u16( vreinterpretq_u16_u32( p3 ) ), vget_high_u16( vreinterpretq_u16_u32( p3 ) ) );
+    uint8x8x2_t p = vuzp_u8( vreinterpret_u8_u16( p4.val[0] ), vreinterpret_u8_u16( p4.val[0] ) );
+
+    uint32_t vmin, vmax, vp;
+    vst1q_lane_u32( &vmin, vreinterpretq_u32_u8( min ), 0 );
+    vst1q_lane_u32( &vmax, vreinterpretq_u32_u8( max ), 0 );
+    vst1_lane_u32( &vp, vreinterpret_u32_u8( p.val[0] ), 0 );
+
+    return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
+#  endif
+#else
+    uint32_t ref;
+    memcpy( &ref, src, 4 );
+    uint32_t refMask = ref & 0xF8FCF8;
+    auto stmp = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        uint32_t px;
+        memcpy( &px, stmp, 4 );
+        if( ( px & 0xF8FCF8 ) != refMask ) break;
+        stmp += 4;
+    }
+    if( stmp == src + 64 )
+    {
+        return uint64_t( to565( ref ) ) << 16;
+    }
+
+    uint8_t min[3] = { src[0], src[1], src[2] };
+    uint8_t max[3] = { src[0], src[1], src[2] };
+    auto tmp = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        for( int j=0; j<3; j++ )
+        {
+            if( tmp[j] < min[j] ) min[j] = tmp[j];
+            else if( tmp[j] > max[j] ) max[j] = tmp[j];
+        }
+        tmp += 4;
+    }
+
+    const uint32_t range = DivTable[max[0] - min[0] + max[1] - min[1] + max[2] - min[2]];
+    const uint32_t rmin = min[0] + min[1] + min[2];
+    for( int i=0; i<3; i++ )
+    {
+        const uint8_t inset = ( max[i] - min[i] ) >> 4;
+        min[i] += inset;
+        max[i] -= inset;
+    }
+
+    uint32_t data = 0;
+    for( int i=0; i<16; i++ )
+    {
+        const uint32_t c = src[0] + src[1] + src[2] - rmin;
+        const uint8_t idx = ( c * range ) >> 16;
+        data |= idx << (i*2);
+        src += 4;
+    }
+
+    return uint64_t( ( uint64_t( to565( min[0], min[1], min[2] ) ) << 16 ) | to565( max[0], max[1], max[2] ) | ( uint64_t( data ) << 32 ) );
+#endif
+}
+
+#ifdef __AVX2__
+static etcpak_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst )
+{
+    __m256i px0 = _mm256_loadu_si256(((__m256i*)src) + 0);
+    __m256i px1 = _mm256_loadu_si256(((__m256i*)src) + 1);
+    __m256i px2 = _mm256_loadu_si256(((__m256i*)src) + 2);
+    __m256i px3 = _mm256_loadu_si256(((__m256i*)src) + 3);
+
+    __m256i smask = _mm256_set1_epi32( 0xF8FCF8 );
+    __m256i sd0 = _mm256_and_si256( px0, smask );
+    __m256i sd1 = _mm256_and_si256( px1, smask );
+    __m256i sd2 = _mm256_and_si256( px2, smask );
+    __m256i sd3 = _mm256_and_si256( px3, smask );
+
+    __m256i sc = _mm256_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m256i sc0 = _mm256_cmpeq_epi8(sd0, sc);
+    __m256i sc1 = _mm256_cmpeq_epi8(sd1, sc);
+    __m256i sc2 = _mm256_cmpeq_epi8(sd2, sc);
+    __m256i sc3 = _mm256_cmpeq_epi8(sd3, sc);
+
+    __m256i sm0 = _mm256_and_si256(sc0, sc1);
+    __m256i sm1 = _mm256_and_si256(sc2, sc3);
+    __m256i sm = _mm256_and_si256(sm0, sm1);
+
+    const int64_t solid0 = 1 - _mm_testc_si128( _mm256_castsi256_si128( sm ), _mm_set1_epi32( -1 ) );
+    const int64_t solid1 = 1 - _mm_testc_si128( _mm256_extracti128_si256( sm, 1 ), _mm_set1_epi32( -1 ) );
+
+    if( solid0 + solid1 == 0 )
+    {
+        const auto c0 = uint64_t( to565( src[0], src[1], src[2] ) );
+        const auto c1 = uint64_t( to565( src[16], src[17], src[18] ) );
+        memcpy( dst, &c0, 8 );
+        memcpy( dst+8, &c1, 8 );
+        dst += 16;
+        return;
+    }
+
+    __m256i min0 = _mm256_min_epu8( px0, px1 );
+    __m256i min1 = _mm256_min_epu8( px2, px3 );
+    __m256i min2 = _mm256_min_epu8( min0, min1 );
+
+    __m256i max0 = _mm256_max_epu8( px0, px1 );
+    __m256i max1 = _mm256_max_epu8( px2, px3 );
+    __m256i max2 = _mm256_max_epu8( max0, max1 );
+
+    __m256i min3 = _mm256_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m256i max3 = _mm256_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m256i min4 = _mm256_min_epu8( min2, min3 );
+    __m256i max4 = _mm256_max_epu8( max2, max3 );
+
+    __m256i min5 = _mm256_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m256i max5 = _mm256_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m256i rmin = _mm256_min_epu8( min4, min5 );
+    __m256i rmax = _mm256_max_epu8( max4, max5 );
+
+    __m256i range1 = _mm256_subs_epu8( rmax, rmin );
+    __m256i range2 = _mm256_sad_epu8( rmax, rmin );
+
+    uint16_t vrange0 = DivTable[_mm256_cvtsi256_si32( range2 ) >> 1];
+    uint16_t vrange1 = DivTable[_mm256_extract_epi16( range2, 8 ) >> 1];
+    __m256i range00 = _mm256_set1_epi16( vrange0 );
+    __m256i range = _mm256_inserti128_si256( range00, _mm_set1_epi16( vrange1 ), 1 );
+
+    __m256i inset1 = _mm256_srli_epi16( range1, 4 );
+    __m256i inset = _mm256_and_si256( inset1, _mm256_set1_epi8( 0xF ) );
+    __m256i min = _mm256_adds_epu8( rmin, inset );
+    __m256i max = _mm256_subs_epu8( rmax, inset );
+
+    __m256i c0 = _mm256_subs_epu8( px0, rmin );
+    __m256i c1 = _mm256_subs_epu8( px1, rmin );
+    __m256i c2 = _mm256_subs_epu8( px2, rmin );
+    __m256i c3 = _mm256_subs_epu8( px3, rmin );
+
+    __m256i is0 = _mm256_maddubs_epi16( c0, _mm256_set1_epi8( 1 ) );
+    __m256i is1 = _mm256_maddubs_epi16( c1, _mm256_set1_epi8( 1 ) );
+    __m256i is2 = _mm256_maddubs_epi16( c2, _mm256_set1_epi8( 1 ) );
+    __m256i is3 = _mm256_maddubs_epi16( c3, _mm256_set1_epi8( 1 ) );
+
+    __m256i s0 = _mm256_hadd_epi16( is0, is1 );
+    __m256i s1 = _mm256_hadd_epi16( is2, is3 );
+
+    __m256i m0 = _mm256_mulhi_epu16( s0, range );
+    __m256i m1 = _mm256_mulhi_epu16( s1, range );
+
+    __m256i p0 = _mm256_packus_epi16( m0, m1 );
+
+    __m256i p1 = _mm256_or_si256( _mm256_srai_epi32( p0, 6 ), _mm256_srai_epi32( p0, 12 ) );
+    __m256i p2 = _mm256_or_si256( _mm256_srai_epi32( p0, 18 ), p0 );
+    __m256i p3 = _mm256_or_si256( p1, p2 );
+    __m256i p =_mm256_shuffle_epi8( p3, _mm256_set1_epi32( 0x0C080400 ) );
+
+    __m256i mm0 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), min );
+    __m256i mm1 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), max );
+    __m256i mm2 = _mm256_unpacklo_epi64( mm1, mm0 );
+    __m256i mmr = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 11 ), 11 );
+    __m256i mmg = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 26 ), 5 );
+    __m256i mmb = _mm256_srli_epi64( _mm256_slli_epi64( mm2, 16 ), 59 );
+    __m256i mm3 = _mm256_or_si256( mmr, mmg );
+    __m256i mm4 = _mm256_or_si256( mm3, mmb );
+    __m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) );
+
+    __m256i d0 = _mm256_unpacklo_epi32( mm5, p );
+    __m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) );
+    __m128i d2 = _mm256_castsi256_si128( d1 );
+
+    __m128i mask = _mm_set_epi64x( 0xFFFF0000 | -solid1, 0xFFFF0000 | -solid0 );
+    __m128i d3 = _mm_and_si128( d2, mask );
+    _mm_storeu_si128( (__m128i*)dst, d3 );
+
+    for( int j=4; j<8; j++ ) dst[j] = (char)DxtcIndexTable[(uint8_t)dst[j]];
+    for( int j=12; j<16; j++ ) dst[j] = (char)DxtcIndexTable[(uint8_t)dst[j]];
+
+    dst += 16;
+}
+#endif
+
+static const uint8_t AlphaIndexTable[8] = { 1, 7, 6, 5, 4, 3, 2, 0 };
+
+static etcpak_force_inline uint64_t ProcessAlpha( const uint8_t* src )
+{
+    uint8_t solid8 = *src;
+    uint16_t solid16 = uint16_t( solid8 ) | ( uint16_t( solid8 ) << 8 );
+    uint32_t solid32 = uint32_t( solid16 ) | ( uint32_t( solid16 ) << 16 );
+    uint64_t solid64 = uint64_t( solid32 ) | ( uint64_t( solid32 ) << 32 );
+    if( memcmp( src, &solid64, 8 ) == 0 && memcmp( src+8, &solid64, 8 ) == 0 )
+    {
+        return solid8;
+    }
+
+    uint8_t min = src[0];
+    uint8_t max = min;
+    for( int i=1; i<16; i++ )
+    {
+        const auto v = src[i];
+        if( v > max ) max = v;
+        else if( v < min ) min = v;
+    }
+
+    uint32_t range = ( 8 << 13 ) / ( 1 + max - min );
+    uint64_t data = 0;
+    for( int i=0; i<16; i++ )
+    {
+        uint8_t a = src[i] - min;
+        uint64_t idx = AlphaIndexTable[( a * range ) >> 13];
+        data |= idx << (i*3);
+    }
+
+    return max | ( min << 8 ) | ( data << 16 );
+}
+
+#ifdef __SSE4_1__
+static etcpak_force_inline uint64_t ProcessRGB_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
+{
+    __m128i smask = _mm_set1_epi32( 0xF8FCF8 );
+    __m128i sd0 = _mm_and_si128( px0, smask );
+    __m128i sd1 = _mm_and_si128( px1, smask );
+    __m128i sd2 = _mm_and_si128( px2, smask );
+    __m128i sd3 = _mm_and_si128( px3, smask );
+
+    __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
+    __m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
+    __m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
+    __m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
+
+    __m128i sm0 = _mm_and_si128(sc0, sc1);
+    __m128i sm1 = _mm_and_si128(sc2, sc3);
+    __m128i sm = _mm_and_si128(sm0, sm1);
+
+    if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
+    {
+        return uint64_t( to565( _mm_cvtsi128_si32( px0 ) ) ) << 16;
+    }
+
+    px0 = _mm_and_si128( px0, _mm_set1_epi32( 0xFFFFFF ) );
+    px1 = _mm_and_si128( px1, _mm_set1_epi32( 0xFFFFFF ) );
+    px2 = _mm_and_si128( px2, _mm_set1_epi32( 0xFFFFFF ) );
+    px3 = _mm_and_si128( px3, _mm_set1_epi32( 0xFFFFFF ) );
+
+    __m128i min0 = _mm_min_epu8( px0, px1 );
+    __m128i min1 = _mm_min_epu8( px2, px3 );
+    __m128i min2 = _mm_min_epu8( min0, min1 );
+
+    __m128i max0 = _mm_max_epu8( px0, px1 );
+    __m128i max1 = _mm_max_epu8( px2, px3 );
+    __m128i max2 = _mm_max_epu8( max0, max1 );
+
+    __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i min4 = _mm_min_epu8( min2, min3 );
+    __m128i max4 = _mm_max_epu8( max2, max3 );
+
+    __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i rmin = _mm_min_epu8( min4, min5 );
+    __m128i rmax = _mm_max_epu8( max4, max5 );
+
+    __m128i range1 = _mm_subs_epu8( rmax, rmin );
+    __m128i range2 = _mm_sad_epu8( rmax, rmin );
+
+    uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1;
+    __m128i range = _mm_set1_epi16( DivTable[vrange] );
+
+    __m128i inset1 = _mm_srli_epi16( range1, 4 );
+    __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) );
+    __m128i min = _mm_adds_epu8( rmin, inset );
+    __m128i max = _mm_subs_epu8( rmax, inset );
+
+    __m128i c0 = _mm_subs_epu8( px0, rmin );
+    __m128i c1 = _mm_subs_epu8( px1, rmin );
+    __m128i c2 = _mm_subs_epu8( px2, rmin );
+    __m128i c3 = _mm_subs_epu8( px3, rmin );
+
+    __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) );
+    __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) );
+    __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) );
+    __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) );
+
+    __m128i s0 = _mm_hadd_epi16( is0, is1 );
+    __m128i s1 = _mm_hadd_epi16( is2, is3 );
+
+    __m128i m0 = _mm_mulhi_epu16( s0, range );
+    __m128i m1 = _mm_mulhi_epu16( s1, range );
+
+    __m128i p0 = _mm_packus_epi16( m0, m1 );
+
+    __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) );
+    __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 );
+    __m128i p3 = _mm_or_si128( p1, p2 );
+    __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) );
+
+    uint32_t vmin = _mm_cvtsi128_si32( min );
+    uint32_t vmax = _mm_cvtsi128_si32( max );
+    uint32_t vp = _mm_cvtsi128_si32( p );
+
+    return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) );
+}
+
+static etcpak_force_inline uint64_t ProcessAlpha_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 )
+{
+    __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );
+
+    __m128i m0 = _mm_shuffle_epi8( px0, mask );
+    __m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
+    __m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
+    __m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
+    __m128i m4 = _mm_or_si128( m0, m1 );
+    __m128i m5 = _mm_or_si128( m2, m3 );
+    __m128i a = _mm_or_si128( m4, m5 );
+
+    __m128i solidCmp = _mm_shuffle_epi8( a, _mm_setzero_si128() );
+    __m128i cmpRes = _mm_cmpeq_epi8( a, solidCmp );
+    if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
+    {
+        return _mm_cvtsi128_si32( a ) & 0xFF;
+    }
+
+    __m128i a1 = _mm_shuffle_epi32( a, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max1 = _mm_max_epu8( a, a1 );
+    __m128i min1 = _mm_min_epu8( a, a1 );
+    __m128i amax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i amin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max2 = _mm_max_epu8( max1, amax2 );
+    __m128i min2 = _mm_min_epu8( min1, amin2 );
+    __m128i amax3 = _mm_alignr_epi8( max2, max2, 2 );
+    __m128i amin3 = _mm_alignr_epi8( min2, min2, 2 );
+    __m128i max3 = _mm_max_epu8( max2, amax3 );
+    __m128i min3 = _mm_min_epu8( min2, amin3 );
+    __m128i amax4 = _mm_alignr_epi8( max3, max3, 1 );
+    __m128i amin4 = _mm_alignr_epi8( min3, min3, 1 );
+    __m128i max = _mm_max_epu8( max3, amax4 );
+    __m128i min = _mm_min_epu8( min3, amin4 );
+    __m128i minmax = _mm_unpacklo_epi8( max, min );
+
+    __m128i r = _mm_sub_epi8( max, min );
+    int range = _mm_cvtsi128_si32( r ) & 0xFF;
+    __m128i rv = _mm_set1_epi16( DivTableAlpha[range] );
+
+    __m128i v = _mm_sub_epi8( a, min );
+
+    __m128i lo16 = _mm_unpacklo_epi8( v, _mm_setzero_si128() );
+    __m128i hi16 = _mm_unpackhi_epi8( v, _mm_setzero_si128() );
+
+    __m128i lomul = _mm_mulhi_epu16( lo16, rv );
+    __m128i himul = _mm_mulhi_epu16( hi16, rv );
+
+    __m128i p0 = _mm_packus_epi16( lomul, himul );
+    __m128i p1 = _mm_or_si128( _mm_and_si128( p0, _mm_set1_epi16( 0x3F ) ), _mm_srai_epi16( _mm_and_si128( p0, _mm_set1_epi16( 0x3F00 ) ), 5 ) );
+    __m128i p2 = _mm_packus_epi16( p1, p1 );
+
+    uint64_t pi = _mm_cvtsi128_si64( p2 );
+    uint64_t data = 0;
+    for( int i=0; i<8; i++ )
+    {
+        uint64_t idx = AlphaIndexTable_SSE[(pi>>(i*8)) & 0x3F];
+        data |= idx << (i*6);
+    }
+    return (uint64_t)(uint16_t)_mm_cvtsi128_si32( minmax ) | ( data << 16 );
+}
+#endif
+
+void CompressDxt1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+#ifdef __AVX2__
+    if( width%8 == 0 )
+    {
+        blocks /= 2;
+        uint32_t buf[8*4];
+        int i = 0;
+        char* dst8 = (char*)dst;
+
+        do
+        {
+            auto tmp = (char*)buf;
+            memcpy( tmp,        src + width * 0, 8*4 );
+            memcpy( tmp + 8*4,  src + width * 1, 8*4 );
+            memcpy( tmp + 16*4, src + width * 2, 8*4 );
+            memcpy( tmp + 24*4, src + width * 3, 8*4 );
+            src += 8;
+            if( ++i == width/8 )
+            {
+                src += width * 3;
+                i = 0;
+            }
+
+            ProcessRGB_AVX( (uint8_t*)buf, dst8 );
+        }
+        while( --blocks );
+    }
+    else
+#endif
+    {
+        uint32_t buf[4*4];
+        int i = 0;
+
+        auto ptr = dst;
+        do
+        {
+            auto tmp = (char*)buf;
+            memcpy( tmp,        src + width * 0, 4*4 );
+            memcpy( tmp + 4*4,  src + width * 1, 4*4 );
+            memcpy( tmp + 8*4,  src + width * 2, 4*4 );
+            memcpy( tmp + 12*4, src + width * 3, 4*4 );
+            src += 4;
+            if( ++i == width/4 )
+            {
+                src += width * 3;
+                i = 0;
+            }
+
+            const auto c = ProcessRGB( (uint8_t*)buf );
+            uint8_t fix[8];
+            memcpy( fix, &c, 8 );
+            for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
+            memcpy( ptr, fix, sizeof( uint64_t ) );
+            ptr++;
+        }
+        while( --blocks );
+    }
+}
+
+void CompressDxt1Dither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    uint32_t buf[4*4];
+    int i = 0;
+
+    auto ptr = dst;
+    do
+    {
+        auto tmp = (char*)buf;
+        memcpy( tmp,        src + width * 0, 4*4 );
+        memcpy( tmp + 4*4,  src + width * 1, 4*4 );
+        memcpy( tmp + 8*4,  src + width * 2, 4*4 );
+        memcpy( tmp + 12*4, src + width * 3, 4*4 );
+        src += 4;
+        if( ++i == width/4 )
+        {
+            src += width * 3;
+            i = 0;
+        }
+
+        Dither( (uint8_t*)buf );
+
+        const auto c = ProcessRGB( (uint8_t*)buf );
+        uint8_t fix[8];
+        memcpy( fix, &c, 8 );
+        for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
+        memcpy( ptr, fix, sizeof( uint64_t ) );
+        ptr++;
+    }
+    while( --blocks );
+}
+
+void CompressDxt5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int i = 0;
+    auto ptr = dst;
+    do
+    {
+#ifdef __SSE4_1__
+        __m128i px0 = _mm_loadu_si128( (__m128i*)( src + width * 0 ) );
+        __m128i px1 = _mm_loadu_si128( (__m128i*)( src + width * 1 ) );
+        __m128i px2 = _mm_loadu_si128( (__m128i*)( src + width * 2 ) );
+        __m128i px3 = _mm_loadu_si128( (__m128i*)( src + width * 3 ) );
+
+        src += 4;
+        if( ++i == width/4 )
+        {
+            src += width * 3;
+            i = 0;
+        }
+
+        *ptr++ = ProcessAlpha_SSE( px0, px1, px2, px3 );
+
+        const auto c = ProcessRGB_SSE( px0, px1, px2, px3 );
+        uint8_t fix[8];
+        memcpy( fix, &c, 8 );
+        for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
+        memcpy( ptr, fix, sizeof( uint64_t ) );
+        ptr++;
+#else
+        uint32_t rgba[4*4];
+        uint8_t alpha[4*4];
+
+        auto tmp = (char*)rgba;
+        memcpy( tmp,        src + width * 0, 4*4 );
+        memcpy( tmp + 4*4,  src + width * 1, 4*4 );
+        memcpy( tmp + 8*4,  src + width * 2, 4*4 );
+        memcpy( tmp + 12*4, src + width * 3, 4*4 );
+        src += 4;
+        if( ++i == width/4 )
+        {
+            src += width * 3;
+            i = 0;
+        }
+
+        for( int i=0; i<16; i++ )
+        {
+            alpha[i] = rgba[i] >> 24;
+            rgba[i] &= 0xFFFFFF;
+        }
+        *ptr++ = ProcessAlpha( alpha );
+
+        const auto c = ProcessRGB( (uint8_t*)rgba );
+        uint8_t fix[8];
+        memcpy( fix, &c, 8 );
+        for( int j=4; j<8; j++ ) fix[j] = DxtcIndexTable[fix[j]];
+        memcpy( ptr, fix, sizeof( uint64_t ) );
+        ptr++;
+#endif
+    }
+    while( --blocks );
+}
diff --git a/thirdparty/etcpak/ProcessDxtc.hpp b/thirdparty/etcpak/ProcessDxtc.hpp
new file mode 100644
index 0000000000..8e0b12e4bd
--- /dev/null
+++ b/thirdparty/etcpak/ProcessDxtc.hpp
@@ -0,0 +1,11 @@
+#ifndef __PROCESSDXT1_HPP__
+#define __PROCESSDXT1_HPP__
+
+#include <stddef.h>
+#include <stdint.h>
+
+void CompressDxt1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressDxt1Dither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressDxt5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+
+#endif
diff --git a/thirdparty/etcpak/ProcessRGB.cpp b/thirdparty/etcpak/ProcessRGB.cpp
new file mode 100644
index 0000000000..7f4524d105
--- /dev/null
+++ b/thirdparty/etcpak/ProcessRGB.cpp
@@ -0,0 +1,3100 @@
+#include <array>
+#include <string.h>
+#include <limits>
+
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+#endif
+
+#include "Dither.hpp"
+#include "ForceInline.hpp"
+#include "Math.hpp"
+#include "ProcessCommon.hpp"
+#include "ProcessRGB.hpp"
+#include "Tables.hpp"
+#include "Vector.hpp"
+#if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#    include <Windows.h>
+#    define _bswap(x) _byteswap_ulong(x)
+#    define _bswap64(x) _byteswap_uint64(x)
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+#ifndef _bswap
+#  define _bswap(x) __builtin_bswap32(x)
+#  define _bswap64(x) __builtin_bswap64(x)
+#endif
+
+namespace
+{
+
+#if defined _MSC_VER && !defined __clang__
+static etcpak_force_inline unsigned long _bit_scan_forward( unsigned long mask )
+{
+    unsigned long ret;
+    _BitScanForward( &ret, mask );
+    return ret;
+}
+#endif
+
+typedef std::array<uint16_t, 4> v4i;
+
+#ifdef __AVX2__
+static etcpak_force_inline __m256i Sum4_AVX2( const uint8_t* data) noexcept
+{
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
+
+    __m256i t0 = _mm256_cvtepu8_epi16(dm0);
+    __m256i t1 = _mm256_cvtepu8_epi16(dm1);
+    __m256i t2 = _mm256_cvtepu8_epi16(dm2);
+    __m256i t3 = _mm256_cvtepu8_epi16(dm3);
+
+    __m256i sum0 = _mm256_add_epi16(t0, t1);
+    __m256i sum1 = _mm256_add_epi16(t2, t3);
+
+    __m256i s0 = _mm256_permute2x128_si256(sum0, sum1, (0) | (3 << 4)); // 0, 0, 3, 3
+    __m256i s1 = _mm256_permute2x128_si256(sum0, sum1, (1) | (2 << 4)); // 1, 1, 2, 2
+
+    __m256i s2 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(1, 3, 0, 2));
+    __m256i s3 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(0, 2, 1, 3));
+    __m256i s4 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(3, 1, 0, 2));
+    __m256i s5 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(2, 0, 1, 3));
+
+    __m256i sum5 = _mm256_add_epi16(s2, s3); //   3,   0,   3,   0
+    __m256i sum6 = _mm256_add_epi16(s4, s5); //   2,   1,   1,   2
+    return _mm256_add_epi16(sum5, sum6);     // 3+2, 0+1, 3+1, 3+2
+}
+
+static etcpak_force_inline __m256i Average_AVX2( const __m256i data) noexcept
+{
+    __m256i a = _mm256_add_epi16(data, _mm256_set1_epi16(4));
+
+    return _mm256_srli_epi16(a, 3);
+}
+
+static etcpak_force_inline __m128i CalcErrorBlock_AVX2( const __m256i data, const v4i a[8]) noexcept
+{
+    //
+    __m256i a0 = _mm256_load_si256((__m256i*)a[0].data());
+    __m256i a1 = _mm256_load_si256((__m256i*)a[4].data());
+
+    // err = 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
+    __m256i a4 = _mm256_madd_epi16(a0, a0);
+    __m256i a5 = _mm256_madd_epi16(a1, a1);
+
+    __m256i a6 = _mm256_hadd_epi32(a4, a5);
+    __m256i a7 = _mm256_slli_epi32(a6, 3);
+
+    __m256i a8 = _mm256_add_epi32(a7, _mm256_set1_epi32(0x3FFFFFFF)); // Big value to prevent negative values, but small enough to prevent overflow
+
+    // average is not swapped
+    // err -= block[0] * 2 * average[0];
+    // err -= block[1] * 2 * average[1];
+    // err -= block[2] * 2 * average[2];
+    __m256i a2 = _mm256_slli_epi16(a0, 1);
+    __m256i a3 = _mm256_slli_epi16(a1, 1);
+    __m256i b0 = _mm256_madd_epi16(a2, data);
+    __m256i b1 = _mm256_madd_epi16(a3, data);
+
+    __m256i b2 = _mm256_hadd_epi32(b0, b1);
+    __m256i b3 = _mm256_sub_epi32(a8, b2);
+    __m256i b4 = _mm256_hadd_epi32(b3, b3);
+
+    __m256i b5 = _mm256_permutevar8x32_epi32(b4, _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0));
+
+    return _mm256_castsi256_si128(b5);
+}
+
+static etcpak_force_inline void ProcessAverages_AVX2(const __m256i d, v4i a[8] ) noexcept
+{
+    __m256i t = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(31)), _mm256_set1_epi16(128));
+
+    __m256i c = _mm256_srli_epi16(_mm256_add_epi16(t, _mm256_srli_epi16(t, 8)), 8);
+
+    __m256i c1 = _mm256_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256i diff = _mm256_sub_epi16(c, c1);
+    diff = _mm256_max_epi16(diff, _mm256_set1_epi16(-4));
+    diff = _mm256_min_epi16(diff, _mm256_set1_epi16(3));
+
+    __m256i co = _mm256_add_epi16(c1, diff);
+
+    c = _mm256_blend_epi16(co, c, 0xF0);
+
+    __m256i a0 = _mm256_or_si256(_mm256_slli_epi16(c, 3), _mm256_srli_epi16(c, 2));
+
+    _mm256_store_si256((__m256i*)a[4].data(), a0);
+
+    __m256i t0 = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(15)), _mm256_set1_epi16(128));
+    __m256i t1 = _mm256_srli_epi16(_mm256_add_epi16(t0, _mm256_srli_epi16(t0, 8)), 8);
+
+    __m256i t2 = _mm256_or_si256(t1, _mm256_slli_epi16(t1, 4));
+
+    _mm256_store_si256((__m256i*)a[0].data(), t2);
+}
+
+static etcpak_force_inline uint64_t EncodeAverages_AVX2( const v4i a[8], size_t idx ) noexcept
+{
+    uint64_t d = ( idx << 24 );
+    size_t base = idx << 1;
+
+    __m128i a0 = _mm_load_si128((const __m128i*)a[base].data());
+
+    __m128i r0, r1;
+
+    if( ( idx & 0x2 ) == 0 )
+    {
+        r0 = _mm_srli_epi16(a0, 4);
+
+        __m128i a1 = _mm_unpackhi_epi64(r0, r0);
+        r1 = _mm_slli_epi16(a1, 4);
+    }
+    else
+    {
+        __m128i a1 = _mm_and_si128(a0, _mm_set1_epi16(-8));
+
+        r0 = _mm_unpackhi_epi64(a1, a1);
+        __m128i a2 = _mm_sub_epi16(a1, r0);
+        __m128i a3 = _mm_srai_epi16(a2, 3);
+        r1 = _mm_and_si128(a3, _mm_set1_epi16(0x07));
+    }
+
+    __m128i r2 = _mm_or_si128(r0, r1);
+    // do missing swap for average values
+    __m128i r3 = _mm_shufflelo_epi16(r2, _MM_SHUFFLE(3, 0, 1, 2));
+    __m128i r4 = _mm_packus_epi16(r3, _mm_setzero_si128());
+    d |= _mm_cvtsi128_si32(r4);
+
+    return d;
+}
+
+static etcpak_force_inline uint64_t CheckSolid_AVX2( const uint8_t* src ) noexcept
+{
+    __m256i d0 = _mm256_loadu_si256(((__m256i*)src) + 0);
+    __m256i d1 = _mm256_loadu_si256(((__m256i*)src) + 1);
+
+    __m256i c = _mm256_broadcastd_epi32(_mm256_castsi256_si128(d0));
+
+    __m256i c0 = _mm256_cmpeq_epi8(d0, c);
+    __m256i c1 = _mm256_cmpeq_epi8(d1, c);
+
+    __m256i m = _mm256_and_si256(c0, c1);
+
+    if (!_mm256_testc_si256(m, _mm256_set1_epi32(-1)))
+    {
+        return 0;
+    }
+
+    return 0x02000000 |
+        ( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
+        ( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
+        ( (unsigned int)( src[2] & 0xF8 ) );
+}
+
+static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const uint8_t* src) noexcept
+{
+    __m256i sum4 = Sum4_AVX2( src );
+
+    ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
+
+    return CalcErrorBlock_AVX2( sum4, a);
+}
+
+static etcpak_force_inline __m128i PrepareAverages_AVX2( v4i a[8], const __m256i sum4) noexcept
+{
+    ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
+
+    return CalcErrorBlock_AVX2( sum4, a);
+}
+
+static etcpak_force_inline void FindBestFit_4x2_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
+{
+    __m256i sel0 = _mm256_setzero_si256();
+    __m256i sel1 = _mm256_setzero_si256();
+
+    for (unsigned int j = 0; j < 2; ++j)
+    {
+        unsigned int bid = offset + 1 - j;
+
+        __m256i squareErrorSum = _mm256_setzero_si256();
+
+        __m128i a0 = _mm_loadl_epi64((const __m128i*)a[bid].data());
+        __m256i a1 = _mm256_broadcastq_epi64(a0);
+
+        // Processing one full row each iteration
+        for (size_t i = 0; i < 8; i += 4)
+        {
+            __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
+
+            __m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
+            __m256i d = _mm256_sub_epi16(a1, rgb16);
+
+            // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
+            // This produces slightly different results, but is significant faster
+            __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
+            __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
+            __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
+            __m128i pixel3 = _mm256_castsi256_si128(pixel2);
+
+            __m128i pix0 = _mm_broadcastw_epi16(pixel3);
+            __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+            __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+            // Processing first two pixels of the row
+            {
+                __m256i pix = _mm256_abs_epi16(pixel);
+
+                // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+                // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+                __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+                __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+                __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+                __m256i minError = _mm256_min_epi16(error0, error1);
+
+                // Exploiting symmetry of the selector table and use the sign bit
+                // This produces slightly different results, but is significant faster
+                __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+                // Interleaving values so madd instruction can be used
+                __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+                __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+                __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+                // Squaring the minimum error to produce correct values when adding
+                __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+                squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
+
+                // Packing selector bits
+                __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
+                __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
+
+                sel0 = _mm256_or_si256(sel0, minIndexLo2);
+                sel1 = _mm256_or_si256(sel1, minIndexHi2);
+            }
+
+            pixel3 = _mm256_extracti128_si256(pixel2, 1);
+            pix0 = _mm_broadcastw_epi16(pixel3);
+            pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+            pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+            // Processing second two pixels of the row
+            {
+                __m256i pix = _mm256_abs_epi16(pixel);
+
+                // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+                // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+                __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+                __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+                __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+                __m256i minError = _mm256_min_epi16(error0, error1);
+
+                // Exploiting symmetry of the selector table and use the sign bit
+                __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+                // Interleaving values so madd instruction can be used
+                __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+                __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+                __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+                // Squaring the minimum error to produce correct values when adding
+                __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+                squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
+
+                // Packing selector bits
+                __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
+                __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
+                __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
+                __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
+
+                sel0 = _mm256_or_si256(sel0, minIndexLo3);
+                sel1 = _mm256_or_si256(sel1, minIndexHi3);
+            }
+        }
+
+        data += 8 * 4;
+
+        _mm256_store_si256((__m256i*)terr[1 - j], squareErrorSum);
+    }
+
+    // Interleave selector bits
+    __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
+    __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
+
+    __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
+    __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
+
+    __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
+
+    __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
+
+    _mm256_store_si256((__m256i*)tsel, sel);
+}
+
+static etcpak_force_inline void FindBestFit_2x4_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
+{
+    __m256i sel0 = _mm256_setzero_si256();
+    __m256i sel1 = _mm256_setzero_si256();
+
+    __m256i squareErrorSum0 = _mm256_setzero_si256();
+    __m256i squareErrorSum1 = _mm256_setzero_si256();
+
+    __m128i a0 = _mm_loadl_epi64((const __m128i*)a[offset + 1].data());
+    __m128i a1 = _mm_loadl_epi64((const __m128i*)a[offset + 0].data());
+
+    __m128i a2 = _mm_broadcastq_epi64(a0);
+    __m128i a3 = _mm_broadcastq_epi64(a1);
+    __m256i a4 = _mm256_insertf128_si256(_mm256_castsi128_si256(a2), a3, 1);
+
+    // Processing one full row each iteration
+    for (size_t i = 0; i < 16; i += 4)
+    {
+        __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
+
+        __m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
+        __m256i d = _mm256_sub_epi16(a4, rgb16);
+
+        // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
+        // This produces slightly different results, but is significant faster
+        __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
+        __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
+        __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
+        __m128i pixel3 = _mm256_castsi256_si128(pixel2);
+
+        __m128i pix0 = _mm_broadcastw_epi16(pixel3);
+        __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+        __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+        // Processing first two pixels of the row
+        {
+            __m256i pix = _mm256_abs_epi16(pixel);
+
+            // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+            // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+            __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+            __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+            __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+            __m256i minError = _mm256_min_epi16(error0, error1);
+
+            // Exploiting symmetry of the selector table and use the sign bit
+            __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+            // Interleaving values so madd instruction can be used
+            __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+            __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+            __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+            // Squaring the minimum error to produce correct values when adding
+            __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+            squareErrorSum0 = _mm256_add_epi32(squareErrorSum0, squareError);
+
+            // Packing selector bits
+            __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
+            __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
+
+            sel0 = _mm256_or_si256(sel0, minIndexLo2);
+            sel1 = _mm256_or_si256(sel1, minIndexHi2);
+        }
+
+        pixel3 = _mm256_extracti128_si256(pixel2, 1);
+        pix0 = _mm_broadcastw_epi16(pixel3);
+        pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+        pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+        // Processing second two pixels of the row
+        {
+            __m256i pix = _mm256_abs_epi16(pixel);
+
+            // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+            // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+            __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+            __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+            __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+            __m256i minError = _mm256_min_epi16(error0, error1);
+
+            // Exploiting symmetry of the selector table and use the sign bit
+            __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+            // Interleaving values so madd instruction can be used
+            __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+            __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+            __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+            // Squaring the minimum error to produce correct values when adding
+            __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+            squareErrorSum1 = _mm256_add_epi32(squareErrorSum1, squareError);
+
+            // Packing selector bits
+            __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
+            __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
+            __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
+            __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
+
+            sel0 = _mm256_or_si256(sel0, minIndexLo3);
+            sel1 = _mm256_or_si256(sel1, minIndexHi3);
+        }
+    }
+
+    _mm256_store_si256((__m256i*)terr[1], squareErrorSum0);
+    _mm256_store_si256((__m256i*)terr[0], squareErrorSum1);
+
+    // Interleave selector bits
+    __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
+    __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
+
+    __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
+    __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
+
+    __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
+
+    __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
+
+    _mm256_store_si256((__m256i*)tsel, sel);
+}
+
+static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate) noexcept
+{
+    size_t tidx[2];
+
+    // Get index of minimum error (terr[0] and terr[1])
+    __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
+    __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
+
+    __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
+    __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
+
+    __m256i errMin0 = _mm256_min_epu32(errLo, errHi);
+
+    __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
+
+    __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
+    __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
+
+    __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
+    __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
+
+    __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
+    __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
+
+    uint32_t mask0 = _mm256_movemask_epi8(errMask0);
+    uint32_t mask1 = _mm256_movemask_epi8(errMask1);
+
+    tidx[0] = _bit_scan_forward(mask0) >> 2;
+    tidx[1] = _bit_scan_forward(mask1) >> 2;
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+
+    unsigned int t0 = tsel[tidx[0]];
+    unsigned int t1 = tsel[tidx[1]];
+
+    if (!rotate)
+    {
+        t0 &= 0xFF00FF00;
+        t1 &= 0x00FF00FF;
+    }
+    else
+    {
+        t0 &= 0xCCCCCCCC;
+        t1 &= 0x33333333;
+    }
+
+    // Flip selectors from sign bit
+    unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
+
+    return d | static_cast<uint64_t>(_bswap(t2)) << 32;
+}
+
+static etcpak_force_inline __m128i r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cvf) noexcept
+{
+    __m128i co = _mm_cvttps_epi32(cof);
+    __m128i ch = _mm_cvttps_epi32(chf);
+    __m128i cv = _mm_cvttps_epi32(cvf);
+
+    __m128i coh = _mm_packus_epi32(co, ch);
+    __m128i cv0 = _mm_packus_epi32(cv, _mm_setzero_si128());
+
+    __m256i cohv0 = _mm256_inserti128_si256(_mm256_castsi128_si256(coh), cv0, 1);
+    __m256i cohv1 = _mm256_min_epu16(cohv0, _mm256_set1_epi16(1023));
+
+    __m256i cohv2 = _mm256_sub_epi16(cohv1, _mm256_set1_epi16(15));
+    __m256i cohv3 = _mm256_srai_epi16(cohv2, 1);
+
+    __m256i cohvrb0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(11));
+    __m256i cohvrb1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(4));
+    __m256i cohvg0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(9));
+    __m256i cohvg1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(6));
+
+    __m256i cohvrb2 = _mm256_srai_epi16(cohvrb0, 7);
+    __m256i cohvrb3 = _mm256_srai_epi16(cohvrb1, 7);
+    __m256i cohvg2 = _mm256_srai_epi16(cohvg0, 8);
+    __m256i cohvg3 = _mm256_srai_epi16(cohvg1, 8);
+
+    __m256i cohvrb4 = _mm256_sub_epi16(cohvrb0, cohvrb2);
+    __m256i cohvrb5 = _mm256_sub_epi16(cohvrb4, cohvrb3);
+    __m256i cohvg4 = _mm256_sub_epi16(cohvg0, cohvg2);
+    __m256i cohvg5 = _mm256_sub_epi16(cohvg4, cohvg3);
+
+    __m256i cohvrb6 = _mm256_srai_epi16(cohvrb5, 3);
+    __m256i cohvg6 = _mm256_srai_epi16(cohvg5, 2);
+
+    __m256i cohv4 = _mm256_blend_epi16(cohvg6, cohvrb6, 0x55);
+
+    __m128i cohv5 = _mm_packus_epi16(_mm256_castsi256_si128(cohv4), _mm256_extracti128_si256(cohv4, 1));
+    return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(6, 5, 4, -1, 2, 1, 0, -1, 10, 9, 8, -1, -1, -1, -1, -1));
+}
+
+struct Plane
+{
+    uint64_t plane;
+    uint64_t error;
+    __m256i sum4;
+};
+
+static etcpak_force_inline Plane Planar_AVX2(const uint8_t* src)
+{
+    __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i rgb0 = _mm_shuffle_epi8(d0, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+    __m128i rgb1 = _mm_shuffle_epi8(d1, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+    __m128i rgb2 = _mm_shuffle_epi8(d2, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+    __m128i rgb3 = _mm_shuffle_epi8(d3, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+
+    __m128i rg0 = _mm_unpacklo_epi32(rgb0, rgb1);
+    __m128i rg1 = _mm_unpacklo_epi32(rgb2, rgb3);
+    __m128i b0 = _mm_unpackhi_epi32(rgb0, rgb1);
+    __m128i b1 = _mm_unpackhi_epi32(rgb2, rgb3);
+
+    // swap channels
+    __m128i b8 = _mm_unpacklo_epi64(rg0, rg1);
+    __m128i g8 = _mm_unpackhi_epi64(rg0, rg1);
+    __m128i r8 = _mm_unpacklo_epi64(b0, b1);
+
+    __m128i t0 = _mm_sad_epu8(r8, _mm_setzero_si128());
+    __m128i t1 = _mm_sad_epu8(g8, _mm_setzero_si128());
+    __m128i t2 = _mm_sad_epu8(b8, _mm_setzero_si128());
+
+    __m128i r8s = _mm_shuffle_epi8(r8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+    __m128i g8s = _mm_shuffle_epi8(g8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+    __m128i b8s = _mm_shuffle_epi8(b8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+
+    __m128i s0 = _mm_sad_epu8(r8s, _mm_setzero_si128());
+    __m128i s1 = _mm_sad_epu8(g8s, _mm_setzero_si128());
+    __m128i s2 = _mm_sad_epu8(b8s, _mm_setzero_si128());
+
+    __m256i sr0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t0), s0, 1);
+    __m256i sg0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t1), s1, 1);
+    __m256i sb0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t2), s2, 1);
+
+    __m256i sr1 = _mm256_slli_epi64(sr0, 32);
+    __m256i sg1 = _mm256_slli_epi64(sg0, 16);
+
+    __m256i srb = _mm256_or_si256(sr1, sb0);
+    __m256i srgb = _mm256_or_si256(srb, sg1);
+
+    __m128i t3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t0), _mm_castsi128_ps(t1), _MM_SHUFFLE(2, 0, 2, 0)));
+    __m128i t4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 2, 0));
+    __m128i t5 = _mm_hadd_epi32(t3, t4);
+    __m128i t6 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(1, 1, 1, 1));
+    __m128i t7 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(2, 2, 2, 2));
+
+    __m256i sr = _mm256_broadcastw_epi16(t5);
+    __m256i sg = _mm256_broadcastw_epi16(t6);
+    __m256i sb = _mm256_broadcastw_epi16(t7);
+
+    __m256i r08 = _mm256_cvtepu8_epi16(r8);
+    __m256i g08 = _mm256_cvtepu8_epi16(g8);
+    __m256i b08 = _mm256_cvtepu8_epi16(b8);
+
+    __m256i r16 = _mm256_slli_epi16(r08, 4);
+    __m256i g16 = _mm256_slli_epi16(g08, 4);
+    __m256i b16 = _mm256_slli_epi16(b08, 4);
+
+    __m256i difR0 = _mm256_sub_epi16(r16, sr);
+    __m256i difG0 = _mm256_sub_epi16(g16, sg);
+    __m256i difB0 = _mm256_sub_epi16(b16, sb);
+
+    __m256i difRyz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+    __m256i difGyz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+    __m256i difByz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+
+    __m256i difRxz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+    __m256i difGxz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+    __m256i difBxz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+
+    __m256i difRGyz = _mm256_hadd_epi32(difRyz, difGyz);
+    __m256i difByzxz = _mm256_hadd_epi32(difByz, difBxz);
+
+    __m256i difRGxz = _mm256_hadd_epi32(difRxz, difGxz);
+
+    __m128i sumRGyz = _mm_add_epi32(_mm256_castsi256_si128(difRGyz), _mm256_extracti128_si256(difRGyz, 1));
+    __m128i sumByzxz = _mm_add_epi32(_mm256_castsi256_si128(difByzxz), _mm256_extracti128_si256(difByzxz, 1));
+    __m128i sumRGxz = _mm_add_epi32(_mm256_castsi256_si128(difRGxz), _mm256_extracti128_si256(difRGxz, 1));
+
+    __m128i sumRGByz = _mm_hadd_epi32(sumRGyz, sumByzxz);
+    __m128i sumRGByzxz = _mm_hadd_epi32(sumRGxz, sumByzxz);
+
+    __m128i sumRGBxz = _mm_shuffle_epi32(sumRGByzxz, _MM_SHUFFLE(2, 3, 1, 0));
+
+    __m128 sumRGByzf = _mm_cvtepi32_ps(sumRGByz);
+    __m128 sumRGBxzf = _mm_cvtepi32_ps(sumRGBxz);
+
+    const float value = (255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f;
+
+    __m128 scale = _mm_set1_ps(-4.0f / value);
+
+    __m128 af = _mm_mul_ps(sumRGBxzf, scale);
+    __m128 bf = _mm_mul_ps(sumRGByzf, scale);
+
+    __m128 df = _mm_mul_ps(_mm_cvtepi32_ps(t5), _mm_set1_ps(4.0f / 16.0f));
+
+    // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
+    __m128 cof0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
+    __m128 chf0 = _mm_fnmadd_ps(af, _mm_set1_ps( 425.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
+    __m128 cvf0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps( 425.0f), df));
+
+    // convert to r6g7b6
+    __m128i cohv = r6g7b6_AVX2(cof0, chf0, cvf0);
+
+    uint64_t rgbho = _mm_extract_epi64(cohv, 0);
+    uint32_t rgbv0 = _mm_extract_epi32(cohv, 2);
+
+    // Error calculation
+    auto ro0 = (rgbho >> 48) & 0x3F;
+    auto go0 = (rgbho >> 40) & 0x7F;
+    auto bo0 = (rgbho >> 32) & 0x3F;
+    auto ro1 = (ro0 >> 4) | (ro0 << 2);
+    auto go1 = (go0 >> 6) | (go0 << 1);
+    auto bo1 = (bo0 >> 4) | (bo0 << 2);
+    auto ro2 = (ro1 << 2) + 2;
+    auto go2 = (go1 << 2) + 2;
+    auto bo2 = (bo1 << 2) + 2;
+
+    __m256i ro3 = _mm256_set1_epi16(ro2);
+    __m256i go3 = _mm256_set1_epi16(go2);
+    __m256i bo3 = _mm256_set1_epi16(bo2);
+
+    auto rh0 = (rgbho >> 16) & 0x3F;
+    auto gh0 = (rgbho >>  8) & 0x7F;
+    auto bh0 = (rgbho >>  0) & 0x3F;
+    auto rh1 = (rh0 >> 4) | (rh0 << 2);
+    auto gh1 = (gh0 >> 6) | (gh0 << 1);
+    auto bh1 = (bh0 >> 4) | (bh0 << 2);
+
+    auto rh2 = rh1 - ro1;
+    auto gh2 = gh1 - go1;
+    auto bh2 = bh1 - bo1;
+
+    __m256i rh3 = _mm256_set1_epi16(rh2);
+    __m256i gh3 = _mm256_set1_epi16(gh2);
+    __m256i bh3 = _mm256_set1_epi16(bh2);
+
+    auto rv0 = (rgbv0 >> 16) & 0x3F;
+    auto gv0 = (rgbv0 >>  8) & 0x7F;
+    auto bv0 = (rgbv0 >>  0) & 0x3F;
+    auto rv1 = (rv0 >> 4) | (rv0 << 2);
+    auto gv1 = (gv0 >> 6) | (gv0 << 1);
+    auto bv1 = (bv0 >> 4) | (bv0 << 2);
+
+    auto rv2 = rv1 - ro1;
+    auto gv2 = gv1 - go1;
+    auto bv2 = bv1 - bo1;
+
+    __m256i rv3 = _mm256_set1_epi16(rv2);
+    __m256i gv3 = _mm256_set1_epi16(gv2);
+    __m256i bv3 = _mm256_set1_epi16(bv2);
+
+    __m256i x = _mm256_set_epi16(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+
+    __m256i rh4 = _mm256_mullo_epi16(rh3, x);
+    __m256i gh4 = _mm256_mullo_epi16(gh3, x);
+    __m256i bh4 = _mm256_mullo_epi16(bh3, x);
+
+    __m256i y = _mm256_set_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
+
+    __m256i rv4 = _mm256_mullo_epi16(rv3, y);
+    __m256i gv4 = _mm256_mullo_epi16(gv3, y);
+    __m256i bv4 = _mm256_mullo_epi16(bv3, y);
+
+    __m256i rxy = _mm256_add_epi16(rh4, rv4);
+    __m256i gxy = _mm256_add_epi16(gh4, gv4);
+    __m256i bxy = _mm256_add_epi16(bh4, bv4);
+
+    __m256i rp0 = _mm256_add_epi16(rxy, ro3);
+    __m256i gp0 = _mm256_add_epi16(gxy, go3);
+    __m256i bp0 = _mm256_add_epi16(bxy, bo3);
+
+    __m256i rp1 = _mm256_srai_epi16(rp0, 2);
+    __m256i gp1 = _mm256_srai_epi16(gp0, 2);
+    __m256i bp1 = _mm256_srai_epi16(bp0, 2);
+
+    __m256i rp2 = _mm256_max_epi16(_mm256_min_epi16(rp1, _mm256_set1_epi16(255)), _mm256_setzero_si256());
+    __m256i gp2 = _mm256_max_epi16(_mm256_min_epi16(gp1, _mm256_set1_epi16(255)), _mm256_setzero_si256());
+    __m256i bp2 = _mm256_max_epi16(_mm256_min_epi16(bp1, _mm256_set1_epi16(255)), _mm256_setzero_si256());
+
+    __m256i rdif = _mm256_sub_epi16(r08, rp2);
+    __m256i gdif = _mm256_sub_epi16(g08, gp2);
+    __m256i bdif = _mm256_sub_epi16(b08, bp2);
+
+    __m256i rerr = _mm256_mullo_epi16(rdif, _mm256_set1_epi16(38));
+    __m256i gerr = _mm256_mullo_epi16(gdif, _mm256_set1_epi16(76));
+    __m256i berr = _mm256_mullo_epi16(bdif, _mm256_set1_epi16(14));
+
+    __m256i sum0 = _mm256_add_epi16(rerr, gerr);
+    __m256i sum1 = _mm256_add_epi16(sum0, berr);
+
+    __m256i sum2 = _mm256_madd_epi16(sum1, sum1);
+
+    __m128i sum3 = _mm_add_epi32(_mm256_castsi256_si128(sum2), _mm256_extracti128_si256(sum2, 1));
+
+    uint32_t err0 = _mm_extract_epi32(sum3, 0);
+    uint32_t err1 = _mm_extract_epi32(sum3, 1);
+    uint32_t err2 = _mm_extract_epi32(sum3, 2);
+    uint32_t err3 = _mm_extract_epi32(sum3, 3);
+
+    uint64_t error = err0 + err1 + err2 + err3;
+    /**/
+
+    uint32_t rgbv = ( rgbv0 & 0x3F ) | ( ( rgbv0 >> 2 ) & 0x1FC0 ) | ( ( rgbv0 >> 3 ) & 0x7E000 );
+    uint64_t rgbho0_ = ( rgbho & 0x3F0000003F ) | ( ( rgbho >> 2 ) & 0x1FC000001FC0 ) | ( ( rgbho >> 3 ) & 0x7E0000007E000 );
+    uint64_t rgbho0 = ( rgbho0_ & 0x7FFFF ) | ( ( rgbho0_ >> 13 ) & 0x3FFFF80000 );
+
+    uint32_t hi = rgbv | ((rgbho0 & 0x1FFF) << 19);
+    rgbho0 >>= 13;
+    uint32_t lo = ( rgbho0 & 0x1 ) | ( ( rgbho0 & 0x1FE ) << 1 ) | ( ( rgbho0 & 0x600 ) << 2 ) | ( ( rgbho0 & 0x3F800 ) << 5 ) | ( ( rgbho0 & 0x1FC0000 ) << 6 );
+
+    uint32_t idx = ( ( rgbho >> 33 ) & 0xF ) | ( ( rgbho >> 41 ) & 0x10 ) | ( ( rgbho >> 48 ) & 0x20 );
+    lo |= g_flags[idx];
+    uint64_t result = static_cast<uint32_t>(_bswap(lo));
+    result |= static_cast<uint64_t>(static_cast<uint32_t>(_bswap(hi))) << 32;
+
+    Plane plane;
+
+    plane.plane = result;
+    plane.error = error;
+    plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(2, 3, 0, 1));
+
+    return plane;
+}
+
+static etcpak_force_inline uint64_t EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate, const uint64_t value, const uint32_t error) noexcept
+{
+    size_t tidx[2];
+
+    // Get index of minimum error (terr[0] and terr[1])
+    __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
+    __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
+
+    __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
+    __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
+
+    __m256i errMin0 = _mm256_min_epu32(errLo, errHi);
+
+    __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
+
+    __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
+    __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
+
+    __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
+    __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
+
+    __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
+    __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
+
+    uint32_t mask0 = _mm256_movemask_epi8(errMask0);
+    uint32_t mask1 = _mm256_movemask_epi8(errMask1);
+
+    tidx[0] = _bit_scan_forward(mask0) >> 2;
+    tidx[1] = _bit_scan_forward(mask1) >> 2;
+
+    if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
+    {
+        return value;
+    }
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+
+    unsigned int t0 = tsel[tidx[0]];
+    unsigned int t1 = tsel[tidx[1]];
+
+    if (!rotate)
+    {
+        t0 &= 0xFF00FF00;
+        t1 &= 0x00FF00FF;
+    }
+    else
+    {
+        t0 &= 0xCCCCCCCC;
+        t1 &= 0x33333333;
+    }
+
+    // Flip selectors from sign bit
+    unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
+
+    return d | static_cast<uint64_t>(_bswap(t2)) << 32;
+}
+
+#endif
+
+static etcpak_force_inline void Average( const uint8_t* data, v4i* a )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
+    __m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
+    __m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128());
+    __m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128());
+    __m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
+    __m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
+    __m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128());
+    __m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128());
+
+    __m128i sum0 = _mm_add_epi16(d0l, d1l);
+    __m128i sum1 = _mm_add_epi16(d0h, d1h);
+    __m128i sum2 = _mm_add_epi16(d2l, d3l);
+    __m128i sum3 = _mm_add_epi16(d2h, d3h);
+
+    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
+    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
+    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
+    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
+    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
+    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
+    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
+    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
+
+    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
+    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
+    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
+    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
+
+    __m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3);
+    __m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3);
+    __m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3);
+    __m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3);
+
+    _mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2))));
+    _mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2))));
+#elif defined __ARM_NEON
+    uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data +  0), uint8x16_t());
+    uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t());
+    uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t());
+    uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t());
+
+    uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) };
+    uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) };
+    uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) };
+    uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) };
+
+    uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ) ) ), uint16x8_t());
+    uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ) ) ), uint16x8_t());
+    uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ) ) ), uint16x8_t());
+    uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ) ) ), uint16x8_t());
+
+    uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) };
+    uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) };
+    uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) };
+    uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) };
+
+    uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]);
+    uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]);
+    uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]);
+    uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]);
+
+    uint32x4_t a0 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b2, b3), vdupq_n_u32(4)), 3);
+    uint32x4_t a1 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b1), vdupq_n_u32(4)), 3);
+    uint32x4_t a2 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b1, b3), vdupq_n_u32(4)), 3);
+    uint32x4_t a3 = vshrq_n_u32(vqaddq_u32(vqaddq_u32(b0, b2), vdupq_n_u32(4)), 3);
+
+    uint16x8_t o0 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a0 )), vqmovun_s32(vreinterpretq_s32_u32( a1 )));
+    uint16x8_t o1 = vcombine_u16(vqmovun_s32(vreinterpretq_s32_u32( a2 )), vqmovun_s32(vreinterpretq_s32_u32( a3 )));
+
+    a[0] = v4i{o0[2], o0[1], o0[0], 0};
+    a[1] = v4i{o0[6], o0[5], o0[4], 0};
+    a[2] = v4i{o1[2], o1[1], o1[0], 0};
+    a[3] = v4i{o1[6], o1[5], o1[4], 0};
+#else
+    uint32_t r[4];
+    uint32_t g[4];
+    uint32_t b[4];
+
+    memset(r, 0, sizeof(r));
+    memset(g, 0, sizeof(g));
+    memset(b, 0, sizeof(b));
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            int index = (j & 2) + (i >> 1);
+            b[index] += *data++;
+            g[index] += *data++;
+            r[index] += *data++;
+            data++;
+        }
+    }
+
+    a[0] = v4i{ uint16_t( (r[2] + r[3] + 4) / 8 ), uint16_t( (g[2] + g[3] + 4) / 8 ), uint16_t( (b[2] + b[3] + 4) / 8 ), 0};
+    a[1] = v4i{ uint16_t( (r[0] + r[1] + 4) / 8 ), uint16_t( (g[0] + g[1] + 4) / 8 ), uint16_t( (b[0] + b[1] + 4) / 8 ), 0};
+    a[2] = v4i{ uint16_t( (r[1] + r[3] + 4) / 8 ), uint16_t( (g[1] + g[3] + 4) / 8 ), uint16_t( (b[1] + b[3] + 4) / 8 ), 0};
+    a[3] = v4i{ uint16_t( (r[0] + r[2] + 4) / 8 ), uint16_t( (g[0] + g[2] + 4) / 8 ), uint16_t( (b[0] + b[2] + 4) / 8 ), 0};
+#endif
+}
+
+static etcpak_force_inline void CalcErrorBlock( const uint8_t* data, unsigned int err[4][4] )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
+
+    __m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128());
+    __m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128());
+    __m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128());
+    __m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128());
+    __m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128());
+    __m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128());
+    __m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128());
+    __m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128());
+
+    __m128i sum0 = _mm_add_epi16(d0l, d1l);
+    __m128i sum1 = _mm_add_epi16(d0h, d1h);
+    __m128i sum2 = _mm_add_epi16(d2l, d3l);
+    __m128i sum3 = _mm_add_epi16(d2h, d3h);
+
+    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
+    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
+    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
+    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
+    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
+    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
+    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
+    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
+
+    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
+    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
+    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
+    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
+
+    __m128i a0 = _mm_add_epi32(b2, b3);
+    __m128i a1 = _mm_add_epi32(b0, b1);
+    __m128i a2 = _mm_add_epi32(b1, b3);
+    __m128i a3 = _mm_add_epi32(b0, b2);
+
+    _mm_storeu_si128((__m128i*)&err[0], a0);
+    _mm_storeu_si128((__m128i*)&err[1], a1);
+    _mm_storeu_si128((__m128i*)&err[2], a2);
+    _mm_storeu_si128((__m128i*)&err[3], a3);
+#elif defined __ARM_NEON
+    uint8x16x2_t t0 = vzipq_u8(vld1q_u8(data +  0), uint8x16_t());
+    uint8x16x2_t t1 = vzipq_u8(vld1q_u8(data + 16), uint8x16_t());
+    uint8x16x2_t t2 = vzipq_u8(vld1q_u8(data + 32), uint8x16_t());
+    uint8x16x2_t t3 = vzipq_u8(vld1q_u8(data + 48), uint8x16_t());
+
+    uint16x8x2_t d0 = { vreinterpretq_u16_u8(t0.val[0]), vreinterpretq_u16_u8(t0.val[1]) };
+    uint16x8x2_t d1 = { vreinterpretq_u16_u8(t1.val[0]), vreinterpretq_u16_u8(t1.val[1]) };
+    uint16x8x2_t d2 = { vreinterpretq_u16_u8(t2.val[0]), vreinterpretq_u16_u8(t2.val[1]) };
+    uint16x8x2_t d3 = { vreinterpretq_u16_u8(t3.val[0]), vreinterpretq_u16_u8(t3.val[1]) };
+
+    uint16x8x2_t s0 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[0] ), vreinterpretq_s16_u16( d1.val[0] ))), uint16x8_t());
+    uint16x8x2_t s1 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d0.val[1] ), vreinterpretq_s16_u16( d1.val[1] ))), uint16x8_t());
+    uint16x8x2_t s2 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[0] ), vreinterpretq_s16_u16( d3.val[0] ))), uint16x8_t());
+    uint16x8x2_t s3 = vzipq_u16(vreinterpretq_u16_s16( vaddq_s16(vreinterpretq_s16_u16( d2.val[1] ), vreinterpretq_s16_u16( d3.val[1] ))), uint16x8_t());
+
+    uint32x4x2_t sum0 = { vreinterpretq_u32_u16(s0.val[0]), vreinterpretq_u32_u16(s0.val[1]) };
+    uint32x4x2_t sum1 = { vreinterpretq_u32_u16(s1.val[0]), vreinterpretq_u32_u16(s1.val[1]) };
+    uint32x4x2_t sum2 = { vreinterpretq_u32_u16(s2.val[0]), vreinterpretq_u32_u16(s2.val[1]) };
+    uint32x4x2_t sum3 = { vreinterpretq_u32_u16(s3.val[0]), vreinterpretq_u32_u16(s3.val[1]) };
+
+    uint32x4_t b0 = vaddq_u32(sum0.val[0], sum0.val[1]);
+    uint32x4_t b1 = vaddq_u32(sum1.val[0], sum1.val[1]);
+    uint32x4_t b2 = vaddq_u32(sum2.val[0], sum2.val[1]);
+    uint32x4_t b3 = vaddq_u32(sum3.val[0], sum3.val[1]);
+
+    uint32x4_t a0 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b2, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
+    uint32x4_t a1 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b1) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
+    uint32x4_t a2 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b1, b3) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
+    uint32x4_t a3 = vreinterpretq_u32_u8( vandq_u8(vreinterpretq_u8_u32( vqaddq_u32(b0, b2) ), vreinterpretq_u8_u32( vdupq_n_u32(0x00FFFFFF)) ) );
+
+    vst1q_u32(err[0], a0);
+    vst1q_u32(err[1], a1);
+    vst1q_u32(err[2], a2);
+    vst1q_u32(err[3], a3);
+#else
+    unsigned int terr[4][4];
+
+    memset(terr, 0, 16 * sizeof(unsigned int));
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            int index = (j & 2) + (i >> 1);
+            unsigned int d = *data++;
+            terr[index][0] += d;
+            d = *data++;
+            terr[index][1] += d;
+            d = *data++;
+            terr[index][2] += d;
+            data++;
+        }
+    }
+
+    for( int i=0; i<3; i++ )
+    {
+        err[0][i] = terr[2][i] + terr[3][i];
+        err[1][i] = terr[0][i] + terr[1][i];
+        err[2][i] = terr[1][i] + terr[3][i];
+        err[3][i] = terr[0][i] + terr[2][i];
+    }
+    for( int i=0; i<4; i++ )
+    {
+        err[i][3] = 0;
+    }
+#endif
+}
+
+static etcpak_force_inline unsigned int CalcError( const unsigned int block[4], const v4i& average )
+{
+    unsigned int err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow
+    err -= block[0] * 2 * average[2];
+    err -= block[1] * 2 * average[1];
+    err -= block[2] * 2 * average[0];
+    err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
+    return err;
+}
+
+static etcpak_force_inline void ProcessAverages( v4i* a )
+{
+#ifdef __SSE4_1__
+    for( int i=0; i<2; i++ )
+    {
+        __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
+
+        __m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128));
+
+        __m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8);
+
+        __m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
+        __m128i diff = _mm_sub_epi16(c, c1);
+        diff = _mm_max_epi16(diff, _mm_set1_epi16(-4));
+        diff = _mm_min_epi16(diff, _mm_set1_epi16(3));
+
+        __m128i co = _mm_add_epi16(c1, diff);
+
+        c = _mm_blend_epi16(co, c, 0xF0);
+
+        __m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2));
+
+        _mm_storeu_si128((__m128i*)a[4+i*2].data(), a0);
+    }
+
+    for( int i=0; i<2; i++ )
+    {
+        __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
+
+        __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128));
+        __m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8);
+
+        __m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4));
+
+        _mm_storeu_si128((__m128i*)a[i*2].data(), t2);
+    }
+#elif defined __ARM_NEON
+    for( int i=0; i<2; i++ )
+    {
+        int16x8_t d = vld1q_s16((int16_t*)&a[i*2]);
+        int16x8_t t = vaddq_s16(vmulq_s16(d, vdupq_n_s16(31)), vdupq_n_s16(128));
+        int16x8_t c = vshrq_n_s16(vaddq_s16(t, vshrq_n_s16(t, 8)), 8);
+
+        int16x8_t c1 = vcombine_s16(vget_high_s16(c), vget_high_s16(c));
+        int16x8_t diff = vsubq_s16(c, c1);
+        diff = vmaxq_s16(diff, vdupq_n_s16(-4));
+        diff = vminq_s16(diff, vdupq_n_s16(3));
+
+        int16x8_t co = vaddq_s16(c1, diff);
+
+        c = vcombine_s16(vget_low_s16(co), vget_high_s16(c));
+
+        int16x8_t a0 = vorrq_s16(vshlq_n_s16(c, 3), vshrq_n_s16(c, 2));
+
+        vst1q_s16((int16_t*)&a[4+i*2], a0);
+    }
+
+    for( int i=0; i<2; i++ )
+    {
+        int16x8_t d = vld1q_s16((int16_t*)&a[i*2]);
+
+        int16x8_t t0 = vaddq_s16(vmulq_s16(d, vdupq_n_s16(15)), vdupq_n_s16(128));
+        int16x8_t t1 = vshrq_n_s16(vaddq_s16(t0, vshrq_n_s16(t0, 8)), 8);
+
+        int16x8_t t2 = vorrq_s16(t1, vshlq_n_s16(t1, 4));
+
+        vst1q_s16((int16_t*)&a[i*2], t2);
+    }
+#else
+    for( int i=0; i<2; i++ )
+    {
+        for( int j=0; j<3; j++ )
+        {
+            int32_t c1 = mul8bit( a[i*2+1][j], 31 );
+            int32_t c2 = mul8bit( a[i*2][j], 31 );
+
+            int32_t diff = c2 - c1;
+            if( diff > 3 ) diff = 3;
+            else if( diff < -4 ) diff = -4;
+
+            int32_t co = c1 + diff;
+
+            a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 );
+            a[4+i*2][j] = ( co << 3 ) | ( co >> 2 );
+        }
+    }
+
+    for( int i=0; i<4; i++ )
+    {
+        a[i][0] = g_avg2[mul8bit( a[i][0], 15 )];
+        a[i][1] = g_avg2[mul8bit( a[i][1], 15 )];
+        a[i][2] = g_avg2[mul8bit( a[i][2], 15 )];
+    }
+#endif
+}
+
+static etcpak_force_inline void EncodeAverages( uint64_t& _d, const v4i* a, size_t idx )
+{
+    auto d = _d;
+    d |= ( idx << 24 );
+    size_t base = idx << 1;
+
+    if( ( idx & 0x2 ) == 0 )
+    {
+        for( int i=0; i<3; i++ )
+        {
+            d |= uint64_t( a[base+0][i] >> 4 ) << ( i*8 );
+            d |= uint64_t( a[base+1][i] >> 4 ) << ( i*8 + 4 );
+        }
+    }
+    else
+    {
+        for( int i=0; i<3; i++ )
+        {
+            d |= uint64_t( a[base+1][i] & 0xF8 ) << ( i*8 );
+            int32_t c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3;
+            c &= ~0xFFFFFFF8;
+            d |= ((uint64_t)c) << ( i*8 );
+        }
+    }
+    _d = d;
+}
+
+static etcpak_force_inline uint64_t CheckSolid( const uint8_t* src )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128i c0 = _mm_cmpeq_epi8(d0, c);
+    __m128i c1 = _mm_cmpeq_epi8(d1, c);
+    __m128i c2 = _mm_cmpeq_epi8(d2, c);
+    __m128i c3 = _mm_cmpeq_epi8(d3, c);
+
+    __m128i m0 = _mm_and_si128(c0, c1);
+    __m128i m1 = _mm_and_si128(c2, c3);
+    __m128i m = _mm_and_si128(m0, m1);
+
+    if (!_mm_testc_si128(m, _mm_set1_epi32(-1)))
+    {
+        return 0;
+    }
+#elif defined __ARM_NEON
+    int32x4_t d0 = vld1q_s32((int32_t*)src +  0);
+    int32x4_t d1 = vld1q_s32((int32_t*)src +  4);
+    int32x4_t d2 = vld1q_s32((int32_t*)src +  8);
+    int32x4_t d3 = vld1q_s32((int32_t*)src + 12);
+
+    int32x4_t c = vdupq_n_s32(d0[0]);
+
+    int32x4_t c0 = vreinterpretq_s32_u32(vceqq_s32(d0, c));
+    int32x4_t c1 = vreinterpretq_s32_u32(vceqq_s32(d1, c));
+    int32x4_t c2 = vreinterpretq_s32_u32(vceqq_s32(d2, c));
+    int32x4_t c3 = vreinterpretq_s32_u32(vceqq_s32(d3, c));
+
+    int32x4_t m0 = vandq_s32(c0, c1);
+    int32x4_t m1 = vandq_s32(c2, c3);
+    int64x2_t m = vreinterpretq_s64_s32(vandq_s32(m0, m1));
+
+    if (m[0] != -1 || m[1] != -1)
+    {
+        return 0;
+    }
+#else
+    const uint8_t* ptr = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        if( memcmp( src, ptr, 4 ) != 0 )
+        {
+            return 0;
+        }
+        ptr += 4;
+    }
+#endif
+    return 0x02000000 |
+        ( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
+        ( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
+        ( (unsigned int)( src[2] & 0xF8 ) );
+}
+
+static etcpak_force_inline void PrepareAverages( v4i a[8], const uint8_t* src, unsigned int err[4] )
+{
+    Average( src, a );
+    ProcessAverages( a );
+
+    unsigned int errblock[4][4];
+    CalcErrorBlock( src, errblock );
+
+    for( int i=0; i<4; i++ )
+    {
+        err[i/2] += CalcError( errblock[i], a[i] );
+        err[2+i/2] += CalcError( errblock[i], a[i+4] );
+    }
+}
+
+static etcpak_force_inline void FindBestFit( uint64_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
+{
+    for( size_t i=0; i<16; i++ )
+    {
+        uint16_t* sel = tsel[i];
+        unsigned int bid = id[i];
+        uint64_t* ter = terr[bid%2];
+
+        uint8_t b = *data++;
+        uint8_t g = *data++;
+        uint8_t r = *data++;
+        data++;
+
+        int dr = a[bid][0] - r;
+        int dg = a[bid][1] - g;
+        int db = a[bid][2] - b;
+
+#ifdef __SSE4_1__
+        // Reference implementation
+
+        __m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28);
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        __m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0]));
+        __m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1]));
+        __m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0]));
+        __m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1]));
+
+        __m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
+        __m128i minError0 = _mm_min_epi32(error0, error1);
+
+        __m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
+        __m128i minError1 = _mm_min_epi32(error2, error3);
+
+        __m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
+        __m128i minError = _mm_min_epi32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        __m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
+        __m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
+        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
+        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
+        __m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
+        __m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
+        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
+        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2]));
+        error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3]));
+        error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2]));
+        error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3]));
+
+        index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
+        minError0 = _mm_min_epi32(error0, error1);
+
+        index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
+        minError1 = _mm_min_epi32(error2, error3);
+
+        __m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
+        minError = _mm_min_epi32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
+        squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
+        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2));
+        _mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow);
+        minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
+        squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
+        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3));
+        _mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh);
+        __m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1);
+        _mm_storeu_si128((__m128i*)sel, minIndex);
+#elif defined __ARM_NEON
+        int32x4_t pix = vdupq_n_s32(dr * 77 + dg * 151 + db * 28);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        uint32x4_t error0 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[0])));
+        uint32x4_t error1 = vreinterpretq_u32_s32(vabsq_s32(vaddq_s32(pix, g_table256_NEON[1])));
+        uint32x4_t error2 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[0])));
+        uint32x4_t error3 = vreinterpretq_u32_s32(vabsq_s32(vsubq_s32(pix, g_table256_NEON[1])));
+
+        uint32x4_t index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1));
+        uint32x4_t minError0 = vminq_u32(error0, error1);
+
+        uint32x4_t index1 = vreinterpretq_u32_s32(vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))));
+        uint32x4_t minError1 = vminq_u32(error2, error3);
+
+        uint32x4_t blendMask = vcltq_u32(minError1, minError0);
+        uint32x4_t minIndex0 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
+        uint32x4_t minError = vminq_u32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        uint32x4_t squareErrorLow = vmulq_u32(minError, minError);
+        uint32x4_t squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError))), 1);
+        uint32x4x2_t squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
+        uint64x2x2_t squareError = { vreinterpretq_u64_u32(squareErrorZip.val[0]), vreinterpretq_u64_u32(squareErrorZip.val[1]) };
+        squareError.val[0] = vaddq_u64(squareError.val[0], vld1q_u64(ter + 0));
+        squareError.val[1] = vaddq_u64(squareError.val[1], vld1q_u64(ter + 2));
+        vst1q_u64(ter + 0, squareError.val[0]);
+        vst1q_u64(ter + 2, squareError.val[1]);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        error0 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[2])));
+        error1 = vreinterpretq_u32_s32( vabsq_s32(vaddq_s32(pix, g_table256_NEON[3])));
+        error2 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[2])));
+        error3 = vreinterpretq_u32_s32( vabsq_s32(vsubq_s32(pix, g_table256_NEON[3])));
+
+        index0 = vandq_u32(vcltq_u32(error1, error0), vdupq_n_u32(1));
+        minError0 = vminq_u32(error0, error1);
+
+        index1 = vreinterpretq_u32_s32( vsubq_s32(vdupq_n_s32(2), vreinterpretq_s32_u32(vcltq_u32(error3, error2))) );
+        minError1 = vminq_u32(error2, error3);
+
+        blendMask = vcltq_u32(minError1, minError0);
+        uint32x4_t minIndex1 = vorrq_u32(vbicq_u32(index0, blendMask), vandq_u32(index1, blendMask));
+        minError = vminq_u32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        squareErrorLow = vmulq_u32(minError, minError);
+        squareErrorHigh = vshrq_n_u32(vreinterpretq_u32_s32( vqdmulhq_s32(vreinterpretq_s32_u32(minError), vreinterpretq_s32_u32(minError)) ), 1 );
+        squareErrorZip = vzipq_u32(squareErrorLow, squareErrorHigh);
+        squareError.val[0] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[0] ), vld1q_u64(ter + 4));
+        squareError.val[1] = vaddq_u64(vreinterpretq_u64_u32( squareErrorZip.val[1] ), vld1q_u64(ter + 6));
+        vst1q_u64(ter + 4, squareError.val[0]);
+        vst1q_u64(ter + 6, squareError.val[1]);
+
+        uint16x8_t minIndex = vcombine_u16(vqmovn_u32(minIndex0), vqmovn_u32(minIndex1));
+        vst1q_u16(sel, minIndex);
+#else
+        int pix = dr * 77 + dg * 151 + db * 28;
+
+        for( int t=0; t<8; t++ )
+        {
+            const int64_t* tab = g_table256[t];
+            unsigned int idx = 0;
+            uint64_t err = sq( tab[0] + pix );
+            for( int j=1; j<4; j++ )
+            {
+                uint64_t local = sq( tab[j] + pix );
+                if( local < err )
+                {
+                    err = local;
+                    idx = j;
+                }
+            }
+            *sel++ = idx;
+            *ter++ += err;
+        }
+#endif
+    }
+}
+
+#if defined __SSE4_1__ || defined __ARM_NEON
+// Non-reference implementation, but faster. Produces same results as the AVX2 version
+static etcpak_force_inline void FindBestFit( uint32_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
+{
+    for( size_t i=0; i<16; i++ )
+    {
+        uint16_t* sel = tsel[i];
+        unsigned int bid = id[i];
+        uint32_t* ter = terr[bid%2];
+
+        uint8_t b = *data++;
+        uint8_t g = *data++;
+        uint8_t r = *data++;
+        data++;
+
+        int dr = a[bid][0] - r;
+        int dg = a[bid][1] - g;
+        int db = a[bid][2] - b;
+
+#ifdef __SSE4_1__
+        // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
+        // This produces slightly different results, but is significant faster
+        __m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14);
+        __m128i pix = _mm_abs_epi16(pixel);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+        __m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0]));
+        __m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1]));
+
+        __m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1));
+        __m128i minError = _mm_min_epi16(error0, error1);
+
+        // Exploiting symmetry of the selector table and use the sign bit
+        // This produces slightly different results, but is needed to produce same results as AVX2 implementation
+        __m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1));
+        __m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit));
+
+        // Squaring the minimum error to produce correct values when adding
+        __m128i squareErrorLo = _mm_mullo_epi16(minError, minError);
+        __m128i squareErrorHi = _mm_mulhi_epi16(minError, minError);
+
+        __m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi);
+        __m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi);
+
+        squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
+        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
+        squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
+        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
+
+        _mm_storeu_si128((__m128i*)sel, minIndex);
+#elif defined __ARM_NEON
+        int16x8_t pixel = vdupq_n_s16( dr * 38 + dg * 76 + db * 14 );
+        int16x8_t pix = vabsq_s16( pixel );
+
+        int16x8_t error0 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[0] ) );
+        int16x8_t error1 = vabsq_s16( vsubq_s16( pix, g_table128_NEON[1] ) );
+
+        int16x8_t index = vandq_s16( vreinterpretq_s16_u16( vcltq_s16( error1, error0 ) ), vdupq_n_s16( 1 ) );
+        int16x8_t minError = vminq_s16( error0, error1 );
+
+        int16x8_t indexBit = vandq_s16( vmvnq_s16( vshrq_n_s16( pixel, 15 ) ), vdupq_n_s16( -1 ) );
+        int16x8_t minIndex = vorrq_s16( index, vaddq_s16( indexBit, indexBit ) );
+
+        int16x4_t minErrorLow = vget_low_s16( minError );
+        int16x4_t minErrorHigh = vget_high_s16( minError );
+
+        int32x4_t squareErrorLow = vmull_s16( minErrorLow, minErrorLow );
+        int32x4_t squareErrorHigh = vmull_s16( minErrorHigh, minErrorHigh );
+
+        int32x4_t squareErrorSumLow = vaddq_s32( squareErrorLow, vld1q_s32( (int32_t*)ter ) );
+        int32x4_t squareErrorSumHigh = vaddq_s32( squareErrorHigh, vld1q_s32( (int32_t*)ter + 4 ) );
+
+        vst1q_s32( (int32_t*)ter, squareErrorSumLow );
+        vst1q_s32( (int32_t*)ter + 4, squareErrorSumHigh );
+
+        vst1q_s16( (int16_t*)sel, minIndex );
+#endif
+    }
+}
+#endif
+
+static etcpak_force_inline uint8_t convert6(float f)
+{
+    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
+    return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3;
+}
+
+static etcpak_force_inline uint8_t convert7(float f)
+{
+    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
+    return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
+}
+
+static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar(const uint8_t* src)
+{
+    int32_t r = 0;
+    int32_t g = 0;
+    int32_t b = 0;
+
+    for (int i = 0; i < 16; ++i)
+    {
+        b += src[i * 4 + 0];
+        g += src[i * 4 + 1];
+        r += src[i * 4 + 2];
+    }
+
+    int32_t difRyz = 0;
+    int32_t difGyz = 0;
+    int32_t difByz = 0;
+    int32_t difRxz = 0;
+    int32_t difGxz = 0;
+    int32_t difBxz = 0;
+
+    const int32_t scaling[] = { -255, -85, 85, 255 };
+
+    for (int i = 0; i < 16; ++i)
+    {
+        int32_t difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b;
+        int32_t difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g;
+        int32_t difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r;
+
+        difRyz += difR * scaling[i % 4];
+        difGyz += difG * scaling[i % 4];
+        difByz += difB * scaling[i % 4];
+
+        difRxz += difR * scaling[i / 4];
+        difGxz += difG * scaling[i / 4];
+        difBxz += difB * scaling[i / 4];
+    }
+
+    const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f);
+
+    float aR = difRxz * scale;
+    float aG = difGxz * scale;
+    float aB = difBxz * scale;
+
+    float bR = difRyz * scale;
+    float bG = difGyz * scale;
+    float bB = difByz * scale;
+
+    float dR = r * (4.0f / 16.0f);
+    float dG = g * (4.0f / 16.0f);
+    float dB = b * (4.0f / 16.0f);
+
+    // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
+    float cofR = std::fma(aR,  255.0f, std::fma(bR,  255.0f, dR));
+    float cofG = std::fma(aG,  255.0f, std::fma(bG,  255.0f, dG));
+    float cofB = std::fma(aB,  255.0f, std::fma(bB,  255.0f, dB));
+    float chfR = std::fma(aR, -425.0f, std::fma(bR,  255.0f, dR));
+    float chfG = std::fma(aG, -425.0f, std::fma(bG,  255.0f, dG));
+    float chfB = std::fma(aB, -425.0f, std::fma(bB,  255.0f, dB));
+    float cvfR = std::fma(aR,  255.0f, std::fma(bR, -425.0f, dR));
+    float cvfG = std::fma(aG,  255.0f, std::fma(bG, -425.0f, dG));
+    float cvfB = std::fma(aB,  255.0f, std::fma(bB, -425.0f, dB));
+
+    // convert to r6g7b6
+    int32_t coR = convert6(cofR);
+    int32_t coG = convert7(cofG);
+    int32_t coB = convert6(cofB);
+    int32_t chR = convert6(chfR);
+    int32_t chG = convert7(chfG);
+    int32_t chB = convert6(chfB);
+    int32_t cvR = convert6(cvfR);
+    int32_t cvG = convert7(cvfG);
+    int32_t cvB = convert6(cvfB);
+
+    // Error calculation
+    auto ro0 = coR;
+    auto go0 = coG;
+    auto bo0 = coB;
+    auto ro1 = (ro0 >> 4) | (ro0 << 2);
+    auto go1 = (go0 >> 6) | (go0 << 1);
+    auto bo1 = (bo0 >> 4) | (bo0 << 2);
+    auto ro2 = (ro1 << 2) + 2;
+    auto go2 = (go1 << 2) + 2;
+    auto bo2 = (bo1 << 2) + 2;
+
+    auto rh0 = chR;
+    auto gh0 = chG;
+    auto bh0 = chB;
+    auto rh1 = (rh0 >> 4) | (rh0 << 2);
+    auto gh1 = (gh0 >> 6) | (gh0 << 1);
+    auto bh1 = (bh0 >> 4) | (bh0 << 2);
+
+    auto rh2 = rh1 - ro1;
+    auto gh2 = gh1 - go1;
+    auto bh2 = bh1 - bo1;
+
+    auto rv0 = cvR;
+    auto gv0 = cvG;
+    auto bv0 = cvB;
+    auto rv1 = (rv0 >> 4) | (rv0 << 2);
+    auto gv1 = (gv0 >> 6) | (gv0 << 1);
+    auto bv1 = (bv0 >> 4) | (bv0 << 2);
+
+    auto rv2 = rv1 - ro1;
+    auto gv2 = gv1 - go1;
+    auto bv2 = bv1 - bo1;
+
+    uint64_t error = 0;
+
+    for (int i = 0; i < 16; ++i)
+    {
+        int32_t cR = clampu8((rh2 * (i / 4) + rv2 * (i % 4) + ro2) >> 2);
+        int32_t cG = clampu8((gh2 * (i / 4) + gv2 * (i % 4) + go2) >> 2);
+        int32_t cB = clampu8((bh2 * (i / 4) + bv2 * (i % 4) + bo2) >> 2);
+
+        int32_t difB = static_cast<int>(src[i * 4 + 0]) - cB;
+        int32_t difG = static_cast<int>(src[i * 4 + 1]) - cG;
+        int32_t difR = static_cast<int>(src[i * 4 + 2]) - cR;
+
+        int32_t dif = difR * 38 + difG * 76 + difB * 14;
+
+        error += dif * dif;
+    }
+
+    /**/
+    uint32_t rgbv = cvB | (cvG << 6) | (cvR << 13);
+    uint32_t rgbh = chB | (chG << 6) | (chR << 13);
+    uint32_t hi = rgbv | ((rgbh & 0x1FFF) << 19);
+    uint32_t lo = (chR & 0x1) | 0x2 | ((chR << 1) & 0x7C);
+    lo |= ((coB & 0x07) <<  7) | ((coB & 0x18) <<  8) | ((coB & 0x20) << 11);
+    lo |= ((coG & 0x3F) << 17) | ((coG & 0x40) << 18);
+    lo |= coR << 25;
+
+    const auto idx = (coR & 0x20) | ((coG & 0x20) >> 1) | ((coB & 0x1E) >> 1);
+
+    lo |= g_flags[idx];
+
+    uint64_t result = static_cast<uint32_t>(_bswap(lo));
+    result |= static_cast<uint64_t>(static_cast<uint32_t>(_bswap(hi))) << 32;
+
+    return std::make_pair(result, error);
+}
+
+#ifdef __ARM_NEON
+
+static etcpak_force_inline int32x2_t Planar_NEON_DifXZ( int16x8_t dif_lo, int16x8_t dif_hi )
+{
+    int32x4_t dif0 = vmull_n_s16( vget_low_s16( dif_lo ), -255 );
+    int32x4_t dif1 = vmull_n_s16( vget_high_s16( dif_lo ), -85 );
+    int32x4_t dif2 = vmull_n_s16( vget_low_s16( dif_hi ), 85 );
+    int32x4_t dif3 = vmull_n_s16( vget_high_s16( dif_hi ), 255 );
+    int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
+
+#ifndef __aarch64__
+    int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
+    return vpadd_s32( dif5, dif5 );
+#else
+    return vdup_n_s32( vaddvq_s32( dif4 ) );
+#endif
+}
+
+static etcpak_force_inline int32x2_t Planar_NEON_DifYZ( int16x8_t dif_lo, int16x8_t dif_hi )
+{
+    int16x4_t scaling = { -255, -85, 85, 255 };
+    int32x4_t dif0 = vmull_s16( vget_low_s16( dif_lo ), scaling );
+    int32x4_t dif1 = vmull_s16( vget_high_s16( dif_lo ), scaling );
+    int32x4_t dif2 = vmull_s16( vget_low_s16( dif_hi ), scaling );
+    int32x4_t dif3 = vmull_s16( vget_high_s16( dif_hi ), scaling );
+    int32x4_t dif4 = vaddq_s32( vaddq_s32( dif0, dif1 ), vaddq_s32( dif2, dif3 ) );
+
+#ifndef __aarch64__
+    int32x2_t dif5 = vpadd_s32( vget_low_s32( dif4 ), vget_high_s32( dif4 ) );
+    return vpadd_s32( dif5, dif5 );
+#else
+    return vdup_n_s32( vaddvq_s32( dif4 ) );
+#endif
+}
+
+static etcpak_force_inline int16x8_t Planar_NEON_SumWide( uint8x16_t src )
+{
+    uint16x8_t accu8 = vpaddlq_u8( src );
+#ifndef __aarch64__
+    uint16x4_t accu4 = vpadd_u16( vget_low_u16( accu8 ), vget_high_u16( accu8 ) );
+    uint16x4_t accu2 = vpadd_u16( accu4, accu4 );
+    uint16x4_t accu1 = vpadd_u16( accu2, accu2 );
+    return vreinterpretq_s16_u16( vcombine_u16( accu1, accu1 ) );
+#else 
+    return vdupq_n_s16( vaddvq_u16( accu8 ) );
+#endif
+}
+
+static etcpak_force_inline int16x8_t convert6_NEON( int32x4_t lo, int32x4_t hi )
+{
+    uint16x8_t x = vcombine_u16( vqmovun_s32( lo ), vqmovun_s32( hi ) );
+    int16x8_t i = vreinterpretq_s16_u16( vshrq_n_u16( vqshlq_n_u16( x, 6 ), 6) ); // clamp 0-1023
+    i = vhsubq_s16( i, vdupq_n_s16( 15 ) );
+
+    int16x8_t ip11 = vaddq_s16( i, vdupq_n_s16( 11 ) );
+    int16x8_t ip4 = vaddq_s16( i, vdupq_n_s16( 4 ) );
+
+    return vshrq_n_s16( vsubq_s16( vsubq_s16( ip11, vshrq_n_s16( ip11, 7 ) ), vshrq_n_s16( ip4, 7) ), 3 );
+}
+
+static etcpak_force_inline int16x4_t convert7_NEON( int32x4_t x )
+{
+    int16x4_t i = vreinterpret_s16_u16( vshr_n_u16( vqshl_n_u16( vqmovun_s32( x ), 6 ), 6 ) ); // clamp 0-1023
+    i = vhsub_s16( i, vdup_n_s16( 15 ) );
+
+    int16x4_t p9 = vadd_s16( i, vdup_n_s16( 9 ) );
+    int16x4_t p6 = vadd_s16( i, vdup_n_s16( 6 ) );
+    return vshr_n_s16( vsub_s16( vsub_s16( p9, vshr_n_s16( p9, 8 ) ), vshr_n_s16( p6, 8 ) ), 2 );
+}
+
+static etcpak_force_inline std::pair<uint64_t, uint64_t> Planar_NEON( const uint8_t* src )
+{
+    uint8x16x4_t srcBlock = vld4q_u8( src );
+
+    int16x8_t bSumWide = Planar_NEON_SumWide( srcBlock.val[0] );
+    int16x8_t gSumWide = Planar_NEON_SumWide( srcBlock.val[1] );
+    int16x8_t rSumWide = Planar_NEON_SumWide( srcBlock.val[2] );
+
+    int16x8_t dif_R_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[2] ), 4) ), rSumWide );
+    int16x8_t dif_R_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[2] ), 4) ), rSumWide );
+
+    int16x8_t dif_G_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[1] ), 4 ) ), gSumWide );
+    int16x8_t dif_G_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[1] ), 4 ) ), gSumWide );
+
+    int16x8_t dif_B_lo = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_low_u8( srcBlock.val[0] ), 4) ), bSumWide );
+    int16x8_t dif_B_hi = vsubq_s16( vreinterpretq_s16_u16( vshll_n_u8( vget_high_u8( srcBlock.val[0] ), 4) ), bSumWide );
+
+    int32x2x2_t dif_xz_z = vzip_s32( vzip_s32( Planar_NEON_DifXZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifXZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifXZ( dif_G_lo, dif_G_hi ) );
+    int32x4_t dif_xz = vcombine_s32( dif_xz_z.val[0], dif_xz_z.val[1] );
+    int32x2x2_t dif_yz_z = vzip_s32( vzip_s32( Planar_NEON_DifYZ( dif_B_lo, dif_B_hi ), Planar_NEON_DifYZ( dif_R_lo, dif_R_hi ) ).val[0], Planar_NEON_DifYZ( dif_G_lo, dif_G_hi ) );
+    int32x4_t dif_yz = vcombine_s32( dif_yz_z.val[0], dif_yz_z.val[1] );
+
+    const float fscale = -4.0f / ( (255 * 255 * 8.0f + 85 * 85 * 8.0f ) * 16.0f );
+    float32x4_t fa = vmulq_n_f32( vcvtq_f32_s32( dif_xz ), fscale );
+    float32x4_t fb = vmulq_n_f32( vcvtq_f32_s32( dif_yz ), fscale );
+    int16x4_t bgrgSum = vzip_s16( vzip_s16( vget_low_s16( bSumWide ), vget_low_s16( rSumWide ) ).val[0], vget_low_s16( gSumWide ) ).val[0];
+    float32x4_t fd = vmulq_n_f32( vcvtq_f32_s32( vmovl_s16( bgrgSum ) ), 4.0f / 16.0f);
+
+    float32x4_t cof = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, 255.0f );
+    float32x4_t chf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, 255.0f ), fa, -425.0f );
+    float32x4_t cvf = vmlaq_n_f32( vmlaq_n_f32( fd, fb, -425.0f ), fa, 255.0f );
+
+    int32x4_t coi = vcvtq_s32_f32( cof );
+    int32x4_t chi = vcvtq_s32_f32( chf );
+    int32x4_t cvi = vcvtq_s32_f32( cvf );
+
+    int32x4x2_t tr_hv = vtrnq_s32( chi, cvi );
+    int32x4x2_t tr_o = vtrnq_s32( coi, coi );
+
+    int16x8_t c_hvoo_br_6 = convert6_NEON( tr_hv.val[0], tr_o.val[0] );
+    int16x4_t c_hvox_g_7 = convert7_NEON( vcombine_s32( vget_low_s32( tr_hv.val[1] ), vget_low_s32( tr_o.val[1] ) ) );
+    int16x8_t c_hvoo_br_8 = vorrq_s16( vshrq_n_s16( c_hvoo_br_6, 4 ), vshlq_n_s16( c_hvoo_br_6, 2 ) );
+    int16x4_t c_hvox_g_8 = vorr_s16( vshr_n_s16( c_hvox_g_7, 6 ), vshl_n_s16( c_hvox_g_7, 1 ) );
+
+    int16x4_t rec_gxbr_o = vext_s16( c_hvox_g_8, vget_high_s16( c_hvoo_br_8 ), 3 );
+
+    rec_gxbr_o = vadd_s16( vshl_n_s16( rec_gxbr_o, 2 ), vdup_n_s16( 2 ) );
+    int16x8_t rec_ro_wide = vdupq_lane_s16( rec_gxbr_o, 3 );
+    int16x8_t rec_go_wide = vdupq_lane_s16( rec_gxbr_o, 0 );
+    int16x8_t rec_bo_wide = vdupq_lane_s16( rec_gxbr_o, 1 );
+
+    int16x4_t br_hv2 = vsub_s16( vget_low_s16( c_hvoo_br_8 ), vget_high_s16( c_hvoo_br_8 ) );
+    int16x4_t gg_hv2 = vsub_s16( c_hvox_g_8, vdup_lane_s16( c_hvox_g_8, 2 ) );
+
+    int16x8_t scaleh_lo = { 0, 0, 0, 0, 1, 1, 1, 1 };
+    int16x8_t scaleh_hi = { 2, 2, 2, 2, 3, 3, 3, 3 };
+    int16x8_t scalev = { 0, 1, 2, 3, 0, 1, 2, 3 };
+
+    int16x8_t rec_r_1 = vmlaq_lane_s16( rec_ro_wide, scalev, br_hv2, 3 );
+    int16x8_t rec_r_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_lo, br_hv2, 2 ), 2 ) ) );
+    int16x8_t rec_r_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_r_1, scaleh_hi, br_hv2, 2 ), 2 ) ) );
+
+    int16x8_t rec_b_1 = vmlaq_lane_s16( rec_bo_wide, scalev, br_hv2, 1 );
+    int16x8_t rec_b_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_lo, br_hv2, 0 ), 2 ) ) );
+    int16x8_t rec_b_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_b_1, scaleh_hi, br_hv2, 0 ), 2 ) ) );
+
+    int16x8_t rec_g_1 = vmlaq_lane_s16( rec_go_wide, scalev, gg_hv2, 1 );
+    int16x8_t rec_g_lo = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_lo, gg_hv2, 0 ), 2 ) ) );
+    int16x8_t rec_g_hi = vreinterpretq_s16_u16( vmovl_u8( vqshrun_n_s16( vmlaq_lane_s16( rec_g_1, scaleh_hi, gg_hv2, 0 ), 2 ) ) );
+
+    int16x8_t dif_r_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[2] ) ) ), rec_r_lo );
+    int16x8_t dif_r_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[2] ) ) ), rec_r_hi );
+
+    int16x8_t dif_g_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[1] ) ) ), rec_g_lo );
+    int16x8_t dif_g_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[1] ) ) ), rec_g_hi );
+
+    int16x8_t dif_b_lo = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_low_u8( srcBlock.val[0] ) ) ), rec_b_lo );
+    int16x8_t dif_b_hi = vsubq_s16( vreinterpretq_s16_u16( vmovl_u8( vget_high_u8( srcBlock.val[0] ) ) ), rec_b_hi );
+
+    int16x8_t dif_lo = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_lo, 38 ), dif_g_lo, 76 ), dif_b_lo, 14 );
+    int16x8_t dif_hi = vmlaq_n_s16( vmlaq_n_s16( vmulq_n_s16( dif_r_hi, 38 ), dif_g_hi, 76 ), dif_b_hi, 14 );
+
+    int16x4_t tmpDif = vget_low_s16( dif_lo );
+    int32x4_t difsq_0 = vmull_s16( tmpDif, tmpDif );
+    tmpDif = vget_high_s16( dif_lo );
+    int32x4_t difsq_1 = vmull_s16( tmpDif, tmpDif );
+    tmpDif = vget_low_s16( dif_hi );
+    int32x4_t difsq_2 = vmull_s16( tmpDif, tmpDif );
+    tmpDif = vget_high_s16( dif_hi );
+    int32x4_t difsq_3 = vmull_s16( tmpDif, tmpDif );
+
+    uint32x4_t difsq_5 = vaddq_u32( vreinterpretq_u32_s32( difsq_0 ), vreinterpretq_u32_s32( difsq_1 ) );
+    uint32x4_t difsq_6 = vaddq_u32( vreinterpretq_u32_s32( difsq_2 ), vreinterpretq_u32_s32( difsq_3) );
+
+    uint64x2_t difsq_7 = vaddl_u32( vget_low_u32( difsq_5 ), vget_high_u32( difsq_5 ) );
+    uint64x2_t difsq_8 = vaddl_u32( vget_low_u32( difsq_6 ), vget_high_u32( difsq_6 ) );
+
+    uint64x2_t difsq_9 = vaddq_u64( difsq_7, difsq_8 );
+
+#ifdef __aarch64__
+    uint64_t error = vaddvq_u64( difsq_9 );
+#else
+    uint64_t error = vgetq_lane_u64( difsq_9, 0 ) + vgetq_lane_u64( difsq_9, 1 );
+#endif
+
+    int32_t coR = c_hvoo_br_6[6];
+    int32_t coG = c_hvox_g_7[2];
+    int32_t coB = c_hvoo_br_6[4];
+
+    int32_t chR = c_hvoo_br_6[2];
+    int32_t chG = c_hvox_g_7[0];
+    int32_t chB = c_hvoo_br_6[0];
+
+    int32_t cvR = c_hvoo_br_6[3];
+    int32_t cvG = c_hvox_g_7[1];
+    int32_t cvB = c_hvoo_br_6[1];
+
+    uint32_t rgbv = cvB | ( cvG << 6 ) | ( cvR << 13 );
+    uint32_t rgbh = chB | ( chG << 6 ) | ( chR << 13 );
+    uint32_t hi = rgbv | ( ( rgbh & 0x1FFF ) << 19 );
+    uint32_t lo = ( chR & 0x1 ) | 0x2 | ( ( chR << 1 ) & 0x7C );
+    lo |= ( ( coB & 0x07 ) << 7 ) | ( ( coB & 0x18 ) << 8 ) | ( ( coB & 0x20 ) << 11 );
+    lo |= ( ( coG & 0x3F) << 17) | ( (coG & 0x40 ) << 18 );
+    lo |= coR << 25;
+
+    const auto idx = ( coR & 0x20 ) | ( ( coG & 0x20 ) >> 1 ) | ( ( coB & 0x1E ) >> 1 );
+
+    lo |= g_flags[idx];
+
+    uint64_t result = static_cast<uint32_t>( _bswap(lo) );
+    result |= static_cast<uint64_t>( static_cast<uint32_t>( _bswap( hi ) ) ) << 32;
+
+    return std::make_pair( result, error );
+}
+
+#endif
+
+template<class T, class S>
+static etcpak_force_inline uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id, const uint64_t value, const uint64_t error)
+{
+    size_t tidx[2];
+    tidx[0] = GetLeastError( terr[0], 8 );
+    tidx[1] = GetLeastError( terr[1], 8 );
+
+    if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
+    {
+        return value;
+    }
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+    for( int i=0; i<16; i++ )
+    {
+        uint64_t t = tsel[i][tidx[id[i]%2]];
+        d |= ( t & 0x1 ) << ( i + 32 );
+        d |= ( t & 0x2 ) << ( i + 47 );
+    }
+
+    return FixByteOrder(d);
+}
+
+}
+
+static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src )
+{
+#ifdef __AVX2__
+    uint64_t d = CheckSolid_AVX2( src );
+    if( d != 0 ) return d;
+
+    alignas(32) v4i a[8];
+
+    __m128i err0 = PrepareAverages_AVX2( a, src );
+
+    // Get index of minimum error (err0)
+    __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m128i errMin0 = _mm_min_epu32(err0, err1);
+
+    __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
+    __m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
+
+    __m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
+
+    uint32_t mask = _mm_movemask_epi8(errMask);
+
+    uint32_t idx = _bit_scan_forward(mask) >> 2;
+
+    d |= EncodeAverages_AVX2( a, idx );
+
+    alignas(32) uint32_t terr[2][8] = {};
+    alignas(32) uint32_t tsel[8];
+
+    if ((idx == 0) || (idx == 2))
+    {
+        FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
+    }
+    else
+    {
+        FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
+    }
+
+    return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1 );
+#else
+    uint64_t d = CheckSolid( src );
+    if( d != 0 ) return d;
+
+    v4i a[8];
+    unsigned int err[4] = {};
+    PrepareAverages( a, src, err );
+    size_t idx = GetLeastError( err, 4 );
+    EncodeAverages( d, a, idx );
+
+#if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
+    uint32_t terr[2][8] = {};
+#else
+    uint64_t terr[2][8] = {};
+#endif
+    uint16_t tsel[16][8];
+    auto id = g_id[idx];
+    FindBestFit( terr, tsel, a, id, src );
+
+    return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) );
+#endif
+}
+
+static etcpak_force_inline uint64_t ProcessRGB_ETC2( const uint8_t* src )
+{
+#ifdef __AVX2__
+    uint64_t d = CheckSolid_AVX2( src );
+    if( d != 0 ) return d;
+
+    auto plane = Planar_AVX2( src );
+
+    alignas(32) v4i a[8];
+
+    __m128i err0 = PrepareAverages_AVX2( a, plane.sum4 );
+
+    // Get index of minimum error (err0)
+    __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m128i errMin0 = _mm_min_epu32(err0, err1);
+
+    __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
+    __m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
+
+    __m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
+
+    uint32_t mask = _mm_movemask_epi8(errMask);
+
+    size_t idx = _bit_scan_forward(mask) >> 2;
+
+    d = EncodeAverages_AVX2( a, idx );
+
+    alignas(32) uint32_t terr[2][8] = {};
+    alignas(32) uint32_t tsel[8];
+
+    if ((idx == 0) || (idx == 2))
+    {
+        FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
+    }
+    else
+    {
+        FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
+    }
+
+    return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1, plane.plane, plane.error );
+#else
+    uint64_t d = CheckSolid( src );
+    if (d != 0) return d;
+
+#ifdef __ARM_NEON
+    auto result = Planar_NEON( src );
+#else
+    auto result = Planar( src );
+#endif
+
+    v4i a[8];
+    unsigned int err[4] = {};
+    PrepareAverages( a, src, err );
+    size_t idx = GetLeastError( err, 4 );
+    EncodeAverages( d, a, idx );
+
+#if ( defined __SSE4_1__ || defined __ARM_NEON ) && !defined REFERENCE_IMPLEMENTATION
+    uint32_t terr[2][8] = {};
+#else
+    uint64_t terr[2][8] = {};
+#endif
+    uint16_t tsel[16][8];
+    auto id = g_id[idx];
+    FindBestFit( terr, tsel, a, id, src );
+
+    return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
+#endif
+}
+
+#ifdef __SSE4_1__
+template<int K>
+static etcpak_force_inline __m128i Widen( const __m128i src )
+{
+    static_assert( K >= 0 && K <= 7, "Index out of range" );
+
+    __m128i tmp;
+    switch( K )
+    {
+    case 0:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 1:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 2:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 3:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 4:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 5:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 6:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 7:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    }
+}
+
+static etcpak_force_inline int GetMulSel( int sel )
+{
+    switch( sel )
+    {
+    case 0:
+        return 0;
+    case 1:
+    case 2:
+    case 3:
+        return 1;
+    case 4:
+        return 2;
+    case 5:
+    case 6:
+    case 7:
+        return 3;
+    case 8:
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+        return 4;
+    case 14:
+    case 15:
+        return 5;
+    }
+}
+
+#endif
+
+#ifdef __ARM_NEON
+
+static constexpr etcpak_force_inline int GetMulSel(int sel)
+{
+    return ( sel < 1 ) ? 0 : ( sel < 4 ) ? 1 : ( sel < 5 ) ? 2 : ( sel < 8 ) ? 3 : ( sel < 14 ) ? 4 : 5;
+}
+
+static constexpr int ClampConstant( int x, int min, int max )
+{
+    return x < min ? min : x > max ? max : x;
+}
+
+template <int Index>
+etcpak_force_inline static uint16x8_t ErrorProbe_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
+{
+    uint8x8_t srcValWide;
+#ifndef __aarch64__
+    if( Index < 8 )
+        srcValWide = vdup_lane_u8( vget_low_u8( alphaBlock ), ClampConstant( Index, 0, 8 ) );
+    else
+        srcValWide = vdup_lane_u8( vget_high_u8( alphaBlock ), ClampConstant( Index - 8, 0, 8 ) );
+#else
+    srcValWide = vdup_laneq_u8( alphaBlock, Index );
+#endif
+
+    uint8x8_t deltaVal = vabd_u8( srcValWide, recVal );
+    return vmull_u8( deltaVal, deltaVal );
+}
+
+etcpak_force_inline static uint16_t MinError_EAC_NEON( uint16x8_t errProbe )
+{
+#ifndef __aarch64__
+    uint16x4_t tmpErr = vpmin_u16( vget_low_u16( errProbe ), vget_high_u16( errProbe ) );
+    tmpErr = vpmin_u16( tmpErr, tmpErr );
+    return vpmin_u16( tmpErr, tmpErr )[0];
+#else
+    return vminvq_u16( errProbe );
+#endif
+}
+
+template <int Index>
+etcpak_force_inline static uint64_t MinErrorIndex_EAC_NEON( uint8x8_t recVal, uint8x16_t alphaBlock )
+{
+    uint16x8_t errProbe = ErrorProbe_EAC_NEON<Index>( recVal, alphaBlock );
+    uint16x8_t minErrMask = vceqq_u16( errProbe, vdupq_n_u16( MinError_EAC_NEON( errProbe ) ) );
+    uint64_t idx = __builtin_ctzll( vget_lane_u64( vreinterpret_u64_u8( vqmovn_u16( minErrMask ) ), 0 ) );
+    idx >>= 3;
+    idx <<= 45 - Index * 3;
+
+    return idx;
+}
+
+template <int Index>
+etcpak_force_inline static int16x8_t WidenMultiplier_EAC_NEON( int16x8_t multipliers )
+{
+    constexpr int Lane = GetMulSel( Index );
+#ifndef __aarch64__
+    if( Lane < 4 )
+        return vdupq_lane_s16( vget_low_s16( multipliers ), ClampConstant( Lane, 0, 4 ) );
+    else
+        return vdupq_lane_s16( vget_high_s16( multipliers ), ClampConstant( Lane - 4, 0, 4 ) );
+#else
+    return vdupq_laneq_s16( multipliers, Lane );
+#endif
+}
+
+#endif
+
+static etcpak_force_inline uint64_t ProcessAlpha_ETC2( const uint8_t* src )
+{
+#if defined __SSE4_1__
+    // Check solid
+    __m128i s = _mm_loadu_si128( (__m128i*)src );
+    __m128i solidCmp = _mm_set1_epi8( src[0] );
+    __m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
+    if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
+    {
+        return src[0];
+    }
+
+    // Calculate min, max
+    __m128i s1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max1 = _mm_max_epu8( s, s1 );
+    __m128i min1 = _mm_min_epu8( s, s1 );
+    __m128i smax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i smin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max2 = _mm_max_epu8( max1, smax2 );
+    __m128i min2 = _mm_min_epu8( min1, smin2 );
+    __m128i smax3 = _mm_alignr_epi8( max2, max2, 2 );
+    __m128i smin3 = _mm_alignr_epi8( min2, min2, 2 );
+    __m128i max3 = _mm_max_epu8( max2, smax3 );
+    __m128i min3 = _mm_min_epu8( min2, smin3 );
+    __m128i smax4 = _mm_alignr_epi8( max3, max3, 1 );
+    __m128i smin4 = _mm_alignr_epi8( min3, min3, 1 );
+    __m128i max = _mm_max_epu8( max3, smax4 );
+    __m128i min = _mm_min_epu8( min3, smin4 );
+    __m128i max16 = _mm_unpacklo_epi8( max, _mm_setzero_si128() );
+    __m128i min16 = _mm_unpacklo_epi8( min, _mm_setzero_si128() );
+
+    // src range, mid
+    __m128i srcRange = _mm_sub_epi16( max16, min16 );
+    __m128i srcRangeHalf = _mm_srli_epi16( srcRange, 1 );
+    __m128i srcMid = _mm_add_epi16( min16, srcRangeHalf );
+
+    // multiplier
+    __m128i mul1 = _mm_mulhi_epi16( srcRange, g_alphaRange_SIMD );
+    __m128i mul = _mm_add_epi16( mul1, _mm_set1_epi16( 1 ) );
+
+    // wide source
+    __m128i s16_1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    __m128i s16[2] = { _mm_unpacklo_epi8( s, _mm_setzero_si128() ), _mm_unpacklo_epi8( s16_1, _mm_setzero_si128() ) };
+
+    __m128i sr[16] = {
+        Widen<0>( s16[0] ),
+        Widen<1>( s16[0] ),
+        Widen<2>( s16[0] ),
+        Widen<3>( s16[0] ),
+        Widen<4>( s16[0] ),
+        Widen<5>( s16[0] ),
+        Widen<6>( s16[0] ),
+        Widen<7>( s16[0] ),
+        Widen<0>( s16[1] ),
+        Widen<1>( s16[1] ),
+        Widen<2>( s16[1] ),
+        Widen<3>( s16[1] ),
+        Widen<4>( s16[1] ),
+        Widen<5>( s16[1] ),
+        Widen<6>( s16[1] ),
+        Widen<7>( s16[1] )
+    };
+
+#ifdef __AVX2__
+    __m256i srcRangeWide = _mm256_broadcastsi128_si256( srcRange );
+    __m256i srcMidWide = _mm256_broadcastsi128_si256( srcMid );
+
+    __m256i mulWide1 = _mm256_mulhi_epi16( srcRangeWide, g_alphaRange_AVX );
+    __m256i mulWide = _mm256_add_epi16( mulWide1, _mm256_set1_epi16( 1 ) );
+
+    __m256i modMul[8] = {
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[0] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[1] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[2] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[3] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[4] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[5] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[6] ) ) ), _mm256_setzero_si256() ),
+        _mm256_unpacklo_epi8( _mm256_packus_epi16( _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ), _mm256_add_epi16( srcMidWide, _mm256_mullo_epi16( mulWide, g_alpha_AVX[7] ) ) ), _mm256_setzero_si256() ),
+    };
+
+    // find selector
+    __m256i mulErr = _mm256_setzero_si256();
+    for( int j=0; j<16; j++ )
+    {
+        __m256i s16Wide = _mm256_broadcastsi128_si256( sr[j] );
+        __m256i err1, err2;
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[0] );
+        __m256i localErr = _mm256_mullo_epi16( err1, err1 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[1] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[2] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[3] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[4] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[5] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[6] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        err1 = _mm256_sub_epi16( s16Wide, modMul[7] );
+        err2 = _mm256_mullo_epi16( err1, err1 );
+        localErr = _mm256_min_epu16( localErr, err2 );
+
+        // note that this can overflow, but since we're looking for the smallest error, it shouldn't matter
+        mulErr = _mm256_adds_epu16( mulErr, localErr );
+    }
+    uint64_t minPos1 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_castsi256_si128( mulErr ) ) );
+    uint64_t minPos2 = _mm_cvtsi128_si64( _mm_minpos_epu16( _mm256_extracti128_si256( mulErr, 1 ) ) );
+    int sel = ( ( minPos1 & 0xFFFF ) < ( minPos2 & 0xFFFF ) ) ? ( minPos1 >> 16 ) : ( 8 + ( minPos2 >> 16 ) );
+
+    __m128i recVal16;
+    switch( sel )
+    {
+    case 0:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() );
+        break;
+    case 1:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() );
+        break;
+    case 2:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() );
+        break;
+    case 3:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() );
+        break;
+    case 4:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() );
+        break;
+    case 5:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() );
+        break;
+    case 6:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() );
+        break;
+    case 7:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() );
+        break;
+    case 8:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() );
+        break;
+    case 9:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() );
+        break;
+    case 10:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() );
+        break;
+    case 11:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() );
+        break;
+    case 12:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() );
+        break;
+    case 13:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() );
+        break;
+    case 14:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() );
+        break;
+    case 15:
+        recVal16 = _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() );
+        break;
+    default:
+        assert( false );
+        break;
+    }
+#else
+    // wide multiplier
+    __m128i rangeMul[16] = {
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() )
+    };
+
+    // find selector
+    int err = std::numeric_limits<int>::max();
+    int sel;
+    for( int r=0; r<16; r++ )
+    {
+        __m128i err1, err2, minerr;
+        __m128i recVal16 = rangeMul[r];
+        int rangeErr;
+
+        err1 = _mm_sub_epi16( sr[0], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr = _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[1], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[2], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[3], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[4], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[5], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[6], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[7], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[8], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[9], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[10], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[11], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[12], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[13], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[14], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        err1 = _mm_sub_epi16( sr[15], recVal16 );
+        err2 = _mm_mullo_epi16( err1, err1 );
+        minerr = _mm_minpos_epu16( err2 );
+        rangeErr += _mm_cvtsi128_si64( minerr ) & 0xFFFF;
+
+        if( rangeErr < err )
+        {
+            err = rangeErr;
+            sel = r;
+            if( err == 0 ) break;
+        }
+    }
+
+    __m128i recVal16 = rangeMul[sel];
+#endif
+
+    // find indices
+    __m128i err1, err2, minerr;
+    uint64_t idx = 0, tmp;
+
+    err1 = _mm_sub_epi16( sr[0], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 15*3;
+
+    err1 = _mm_sub_epi16( sr[1], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 14*3;
+
+    err1 = _mm_sub_epi16( sr[2], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 13*3;
+
+    err1 = _mm_sub_epi16( sr[3], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 12*3;
+
+    err1 = _mm_sub_epi16( sr[4], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 11*3;
+
+    err1 = _mm_sub_epi16( sr[5], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 10*3;
+
+    err1 = _mm_sub_epi16( sr[6], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 9*3;
+
+    err1 = _mm_sub_epi16( sr[7], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 8*3;
+
+    err1 = _mm_sub_epi16( sr[8], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 7*3;
+
+    err1 = _mm_sub_epi16( sr[9], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 6*3;
+
+    err1 = _mm_sub_epi16( sr[10], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 5*3;
+
+    err1 = _mm_sub_epi16( sr[11], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 4*3;
+
+    err1 = _mm_sub_epi16( sr[12], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 3*3;
+
+    err1 = _mm_sub_epi16( sr[13], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 2*3;
+
+    err1 = _mm_sub_epi16( sr[14], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 1*3;
+
+    err1 = _mm_sub_epi16( sr[15], recVal16 );
+    err2 = _mm_mullo_epi16( err1, err1 );
+    minerr = _mm_minpos_epu16( err2 );
+    tmp = _mm_cvtsi128_si64( minerr );
+    idx |= ( tmp >> 16 ) << 0*3;
+
+    uint16_t rm[8];
+    _mm_storeu_si128( (__m128i*)rm, mul );
+    uint16_t sm = _mm_cvtsi128_si64( srcMid );
+
+    uint64_t d = ( uint64_t( sm ) << 56 ) |
+        ( uint64_t( rm[GetMulSel( sel )] ) << 52 ) |
+        ( uint64_t( sel ) << 48 ) |
+        idx;
+
+    return _bswap64( d );
+#elif defined __ARM_NEON
+
+    int16x8_t srcMidWide, multipliers;
+    int srcMid;
+    uint8x16_t srcAlphaBlock = vld1q_u8( src );
+    {
+        uint8_t ref = src[0];
+        uint8x16_t a0 = vdupq_n_u8( ref );
+        uint8x16_t r = vceqq_u8( srcAlphaBlock, a0 );
+        int64x2_t m = vreinterpretq_s64_u8( r );
+        if( m[0] == -1 && m[1] == -1 )
+            return ref;
+
+        // srcRange
+#ifdef __aarch64__
+        uint8_t min = vminvq_u8( srcAlphaBlock );
+        uint8_t max = vmaxvq_u8( srcAlphaBlock );
+        uint8_t srcRange = max - min;
+        multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_n_s16( g_alphaRange_NEON, srcRange ), 1 ), vdupq_n_s16( 1 ) );
+        srcMid = min + srcRange / 2;
+        srcMidWide = vdupq_n_s16( srcMid );
+#else
+        uint8x8_t vmin = vpmin_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
+        vmin = vpmin_u8( vmin, vmin );
+        vmin = vpmin_u8( vmin, vmin );
+        vmin = vpmin_u8( vmin, vmin );
+        uint8x8_t vmax = vpmax_u8( vget_low_u8( srcAlphaBlock ), vget_high_u8( srcAlphaBlock ) );
+        vmax = vpmax_u8( vmax, vmax );
+        vmax = vpmax_u8( vmax, vmax );
+        vmax = vpmax_u8( vmax, vmax );
+
+        int16x8_t srcRangeWide = vreinterpretq_s16_u16( vsubl_u8( vmax, vmin ) );
+        multipliers = vqaddq_s16( vshrq_n_s16( vqdmulhq_s16( g_alphaRange_NEON, srcRangeWide ), 1 ), vdupq_n_s16( 1 ) );
+        srcMidWide = vsraq_n_s16( vreinterpretq_s16_u16(vmovl_u8(vmin)), srcRangeWide, 1);
+        srcMid = vgetq_lane_s16( srcMidWide, 0 );
+#endif
+    }
+
+    // calculate reconstructed values
+#define EAC_APPLY_16X( m ) m( 0 ) m( 1 ) m( 2 ) m( 3 ) m( 4 ) m( 5 ) m( 6 ) m( 7 ) m( 8 ) m( 9 ) m( 10 ) m( 11 ) m( 12 ) m( 13 ) m( 14 ) m( 15 )
+
+#define EAC_RECONSTRUCT_VALUE( n ) vqmovun_s16( vmlaq_s16( srcMidWide, g_alpha_NEON[n], WidenMultiplier_EAC_NEON<n>( multipliers ) ) ),
+    uint8x8_t recVals[16] = { EAC_APPLY_16X( EAC_RECONSTRUCT_VALUE ) };
+
+    // find selector
+    int err = std::numeric_limits<int>::max();
+    int sel = 0;
+    for( int r = 0; r < 16; r++ )
+    {
+        uint8x8_t recVal = recVals[r];
+
+        int rangeErr = 0;
+#define EAC_ACCUMULATE_ERROR( n ) rangeErr += MinError_EAC_NEON( ErrorProbe_EAC_NEON<n>( recVal, srcAlphaBlock ) );
+        EAC_APPLY_16X( EAC_ACCUMULATE_ERROR )
+
+        if( rangeErr < err )
+        {
+            err = rangeErr;
+            sel = r;
+            if ( err == 0 ) break;
+        }
+    }
+
+    // combine results
+    uint64_t d = ( uint64_t( srcMid ) << 56 ) |
+        ( uint64_t( multipliers[GetMulSel( sel )] ) << 52 ) |
+        ( uint64_t( sel ) << 48);
+
+    // generate indices
+    uint8x8_t recVal = recVals[sel];
+#define EAC_INSERT_INDEX(n) d |= MinErrorIndex_EAC_NEON<n>( recVal, srcAlphaBlock );
+    EAC_APPLY_16X( EAC_INSERT_INDEX )
+
+    return _bswap64( d );
+
+#undef EAC_APPLY_16X
+#undef EAC_INSERT_INDEX
+#undef EAC_ACCUMULATE_ERROR
+#undef EAC_RECONSTRUCT_VALUE
+
+#else
+    {
+        bool solid = true;
+        const uint8_t* ptr = src + 1;
+        const uint8_t ref = *src;
+        for( int i=1; i<16; i++ )
+        {
+            if( ref != *ptr++ )
+            {
+                solid = false;
+                break;
+            }
+        }
+        if( solid )
+        {
+            return ref;
+        }
+    }
+
+    uint8_t min = src[0];
+    uint8_t max = src[0];
+    for( int i=1; i<16; i++ )
+    {
+        if( min > src[i] ) min = src[i];
+        else if( max < src[i] ) max = src[i];
+    }
+    int srcRange = max - min;
+    int srcMid = min + srcRange / 2;
+
+    uint8_t buf[16][16];
+    int err = std::numeric_limits<int>::max();
+    int sel;
+    int selmul;
+    for( int r=0; r<16; r++ )
+    {
+        int mul = ( ( srcRange * g_alphaRange[r] ) >> 16 ) + 1;
+
+        int rangeErr = 0;
+        for( int i=0; i<16; i++ )
+        {
+            const auto srcVal = src[i];
+
+            int idx = 0;
+            const auto modVal = g_alpha[r][0] * mul;
+            const auto recVal = clampu8( srcMid + modVal );
+            int localErr = sq( srcVal - recVal );
+
+            if( localErr != 0 )
+            {
+                for( int j=1; j<8; j++ )
+                {
+                    const auto modVal = g_alpha[r][j] * mul;
+                    const auto recVal = clampu8( srcMid + modVal );
+                    const auto errProbe = sq( srcVal - recVal );
+                    if( errProbe < localErr )
+                    {
+                        localErr = errProbe;
+                        idx = j;
+                    }
+                }
+            }
+
+            buf[r][i] = idx;
+            rangeErr += localErr;
+        }
+
+        if( rangeErr < err )
+        {
+            err = rangeErr;
+            sel = r;
+            selmul = mul;
+            if( err == 0 ) break;
+        }
+    }
+
+    uint64_t d = ( uint64_t( srcMid ) << 56 ) |
+        ( uint64_t( selmul ) << 52 ) |
+        ( uint64_t( sel ) << 48 );
+
+    int offset = 45;
+    auto ptr = buf[sel];
+    for( int i=0; i<16; i++ )
+    {
+        d |= uint64_t( *ptr++ ) << offset;
+        offset -= 3;
+    }
+
+    return _bswap64( d );
+#endif
+}
+
+
+void CompressEtc1Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t buf[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+        __m128i c0 = _mm_castps_si128( px0 );
+        __m128i c1 = _mm_castps_si128( px1 );
+        __m128i c2 = _mm_castps_si128( px2 );
+        __m128i c3 = _mm_castps_si128( px3 );
+
+        __m128i mask = _mm_setr_epi32( 0x03030303, 0x07070707, 0x0b0b0b0b, 0x0f0f0f0f );
+        __m128i p0 = _mm_shuffle_epi8( c0, mask );
+        __m128i p1 = _mm_shuffle_epi8( c1, mask );
+        __m128i p2 = _mm_shuffle_epi8( c2, mask );
+        __m128i p3 = _mm_shuffle_epi8( c3, mask );
+
+        _mm_store_si128( (__m128i*)(buf + 0),  p0 );
+        _mm_store_si128( (__m128i*)(buf + 4),  p1 );
+        _mm_store_si128( (__m128i*)(buf + 8),  p2 );
+        _mm_store_si128( (__m128i*)(buf + 12), p3 );
+
+        src += 4;
+#else
+        auto ptr = buf;
+        for( int x=0; x<4; x++ )
+        {
+            unsigned int a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessRGB( (uint8_t*)buf );
+    }
+    while( --blocks );
+}
+
+void CompressEtc2Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t buf[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+        __m128i c0 = _mm_castps_si128( px0 );
+        __m128i c1 = _mm_castps_si128( px1 );
+        __m128i c2 = _mm_castps_si128( px2 );
+        __m128i c3 = _mm_castps_si128( px3 );
+
+        __m128i mask = _mm_setr_epi32( 0x03030303, 0x07070707, 0x0b0b0b0b, 0x0f0f0f0f );
+        __m128i p0 = _mm_shuffle_epi8( c0, mask );
+        __m128i p1 = _mm_shuffle_epi8( c1, mask );
+        __m128i p2 = _mm_shuffle_epi8( c2, mask );
+        __m128i p3 = _mm_shuffle_epi8( c3, mask );
+
+        _mm_store_si128( (__m128i*)(buf + 0),  p0 );
+        _mm_store_si128( (__m128i*)(buf + 4),  p1 );
+        _mm_store_si128( (__m128i*)(buf + 8),  p2 );
+        _mm_store_si128( (__m128i*)(buf + 12), p3 );
+
+        src += 4;
+#else
+        auto ptr = buf;
+        for( int x=0; x<4; x++ )
+        {
+            unsigned int a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src += width;
+            a = *src >> 24;
+            *ptr++ = a | ( a << 8 ) | ( a << 16 );
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessRGB_ETC2( (uint8_t*)buf );
+    }
+    while( --blocks );
+}
+
+#include <chrono>
+#include <thread>
+
+void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t buf[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+        _mm_store_si128( (__m128i*)(buf + 0),  _mm_castps_si128( px0 ) );
+        _mm_store_si128( (__m128i*)(buf + 4),  _mm_castps_si128( px1 ) );
+        _mm_store_si128( (__m128i*)(buf + 8),  _mm_castps_si128( px2 ) );
+        _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
+
+        src += 4;
+#else
+        auto ptr = buf;
+        for( int x=0; x<4; x++ )
+        {
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessRGB( (uint8_t*)buf );
+    }
+    while( --blocks );
+}
+
+void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t buf[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+#  ifdef __AVX2__
+        DitherAvx2( (uint8_t*)buf, _mm_castps_si128( px0 ), _mm_castps_si128( px1 ), _mm_castps_si128( px2 ), _mm_castps_si128( px3 ) );
+#  else
+        _mm_store_si128( (__m128i*)(buf + 0),  _mm_castps_si128( px0 ) );
+        _mm_store_si128( (__m128i*)(buf + 4),  _mm_castps_si128( px1 ) );
+        _mm_store_si128( (__m128i*)(buf + 8),  _mm_castps_si128( px2 ) );
+        _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
+
+        Dither( (uint8_t*)buf );
+#  endif
+
+        src += 4;
+#else
+        auto ptr = buf;
+        for( int x=0; x<4; x++ )
+        {
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessRGB( (uint8_t*)buf );
+    }
+    while( --blocks );
+}
+
+void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t buf[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+        _mm_store_si128( (__m128i*)(buf + 0),  _mm_castps_si128( px0 ) );
+        _mm_store_si128( (__m128i*)(buf + 4),  _mm_castps_si128( px1 ) );
+        _mm_store_si128( (__m128i*)(buf + 8),  _mm_castps_si128( px2 ) );
+        _mm_store_si128( (__m128i*)(buf + 12), _mm_castps_si128( px3 ) );
+
+        src += 4;
+#else
+        auto ptr = buf;
+        for( int x=0; x<4; x++ )
+        {
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src += width;
+            *ptr++ = *src;
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessRGB_ETC2( (uint8_t*)buf );
+    }
+    while( --blocks );
+}
+
+void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width )
+{
+    int w = 0;
+    uint32_t rgba[4*4];
+    uint8_t alpha[4*4];
+    do
+    {
+#ifdef __SSE4_1__
+        __m128 px0 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 0 ) ) );
+        __m128 px1 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 1 ) ) );
+        __m128 px2 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 2 ) ) );
+        __m128 px3 = _mm_castsi128_ps( _mm_loadu_si128( (__m128i*)( src + width * 3 ) ) );
+
+        _MM_TRANSPOSE4_PS( px0, px1, px2, px3 );
+
+        __m128i c0 = _mm_castps_si128( px0 );
+        __m128i c1 = _mm_castps_si128( px1 );
+        __m128i c2 = _mm_castps_si128( px2 );
+        __m128i c3 = _mm_castps_si128( px3 );
+
+        _mm_store_si128( (__m128i*)(rgba + 0),  c0 );
+        _mm_store_si128( (__m128i*)(rgba + 4),  c1 );
+        _mm_store_si128( (__m128i*)(rgba + 8),  c2 );
+        _mm_store_si128( (__m128i*)(rgba + 12), c3 );
+
+        __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 );
+
+        __m128i a0 = _mm_shuffle_epi8( c0, mask );
+        __m128i a1 = _mm_shuffle_epi8( c1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) );
+        __m128i a2 = _mm_shuffle_epi8( c2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) );
+        __m128i a3 = _mm_shuffle_epi8( c3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) );
+
+        __m128i s0 = _mm_or_si128( a0, a1 );
+        __m128i s1 = _mm_or_si128( a2, a3 );
+        __m128i s2 = _mm_or_si128( s0, s1 );
+
+        _mm_store_si128( (__m128i*)alpha, s2 );
+
+        src += 4;
+#else
+        auto ptr = rgba;
+        auto ptr8 = alpha;
+        for( int x=0; x<4; x++ )
+        {
+            auto v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src += width;
+            v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src += width;
+            v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src += width;
+            v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src -= width * 3 - 1;
+        }
+#endif
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+        *dst++ = ProcessAlpha_ETC2( alpha );
+        *dst++ = ProcessRGB_ETC2( (uint8_t*)rgba );
+    }
+    while( --blocks );
+}
diff --git a/thirdparty/etcpak/ProcessRGB.hpp b/thirdparty/etcpak/ProcessRGB.hpp
new file mode 100644
index 0000000000..c5555a5bb1
--- /dev/null
+++ b/thirdparty/etcpak/ProcessRGB.hpp
@@ -0,0 +1,13 @@
+#ifndef __PROCESSRGB_HPP__
+#define __PROCESSRGB_HPP__
+
+#include <stdint.h>
+
+void CompressEtc1Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc2Alpha( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc1Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc1RgbDither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc2Rgb( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+void CompressEtc2Rgba( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width );
+
+#endif
diff --git a/thirdparty/etcpak/Tables.cpp b/thirdparty/etcpak/Tables.cpp
new file mode 100644
index 0000000000..5c7fd9cf61
--- /dev/null
+++ b/thirdparty/etcpak/Tables.cpp
@@ -0,0 +1,221 @@
+#include "Tables.hpp"
+
+const int32_t g_table[8][4] = {
+    {  2,  8,   -2,   -8 },
+    {  5, 17,   -5,  -17 },
+    {  9, 29,   -9,  -29 },
+    { 13, 42,  -13,  -42 },
+    { 18, 60,  -18,  -60 },
+    { 24, 80,  -24,  -80 },
+    { 33, 106, -33, -106 },
+    { 47, 183, -47, -183 }
+};
+
+const int64_t g_table256[8][4] = {
+    {  2*256,  8*256,   -2*256,   -8*256 },
+    {  5*256, 17*256,   -5*256,  -17*256 },
+    {  9*256, 29*256,   -9*256,  -29*256 },
+    { 13*256, 42*256,  -13*256,  -42*256 },
+    { 18*256, 60*256,  -18*256,  -60*256 },
+    { 24*256, 80*256,  -24*256,  -80*256 },
+    { 33*256, 106*256, -33*256, -106*256 },
+    { 47*256, 183*256, -47*256, -183*256 }
+};
+
+const uint32_t g_id[4][16] = {
+    { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2 },
+    { 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 },
+    { 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6 }
+};
+
+const uint32_t g_avg2[16] = {
+    0x00,
+    0x11,
+    0x22,
+    0x33,
+    0x44,
+    0x55,
+    0x66,
+    0x77,
+    0x88,
+    0x99,
+    0xAA,
+    0xBB,
+    0xCC,
+    0xDD,
+    0xEE,
+    0xFF
+};
+
+const uint32_t g_flags[64] = {
+    0x80800402, 0x80800402, 0x80800402, 0x80800402,
+    0x80800402, 0x80800402, 0x80800402, 0x8080E002,
+    0x80800402, 0x80800402, 0x8080E002, 0x8080E002,
+    0x80800402, 0x8080E002, 0x8080E002, 0x8080E002,
+    0x80000402, 0x80000402, 0x80000402, 0x80000402,
+    0x80000402, 0x80000402, 0x80000402, 0x8000E002,
+    0x80000402, 0x80000402, 0x8000E002, 0x8000E002,
+    0x80000402, 0x8000E002, 0x8000E002, 0x8000E002,
+    0x00800402, 0x00800402, 0x00800402, 0x00800402,
+    0x00800402, 0x00800402, 0x00800402, 0x0080E002,
+    0x00800402, 0x00800402, 0x0080E002, 0x0080E002,
+    0x00800402, 0x0080E002, 0x0080E002, 0x0080E002,
+    0x00000402, 0x00000402, 0x00000402, 0x00000402,
+    0x00000402, 0x00000402, 0x00000402, 0x0000E002,
+    0x00000402, 0x00000402, 0x0000E002, 0x0000E002,
+    0x00000402, 0x0000E002, 0x0000E002, 0x0000E002
+};
+
+const int32_t g_alpha[16][8] = {
+    { -3, -6,  -9, -15, 2, 5, 8, 14 },
+    { -3, -7, -10, -13, 2, 6, 9, 12 },
+    { -2, -5,  -8, -13, 1, 4, 7, 12 },
+    { -2, -4,  -6, -13, 1, 3, 5, 12 },
+    { -3, -6,  -8, -12, 2, 5, 7, 11 },
+    { -3, -7,  -9, -11, 2, 6, 8, 10 },
+    { -4, -7,  -8, -11, 3, 6, 7, 10 },
+    { -3, -5,  -8, -11, 2, 4, 7, 10 },
+    { -2, -6,  -8, -10, 1, 5, 7,  9 },
+    { -2, -5,  -8, -10, 1, 4, 7,  9 },
+    { -2, -4,  -8, -10, 1, 3, 7,  9 },
+    { -2, -5,  -7, -10, 1, 4, 6,  9 },
+    { -3, -4,  -7, -10, 2, 3, 6,  9 },
+    { -1, -2,  -3, -10, 0, 1, 2,  9 },
+    { -4, -6,  -8,  -9, 3, 5, 7,  8 },
+    { -3, -5,  -7,  -9, 2, 4, 6,  8 }
+};
+
+const int32_t g_alphaRange[16] = {
+    0x100FF / ( 1 + g_alpha[0][7] - g_alpha[0][3] ),
+    0x100FF / ( 1 + g_alpha[1][7] - g_alpha[1][3] ),
+    0x100FF / ( 1 + g_alpha[2][7] - g_alpha[2][3] ),
+    0x100FF / ( 1 + g_alpha[3][7] - g_alpha[3][3] ),
+    0x100FF / ( 1 + g_alpha[4][7] - g_alpha[4][3] ),
+    0x100FF / ( 1 + g_alpha[5][7] - g_alpha[5][3] ),
+    0x100FF / ( 1 + g_alpha[6][7] - g_alpha[6][3] ),
+    0x100FF / ( 1 + g_alpha[7][7] - g_alpha[7][3] ),
+    0x100FF / ( 1 + g_alpha[8][7] - g_alpha[8][3] ),
+    0x100FF / ( 1 + g_alpha[9][7] - g_alpha[9][3] ),
+    0x100FF / ( 1 + g_alpha[10][7] - g_alpha[10][3] ),
+    0x100FF / ( 1 + g_alpha[11][7] - g_alpha[11][3] ),
+    0x100FF / ( 1 + g_alpha[12][7] - g_alpha[12][3] ),
+    0x100FF / ( 1 + g_alpha[13][7] - g_alpha[13][3] ),
+    0x100FF / ( 1 + g_alpha[14][7] - g_alpha[14][3] ),
+    0x100FF / ( 1 + g_alpha[15][7] - g_alpha[15][3] ),
+};
+
+#ifdef __SSE4_1__
+const __m128i g_table_SIMD[2] =
+{
+    _mm_setr_epi16(   2,   5,   9,  13,  18,  24,  33,  47),
+    _mm_setr_epi16(   8,  17,  29,  42,  60,  80, 106, 183)
+};
+const __m128i g_table128_SIMD[2] =
+{
+    _mm_setr_epi16(   2*128,   5*128,   9*128,  13*128,  18*128,  24*128,  33*128,  47*128),
+    _mm_setr_epi16(   8*128,  17*128,  29*128,  42*128,  60*128,  80*128, 106*128, 183*128)
+};
+const __m128i g_table256_SIMD[4] =
+{
+    _mm_setr_epi32(  2*256,   5*256,   9*256,  13*256),
+    _mm_setr_epi32(  8*256,  17*256,  29*256,  42*256),
+    _mm_setr_epi32( 18*256,  24*256,  33*256,  47*256),
+    _mm_setr_epi32( 60*256,  80*256, 106*256, 183*256)
+};
+
+const __m128i g_alpha_SIMD[16] = {
+    _mm_setr_epi16( g_alpha[ 0][0], g_alpha[ 0][1], g_alpha[ 0][2], g_alpha[ 0][3], g_alpha[ 0][4], g_alpha[ 0][5], g_alpha[ 0][6], g_alpha[ 0][7] ),
+    _mm_setr_epi16( g_alpha[ 1][0], g_alpha[ 1][1], g_alpha[ 1][2], g_alpha[ 1][3], g_alpha[ 1][4], g_alpha[ 1][5], g_alpha[ 1][6], g_alpha[ 1][7] ),
+    _mm_setr_epi16( g_alpha[ 2][0], g_alpha[ 2][1], g_alpha[ 2][2], g_alpha[ 2][3], g_alpha[ 2][4], g_alpha[ 2][5], g_alpha[ 2][6], g_alpha[ 2][7] ),
+    _mm_setr_epi16( g_alpha[ 3][0], g_alpha[ 3][1], g_alpha[ 3][2], g_alpha[ 3][3], g_alpha[ 3][4], g_alpha[ 3][5], g_alpha[ 3][6], g_alpha[ 3][7] ),
+    _mm_setr_epi16( g_alpha[ 4][0], g_alpha[ 4][1], g_alpha[ 4][2], g_alpha[ 4][3], g_alpha[ 4][4], g_alpha[ 4][5], g_alpha[ 4][6], g_alpha[ 4][7] ),
+    _mm_setr_epi16( g_alpha[ 5][0], g_alpha[ 5][1], g_alpha[ 5][2], g_alpha[ 5][3], g_alpha[ 5][4], g_alpha[ 5][5], g_alpha[ 5][6], g_alpha[ 5][7] ),
+    _mm_setr_epi16( g_alpha[ 6][0], g_alpha[ 6][1], g_alpha[ 6][2], g_alpha[ 6][3], g_alpha[ 6][4], g_alpha[ 6][5], g_alpha[ 6][6], g_alpha[ 6][7] ),
+    _mm_setr_epi16( g_alpha[ 7][0], g_alpha[ 7][1], g_alpha[ 7][2], g_alpha[ 7][3], g_alpha[ 7][4], g_alpha[ 7][5], g_alpha[ 7][6], g_alpha[ 7][7] ),
+    _mm_setr_epi16( g_alpha[ 8][0], g_alpha[ 8][1], g_alpha[ 8][2], g_alpha[ 8][3], g_alpha[ 8][4], g_alpha[ 8][5], g_alpha[ 8][6], g_alpha[ 8][7] ),
+    _mm_setr_epi16( g_alpha[ 9][0], g_alpha[ 9][1], g_alpha[ 9][2], g_alpha[ 9][3], g_alpha[ 9][4], g_alpha[ 9][5], g_alpha[ 9][6], g_alpha[ 9][7] ),
+    _mm_setr_epi16( g_alpha[10][0], g_alpha[10][1], g_alpha[10][2], g_alpha[10][3], g_alpha[10][4], g_alpha[10][5], g_alpha[10][6], g_alpha[10][7] ),
+    _mm_setr_epi16( g_alpha[11][0], g_alpha[11][1], g_alpha[11][2], g_alpha[11][3], g_alpha[11][4], g_alpha[11][5], g_alpha[11][6], g_alpha[11][7] ),
+    _mm_setr_epi16( g_alpha[12][0], g_alpha[12][1], g_alpha[12][2], g_alpha[12][3], g_alpha[12][4], g_alpha[12][5], g_alpha[12][6], g_alpha[12][7] ),
+    _mm_setr_epi16( g_alpha[13][0], g_alpha[13][1], g_alpha[13][2], g_alpha[13][3], g_alpha[13][4], g_alpha[13][5], g_alpha[13][6], g_alpha[13][7] ),
+    _mm_setr_epi16( g_alpha[14][0], g_alpha[14][1], g_alpha[14][2], g_alpha[14][3], g_alpha[14][4], g_alpha[14][5], g_alpha[14][6], g_alpha[14][7] ),
+    _mm_setr_epi16( g_alpha[15][0], g_alpha[15][1], g_alpha[15][2], g_alpha[15][3], g_alpha[15][4], g_alpha[15][5], g_alpha[15][6], g_alpha[15][7] ),
+};
+
+const __m128i g_alphaRange_SIMD = _mm_setr_epi16(
+    g_alphaRange[0],
+    g_alphaRange[1],
+    g_alphaRange[4],
+    g_alphaRange[5],
+    g_alphaRange[8],
+    g_alphaRange[14],
+    0,
+    0 );
+#endif
+
+#ifdef __AVX2__
+const __m256i g_alpha_AVX[8] = {
+    _mm256_setr_epi16( g_alpha[ 0][0], g_alpha[ 1][0], g_alpha[ 2][0], g_alpha[ 3][0], g_alpha[ 4][0], g_alpha[ 5][0], g_alpha[ 6][0], g_alpha[ 7][0], g_alpha[ 8][0], g_alpha[ 9][0], g_alpha[10][0], g_alpha[11][0], g_alpha[12][0], g_alpha[13][0], g_alpha[14][0], g_alpha[15][0] ),
+    _mm256_setr_epi16( g_alpha[ 0][1], g_alpha[ 1][1], g_alpha[ 2][1], g_alpha[ 3][1], g_alpha[ 4][1], g_alpha[ 5][1], g_alpha[ 6][1], g_alpha[ 7][1], g_alpha[ 8][1], g_alpha[ 9][1], g_alpha[10][1], g_alpha[11][1], g_alpha[12][1], g_alpha[13][1], g_alpha[14][1], g_alpha[15][1] ),
+    _mm256_setr_epi16( g_alpha[ 0][2], g_alpha[ 1][2], g_alpha[ 2][2], g_alpha[ 3][2], g_alpha[ 4][2], g_alpha[ 5][2], g_alpha[ 6][2], g_alpha[ 7][2], g_alpha[ 8][2], g_alpha[ 9][2], g_alpha[10][2], g_alpha[11][2], g_alpha[12][2], g_alpha[13][2], g_alpha[14][2], g_alpha[15][2] ),
+    _mm256_setr_epi16( g_alpha[ 0][3], g_alpha[ 1][3], g_alpha[ 2][3], g_alpha[ 3][3], g_alpha[ 4][3], g_alpha[ 5][3], g_alpha[ 6][3], g_alpha[ 7][3], g_alpha[ 8][3], g_alpha[ 9][3], g_alpha[10][3], g_alpha[11][3], g_alpha[12][3], g_alpha[13][3], g_alpha[14][3], g_alpha[15][3] ),
+    _mm256_setr_epi16( g_alpha[ 0][4], g_alpha[ 1][4], g_alpha[ 2][4], g_alpha[ 3][4], g_alpha[ 4][4], g_alpha[ 5][4], g_alpha[ 6][4], g_alpha[ 7][4], g_alpha[ 8][4], g_alpha[ 9][4], g_alpha[10][4], g_alpha[11][4], g_alpha[12][4], g_alpha[13][4], g_alpha[14][4], g_alpha[15][4] ),
+    _mm256_setr_epi16( g_alpha[ 0][5], g_alpha[ 1][5], g_alpha[ 2][5], g_alpha[ 3][5], g_alpha[ 4][5], g_alpha[ 5][5], g_alpha[ 6][5], g_alpha[ 7][5], g_alpha[ 8][5], g_alpha[ 9][5], g_alpha[10][5], g_alpha[11][5], g_alpha[12][5], g_alpha[13][5], g_alpha[14][5], g_alpha[15][5] ),
+    _mm256_setr_epi16( g_alpha[ 0][6], g_alpha[ 1][6], g_alpha[ 2][6], g_alpha[ 3][6], g_alpha[ 4][6], g_alpha[ 5][6], g_alpha[ 6][6], g_alpha[ 7][6], g_alpha[ 8][6], g_alpha[ 9][6], g_alpha[10][6], g_alpha[11][6], g_alpha[12][6], g_alpha[13][6], g_alpha[14][6], g_alpha[15][6] ),
+    _mm256_setr_epi16( g_alpha[ 0][7], g_alpha[ 1][7], g_alpha[ 2][7], g_alpha[ 3][7], g_alpha[ 4][7], g_alpha[ 5][7], g_alpha[ 6][7], g_alpha[ 7][7], g_alpha[ 8][7], g_alpha[ 9][7], g_alpha[10][7], g_alpha[11][7], g_alpha[12][7], g_alpha[13][7], g_alpha[14][7], g_alpha[15][7] ),
+};
+
+const __m256i g_alphaRange_AVX = _mm256_setr_epi16(
+    g_alphaRange[ 0], g_alphaRange[ 1], g_alphaRange[ 2], g_alphaRange[ 3], g_alphaRange[ 4], g_alphaRange[ 5], g_alphaRange[ 6], g_alphaRange[ 7],
+    g_alphaRange[ 8], g_alphaRange[ 9], g_alphaRange[10], g_alphaRange[11], g_alphaRange[12], g_alphaRange[13], g_alphaRange[14], g_alphaRange[15]
+);
+#endif
+
+#ifdef __ARM_NEON
+const int16x8_t g_table128_NEON[2] =
+{
+    { 2*128,   5*128,   9*128,  13*128,  18*128,  24*128,  33*128,  47*128 },
+    { 8*128,  17*128,  29*128,  42*128,  60*128,  80*128, 106*128, 183*128 }
+};
+
+const int32x4_t g_table256_NEON[4] =
+{
+    {  2*256,   5*256,   9*256,  13*256 },
+    {  8*256,  17*256,  29*256,  42*256 },
+    { 18*256,  24*256,  33*256,  47*256 },
+    { 60*256,  80*256, 106*256, 183*256 }
+};
+
+const int16x8_t g_alpha_NEON[16] =
+{
+    { -3, -6,  -9, -15, 2, 5, 8, 14 },
+    { -3, -7, -10, -13, 2, 6, 9, 12 },
+    { -2, -5,  -8, -13, 1, 4, 7, 12 },
+    { -2, -4,  -6, -13, 1, 3, 5, 12 },
+    { -3, -6,  -8, -12, 2, 5, 7, 11 },
+    { -3, -7,  -9, -11, 2, 6, 8, 10 },
+    { -4, -7,  -8, -11, 3, 6, 7, 10 },
+    { -3, -5,  -8, -11, 2, 4, 7, 10 },
+    { -2, -6,  -8, -10, 1, 5, 7,  9 },
+    { -2, -5,  -8, -10, 1, 4, 7,  9 },
+    { -2, -4,  -8, -10, 1, 3, 7,  9 },
+    { -2, -5,  -7, -10, 1, 4, 6,  9 },
+    { -3, -4,  -7, -10, 2, 3, 6,  9 },
+    { -1, -2,  -3, -10, 0, 1, 2,  9 },
+    { -4, -6,  -8,  -9, 3, 5, 7,  8 },
+    { -3, -5,  -7,  -9, 2, 4, 6,  8 }
+};
+
+const int16x8_t g_alphaRange_NEON =
+{
+    (int16_t)g_alphaRange[0],
+    (int16_t)g_alphaRange[1],
+    (int16_t)g_alphaRange[4],
+    (int16_t)g_alphaRange[5],
+    (int16_t)g_alphaRange[8],
+    (int16_t)g_alphaRange[14],
+    0,
+    0
+};
+#endif
diff --git a/thirdparty/etcpak/Tables.hpp b/thirdparty/etcpak/Tables.hpp
new file mode 100644
index 0000000000..69d7e8aa07
--- /dev/null
+++ b/thirdparty/etcpak/Tables.hpp
@@ -0,0 +1,49 @@
+#ifndef __TABLES_HPP__
+#define __TABLES_HPP__
+
+#include <stdint.h>
+
+#ifdef __AVX2__
+#  include <immintrin.h>
+#endif
+#ifdef __SSE4_1__
+#  include <smmintrin.h>
+#endif
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+#endif
+
+extern const int32_t g_table[8][4];
+extern const int64_t g_table256[8][4];
+
+extern const uint32_t g_id[4][16];
+
+extern const uint32_t g_avg2[16];
+
+extern const uint32_t g_flags[64];
+
+extern const int32_t g_alpha[16][8];
+extern const int32_t g_alphaRange[16];
+
+#ifdef __SSE4_1__
+extern const __m128i g_table_SIMD[2];
+extern const __m128i g_table128_SIMD[2];
+extern const __m128i g_table256_SIMD[4];
+
+extern const __m128i g_alpha_SIMD[16];
+extern const __m128i g_alphaRange_SIMD;
+#endif
+
+#ifdef __AVX2__
+extern const __m256i g_alpha_AVX[8];
+extern const __m256i g_alphaRange_AVX;
+#endif
+
+#ifdef __ARM_NEON
+extern const int16x8_t g_table128_NEON[2];
+extern const int32x4_t g_table256_NEON[4];
+extern const int16x8_t g_alpha_NEON[16];
+extern const int16x8_t g_alphaRange_NEON;
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/Vector.hpp b/thirdparty/etcpak/Vector.hpp
new file mode 100644
index 0000000000..3370a88aea
--- /dev/null
+++ b/thirdparty/etcpak/Vector.hpp
@@ -0,0 +1,222 @@
+#ifndef __DARKRL__VECTOR_HPP__
+#define __DARKRL__VECTOR_HPP__
+
+#include <assert.h>
+#include <algorithm>
+#include <math.h>
+#include <stdint.h>
+
+#include "Math.hpp"
+
+template<class T>
+struct Vector2
+{
+    Vector2() : x( 0 ), y( 0 ) {}
+    Vector2( T v ) : x( v ), y( v ) {}
+    Vector2( T _x, T _y ) : x( _x ), y( _y ) {}
+
+    bool operator==( const Vector2<T>& rhs ) const { return x == rhs.x && y == rhs.y; }
+    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    Vector2<T>& operator+=( const Vector2<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        return *this;
+    }
+    Vector2<T>& operator-=( const Vector2<T>& rhs )
+    {
+        x -= rhs.x;
+        y -= rhs.y;
+        return *this;
+    }
+    Vector2<T>& operator*=( const Vector2<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        return *this;
+    }
+
+    T x, y;
+};
+
+template<class T>
+Vector2<T> operator+( const Vector2<T>& lhs, const Vector2<T>& rhs )
+{
+    return Vector2<T>( lhs.x + rhs.x, lhs.y + rhs.y );
+}
+
+template<class T>
+Vector2<T> operator-( const Vector2<T>& lhs, const Vector2<T>& rhs )
+{
+    return Vector2<T>( lhs.x - rhs.x, lhs.y - rhs.y );
+}
+
+template<class T>
+Vector2<T> operator*( const Vector2<T>& lhs, const float& rhs )
+{
+    return Vector2<T>( lhs.x * rhs, lhs.y * rhs );
+}
+
+template<class T>
+Vector2<T> operator/( const Vector2<T>& lhs, const T& rhs )
+{
+    return Vector2<T>( lhs.x / rhs, lhs.y / rhs );
+}
+
+
+typedef Vector2<int32_t> v2i;
+typedef Vector2<float> v2f;
+
+
+template<class T>
+struct Vector3
+{
+    Vector3() : x( 0 ), y( 0 ), z( 0 ) {}
+    Vector3( T v ) : x( v ), y( v ), z( v ) {}
+    Vector3( T _x, T _y, T _z ) : x( _x ), y( _y ), z( _z ) {}
+    template<class Y>
+    Vector3( const Vector3<Y>& v ) : x( T( v.x ) ), y( T( v.y ) ), z( T( v.z ) ) {}
+
+    T Luminance() const { return T( x * 0.3f + y * 0.59f + z * 0.11f ); }
+    void Clamp()
+    {
+        x = std::min( T(1), std::max( T(0), x ) );
+        y = std::min( T(1), std::max( T(0), y ) );
+        z = std::min( T(1), std::max( T(0), z ) );
+    }
+
+    bool operator==( const Vector3<T>& rhs ) const { return x == rhs.x && y == rhs.y && z == rhs.z; }
+    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    T& operator[]( unsigned int idx ) { assert( idx < 3 ); return ((T*)this)[idx]; }
+    const T& operator[]( unsigned int idx ) const { assert( idx < 3 ); return ((T*)this)[idx]; }
+
+    Vector3<T> operator+=( const Vector3<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        z += rhs.z;
+        return *this;
+    }
+
+    Vector3<T> operator*=( const Vector3<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        z *= rhs.z;
+        return *this;
+    }
+
+    Vector3<T> operator*=( const float& rhs )
+    {
+        x *= rhs;
+        y *= rhs;
+        z *= rhs;
+        return *this;
+    }
+
+    T x, y, z;
+    T padding;
+};
+
+template<class T>
+Vector3<T> operator+( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z );
+}
+
+template<class T>
+Vector3<T> operator-( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z );
+}
+
+template<class T>
+Vector3<T> operator*( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z );
+}
+
+template<class T>
+Vector3<T> operator*( const Vector3<T>& lhs, const float& rhs )
+{
+    return Vector3<T>( T( lhs.x * rhs ), T( lhs.y * rhs ), T( lhs.z * rhs ) );
+}
+
+template<class T>
+Vector3<T> operator/( const Vector3<T>& lhs, const T& rhs )
+{
+    return Vector3<T>( lhs.x / rhs, lhs.y / rhs, lhs.z / rhs );
+}
+
+template<class T>
+bool operator<( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return lhs.Luminance() < rhs.Luminance();
+}
+
+typedef Vector3<int32_t> v3i;
+typedef Vector3<float> v3f;
+typedef Vector3<uint8_t> v3b;
+
+
+static inline v3b v3f_to_v3b( const v3f& v )
+{
+    return v3b( uint8_t( std::min( 1.f, v.x ) * 255 ), uint8_t( std::min( 1.f, v.y ) * 255 ), uint8_t( std::min( 1.f, v.z ) * 255 ) );
+}
+
+template<class T>
+Vector3<T> Mix( const Vector3<T>& v1, const Vector3<T>& v2, float amount )
+{
+    return v1 + ( v2 - v1 ) * amount;
+}
+
+template<>
+inline v3b Mix( const v3b& v1, const v3b& v2, float amount )
+{
+    return v3b( v3f( v1 ) + ( v3f( v2 ) - v3f( v1 ) ) * amount );
+}
+
+template<class T>
+Vector3<T> Desaturate( const Vector3<T>& v )
+{
+    T l = v.Luminance();
+    return Vector3<T>( l, l, l );
+}
+
+template<class T>
+Vector3<T> Desaturate( const Vector3<T>& v, float mul )
+{
+    T l = T( v.Luminance() * mul );
+    return Vector3<T>( l, l, l );
+}
+
+template<class T>
+Vector3<T> pow( const Vector3<T>& base, float exponent )
+{
+    return Vector3<T>(
+        pow( base.x, exponent ),
+        pow( base.y, exponent ),
+        pow( base.z, exponent ) );
+}
+
+template<class T>
+Vector3<T> sRGB2linear( const Vector3<T>& v )
+{
+    return Vector3<T>(
+        sRGB2linear( v.x ),
+        sRGB2linear( v.y ),
+        sRGB2linear( v.z ) );
+}
+
+template<class T>
+Vector3<T> linear2sRGB( const Vector3<T>& v )
+{
+    return Vector3<T>(
+        linear2sRGB( v.x ),
+        linear2sRGB( v.y ),
+        linear2sRGB( v.z ) );
+}
+
+#endif
diff --git a/thirdparty/icu4c/APIChangeReport.md b/thirdparty/icu4c/APIChangeReport.md
deleted file mode 100644
index 5385904fd1..0000000000
--- a/thirdparty/icu4c/APIChangeReport.md
+++ /dev/null
@@ -1,396 +0,0 @@
-
-  
-<!--
- Copyright © 2019 and later: Unicode, Inc. and others.
- License & terms of use: http://www.unicode.org/copyright.html
--->
-
-# ICU4C API Comparison: ICU 67 with ICU 68
-
-> _Note_ Markdown format of this document is new for ICU 65.
-
-- [Removed from ICU 67](#removed)
-- [Deprecated or Obsoleted in ICU 68](#deprecated)
-- [Changed in  ICU 68](#changed)
-- [Promoted to stable in ICU 68](#promoted)
-- [Added in ICU 68](#added)
-- [Other existing drafts in ICU 68](#other)
-- [Signature Simplifications](#simplifications)
-
-## Removed
-
-Removed from ICU 67
-  
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| fmtable.h | const UFormattable* icu::Formattable::toUFormattable() |  StableICU 52 | (missing)
-| measunit.h | LocalArray&lt;MeasureUnit&gt; icu::MeasureUnit::splitToSingleUnits(int32_t&amp;, UErrorCode&amp;) const |  InternalICU 67 | (missing)
-| measunit.h | int32_t icu::MeasureUnit::getIndex() const |  Internal | (missing)
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::resolveUnitPerUnit(const MeasureUnit&amp;, const MeasureUnit&amp;, bool*) |  Internal | (missing)
-| measunit.h | <tt>static</tt> int32_t icu::MeasureUnit::getIndexCount() |  Internal | (missing)
-| measunit.h | <tt>static</tt> int32_t icu::MeasureUnit::internalGetIndexForTypeAndSubtype(const char*, const char*) |  Internal | (missing)
-| nounit.h | UClassID icu::NoUnit::getDynamicClassID() const |  DraftICU 60 | (missing)
-| nounit.h | icu::NoUnit::NoUnit(const NoUnit&amp;) |  DraftICU 60 | (missing)
-| nounit.h | icu::NoUnit::~NoUnit() |  DraftICU 60 | (missing)
-| nounit.h | <tt>static</tt> NoUnit icu::NoUnit::base() |  DraftICU 60 | (missing)
-| nounit.h | <tt>static</tt> NoUnit icu::NoUnit::percent() |  DraftICU 60 | (missing)
-| nounit.h | <tt>static</tt> NoUnit icu::NoUnit::permille() |  DraftICU 60 | (missing)
-| nounit.h | <tt>static</tt> UClassID icu::NoUnit::getStaticClassID() |  DraftICU 60 | (missing)
-| nounit.h | void* icu::NoUnit::clone() const |  DraftICU 60 | (missing)
-| uniset.h | const USet* icu::UnicodeSet::toUSet() |  StableICU 4.2 | (missing)
-
-## Deprecated
-
-Deprecated or Obsoleted in ICU 68
-  
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| numberrangeformatter.h | UnicodeString icu::number::FormattedNumberRange::getFirstDecimal(UErrorCode&amp;) const |  DraftICU 63 | DeprecatedICU 68
-| numberrangeformatter.h | UnicodeString icu::number::FormattedNumberRange::getSecondDecimal(UErrorCode&amp;) const |  DraftICU 63 | DeprecatedICU 68
-| umachine.h | <tt>#define</tt> FALSE |  StableICU 2.0 | DeprecatedICU 68
-| umachine.h | <tt>#define</tt> TRUE |  StableICU 2.0 | DeprecatedICU 68
-
-## Changed
-
-Changed in  ICU 68 (old, new)
-
-
-  
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| bytestrie.h | BytesTrie&amp; icu::BytesTrie::resetToState64(uint64_t) |  Draft→StableICU 65
-| bytestrie.h | uint64_t icu::BytesTrie::getState64() const |  Draft→StableICU 65
-| listformatter.h | <tt>static</tt> ListFormatter* icu::ListFormatter::createInstance(const Locale&amp;, UListFormatterType, UListFormatterWidth, UErrorCode&amp;) |  Draft→StableICU 67
-| localebuilder.h | UBool icu::LocaleBuilder::copyErrorTo(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::addSupportedLocale(const Locale&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::operator=(Builder&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setDefaultLocale(const Locale*) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setDemotionPerDesiredLocale(ULocMatchDemotion) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setFavorSubtag(ULocMatchFavorSubtag) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocales(Iter, Iter) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocalesFromListString(StringPiece) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocalesViaConverter(Iter, Iter, Conv) |  Draft→StableICU 65
-| localematcher.h | Locale icu::LocaleMatcher::Result::makeResolvedLocale(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | LocaleMatcher icu::LocaleMatcher::Builder::build(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | LocaleMatcher&amp; icu::LocaleMatcher::operator=(LocaleMatcher&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | Result icu::LocaleMatcher::getBestMatchResult(Locale::Iterator&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Result icu::LocaleMatcher::getBestMatchResult(const Locale&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Result&amp; icu::LocaleMatcher::Result::operator=(Result&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | UBool icu::LocaleMatcher::Builder::copyErrorTo(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::Result::getDesiredLocale() const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::Result::getSupportedLocale() const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatch(Locale::Iterator&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatch(const Locale&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatchForListString(StringPiece, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchDemotion::ULOCMATCH_DEMOTION_NONE |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchDemotion::ULOCMATCH_DEMOTION_REGION |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchFavorSubtag::ULOCMATCH_FAVOR_LANGUAGE |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchFavorSubtag::ULOCMATCH_FAVOR_SCRIPT |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::Builder() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::Builder(Builder&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::~Builder() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::LocaleMatcher(LocaleMatcher&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Result::Result(Result&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Result::~Result() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::~LocaleMatcher() |  Draft→StableICU 65
-| localematcher.h | int32_t icu::LocaleMatcher::Result::getDesiredIndex() const |  Draft→StableICU 65
-| localematcher.h | int32_t icu::LocaleMatcher::Result::getSupportedIndex() const |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::hasNext() const override |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::Iterator::hasNext() const |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::RangeIterator&lt; Iter &gt;::hasNext() const override |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::next() override |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::Iterator::next() |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::RangeIterator&lt; Iter &gt;::next() override |  Draft→StableICU 65
-| locid.h | icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::ConvertingIterator(Iter, Iter, Conv) |  Draft→StableICU 65
-| locid.h | icu::Locale::Iterator::~Iterator() |  Draft→StableICU 65
-| locid.h | icu::Locale::RangeIterator&lt; Iter &gt;::RangeIterator(Iter, Iter) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getBar() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDecade() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDotPerCentimeter() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDotPerInch() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getEm() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getMegapixel() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPascal() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixel() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixelPerCentimeter() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixelPerInch() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getThermUs() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createBar(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDecade(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDotPerCentimeter(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDotPerInch(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createEm(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createMegapixel(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPascal(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixel(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixelPerCentimeter(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixelPerInch(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createThermUs(UErrorCode&amp;) |  Draft→StableICU 65
-| numberformatter.h | StringClass icu::number::FormattedNumber::toDecimalNumber(UErrorCode&amp;) const |  Draft→StableICU 65
-| numberrangeformatter.h | UnicodeString icu::number::FormattedNumberRange::getFirstDecimal(UErrorCode&amp;) const |  DraftICU 63 | DeprecatedICU 68
-| numberrangeformatter.h | UnicodeString icu::number::FormattedNumberRange::getSecondDecimal(UErrorCode&amp;) const |  DraftICU 63 | DeprecatedICU 68
-| reldatefmt.h | <tt>enum</tt> UDateAbsoluteUnit::UDAT_ABSOLUTE_HOUR |  Draft→StableICU 65
-| reldatefmt.h | <tt>enum</tt> UDateAbsoluteUnit::UDAT_ABSOLUTE_MINUTE |  Draft→StableICU 65
-| stringpiece.h | icu::StringPiece::StringPiece(T) |  Draft→StableICU 65
-| ucal.h | int32_t ucal_getHostTimeZone(UChar*, int32_t, UErrorCode*) |  Draft→StableICU 65
-| ucharstrie.h | UCharsTrie&amp; icu::UCharsTrie::resetToState64(uint64_t) |  Draft→StableICU 65
-| ucharstrie.h | uint64_t icu::UCharsTrie::getState64() const |  Draft→StableICU 65
-| ulistformatter.h | UListFormatter* ulistfmt_openForType(const char*, UListFormatterType, UListFormatterWidth, UErrorCode*) |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_AND |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_OR |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_UNITS |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_NARROW |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_SHORT |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_WIDE |  Draft→StableICU 67
-| uloc.h | UEnumeration* uloc_openAvailableByType(ULocAvailableType, UErrorCode*) |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_DEFAULT |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_ONLY_LEGACY_ALIASES |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_WITH_LEGACY_ALIASES |  Draft→StableICU 65
-| umachine.h | <tt>#define</tt> FALSE |  StableICU 2.0 | DeprecatedICU 68
-| umachine.h | <tt>#define</tt> TRUE |  StableICU 2.0 | DeprecatedICU 68
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_BUNDLE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_DATA_FILE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_RES_FILE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_START |  Draft→StableICU 65
-
-## Promoted
-
-Promoted to stable in ICU 68
-  
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| bytestrie.h | BytesTrie&amp; icu::BytesTrie::resetToState64(uint64_t) |  Draft→StableICU 65
-| bytestrie.h | uint64_t icu::BytesTrie::getState64() const |  Draft→StableICU 65
-| fmtable.h | UFormattable* icu::Formattable::toUFormattable() |  (missing) | StableICU 52
-| listformatter.h | <tt>static</tt> ListFormatter* icu::ListFormatter::createInstance(const Locale&amp;, UListFormatterType, UListFormatterWidth, UErrorCode&amp;) |  Draft→StableICU 67
-| localebuilder.h | UBool icu::LocaleBuilder::copyErrorTo(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::addSupportedLocale(const Locale&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::operator=(Builder&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setDefaultLocale(const Locale*) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setDemotionPerDesiredLocale(ULocMatchDemotion) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setFavorSubtag(ULocMatchFavorSubtag) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocales(Iter, Iter) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator&amp;) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocalesFromListString(StringPiece) |  Draft→StableICU 65
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setSupportedLocalesViaConverter(Iter, Iter, Conv) |  Draft→StableICU 65
-| localematcher.h | Locale icu::LocaleMatcher::Result::makeResolvedLocale(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | LocaleMatcher icu::LocaleMatcher::Builder::build(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | LocaleMatcher&amp; icu::LocaleMatcher::operator=(LocaleMatcher&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | Result icu::LocaleMatcher::getBestMatchResult(Locale::Iterator&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Result icu::LocaleMatcher::getBestMatchResult(const Locale&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | Result&amp; icu::LocaleMatcher::Result::operator=(Result&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | UBool icu::LocaleMatcher::Builder::copyErrorTo(UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::Result::getDesiredLocale() const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::Result::getSupportedLocale() const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatch(Locale::Iterator&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatch(const Locale&amp;, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | const Locale* icu::LocaleMatcher::getBestMatchForListString(StringPiece, UErrorCode&amp;) const |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchDemotion::ULOCMATCH_DEMOTION_NONE |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchDemotion::ULOCMATCH_DEMOTION_REGION |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchFavorSubtag::ULOCMATCH_FAVOR_LANGUAGE |  Draft→StableICU 65
-| localematcher.h | <tt>enum</tt> ULocMatchFavorSubtag::ULOCMATCH_FAVOR_SCRIPT |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::Builder() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::Builder(Builder&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Builder::~Builder() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::LocaleMatcher(LocaleMatcher&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Result::Result(Result&amp;&amp;) |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::Result::~Result() |  Draft→StableICU 65
-| localematcher.h | icu::LocaleMatcher::~LocaleMatcher() |  Draft→StableICU 65
-| localematcher.h | int32_t icu::LocaleMatcher::Result::getDesiredIndex() const |  Draft→StableICU 65
-| localematcher.h | int32_t icu::LocaleMatcher::Result::getSupportedIndex() const |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::hasNext() const override |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::Iterator::hasNext() const |  Draft→StableICU 65
-| locid.h | UBool icu::Locale::RangeIterator&lt; Iter &gt;::hasNext() const override |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::next() override |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::Iterator::next() |  Draft→StableICU 65
-| locid.h | const Locale&amp; icu::Locale::RangeIterator&lt; Iter &gt;::next() override |  Draft→StableICU 65
-| locid.h | icu::Locale::ConvertingIterator&lt; Iter, Conv &gt;::ConvertingIterator(Iter, Iter, Conv) |  Draft→StableICU 65
-| locid.h | icu::Locale::Iterator::~Iterator() |  Draft→StableICU 65
-| locid.h | icu::Locale::RangeIterator&lt; Iter &gt;::RangeIterator(Iter, Iter) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getBar() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDecade() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDotPerCentimeter() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDotPerInch() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getEm() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getMegapixel() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPascal() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixel() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixelPerCentimeter() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPixelPerInch() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getThermUs() |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createBar(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDecade(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDotPerCentimeter(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDotPerInch(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createEm(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createMegapixel(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPascal(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixel(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixelPerCentimeter(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPixelPerInch(UErrorCode&amp;) |  Draft→StableICU 65
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createThermUs(UErrorCode&amp;) |  Draft→StableICU 65
-| numberformatter.h | StringClass icu::number::FormattedNumber::toDecimalNumber(UErrorCode&amp;) const |  Draft→StableICU 65
-| reldatefmt.h | <tt>enum</tt> UDateAbsoluteUnit::UDAT_ABSOLUTE_HOUR |  Draft→StableICU 65
-| reldatefmt.h | <tt>enum</tt> UDateAbsoluteUnit::UDAT_ABSOLUTE_MINUTE |  Draft→StableICU 65
-| stringpiece.h | icu::StringPiece::StringPiece(T) |  Draft→StableICU 65
-| ucal.h | int32_t ucal_getHostTimeZone(UChar*, int32_t, UErrorCode*) |  Draft→StableICU 65
-| ucharstrie.h | UCharsTrie&amp; icu::UCharsTrie::resetToState64(uint64_t) |  Draft→StableICU 65
-| ucharstrie.h | uint64_t icu::UCharsTrie::getState64() const |  Draft→StableICU 65
-| ulistformatter.h | UListFormatter* ulistfmt_openForType(const char*, UListFormatterType, UListFormatterWidth, UErrorCode*) |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_AND |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_OR |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterType::ULISTFMT_TYPE_UNITS |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_NARROW |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_SHORT |  Draft→StableICU 67
-| ulistformatter.h | <tt>enum</tt> UListFormatterWidth::ULISTFMT_WIDTH_WIDE |  Draft→StableICU 67
-| uloc.h | UEnumeration* uloc_openAvailableByType(ULocAvailableType, UErrorCode*) |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_DEFAULT |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_ONLY_LEGACY_ALIASES |  Draft→StableICU 65
-| uloc.h | <tt>enum</tt> ULocAvailableType::ULOC_AVAILABLE_WITH_LEGACY_ALIASES |  Draft→StableICU 65
-| uniset.h | USet* icu::UnicodeSet::toUSet() |  (missing) | StableICU 4.2
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_BUNDLE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_DATA_FILE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_RES_FILE |  Draft→StableICU 65
-| utrace.h | <tt>enum</tt> UTraceFunctionNumber::UTRACE_UDATA_START |  Draft→StableICU 65
-
-## Added
-
-Added in ICU 68
-  
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| dtitvfmt.h | UDisplayContext icu::DateIntervalFormat::getContext(UDisplayContextType, UErrorCode&amp;) const |  (missing) | DraftICU 68
-| dtitvfmt.h | void icu::DateIntervalFormat::setContext(UDisplayContext, UErrorCode&amp;) |  (missing) | DraftICU 68
-| dtptngen.h | <tt>static</tt> DateTimePatternGenerator* icu::DateTimePatternGenerator::createInstanceNoStdPat(const Locale&amp;, UErrorCode&amp;) |  (missing) | Internal
-| fmtable.h | UFormattable* icu::Formattable::toUFormattable() |  (missing) | StableICU 52
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setMaxDistance(const Locale&amp;, const Locale&amp;) |  (missing) | DraftICU 68
-| localematcher.h | Builder&amp; icu::LocaleMatcher::Builder::setNoDefaultLocale() |  (missing) | DraftICU 68
-| localematcher.h | UBool icu::LocaleMatcher::isMatch(const Locale&amp;, const Locale&amp;, UErrorCode&amp;) const |  (missing) | DraftICU 68
-| measunit.h | int32_t icu::MeasureUnit::getOffset() const |  (missing) | Internal
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getCandela() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDessertSpoon() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDessertSpoonImperial() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDot() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDram() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getDrop() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getEarthRadius() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getGrain() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getJigger() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getLumen() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getPinch() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit icu::MeasureUnit::getQuartImperial() |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createCandela(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDessertSpoon(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDessertSpoonImperial(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDot(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDram(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createDrop(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createEarthRadius(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createGrain(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createJigger(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createLumen(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createPinch(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | <tt>static</tt> MeasureUnit* icu::MeasureUnit::createQuartImperial(UErrorCode&amp;) |  (missing) | DraftICU 68
-| measunit.h | std::pair&lt; LocalArray&lt; MeasureUnit &gt;, int32_t &gt; icu::MeasureUnit::splitToSingleUnits(UErrorCode&amp;) const |  (missing) | DraftICU 68
-| numberformatter.h | Derived icu::number::NumberFormatterSettings&lt; Derived &gt;::usage(StringPiece) const&amp; |  (missing) | DraftICU 68
-| numberformatter.h | Derived icu::number::NumberFormatterSettings&lt; Derived &gt;::usage(StringPiece)&amp;&amp; |  (missing) | DraftICU 68
-| numberformatter.h | MeasureUnit icu::number::FormattedNumber::getOutputUnit(UErrorCode&amp;) const |  (missing) | DraftICU 68
-| numberformatter.h | Usage&amp; icu::number::impl::Usage::operator=(Usage&amp;&amp;) |  (missing) | Internal
-| numberformatter.h | Usage&amp; icu::number::impl::Usage::operator=(const Usage&amp;) |  (missing) | Internal
-| numberformatter.h | bool icu::number::impl::Usage::isSet() const |  (missing) | Internal
-| numberformatter.h | icu::number::impl::Usage::Usage(Usage&amp;&amp;) |  (missing) | Internal
-| numberformatter.h | icu::number::impl::Usage::Usage(const Usage&amp;) |  (missing) | Internal
-| numberformatter.h | icu::number::impl::Usage::~Usage() |  (missing) | Internal
-| numberformatter.h | int16_t icu::number::impl::Usage::length() const |  (missing) | Internal
-| numberformatter.h | void icu::number::impl::Usage::set(StringPiece) |  (missing) | Internal
-| numberrangeformatter.h | std::pair&lt; StringClass, StringClass &gt; icu::number::FormattedNumberRange::getDecimalNumbers(UErrorCode&amp;) const |  (missing) | DraftICU 68
-| plurrule.h | UnicodeString icu::PluralRules::select(const number::FormattedNumberRange&amp;, UErrorCode&amp;) const |  (missing) | DraftICU 68
-| plurrule.h | UnicodeString icu::PluralRules::select(const number::impl::UFormattedNumberRangeData*, UErrorCode&amp;) const |  (missing) | Internal
-| plurrule.h | int32_t icu::PluralRules::getSamples(const UnicodeString&amp;, FixedDecimal*, int32_t, UErrorCode&amp;) |  (missing) | Internal
-| timezone.h | <tt>static</tt> TimeZone* icu::TimeZone::forLocaleOrDefault(const Locale&amp;) |  (missing) | Internal
-| ucurr.h | <tt>enum</tt> UCurrNameStyle::UCURR_FORMAL_SYMBOL_NAME |  (missing) | DraftICU 68
-| ucurr.h | <tt>enum</tt> UCurrNameStyle::UCURR_VARIANT_SYMBOL_NAME |  (missing) | DraftICU 68
-| udateintervalformat.h | UDisplayContext udtitvfmt_getContext(const UDateIntervalFormat*, UDisplayContextType, UErrorCode*) |  (missing) | DraftICU 68
-| udateintervalformat.h | void udtitvfmt_setContext(UDateIntervalFormat*, UDisplayContext, UErrorCode*) |  (missing) | DraftICU 68
-| umachine.h | <tt>#define</tt> U_DEFINE_FALSE_AND_TRUE |  (missing) | InternalICU 68
-| uniset.h | USet* icu::UnicodeSet::toUSet() |  (missing) | StableICU 4.2
-| unum.h | <tt>enum</tt> UNumberFormatMinimumGroupingDigits::UNUM_MINIMUM_GROUPING_DIGITS_AUTO |  (missing) | DraftICU 68
-| unum.h | <tt>enum</tt> UNumberFormatMinimumGroupingDigits::UNUM_MINIMUM_GROUPING_DIGITS_MIN2 |  (missing) | DraftICU 68
-| unumberformatter.h | <tt>enum</tt> UNumberUnitWidth::UNUM_UNIT_WIDTH_FORMAL |  (missing) | DraftICU 68
-| unumberformatter.h | <tt>enum</tt> UNumberUnitWidth::UNUM_UNIT_WIDTH_VARIANT |  (missing) | DraftICU 68
-| unumberformatter.h | int32_t unumf_resultToDecimalNumber(const UFormattedNumber*, char*, int32_t, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | UFormattedNumberRange* unumrf_openResult(UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | UNumberRangeFormatter* unumrf_openForSkeletonWithCollapseAndIdentityFallback(const UChar*, int32_t, UNumberRangeCollapse, UNumberRangeIdentityFallback, const char*, UParseError*, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | UNumberRangeIdentityResult unumrf_resultGetIdentityResult(const UFormattedNumberRange*, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | const UFormattedValue* unumrf_resultAsValue(const UFormattedNumberRange*, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | int32_t unumrf_resultGetFirstDecimalNumber(const UFormattedNumberRange*, char*, int32_t, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | int32_t unumrf_resultGetSecondDecimalNumber(const UFormattedNumberRange*, char*, int32_t, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | void unumrf_close(UNumberRangeFormatter*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | void unumrf_closeResult(UFormattedNumberRange*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | void unumrf_formatDecimalRange(const UNumberRangeFormatter*, const char*, int32_t, const char*, int32_t, UFormattedNumberRange*, UErrorCode*) |  (missing) | DraftICU 68
-| unumberrangeformatter.h | void unumrf_formatDoubleRange(const UNumberRangeFormatter*, double, double, UFormattedNumberRange*, UErrorCode*) |  (missing) | DraftICU 68
-| upluralrules.h | int32_t uplrules_selectForRange(const UPluralRules*, const struct UFormattedNumberRange*, UChar*, int32_t, UErrorCode*) |  (missing) | DraftICU 68
-
-## Other
-
-Other existing drafts in ICU 68
-
-| File | API | ICU 67 | ICU 68 |
-|---|---|---|---|
-| bytestream.h |  void icu::ByteSink::AppendU8(const char*, int32_t) | DraftICU 67 | 
-| bytestream.h |  void icu::ByteSink::AppendU8(const char8_t*, int32_t) | DraftICU 67 | 
-| dtptngen.h |  UDateFormatHourCycle icu::DateTimePatternGenerator::getDefaultHourCycle(UErrorCode&amp;) const | DraftICU 67 | 
-| localematcher.h |  Builder&amp; icu::LocaleMatcher::Builder::setDirection(ULocMatchDirection) | DraftICU 67 | 
-| localematcher.h |  <tt>enum</tt> ULocMatchDirection::ULOCMATCH_DIRECTION_ONLY_TWO_WAY | DraftICU 67 | 
-| localematcher.h |  <tt>enum</tt> ULocMatchDirection::ULOCMATCH_DIRECTION_WITH_ONE_WAY | DraftICU 67 | 
-| locid.h |  void icu::Locale::canonicalize(UErrorCode&amp;) | DraftICU 67 | 
-| measfmt.h |  void icu::MeasureFormat::parseObject(const UnicodeString&amp;, Formattable&amp;, ParsePosition&amp;) const | DraftICU 53 | 
-| measunit.h |  MeasureUnit icu::MeasureUnit::product(const MeasureUnit&amp;, UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  MeasureUnit icu::MeasureUnit::reciprocal(UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  MeasureUnit icu::MeasureUnit::withDimensionality(int32_t, UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  MeasureUnit icu::MeasureUnit::withSIPrefix(UMeasureSIPrefix, UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  MeasureUnit&amp; icu::MeasureUnit::operator=(MeasureUnit&amp;&amp;) noexcept | DraftICU 67 | 
-| measunit.h |  UMeasureSIPrefix icu::MeasureUnit::getSIPrefix(UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  UMeasureUnitComplexity icu::MeasureUnit::getComplexity(UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  const char* icu::MeasureUnit::getIdentifier() const | DraftICU 67 | 
-| measunit.h |  icu::MeasureUnit::MeasureUnit(MeasureUnit&amp;&amp;) noexcept | DraftICU 67 | 
-| measunit.h |  int32_t icu::MeasureUnit::getDimensionality(UErrorCode&amp;) const | DraftICU 67 | 
-| measunit.h |  <tt>static</tt> MeasureUnit icu::MeasureUnit::forIdentifier(StringPiece, UErrorCode&amp;) | DraftICU 67 | 
-| stringpiece.h |  icu::StringPiece::StringPiece(const char8_t*) | DraftICU 67 | 
-| stringpiece.h |  icu::StringPiece::StringPiece(const char8_t*, int32_t) | DraftICU 67 | 
-| stringpiece.h |  icu::StringPiece::StringPiece(const std::u8string&amp;) | DraftICU 67 | 
-| stringpiece.h |  icu::StringPiece::StringPiece(std::nullptr_t) | DraftICU 67 | 
-| stringpiece.h |  int32_t icu::StringPiece::compare(StringPiece) | DraftICU 67 | 
-| stringpiece.h |  int32_t icu::StringPiece::find(StringPiece, int32_t) | DraftICU 67 | 
-| stringpiece.h |  void icu::StringPiece::set(const char8_t*) | DraftICU 67 | 
-| stringpiece.h |  void icu::StringPiece::set(const char8_t*, int32_t) | DraftICU 67 | 
-| udat.h |  <tt>enum</tt> UDateFormatHourCycle::UDAT_HOUR_CYCLE_11 | DraftICU 67 | 
-| udat.h |  <tt>enum</tt> UDateFormatHourCycle::UDAT_HOUR_CYCLE_12 | DraftICU 67 | 
-| udat.h |  <tt>enum</tt> UDateFormatHourCycle::UDAT_HOUR_CYCLE_23 | DraftICU 67 | 
-| udat.h |  <tt>enum</tt> UDateFormatHourCycle::UDAT_HOUR_CYCLE_24 | DraftICU 67 | 
-| udateintervalformat.h |  void udtitvfmt_formatCalendarToResult(const UDateIntervalFormat*, UCalendar*, UCalendar*, UFormattedDateInterval*, UErrorCode*) | DraftICU 67 | 
-| udateintervalformat.h |  void udtitvfmt_formatToResult(const UDateIntervalFormat*, UDate, UDate, UFormattedDateInterval*, UErrorCode*) | DraftICU 67 | 
-| udatpg.h |  UDateFormatHourCycle udatpg_getDefaultHourCycle(const UDateTimePatternGenerator*, UErrorCode*) | DraftICU 67 | 
-| uregex.h |  <tt>enum</tt> URegexpFlag::UREGEX_CANON_EQ | DraftICU 2.4 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_BREAK_ENGINE | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_CHARACTER | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_LINE | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_SENTENCE | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_TITLE | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_CREATE_WORD | DraftICU 67 | 
-| utrace.h |  <tt>enum</tt> UTraceFunctionNumber::UTRACE_UBRK_START | DraftICU 67 | 
-
-## Simplifications
-
-This section shows cases where the signature was "simplified" for the sake of comparison. The simplified form is in bold, followed by
-    all possible variations in "original" form.
-
-
-## Colophon
-
-Contents generated by StableAPI tool on Fri Oct 23 11:32:42 PDT 2020
-
-Copyright © 2019 and later: Unicode, Inc. and others.
-License & terms of use: http://www.unicode.org/copyright.html
-  
-\ No newline at end of file
diff --git a/thirdparty/icu4c/common/bytestriebuilder.cpp b/thirdparty/icu4c/common/bytestriebuilder.cpp
index ec1ab7d8f5..28256f272a 100644
--- a/thirdparty/icu4c/common/bytestriebuilder.cpp
+++ b/thirdparty/icu4c/common/bytestriebuilder.cpp
@@ -474,31 +474,39 @@ BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
     U_ASSERT(i>=0);
     if(i<=BytesTrie::kMaxOneByteDelta) {
         return write(i);
+    } else {
+        char intBytes[5];
+        return write(intBytes, internalEncodeDelta(i, intBytes));
     }
-    char intBytes[5];
-    int32_t length;
+}
+
+int32_t
+BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) {
+    U_ASSERT(i>=0);
+    if(i<=BytesTrie::kMaxOneByteDelta) {
+        intBytes[0]=(char)i;
+        return 1;
+    }
+    int32_t length=1;
     if(i<=BytesTrie::kMaxTwoByteDelta) {
         intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
-        length=1;
     } else {
         if(i<=BytesTrie::kMaxThreeByteDelta) {
             intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
-            length=2;
         } else {
             if(i<=0xffffff) {
                 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
-                length=3;
             } else {
                 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
                 intBytes[1]=(char)(i>>24);
-                length=4;
+                length=2;
             }
-            intBytes[1]=(char)(i>>16);
+            intBytes[length++]=(char)(i>>16);
         }
-        intBytes[1]=(char)(i>>8);
+        intBytes[length++]=(char)(i>>8);
     }
     intBytes[length++]=(char)i;
-    return write(intBytes, length);
+    return length;
 }
 
 U_NAMESPACE_END
diff --git a/thirdparty/icu4c/common/charstr.cpp b/thirdparty/icu4c/common/charstr.cpp
index 318a185b3f..c35622882c 100644
--- a/thirdparty/icu4c/common/charstr.cpp
+++ b/thirdparty/icu4c/common/charstr.cpp
@@ -14,6 +14,8 @@
 *   created by: Markus W. Scherer
 */
 
+#include <cstdlib>
+
 #include "unicode/utypes.h"
 #include "unicode/putil.h"
 #include "charstr.h"
@@ -141,6 +143,38 @@ CharString &CharString::append(const char *s, int32_t sLength, UErrorCode &error
     return *this;
 }
 
+CharString &CharString::appendNumber(int32_t number, UErrorCode &status) {
+    if (number < 0) {
+        this->append('-', status);
+        if (U_FAILURE(status)) {
+            return *this;
+        }
+    }
+
+    if (number == 0) {
+        this->append('0', status);
+        return *this;
+    }
+
+    int32_t numLen = 0;
+    while (number != 0) {
+        int32_t residue = number % 10;
+        number /= 10;
+        this->append(std::abs(residue) + '0', status);
+        numLen++;
+        if (U_FAILURE(status)) {
+            return *this;
+        }
+    }
+
+    int32_t start = this->length() - numLen, end = this->length() - 1;
+    while(start < end) {
+        std::swap(this->data()[start++], this->data()[end--]);
+    }
+
+    return *this;
+}
+
 char *CharString::getAppendBuffer(int32_t minCapacity,
                                   int32_t desiredCapacityHint,
                                   int32_t &resultCapacity,
diff --git a/thirdparty/icu4c/common/charstr.h b/thirdparty/icu4c/common/charstr.h
index 6619faac61..175acd1c0a 100644
--- a/thirdparty/icu4c/common/charstr.h
+++ b/thirdparty/icu4c/common/charstr.h
@@ -127,6 +127,9 @@ public:
         return append(s.data(), s.length(), errorCode);
     }
     CharString &append(const char *s, int32_t sLength, UErrorCode &status);
+
+    CharString &appendNumber(int32_t number, UErrorCode &status);
+
     /**
      * Returns a writable buffer for appending and writes the buffer's capacity to
      * resultCapacity. Guarantees resultCapacity>=minCapacity if U_SUCCESS().
diff --git a/thirdparty/icu4c/common/cmemory.h b/thirdparty/icu4c/common/cmemory.h
index a9d9424b4e..f03b7dcce6 100644
--- a/thirdparty/icu4c/common/cmemory.h
+++ b/thirdparty/icu4c/common/cmemory.h
@@ -31,14 +31,63 @@
 #include <stddef.h>
 #include <string.h>
 #include "unicode/localpointer.h"
+#include "uassert.h"
 
 #if U_DEBUG && defined(UPRV_MALLOC_COUNT)
 #include <stdio.h>
 #endif
 
-
-#define uprv_memcpy(dst, src, size) U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size)
-#define uprv_memmove(dst, src, size) U_STANDARD_CPP_NAMESPACE memmove(dst, src, size)
+// uprv_memcpy and uprv_memmove
+#if defined(__clang__)
+#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    /* Suppress warnings about addresses that will never be NULL */ \
+    _Pragma("clang diagnostic push") \
+    _Pragma("clang diagnostic ignored \"-Waddress\"") \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    _Pragma("clang diagnostic pop") \
+    U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    /* Suppress warnings about addresses that will never be NULL */ \
+    _Pragma("clang diagnostic push") \
+    _Pragma("clang diagnostic ignored \"-Waddress\"") \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    _Pragma("clang diagnostic pop") \
+    U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#elif defined(__GNUC__)
+#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    /* Suppress warnings about addresses that will never be NULL */ \
+    _Pragma("GCC diagnostic push") \
+    _Pragma("GCC diagnostic ignored \"-Waddress\"") \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    _Pragma("GCC diagnostic pop") \
+    U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    /* Suppress warnings about addresses that will never be NULL */ \
+    _Pragma("GCC diagnostic push") \
+    _Pragma("GCC diagnostic ignored \"-Waddress\"") \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    _Pragma("GCC diagnostic pop") \
+    U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#else
+#define uprv_memcpy(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    U_STANDARD_CPP_NAMESPACE memcpy(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#define uprv_memmove(dst, src, size) UPRV_BLOCK_MACRO_BEGIN { \
+    U_ASSERT(dst != NULL); \
+    U_ASSERT(src != NULL); \
+    U_STANDARD_CPP_NAMESPACE memmove(dst, src, size); \
+} UPRV_BLOCK_MACRO_END
+#endif
 
 /**
  * \def UPRV_LENGTHOF
diff --git a/thirdparty/icu4c/common/dictbe.cpp b/thirdparty/icu4c/common/dictbe.cpp
index b42cdf03fa..44285755f3 100644
--- a/thirdparty/icu4c/common/dictbe.cpp
+++ b/thirdparty/icu4c/common/dictbe.cpp
@@ -265,13 +265,9 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
                 goto foundBest;
             }
             do {
-                int32_t wordsMatched = 1;
                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
-                    if (wordsMatched < 2) {
-                        // Followed by another dictionary word; mark first word as a good candidate
-                        words[wordsFound%THAI_LOOKAHEAD].markCurrent();
-                        wordsMatched = 2;
-                    }
+                    // Followed by another dictionary word; mark first word as a good candidate
+                    words[wordsFound%THAI_LOOKAHEAD].markCurrent();
                     
                     // If we're already at the end of the range, we're done
                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
@@ -503,13 +499,9 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
                 goto foundBest;
             }
             do {
-                int32_t wordsMatched = 1;
                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
-                    if (wordsMatched < 2) {
-                        // Followed by another dictionary word; mark first word as a good candidate
-                        words[wordsFound%LAO_LOOKAHEAD].markCurrent();
-                        wordsMatched = 2;
-                    }
+                    // Followed by another dictionary word; mark first word as a good candidate
+                    words[wordsFound%LAO_LOOKAHEAD].markCurrent();
                     
                     // If we're already at the end of the range, we're done
                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
@@ -699,13 +691,9 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
                 goto foundBest;
             }
             do {
-                int32_t wordsMatched = 1;
                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
-                    if (wordsMatched < 2) {
-                        // Followed by another dictionary word; mark first word as a good candidate
-                        words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
-                        wordsMatched = 2;
-                    }
+                    // Followed by another dictionary word; mark first word as a good candidate
+                    words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
                     
                     // If we're already at the end of the range, we're done
                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
@@ -908,13 +896,9 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
                 goto foundBest;
             }
             do {
-                int32_t wordsMatched = 1;
                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
-                    if (wordsMatched < 2) {
-                        // Followed by another dictionary word; mark first word as a good candidate
-                        words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
-                        wordsMatched = 2;
-                    }
+                    // Followed by another dictionary word; mark first word as a good candidate
+                    words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 
                     // If we're already at the end of the range, we're done
                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
diff --git a/thirdparty/icu4c/common/edits.cpp b/thirdparty/icu4c/common/edits.cpp
index 95f0c19a72..92ca36fb5d 100644
--- a/thirdparty/icu4c/common/edits.cpp
+++ b/thirdparty/icu4c/common/edits.cpp
@@ -86,6 +86,7 @@ Edits &Edits::moveArray(Edits &src) U_NOEXCEPT {
 }
 
 Edits &Edits::operator=(const Edits &other) {
+    if (this == &other) { return *this; }  // self-assignment: no-op
     length = other.length;
     delta = other.delta;
     numChanges = other.numChanges;
diff --git a/thirdparty/icu4c/common/filteredbrk.cpp b/thirdparty/icu4c/common/filteredbrk.cpp
index c07128cbce..25080f9d33 100644
--- a/thirdparty/icu4c/common/filteredbrk.cpp
+++ b/thirdparty/icu4c/common/filteredbrk.cpp
@@ -20,6 +20,7 @@
 #include "ubrkimpl.h" // U_ICUDATA_BRKITR
 #include "uvector.h"
 #include "cmemory.h"
+#include "umutex.h"
 
 U_NAMESPACE_BEGIN
 
@@ -139,13 +140,30 @@ class SimpleFilteredSentenceBreakData : public UMemory {
 public:
   SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) 
       : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
-  SimpleFilteredSentenceBreakData *incr() { refcount++;  return this; }
-  SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
-  virtual ~SimpleFilteredSentenceBreakData();
+    SimpleFilteredSentenceBreakData *incr() {
+        umtx_atomic_inc(&refcount);
+        return this;
+    }
+    SimpleFilteredSentenceBreakData *decr() {
+        if(umtx_atomic_dec(&refcount) <= 0) {
+            delete this;
+        }
+        return 0;
+    }
+    virtual ~SimpleFilteredSentenceBreakData();
+
+    bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); }
+    bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); }
 
-  LocalPointer<UCharsTrie>    fForwardsPartialTrie; //  Has ".a" for "a.M."
-  LocalPointer<UCharsTrie>    fBackwardsTrie; //  i.e. ".srM" for Mrs.
-  int32_t                     refcount;
+    const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; }
+    const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; }
+
+private:
+    // These tries own their data arrays.
+    // They are shared and must therefore not be modified.
+    LocalPointer<UCharsTrie>    fForwardsPartialTrie; //  Has ".a" for "a.M."
+    LocalPointer<UCharsTrie>    fBackwardsTrie; //  i.e. ".srM" for Mrs.
+    u_atomic_int32_t            refcount;
 };
 
 SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
@@ -244,7 +262,13 @@ SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIt
   fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
   fDelegate(adopt)
 {
-  // all set..
+    if (fData == nullptr) {
+        delete forwards;
+        delete backwards;
+        if (U_SUCCESS(status)) {
+            status = U_MEMORY_ALLOCATION_ERROR;
+        }
+    }
 }
 
 SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
@@ -261,59 +285,62 @@ SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
     int32_t bestValue = -1;
     // loops while 'n' points to an exception.
     utext_setNativeIndex(fText.getAlias(), n); // from n..
-    fData->fBackwardsTrie->reset();
-    UChar32 uch;
 
     //if(debug2) u_printf(" n@ %d\n", n);
     // Assume a space is following the '.'  (so we handle the case:  "Mr. /Brown")
-    if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) {  // TODO: skip a class of chars here??
+    if(utext_previous32(fText.getAlias())==u' ') {  // TODO: skip a class of chars here??
       // TODO only do this the 1st time?
       //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
     } else {
       //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
-      uch = utext_next32(fText.getAlias());
+      utext_next32(fText.getAlias());
       //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
     }
 
-    UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
-
-    while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL  &&   // more to consume backwards and..
-          USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
-      if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
-        bestPosn = utext_getNativeIndex(fText.getAlias());
-        bestValue = fData->fBackwardsTrie->getValue();
-      }
-      //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
+    {
+        // Do not modify the shared trie!
+        UCharsTrie iter(fData->getBackwardsTrie());
+        UChar32 uch;
+        while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) {  // more to consume backwards
+            UStringTrieResult r = iter.nextForCodePoint(uch);
+            if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
+                bestPosn = utext_getNativeIndex(fText.getAlias());
+                bestValue = iter.getValue();
+            }
+            if(!USTRINGTRIE_HAS_NEXT(r)) {
+                break;
+            }
+            //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
+        }
     }
 
-    if(USTRINGTRIE_MATCHES(r)) { // exact match?
-      //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
-      bestValue = fData->fBackwardsTrie->getValue();
-      bestPosn = utext_getNativeIndex(fText.getAlias());
-      //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
-    }
+    //if(bestValue >= 0) {
+        //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
+    //}
 
     if(bestPosn>=0) {
       //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
 
       //if(USTRINGTRIE_MATCHES(r)) {  // matched - so, now what?
-      //int32_t bestValue = fBackwardsTrie->getValue();
+      //int32_t bestValue = iter.getValue();
       ////if(debug2) u_printf("rev< /%C/ matched, skip..%d  bestValue=%d\n", (UChar)uch, r, bestValue);
 
       if(bestValue == kMATCH) { // exact match!
         //if(debug2) u_printf(" exact backward match\n");
         return kExceptionHere; // See if the next is another exception.
       } else if(bestValue == kPARTIAL
-                && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
+                && fData->hasForwardsPartialTrie()) { // make sure there's a forward trie
         //if(debug2) u_printf(" partial backward match\n");
         // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
         // to see if it matches something going forward.
-        fData->fForwardsPartialTrie->reset();
         UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
         utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
         //if(debug2) u_printf("Retrying at %d\n", bestPosn);
+        // Do not modify the shared trie!
+        UCharsTrie iter(fData->getForwardsPartialTrie());
+        UChar32 uch;
         while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
-              USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
+              USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) {
           //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
         }
         if(USTRINGTRIE_MATCHES(rfwd)) {
@@ -339,7 +366,7 @@ SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
 int32_t
 SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
   if(n == UBRK_DONE || // at end  or
-    fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
+    !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
       return n;
   }
   // OK, do we need to break here?
@@ -369,7 +396,7 @@ SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
 int32_t
 SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
   if(n == 0 || n == UBRK_DONE || // at end  or
-    fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
+    !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
       return n;
   }
   // OK, do we need to break here?
@@ -420,7 +447,7 @@ SimpleFilteredSentenceBreakIterator::previous(void) {
 UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
   if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
 
-  if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions
+  if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions
 
   UErrorCode status = U_ZERO_ERROR;
   resetState(status);
diff --git a/thirdparty/icu4c/common/hash.h b/thirdparty/icu4c/common/hash.h
index f02cb7087a..b927ddb3c3 100644
--- a/thirdparty/icu4c/common/hash.h
+++ b/thirdparty/icu4c/common/hash.h
@@ -85,16 +85,22 @@ public:
 
     inline int32_t puti(const UnicodeString& key, int32_t value, UErrorCode& status);
 
+    inline int32_t putiAllowZero(const UnicodeString& key, int32_t value, UErrorCode& status);
+
     inline void* get(const UnicodeString& key) const;
 
     inline int32_t geti(const UnicodeString& key) const;
 
+    inline int32_t getiAndFound(const UnicodeString& key, UBool &found) const;
+
     inline void* remove(const UnicodeString& key);
 
     inline int32_t removei(const UnicodeString& key);
 
     inline void removeAll(void);
 
+    inline UBool containsKey(const UnicodeString& key) const;
+
     inline const UHashElement* find(const UnicodeString& key) const;
 
     /**
@@ -203,6 +209,11 @@ inline int32_t Hashtable::puti(const UnicodeString& key, int32_t value, UErrorCo
     return uhash_puti(hash, new UnicodeString(key), value, &status);
 }
 
+inline int32_t Hashtable::putiAllowZero(const UnicodeString& key, int32_t value,
+                                        UErrorCode& status) {
+    return uhash_putiAllowZero(hash, new UnicodeString(key), value, &status);
+}
+
 inline void* Hashtable::get(const UnicodeString& key) const {
     return uhash_get(hash, &key);
 }
@@ -211,6 +222,10 @@ inline int32_t Hashtable::geti(const UnicodeString& key) const {
     return uhash_geti(hash, &key);
 }
 
+inline int32_t Hashtable::getiAndFound(const UnicodeString& key, UBool &found) const {
+    return uhash_getiAndFound(hash, &key, &found);
+}
+
 inline void* Hashtable::remove(const UnicodeString& key) {
     return uhash_remove(hash, &key);
 }
@@ -219,6 +234,10 @@ inline int32_t Hashtable::removei(const UnicodeString& key) {
     return uhash_removei(hash, &key);
 }
 
+inline UBool Hashtable::containsKey(const UnicodeString& key) const {
+    return uhash_containsKey(hash, &key);
+}
+
 inline const UHashElement* Hashtable::find(const UnicodeString& key) const {
     return uhash_find(hash, &key);
 }
diff --git a/thirdparty/icu4c/common/localematcher.cpp b/thirdparty/icu4c/common/localematcher.cpp
index 5795cbf87e..132aee290e 100644
--- a/thirdparty/icu4c/common/localematcher.cpp
+++ b/thirdparty/icu4c/common/localematcher.cpp
@@ -345,9 +345,8 @@ UBool compareLSRs(const UHashTok t1, const UHashTok t2) {
 int32_t LocaleMatcher::putIfAbsent(const LSR &lsr, int32_t i, int32_t suppLength,
                                    UErrorCode &errorCode) {
     if (U_FAILURE(errorCode)) { return suppLength; }
-    int32_t index = uhash_geti(supportedLsrToIndex, &lsr);
-    if (index == 0) {
-        uhash_puti(supportedLsrToIndex, const_cast<LSR *>(&lsr), i + 1, &errorCode);
+    if (!uhash_containsKey(supportedLsrToIndex, &lsr)) {
+        uhash_putiAllowZero(supportedLsrToIndex, const_cast<LSR *>(&lsr), i, &errorCode);
         if (U_SUCCESS(errorCode)) {
             supportedLSRs[suppLength] = &lsr;
             supportedIndexes[suppLength++] = i;
@@ -685,12 +684,11 @@ int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remai
     int32_t bestSupportedLsrIndex = -1;
     for (int32_t bestShiftedDistance = LocaleDistance::shiftDistance(thresholdDistance);;) {
         // Quick check for exact maximized LSR.
-        // Returns suppIndex+1 where 0 means not found.
         if (supportedLsrToIndex != nullptr) {
             desiredLSR.setHashCode();
-            int32_t index = uhash_geti(supportedLsrToIndex, &desiredLSR);
-            if (index != 0) {
-                int32_t suppIndex = index - 1;
+            UBool found = false;
+            int32_t suppIndex = uhash_getiAndFound(supportedLsrToIndex, &desiredLSR, &found);
+            if (found) {
                 if (remainingIter != nullptr) {
                     remainingIter->rememberCurrent(desiredIndex, errorCode);
                 }
diff --git a/thirdparty/icu4c/common/localeprioritylist.cpp b/thirdparty/icu4c/common/localeprioritylist.cpp
index 8916b121be..4455eedb75 100644
--- a/thirdparty/icu4c/common/localeprioritylist.cpp
+++ b/thirdparty/icu4c/common/localeprioritylist.cpp
@@ -187,17 +187,18 @@ bool LocalePriorityList::add(const Locale &locale, int32_t weight, UErrorCode &e
         if (U_FAILURE(errorCode)) { return false; }
     }
     LocalPointer<Locale> clone;
-    int32_t index = uhash_geti(map, &locale);
-    if (index != 0) {
+    UBool found = false;
+    int32_t index = uhash_getiAndFound(map, &locale, &found);
+    if (found) {
         // Duplicate: Remove the old item and append it anew.
-        LocaleAndWeight &lw = list->array[index - 1];
+        LocaleAndWeight &lw = list->array[index];
         clone.adoptInstead(lw.locale);
         lw.locale = nullptr;
         lw.weight = 0;
         ++numRemoved;
     }
     if (weight <= 0) {  // do not add q=0
-        if (index != 0) {
+        if (found) {
             // Not strictly necessary but cleaner.
             uhash_removei(map, &locale);
         }
@@ -217,7 +218,7 @@ bool LocalePriorityList::add(const Locale &locale, int32_t weight, UErrorCode &e
             return false;
         }
     }
-    uhash_puti(map, clone.getAlias(), listLength + 1, &errorCode);
+    uhash_putiAllowZero(map, clone.getAlias(), listLength, &errorCode);
     if (U_FAILURE(errorCode)) { return false; }
     LocaleAndWeight &lw = list->array[listLength];
     lw.locale = clone.orphan();
diff --git a/thirdparty/icu4c/common/locdispnames.cpp b/thirdparty/icu4c/common/locdispnames.cpp
index 47c0667417..96af3f9aa8 100644
--- a/thirdparty/icu4c/common/locdispnames.cpp
+++ b/thirdparty/icu4c/common/locdispnames.cpp
@@ -698,7 +698,7 @@ uloc_getDisplayName(const char *locale,
                     } /* end switch */
 
                     if (len>0) {
-                        /* we addeed a component, so add separator and write it if there's room. */
+                        /* we added a component, so add separator and write it if there's room. */
                         if(len+sepLen<=cap) {
                             const UChar * plimit = p + len;
                             for (; p < plimit; p++) {
diff --git a/thirdparty/icu4c/common/locid.cpp b/thirdparty/icu4c/common/locid.cpp
index 874e4a7055..0d506293a9 100644
--- a/thirdparty/icu4c/common/locid.cpp
+++ b/thirdparty/icu4c/common/locid.cpp
@@ -254,7 +254,7 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Locale)
 
 Locale::~Locale()
 {
-    if (baseName != fullName) {
+    if ((baseName != fullName) && (baseName != fullNameBuffer)) {
         uprv_free(baseName);
     }
     baseName = NULL;
@@ -466,7 +466,7 @@ Locale& Locale::operator=(const Locale& other) {
 }
 
 Locale& Locale::operator=(Locale&& other) U_NOEXCEPT {
-    if (baseName != fullName) uprv_free(baseName);
+    if ((baseName != fullName) && (baseName != fullNameBuffer)) uprv_free(baseName);
     if (fullName != fullNameBuffer) uprv_free(fullName);
 
     if (other.fullName == other.fullNameBuffer) {
@@ -524,7 +524,7 @@ static const char* const KNOWN_CANONICALIZED[] = {
     "km", "km_KH", "kn", "kn_IN", "ko", "ko_KR", "ky", "ky_KG", "lo", "lo_LA",
     "lt", "lt_LT", "lv", "lv_LV", "mk", "mk_MK", "ml", "ml_IN", "mn", "mn_MN",
     "mr", "mr_IN", "ms", "ms_MY", "my", "my_MM", "nb", "nb_NO", "ne", "ne_NP",
-    "nl", "nl_NL", "or", "or_IN", "pa", "pa_IN", "pl", "pl_PL", "ps", "ps_AF",
+    "nl", "nl_NL", "no", "or", "or_IN", "pa", "pa_IN", "pl", "pl_PL", "ps", "ps_AF",
     "pt", "pt_BR", "pt_PT", "ro", "ro_RO", "ru", "ru_RU", "sd", "sd_IN", "si",
     "si_LK", "sk", "sk_SK", "sl", "sl_SI", "so", "so_SO", "sq", "sq_AL", "sr",
     "sr_Cyrl_RS", "sr_Latn", "sr_RS", "sv", "sv_SE", "sw", "sw_TZ", "ta",
@@ -627,6 +627,17 @@ private:
                           LocalMemory<const char*>& types,
                           LocalMemory<int32_t>& replacementIndexes,
                           int32_t &length, UErrorCode &status);
+
+    // Read the subdivisionAlias data from alias to
+    // strings+types+replacementIndexes
+    // Allocate length items for types, to store the type field.
+    // Allocate length items for replacementIndexes,
+    // to store the index in the strings for the replacement variant.
+    void readSubdivisionAlias(UResourceBundle* alias,
+                          UniqueCharStrings* strings,
+                          LocalMemory<const char*>& types,
+                          LocalMemory<int32_t>& replacementIndexes,
+                          int32_t &length, UErrorCode &status);
 };
 
 /**
@@ -647,6 +658,7 @@ public:
     const CharStringMap& scriptMap() const { return script; }
     const CharStringMap& territoryMap() const { return territory; }
     const CharStringMap& variantMap() const { return variant; }
+    const CharStringMap& subdivisionMap() const { return subdivision; }
 
     static void U_CALLCONV loadData(UErrorCode &status);
     static UBool U_CALLCONV cleanup();
@@ -658,11 +670,13 @@ private:
               CharStringMap scriptMap,
               CharStringMap territoryMap,
               CharStringMap variantMap,
+              CharStringMap subdivisionMap,
               CharString* strings)
         : language(std::move(languageMap)),
           script(std::move(scriptMap)),
           territory(std::move(territoryMap)),
           variant(std::move(variantMap)),
+          subdivision(std::move(subdivisionMap)),
           strings(strings) {
     }
 
@@ -676,6 +690,7 @@ private:
     CharStringMap script;
     CharStringMap territory;
     CharStringMap variant;
+    CharStringMap subdivision;
     CharString* strings;
 
     friend class AliasDataBuilder;
@@ -867,6 +882,34 @@ AliasDataBuilder::readVariantAlias(
 }
 
 /**
+ * Read the subdivisionAlias data from alias to strings+types+replacementIndexes.
+ * Allocate length items for types, to store the type field. Allocate length
+ * items for replacementIndexes, to store the index in the strings for the
+ * replacement regions.
+ */
+void
+AliasDataBuilder::readSubdivisionAlias(
+        UResourceBundle* alias,
+        UniqueCharStrings* strings,
+        LocalMemory<const char*>& types,
+        LocalMemory<int32_t>& replacementIndexes,
+        int32_t &length,
+        UErrorCode &status)
+{
+    return readAlias(
+        alias, strings, types, replacementIndexes, length,
+#if U_DEBUG
+        [](const char* type) {
+            U_ASSERT(uprv_strlen(type) >= 3 && uprv_strlen(type) <= 8);
+        },
+#else
+        [](const char*) {},
+#endif
+        [](const UnicodeString&) { },
+        status);
+}
+
+/**
  * Initializes the alias data from the ICU resource bundles. The alias data
  * contains alias of language, country, script and variants.
  *
@@ -905,12 +948,14 @@ AliasDataBuilder::build(UErrorCode &status) {
         ures_getByKey(metadataAlias.getAlias(), "territory", nullptr, &status));
     LocalUResourceBundlePointer variantAlias(
         ures_getByKey(metadataAlias.getAlias(), "variant", nullptr, &status));
+    LocalUResourceBundlePointer subdivisionAlias(
+        ures_getByKey(metadataAlias.getAlias(), "subdivision", nullptr, &status));
 
     if (U_FAILURE(status)) {
         return nullptr;
     }
     int32_t languagesLength = 0, scriptLength = 0, territoryLength = 0,
-            variantLength = 0;
+            variantLength = 0, subdivisionLength = 0;
 
     // Read the languageAlias into languageTypes, languageReplacementIndexes
     // and strings
@@ -955,6 +1000,16 @@ AliasDataBuilder::build(UErrorCode &status) {
                      variantReplacementIndexes,
                      variantLength, status);
 
+    // Read the subdivisionAlias into subdivisionTypes, subdivisionReplacementIndexes
+    // and strings
+    LocalMemory<const char*> subdivisionTypes;
+    LocalMemory<int32_t> subdivisionReplacementIndexes;
+    readSubdivisionAlias(subdivisionAlias.getAlias(),
+                         &strings,
+                         subdivisionTypes,
+                         subdivisionReplacementIndexes,
+                         subdivisionLength, status);
+
     if (U_FAILURE(status)) {
         return nullptr;
     }
@@ -994,6 +1049,14 @@ AliasDataBuilder::build(UErrorCode &status) {
                        status);
     }
 
+    // Build the subdivisionMap from subdivisionTypes & subdivisionReplacementIndexes.
+    CharStringMap subdivisionMap(2, status);
+    for (int32_t i = 0; U_SUCCESS(status) && i < subdivisionLength; i++) {
+        subdivisionMap.put(subdivisionTypes[i],
+                       strings.get(subdivisionReplacementIndexes[i]),
+                       status);
+    }
+
     if (U_FAILURE(status)) {
         return nullptr;
     }
@@ -1004,6 +1067,7 @@ AliasDataBuilder::build(UErrorCode &status) {
         std::move(scriptMap),
         std::move(territoryMap),
         std::move(variantMap),
+        std::move(subdivisionMap),
         strings.orphanCharStrings());
 
     if (data == nullptr) {
@@ -1105,6 +1169,14 @@ private:
 
     // Replace by using variantAlias.
     bool replaceVariant(UErrorCode& status);
+
+    // Replace by using subdivisionAlias.
+    bool replaceSubdivision(StringPiece subdivision,
+                            CharString& output, UErrorCode& status);
+
+    // Replace transformed extensions.
+    bool replaceTransformedExtensions(
+        CharString& transformedExtensions, CharString& output, UErrorCode& status);
 };
 
 CharString&
@@ -1294,7 +1366,6 @@ AliasReplacer::replaceLanguage(
             }
         }
         if (replacedExtensions != nullptr) {
-            // TODO(ICU-21292)
             // DO NOTHING
             // UTS35 does not specifiy what should we do if we have extensions in the
             // replacement. Currently we know only the following 4 "BCP47 LegacyRules" have
@@ -1435,6 +1506,106 @@ AliasReplacer::replaceVariant(UErrorCode& status)
     return false;
 }
 
+bool
+AliasReplacer::replaceSubdivision(
+    StringPiece subdivision, CharString& output, UErrorCode& status)
+{
+    if (U_FAILURE(status)) {
+        return false;
+    }
+    const char *replacement = data->subdivisionMap().get(subdivision.data());
+    if (replacement != nullptr) {
+        const char* firstSpace = uprv_strchr(replacement, ' ');
+        // Found replacement data for this subdivision.
+        size_t len = (firstSpace != nullptr) ?
+            (firstSpace - replacement) : uprv_strlen(replacement);
+        if (2 <= len && len <= 8) {
+            output.append(replacement, (int32_t)len, status);
+            if (2 == len) {
+                // Add 'zzzz' based on changes to UTS #35 for CLDR-14312.
+                output.append("zzzz", 4, status);
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
+bool
+AliasReplacer::replaceTransformedExtensions(
+    CharString& transformedExtensions, CharString& output, UErrorCode& status)
+{
+    // The content of the transformedExtensions will be modified in this
+    // function to NULL-terminating (tkey-tvalue) pairs.
+    if (U_FAILURE(status)) {
+        return false;
+    }
+    int32_t len = transformedExtensions.length();
+    const char* str = transformedExtensions.data();
+    const char* tkey = ultag_getTKeyStart(str);
+    int32_t tlangLen = (tkey == str) ? 0 :
+        ((tkey == nullptr) ? len : static_cast<int32_t>((tkey - str - 1)));
+    CharStringByteSink sink(&output);
+    if (tlangLen > 0) {
+        Locale tlang = LocaleBuilder()
+            .setLanguageTag(StringPiece(str, tlangLen))
+            .build(status);
+        tlang.canonicalize(status);
+        tlang.toLanguageTag(sink, status);
+        if (U_FAILURE(status)) {
+            return false;
+        }
+        T_CString_toLowerCase(output.data());
+    }
+    if (tkey != nullptr) {
+        // We need to sort the tfields by tkey
+        UVector tfields(status);
+        if (U_FAILURE(status)) {
+            return false;
+        }
+        do {
+            const char* tvalue = uprv_strchr(tkey, '-');
+            if (tvalue == nullptr) {
+                status = U_ILLEGAL_ARGUMENT_ERROR;
+            }
+            const char* nextTKey = ultag_getTKeyStart(tvalue);
+            if (nextTKey != nullptr) {
+                *((char*)(nextTKey-1)) = '\0';  // NULL terminate tvalue
+            }
+            tfields.insertElementAt((void*)tkey, tfields.size(), status);
+            if (U_FAILURE(status)) {
+                return false;
+            }
+            tkey = nextTKey;
+        } while (tkey != nullptr);
+        tfields.sort([](UElement e1, UElement e2) -> int8_t {
+            // uprv_strcmp return int and in some platform, such as arm64-v8a,
+            // it may return positive values > 127 which cause the casted value
+            // of int8_t negative.
+            int res = uprv_strcmp(
+                (const char*)e1.pointer, (const char*)e2.pointer);
+            return (res == 0) ? 0 : ((res > 0) ? 1 : -1);
+        }, status);
+        for (int32_t i = 0; i < tfields.size(); i++) {
+             if (output.length() > 0) {
+                 output.append('-', status);
+             }
+             const char* tfield = (const char*) tfields.elementAt(i);
+             const char* tvalue = uprv_strchr(tfield, '-');
+             // Split the "tkey-tvalue" pair string so that we can canonicalize the tvalue.
+             U_ASSERT(tvalue != nullptr);
+             *((char*)tvalue++) = '\0'; // NULL terminate tkey
+             output.append(tfield, status).append('-', status);
+             const char* bcpTValue = ulocimp_toBcpType(tfield, tvalue, nullptr, nullptr);
+             output.append((bcpTValue == nullptr) ? tvalue : bcpTValue, status);
+        }
+    }
+    if (U_FAILURE(status)) {
+        return false;
+    }
+    return true;
+}
+
 CharString&
 AliasReplacer::outputToString(
     CharString& out, UErrorCode status)
@@ -1453,8 +1624,12 @@ AliasReplacer::outputToString(
           out.append(SEP_CHAR, status);
         }
         variants.sort([](UElement e1, UElement e2) -> int8_t {
-            return uprv_strcmp(
+            // uprv_strcmp return int and in some platform, such as arm64-v8a,
+            // it may return positive values > 127 which cause the casted value
+            // of int8_t negative.
+            int res = uprv_strcmp(
                 (const char*)e1.pointer, (const char*)e2.pointer);
+            return (res == 0) ? 0 : ((res > 0) ? 1 : -1);
         }, status);
         int32_t variantsStart = out.length();
         for (int32_t i = 0; i < variants.size(); i++) {
@@ -1497,7 +1672,6 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
         region = nullptr;
     }
     const char* variantsStr = locale.getVariant();
-    const char* extensionsStr = locale_getKeywordsStart(locale.getName());
     CharString variantsBuff(variantsStr, -1, status);
     if (!variantsBuff.isEmpty()) {
         if (U_FAILURE(status)) { return false; }
@@ -1516,8 +1690,12 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
 
     // Sort the variants
     variants.sort([](UElement e1, UElement e2) -> int8_t {
-        return uprv_strcmp(
+        // uprv_strcmp return int and in some platform, such as arm64-v8a,
+        // it may return positive values > 127 which cause the casted value
+        // of int8_t negative.
+        int res = uprv_strcmp(
             (const char*)e1.pointer, (const char*)e2.pointer);
+        return (res == 0) ? 0 : ((res > 0) ? 1 : -1);
     }, status);
 
     // A changed count to assert when loop too many times.
@@ -1561,11 +1739,52 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
     if (U_FAILURE(status)) { return false; }
     // Nothing changed and we know the order of the vaiants are not change
     // because we have no variant or only one.
-    if (changed == 0 && variants.size() <= 1) {
+    const char* extensionsStr = locale_getKeywordsStart(locale.getName());
+    if (changed == 0 && variants.size() <= 1 && extensionsStr == nullptr) {
         return false;
     }
     outputToString(out, status);
+    if (U_FAILURE(status)) {
+        return false;
+    }
     if (extensionsStr != nullptr) {
+        changed = 0;
+        Locale temp(locale);
+        LocalPointer<icu::StringEnumeration> iter(locale.createKeywords(status));
+        if (U_SUCCESS(status) && !iter.isNull()) {
+            const char* key;
+            while ((key = iter->next(nullptr, status)) != nullptr) {
+                if (uprv_strcmp("sd", key) == 0 || uprv_strcmp("rg", key) == 0 ||
+                        uprv_strcmp("t", key) == 0) {
+                    CharString value;
+                    CharStringByteSink valueSink(&value);
+                    locale.getKeywordValue(key, valueSink, status);
+                    if (U_FAILURE(status)) {
+                        status = U_ZERO_ERROR;
+                        continue;
+                    }
+                    CharString replacement;
+                    if (uprv_strlen(key) == 2) {
+                        if (replaceSubdivision(value.toStringPiece(), replacement, status)) {
+                            changed++;
+                            temp.setKeywordValue(key, replacement.data(), status);
+                        }
+                    } else {
+                        U_ASSERT(uprv_strcmp(key, "t") == 0);
+                        if (replaceTransformedExtensions(value, replacement, status)) {
+                            changed++;
+                            temp.setKeywordValue(key, replacement.data(), status);
+                        }
+                    }
+                    if (U_FAILURE(status)) {
+                        return false;
+                    }
+                }
+            }
+        }
+        if (changed != 0) {
+            extensionsStr = locale_getKeywordsStart(temp.getName());
+        }
         out.append(extensionsStr, status);
     }
     if (U_FAILURE(status)) {
@@ -1573,8 +1792,6 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
     }
     // If the tag is not changed, return.
     if (uprv_strcmp(out.data(), locale.getName()) == 0) {
-        U_ASSERT(changed == 0);
-        U_ASSERT(variants.size() > 1);
         out.clear();
         return false;
     }
@@ -1636,7 +1853,7 @@ Locale& Locale::init(const char* localeID, UBool canonicalize)
 {
     fIsBogus = FALSE;
     /* Free our current storage */
-    if (baseName != fullName) {
+    if ((baseName != fullName) && (baseName != fullNameBuffer)) {
         uprv_free(baseName);
     }
     baseName = NULL;
@@ -1672,6 +1889,7 @@ Locale& Locale::init(const char* localeID, UBool canonicalize)
             uloc_getName(localeID, fullName, sizeof(fullNameBuffer), &err);
 
         if(err == U_BUFFER_OVERFLOW_ERROR || length >= (int32_t)sizeof(fullNameBuffer)) {
+            U_ASSERT(baseName == nullptr);
             /*Go to heap for the fullName if necessary*/
             fullName = (char *)uprv_malloc(sizeof(char)*(length + 1));
             if(fullName == 0) {
@@ -1825,7 +2043,7 @@ Locale::hashCode() const
 void
 Locale::setToBogus() {
     /* Free our current storage */
-    if(baseName != fullName) {
+    if((baseName != fullName) && (baseName != fullNameBuffer)) {
         uprv_free(baseName);
     }
     baseName = NULL;
diff --git a/thirdparty/icu4c/common/loclikelysubtags.cpp b/thirdparty/icu4c/common/loclikelysubtags.cpp
index a031bfa587..aa592e6ea8 100644
--- a/thirdparty/icu4c/common/loclikelysubtags.cpp
+++ b/thirdparty/icu4c/common/loclikelysubtags.cpp
@@ -320,7 +320,8 @@ XLikelySubtags::~XLikelySubtags() {
 LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const {
     const char *name = locale.getName();
     if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') {  // name.startsWith("@x=")
-        // Private use language tag x-subtag-subtag...
+        // Private use language tag x-subtag-subtag... which CLDR changes to
+        // und-x-subtag-subtag...
         return LSR(name, "", "", LSR::EXPLICIT_LSR);
     }
     return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
diff --git a/thirdparty/icu4c/common/norm2allmodes.h b/thirdparty/icu4c/common/norm2allmodes.h
index e8bd52c6ae..584835da57 100644
--- a/thirdparty/icu4c/common/norm2allmodes.h
+++ b/thirdparty/icu4c/common/norm2allmodes.h
@@ -38,7 +38,7 @@ public:
     virtual UnicodeString &
     normalize(const UnicodeString &src,
               UnicodeString &dest,
-              UErrorCode &errorCode) const {
+              UErrorCode &errorCode) const U_OVERRIDE {
         if(U_FAILURE(errorCode)) {
             dest.setToBogus();
             return dest;
@@ -64,13 +64,13 @@ public:
     virtual UnicodeString &
     normalizeSecondAndAppend(UnicodeString &first,
                              const UnicodeString &second,
-                             UErrorCode &errorCode) const {
+                             UErrorCode &errorCode) const U_OVERRIDE {
         return normalizeSecondAndAppend(first, second, true, errorCode);
     }
     virtual UnicodeString &
     append(UnicodeString &first,
            const UnicodeString &second,
-           UErrorCode &errorCode) const {
+           UErrorCode &errorCode) const U_OVERRIDE {
         return normalizeSecondAndAppend(first, second, false, errorCode);
     }
     UnicodeString &
@@ -107,7 +107,7 @@ public:
                        UnicodeString &safeMiddle,
                        ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
     virtual UBool
-    getDecomposition(UChar32 c, UnicodeString &decomposition) const {
+    getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE {
         UChar buffer[4];
         int32_t length;
         const UChar *d=impl.getDecomposition(c, buffer, length);
@@ -122,7 +122,7 @@ public:
         return true;
     }
     virtual UBool
-    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
+    getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE {
         UChar buffer[30];
         int32_t length;
         const UChar *d=impl.getRawDecomposition(c, buffer, length);
@@ -137,18 +137,18 @@ public:
         return true;
     }
     virtual UChar32
-    composePair(UChar32 a, UChar32 b) const {
+    composePair(UChar32 a, UChar32 b) const U_OVERRIDE {
         return impl.composePair(a, b);
     }
 
     virtual uint8_t
-    getCombiningClass(UChar32 c) const {
+    getCombiningClass(UChar32 c) const U_OVERRIDE {
         return impl.getCC(impl.getNorm16(c));
     }
 
     // quick checks
     virtual UBool
-    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
+    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
         if(U_FAILURE(errorCode)) {
             return false;
         }
@@ -161,11 +161,11 @@ public:
         return sLimit==spanQuickCheckYes(sArray, sLimit, errorCode);
     }
     virtual UNormalizationCheckResult
-    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
+    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
         return Normalizer2WithImpl::isNormalized(s, errorCode) ? UNORM_YES : UNORM_NO;
     }
     virtual int32_t
-    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
+    spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
         if(U_FAILURE(errorCode)) {
             return 0;
         }
@@ -194,27 +194,57 @@ public:
 private:
     virtual void
     normalize(const UChar *src, const UChar *limit,
-              ReorderingBuffer &buffer, UErrorCode &errorCode) const {
+              ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
         impl.decompose(src, limit, &buffer, errorCode);
     }
     using Normalizer2WithImpl::normalize;  // Avoid warning about hiding base class function.
     virtual void
     normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
                        UnicodeString &safeMiddle,
-                       ReorderingBuffer &buffer, UErrorCode &errorCode) const {
+                       ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
         impl.decomposeAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
     }
+
+    void
+    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
+                  Edits *edits, UErrorCode &errorCode) const U_OVERRIDE {
+        if (U_FAILURE(errorCode)) {
+            return;
+        }
+        if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
+            edits->reset();
+        }
+        const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
+        impl.decomposeUTF8(options, s, s + src.length(), &sink, edits, errorCode);
+        sink.Flush();
+    }
+    virtual UBool
+    isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const U_OVERRIDE {
+        if(U_FAILURE(errorCode)) {
+            return false;
+        }
+        const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
+        const uint8_t *sLimit = s + sp.length();
+        return sLimit == impl.decomposeUTF8(0, s, sLimit, nullptr, nullptr, errorCode);
+    }
+
     virtual const UChar *
-    spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
+    spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const U_OVERRIDE {
         return impl.decompose(src, limit, NULL, errorCode);
     }
     using Normalizer2WithImpl::spanQuickCheckYes;  // Avoid warning about hiding base class function.
-    virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
+    virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const U_OVERRIDE {
         return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO;
     }
-    virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasDecompBoundaryBefore(c); }
-    virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasDecompBoundaryAfter(c); }
-    virtual UBool isInert(UChar32 c) const { return impl.isDecompInert(c); }
+    virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE {
+        return impl.hasDecompBoundaryBefore(c);
+    }
+    virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE {
+        return impl.hasDecompBoundaryAfter(c);
+    }
+    virtual UBool isInert(UChar32 c) const U_OVERRIDE {
+        return impl.isDecompInert(c);
+    }
 };
 
 class ComposeNormalizer2 : public Normalizer2WithImpl {
@@ -321,24 +351,30 @@ public:
 private:
     virtual void
     normalize(const UChar *src, const UChar *limit,
-              ReorderingBuffer &buffer, UErrorCode &errorCode) const {
+              ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
         impl.makeFCD(src, limit, &buffer, errorCode);
     }
     using Normalizer2WithImpl::normalize;  // Avoid warning about hiding base class function.
     virtual void
     normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
                        UnicodeString &safeMiddle,
-                       ReorderingBuffer &buffer, UErrorCode &errorCode) const {
+                       ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
         impl.makeFCDAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
     }
     virtual const UChar *
-    spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
+    spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const U_OVERRIDE {
         return impl.makeFCD(src, limit, NULL, errorCode);
     }
     using Normalizer2WithImpl::spanQuickCheckYes;  // Avoid warning about hiding base class function.
-    virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasFCDBoundaryBefore(c); }
-    virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasFCDBoundaryAfter(c); }
-    virtual UBool isInert(UChar32 c) const { return impl.isFCDInert(c); }
+    virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE {
+        return impl.hasFCDBoundaryBefore(c);
+    }
+    virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE {
+        return impl.hasFCDBoundaryAfter(c);
+    }
+    virtual UBool isInert(UChar32 c) const U_OVERRIDE {
+        return impl.isFCDInert(c);
+    }
 };
 
 struct Norm2AllModes : public UMemory {
diff --git a/thirdparty/icu4c/common/normalizer2impl.cpp b/thirdparty/icu4c/common/normalizer2impl.cpp
index cbf6b4d980..c0ad5c69f3 100644
--- a/thirdparty/icu4c/common/normalizer2impl.cpp
+++ b/thirdparty/icu4c/common/normalizer2impl.cpp
@@ -731,9 +731,131 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
     return buffer.append((const UChar *)mapping+1, length, TRUE, leadCC, trailCC, errorCode);
 }
 
+// Dual functionality:
+// sink != nullptr: normalize
+// sink == nullptr: isNormalized/spanQuickCheckYes
+const uint8_t *
+Normalizer2Impl::decomposeUTF8(uint32_t options,
+                               const uint8_t *src, const uint8_t *limit,
+                               ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
+    U_ASSERT(limit != nullptr);
+    UnicodeString s16;
+    uint8_t minNoLead = leadByteForCP(minDecompNoCP);
+
+    const uint8_t *prevBoundary = src;
+    // only for quick check
+    uint8_t prevCC = 0;
+
+    for (;;) {
+        // Fast path: Scan over a sequence of characters below the minimum "no" code point,
+        // or with (decompYes && ccc==0) properties.
+        const uint8_t *fastStart = src;
+        const uint8_t *prevSrc;
+        uint16_t norm16 = 0;
+
+        for (;;) {
+            if (src == limit) {
+                if (prevBoundary != limit && sink != nullptr) {
+                    ByteSinkUtil::appendUnchanged(prevBoundary, limit,
+                                                  *sink, options, edits, errorCode);
+                }
+                return src;
+            }
+            if (*src < minNoLead) {
+                ++src;
+            } else {
+                prevSrc = src;
+                UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
+                if (!isMostDecompYesAndZeroCC(norm16)) {
+                    break;
+                }
+            }
+        }
+        // isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo,
+        // and the current character at [prevSrc..src[ is not a common case with cc=0
+        // (MIN_NORMAL_MAYBE_YES or JAMO_VT).
+        // It could still be a maybeYes with cc=0.
+        if (prevSrc != fastStart) {
+            // The fast path looped over yes/0 characters before the current one.
+            if (sink != nullptr &&
+                    !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
+                                                   *sink, options, edits, errorCode)) {
+                break;
+            }
+            prevBoundary = prevSrc;
+            prevCC = 0;
+        }
+
+        // Medium-fast path: Quick check.
+        if (isMaybeOrNonZeroCC(norm16)) {
+            // Does not decompose.
+            uint8_t cc = getCCFromYesOrMaybe(norm16);
+            if (prevCC <= cc || cc == 0) {
+                prevCC = cc;
+                if (cc <= 1) {
+                    if (sink != nullptr &&
+                            !ByteSinkUtil::appendUnchanged(prevBoundary, src,
+                                                           *sink, options, edits, errorCode)) {
+                        break;
+                    }
+                    prevBoundary = src;
+                }
+                continue;
+            }
+        }
+        if (sink == nullptr) {
+            return prevBoundary;  // quick check: "no" or cc out of order
+        }
+
+        // Slow path
+        // Decompose up to and including the current character.
+        if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) {
+            if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
+                                               *sink, options, edits, errorCode)) {
+                break;
+            }
+            prevBoundary = prevSrc;
+        }
+        ReorderingBuffer buffer(*this, s16, errorCode);
+        if (U_FAILURE(errorCode)) {
+            break;
+        }
+        decomposeShort(prevBoundary, src, STOP_AT_LIMIT, FALSE /* onlyContiguous */,
+                       buffer, errorCode);
+        // Decompose until the next boundary.
+        if (buffer.getLastCC() > 1) {
+            src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, FALSE /* onlyContiguous */,
+                                 buffer, errorCode);
+        }
+        if (U_FAILURE(errorCode)) {
+            break;
+        }
+        if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals()
+            errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+            break;
+        }
+        // We already know there was a change if the original character decomposed;
+        // otherwise compare.
+        if (isMaybeOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) {
+            if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,
+                                               *sink, options, edits, errorCode)) {
+                break;
+            }
+        } else {
+            if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),
+                                            *sink, edits, errorCode)) {
+                break;
+            }
+        }
+        prevBoundary = src;
+        prevCC = 0;
+    }
+    return src;
+}
+
 const uint8_t *
 Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
-                                UBool stopAtCompBoundary, UBool onlyContiguous,
+                                StopAt stopAt, UBool onlyContiguous,
                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const {
     if (U_FAILURE(errorCode)) {
         return nullptr;
@@ -746,21 +868,28 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
         UChar32 c = U_SENTINEL;
         if (norm16 >= limitNoNo) {
             if (isMaybeOrNonZeroCC(norm16)) {
-                // No boundaries around this character.
+                // No comp boundaries around this character.
+                uint8_t cc = getCCFromYesOrMaybe(norm16);
+                if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
+                    return prevSrc;
+                }
                 c = codePointFromValidUTF8(prevSrc, src);
-                if (!buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode)) {
+                if (!buffer.append(c, cc, errorCode)) {
                     return nullptr;
                 }
+                if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) {
+                    return src;
+                }
                 continue;
             }
             // Maps to an isCompYesAndZeroCC.
-            if (stopAtCompBoundary) {
+            if (stopAt != STOP_AT_LIMIT) {
                 return prevSrc;
             }
             c = codePointFromValidUTF8(prevSrc, src);
             c = mapAlgorithmic(c, norm16);
             norm16 = getRawNorm16(c);
-        } else if (stopAtCompBoundary && norm16 < minNoNoCompNoMaybeCC) {
+        } else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) {
             return prevSrc;
         }
         // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
@@ -768,7 +897,8 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
         // its norm16==INERT is normalization-inert,
         // so it gets copied unchanged in the fast path,
         // and we stop the slow path where invalid UTF-8 begins.
-        U_ASSERT(norm16 != INERT);
+        // c >= 0 is the result of an algorithmic mapping.
+        U_ASSERT(c >= 0 || norm16 != INERT);
         if (norm16 < minYesNo) {
             if (c < 0) {
                 c = codePointFromValidUTF8(prevSrc, src);
@@ -798,11 +928,15 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
             } else {
                 leadCC = 0;
             }
+            if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
+                return prevSrc;
+            }
             if (!buffer.append((const char16_t *)mapping+1, length, TRUE, leadCC, trailCC, errorCode)) {
                 return nullptr;
             }
         }
-        if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
+        if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) ||
+                (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) {
             return src;
         }
     }
@@ -1954,10 +2088,10 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
             break;
         }
         // We know there is not a boundary here.
-        decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
+        decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous,
                        buffer, errorCode);
         // Decompose until the next boundary.
-        src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
+        src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous,
                              buffer, errorCode);
         if (U_FAILURE(errorCode)) {
             break;
diff --git a/thirdparty/icu4c/common/normalizer2impl.h b/thirdparty/icu4c/common/normalizer2impl.h
index 4218a30a34..bdb6767a92 100644
--- a/thirdparty/icu4c/common/normalizer2impl.h
+++ b/thirdparty/icu4c/common/normalizer2impl.h
@@ -491,6 +491,12 @@ public:
                             UnicodeString &safeMiddle,
                             ReorderingBuffer &buffer,
                             UErrorCode &errorCode) const;
+
+    /** sink==nullptr: isNormalized()/spanQuickCheckYes() */
+    const uint8_t *decomposeUTF8(uint32_t options,
+                                 const uint8_t *src, const uint8_t *limit,
+                                 ByteSink *sink, Edits *edits, UErrorCode &errorCode) const;
+
     UBool compose(const UChar *src, const UChar *limit,
                   UBool onlyContiguous,
                   UBool doCompose,
@@ -649,6 +655,9 @@ private:
                                                 UChar32 minNeedDataCP,
                                                 ReorderingBuffer *buffer,
                                                 UErrorCode &errorCode) const;
+
+    enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY };
+
     const UChar *decomposeShort(const UChar *src, const UChar *limit,
                                 UBool stopAtCompBoundary, UBool onlyContiguous,
                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const;
@@ -656,7 +665,7 @@ private:
                     ReorderingBuffer &buffer, UErrorCode &errorCode) const;
 
     const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
-                                  UBool stopAtCompBoundary, UBool onlyContiguous,
+                                  StopAt stopAt, UBool onlyContiguous,
                                   ReorderingBuffer &buffer, UErrorCode &errorCode) const;
 
     static int32_t combine(const uint16_t *list, UChar32 trail);
diff --git a/thirdparty/icu4c/common/pluralmap.h b/thirdparty/icu4c/common/pluralmap.h
index d898ac4671..2a14a07af1 100644
--- a/thirdparty/icu4c/common/pluralmap.h
+++ b/thirdparty/icu4c/common/pluralmap.h
@@ -24,7 +24,7 @@ class U_COMMON_API PluralMapBase : public UMemory {
 public:
     /**
      * The names of all the plural categories. NONE is not an actual plural
-     * category, but rather represents the absense of a plural category.
+     * category, but rather represents the absence of a plural category.
      */
     enum Category {
         NONE = -1,
diff --git a/thirdparty/icu4c/common/putil.cpp b/thirdparty/icu4c/common/putil.cpp
index 3ed6a05d22..ffcbbcce59 100644
--- a/thirdparty/icu4c/common/putil.cpp
+++ b/thirdparty/icu4c/common/putil.cpp
@@ -1139,7 +1139,7 @@ uprv_tzname(int n)
 #endif
     if (tzid != NULL && isValidOlsonID(tzid)
 #if U_PLATFORM == U_PF_SOLARIS
-    /* When TZ equals localtime on Solaris, check the /etc/localtime file. */
+    /* Don't misinterpret TZ "localtime" on Solaris as a time zone name. */
         && uprv_strcmp(tzid, TZ_ENV_CHECK) != 0
 #endif
     ) {
@@ -1361,7 +1361,7 @@ uprv_pathIsAbsolute(const char *path)
 
 /* Backup setting of ICU_DATA_DIR_PREFIX_ENV_VAR
    (needed for some Darwin ICU build environments) */
-#if U_PLATFORM_IS_DARWIN_BASED && TARGET_OS_SIMULATOR
+#if U_PLATFORM_IS_DARWIN_BASED && defined(TARGET_OS_SIMULATOR) && TARGET_OS_SIMULATOR
 # if !defined(ICU_DATA_DIR_PREFIX_ENV_VAR)
 #  define ICU_DATA_DIR_PREFIX_ENV_VAR "IPHONE_SIMULATOR_ROOT"
 # endif
diff --git a/thirdparty/icu4c/common/putilimp.h b/thirdparty/icu4c/common/putilimp.h
index a325c6c359..5b95a68418 100644
--- a/thirdparty/icu4c/common/putilimp.h
+++ b/thirdparty/icu4c/common/putilimp.h
@@ -527,7 +527,7 @@ U_CAPI void * U_EXPORT2 uprv_maximumPtr(void *base);
  * on the destination pointer and capacity cannot overflow.
  *
  * The pinned capacity must fulfill the following conditions (for positive capacities):
- *   - dest + capacity is a valid pointer according to the machine arcitecture (AS/400, 64-bit, etc.)
+ *   - dest + capacity is a valid pointer according to the machine architecture (AS/400, 64-bit, etc.)
  *   - (dest + capacity) >= dest
  *   - The size (in bytes) of T[capacity] does not exceed 0x7fffffff
  *
diff --git a/thirdparty/icu4c/common/rbbi.cpp b/thirdparty/icu4c/common/rbbi.cpp
index 9b7e70c3cf..b821ca4463 100644
--- a/thirdparty/icu4c/common/rbbi.cpp
+++ b/thirdparty/icu4c/common/rbbi.cpp
@@ -812,7 +812,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
         }
     #endif
 
-    // handleNext alway sets the break tag value.
+    // handleNext always sets the break tag value.
     // Set the default for it.
     fRuleStatusIndex = 0;
 
diff --git a/thirdparty/icu4c/common/rbbi_cache.cpp b/thirdparty/icu4c/common/rbbi_cache.cpp
index 63ff3001c7..44f19d8697 100644
--- a/thirdparty/icu4c/common/rbbi_cache.cpp
+++ b/thirdparty/icu4c/common/rbbi_cache.cpp
@@ -258,7 +258,7 @@ void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode
             previous(status);
         } else {
             // seek() leaves the BreakCache positioned at the preceding boundary
-            //        if the requested position is between two bounaries.
+            //        if the requested position is between two boundaries.
             // current() pushes the BreakCache position out to the BreakIterator itself.
             U_ASSERT(startPos > fTextIdx);
             current();
diff --git a/thirdparty/icu4c/common/rbbiscan.cpp b/thirdparty/icu4c/common/rbbiscan.cpp
index 9c406af671..45911b1cfe 100644
--- a/thirdparty/icu4c/common/rbbiscan.cpp
+++ b/thirdparty/icu4c/common/rbbiscan.cpp
@@ -284,7 +284,7 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
 
     case doEndAssign:
         {
-            // We have reached the end of an assignement statement.
+            // We have reached the end of an assignment statement.
             //   Current scan char is the ';' that terminates the assignment.
 
             // Terminate expression, leaves expression parse tree rooted in TOS node.
@@ -856,6 +856,10 @@ UChar32  RBBIRuleScanner::nextCharLL() {
         return (UChar32)-1;
     }
     ch         = fRB->fRules.char32At(fNextIndex);
+    if (U_IS_SURROGATE(ch)) {
+        error(U_ILLEGAL_CHAR_FOUND);
+        return U_SENTINEL;
+    }
     fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
 
     if (ch == chCR ||
diff --git a/thirdparty/icu4c/common/rbbitblb.cpp b/thirdparty/icu4c/common/rbbitblb.cpp
index 70e260fc08..dd76337bc6 100644
--- a/thirdparty/icu4c/common/rbbitblb.cpp
+++ b/thirdparty/icu4c/common/rbbitblb.cpp
@@ -151,7 +151,7 @@ void  RBBITableBuilder::buildForwardTable() {
     //
     // calculate the functions nullable, firstpos, lastpos and followpos on
     // nodes in the parse tree.
-    //    See the alogrithm description in Aho.
+    //    See the algorithm description in Aho.
     //    Understanding how this works by looking at the code alone will be
     //       nearly impossible.
     //
diff --git a/thirdparty/icu4c/common/resource.h b/thirdparty/icu4c/common/resource.h
index 3795694412..48f5b9fa6e 100644
--- a/thirdparty/icu4c/common/resource.h
+++ b/thirdparty/icu4c/common/resource.h
@@ -274,8 +274,10 @@ public:
      *
      * @param key The key string of the enumeration-start resource.
      *     Empty if the enumeration starts at the top level of the bundle.
-     * @param value Call getArray() or getTable() as appropriate.
-     *     Then reuse for output values from Array and Table getters.
+     * @param value Call getArray() or getTable() as appropriate. Then reuse for
+     *     output values from Array and Table getters. Note: ResourceTable and
+     *     ResourceArray instances must outlive the ResourceValue instance for
+     *     ResourceTracer to be happy.
      * @param noFallback true if the bundle has no parent;
      *     that is, its top-level table has the nofallback attribute,
      *     or it is the root bundle of a locale tree.
diff --git a/thirdparty/icu4c/common/restrace.cpp b/thirdparty/icu4c/common/restrace.cpp
index 5c6498850e..1f83372d68 100644
--- a/thirdparty/icu4c/common/restrace.cpp
+++ b/thirdparty/icu4c/common/restrace.cpp
@@ -54,6 +54,9 @@ void ResourceTracer::traceOpen() const {
 
 CharString& ResourceTracer::getFilePath(CharString& output, UErrorCode& status) const {
     if (fResB) {
+        // Note: if you get a segfault around here, check that ResourceTable and
+        // ResourceArray instances outlive ResourceValue instances referring to
+        // their contents:
         output.append(fResB->fData->fPath, status);
         output.append('/', status);
         output.append(fResB->fData->fName, status);
diff --git a/thirdparty/icu4c/common/servnotf.h b/thirdparty/icu4c/common/servnotf.h
index 305570c1e6..73ce38c772 100644
--- a/thirdparty/icu4c/common/servnotf.h
+++ b/thirdparty/icu4c/common/servnotf.h
@@ -82,7 +82,7 @@ public:
     /**
      * Add a listener to be notified when notifyChanged is called.
      * The listener must not be null. AcceptsListener must return
-     * true for the listener.  Attempts to concurrently
+     * true for the listener. Attempts to concurrently
      * register the identical listener more than once will be
      * silently ignored.  
      */
@@ -90,7 +90,7 @@ public:
     
     /**
      * Stop notifying this listener.  The listener must
-     * not be null.  Attemps to remove a listener that is
+     * not be null. Attempts to remove a listener that is
      * not registered will be silently ignored.
      */
     virtual void removeListener(const EventListener* l, UErrorCode& status);
diff --git a/thirdparty/icu4c/common/ubrk.cpp b/thirdparty/icu4c/common/ubrk.cpp
index f8bdf5a6b6..bb5bdd1b50 100644
--- a/thirdparty/icu4c/common/ubrk.cpp
+++ b/thirdparty/icu4c/common/ubrk.cpp
@@ -174,6 +174,18 @@ ubrk_safeClone(
     return (UBreakIterator *)newBI;
 }
 
+U_CAPI UBreakIterator * U_EXPORT2
+ubrk_clone(const UBreakIterator *bi, UErrorCode *status) {
+    if (U_FAILURE(*status)) {
+        return nullptr;
+    }
+    BreakIterator *newBI = ((BreakIterator *)bi)->clone();
+    if (newBI == nullptr) {
+        *status = U_MEMORY_ALLOCATION_ERROR;
+        return nullptr;
+    }
+    return (UBreakIterator *)newBI;
+}
 
 
 U_CAPI void U_EXPORT2
diff --git a/thirdparty/icu4c/common/ucase.cpp b/thirdparty/icu4c/common/ucase.cpp
index 2b142f5bc2..4f4c274d60 100644
--- a/thirdparty/icu4c/common/ucase.cpp
+++ b/thirdparty/icu4c/common/ucase.cpp
@@ -681,7 +681,7 @@ ucase_isCaseSensitive(UChar32 c) {
  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
  *   - Given D = NFD(C), then it is not the case that:
  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
- *     (This third criterium does not add any characters to the list
+ *     (This third criterion does not add any characters to the list
  *      for Unicode 3.2. Ignored.)
  *
  * D2. A character C is defined to be case-ignorable
diff --git a/thirdparty/icu4c/common/uchar.cpp b/thirdparty/icu4c/common/uchar.cpp
index eb14e4c75d..61e9c3d900 100644
--- a/thirdparty/icu4c/common/uchar.cpp
+++ b/thirdparty/icu4c/common/uchar.cpp
@@ -194,7 +194,7 @@ u_isISOControl(UChar32 c) {
 
 /* Some control characters that are used as space. */
 #define IS_THAT_CONTROL_SPACE(c) \
-    (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==NL))
+    (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==0x85))
 
 /* Java has decided that U+0085 New Line is not whitespace any more. */
 #define IS_THAT_ASCII_CONTROL_SPACE(c) \
@@ -677,14 +677,14 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
     sa->add(sa->set, CR+1); /* range TAB..CR */
     sa->add(sa->set, 0x1c);
     sa->add(sa->set, 0x1f+1);
-    USET_ADD_CP_AND_NEXT(sa, NL);
+    USET_ADD_CP_AND_NEXT(sa, 0x85);  // NEXT LINE (NEL)
 
     /* add for u_isIDIgnorable() what was not added above */
-    sa->add(sa->set, DEL); /* range DEL..NBSP-1, NBSP added below */
+    sa->add(sa->set, 0x7f); /* range DEL..NBSP-1, NBSP added below */
     sa->add(sa->set, HAIRSP);
     sa->add(sa->set, RLM+1);
-    sa->add(sa->set, INHSWAP);
-    sa->add(sa->set, NOMDIG+1);
+    sa->add(sa->set, 0x206a);  // INHIBIT SYMMETRIC SWAPPING
+    sa->add(sa->set, 0x206f+1);  // NOMINAL DIGIT SHAPES
     USET_ADD_CP_AND_NEXT(sa, ZWNBSP);
 
     /* add no-break spaces for u_isWhitespace() what was not added above */
@@ -693,23 +693,25 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
     USET_ADD_CP_AND_NEXT(sa, NNBSP);
 
     /* add for u_digit() */
-    sa->add(sa->set, U_a);
-    sa->add(sa->set, U_z+1);
-    sa->add(sa->set, U_A);
-    sa->add(sa->set, U_Z+1);
-    sa->add(sa->set, U_FW_a);
-    sa->add(sa->set, U_FW_z+1);
-    sa->add(sa->set, U_FW_A);
-    sa->add(sa->set, U_FW_Z+1);
+    sa->add(sa->set, u'a');
+    sa->add(sa->set, u'z'+1);
+    sa->add(sa->set, u'A');
+    sa->add(sa->set, u'Z'+1);
+    // fullwidth
+    sa->add(sa->set, u'ａ');
+    sa->add(sa->set, u'ｚ'+1);
+    sa->add(sa->set, u'Ａ');
+    sa->add(sa->set, u'Ｚ'+1);
 
     /* add for u_isxdigit() */
-    sa->add(sa->set, U_f+1);
-    sa->add(sa->set, U_F+1);
-    sa->add(sa->set, U_FW_f+1);
-    sa->add(sa->set, U_FW_F+1);
+    sa->add(sa->set, u'f'+1);
+    sa->add(sa->set, u'F'+1);
+    // fullwidth
+    sa->add(sa->set, u'ｆ'+1);
+    sa->add(sa->set, u'Ｆ'+1);
 
     /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
-    sa->add(sa->set, WJ); /* range WJ..NOMDIG */
+    sa->add(sa->set, 0x2060); /* range 2060..206f */
     sa->add(sa->set, 0xfff0);
     sa->add(sa->set, 0xfffb+1);
     sa->add(sa->set, 0xe0000);
diff --git a/thirdparty/icu4c/common/ucnv2022.cpp b/thirdparty/icu4c/common/ucnv2022.cpp
index 169ad4c526..1726440b94 100644
--- a/thirdparty/icu4c/common/ucnv2022.cpp
+++ b/thirdparty/icu4c/common/ucnv2022.cpp
@@ -820,7 +820,7 @@ getKey_2022(char c,int32_t* key,int32_t* offset){
     return INVALID_2022;
 }
 
-/*runs through a state machine to determine the escape sequence - codepage correspondance
+/*runs through a state machine to determine the escape sequence - codepage correspondence
  */
 static void
 changeState_2022(UConverter* _this,
@@ -1424,7 +1424,7 @@ toUnicodeCallback(UConverter *cnv,
 *          KSC5601 : alias to ibm-949 mapping table
 *          GB2312 : alias to ibm-1386 mapping table
 *          ISO-8859-1 : Algorithmic implemented as LATIN1 case
-*          ISO-8859-7 : alisas to ibm-9409 mapping table
+*          ISO-8859-7 : alias to ibm-9409 mapping table
 */
 
 /* preference order of JP charsets */
@@ -2324,7 +2324,7 @@ endloop:
 /***************************************************************
 *   Rules for ISO-2022-KR encoding
 *   i) The KSC5601 designator sequence should appear only once in a file,
-*      at the begining of a line before any KSC5601 characters. This usually
+*      at the beginning of a line before any KSC5601 characters. This usually
 *      means that it appears by itself on the first line of the file
 *  ii) There are only 2 shifting sequences SO to shift into double byte mode
 *      and SI to shift into single byte mode
diff --git a/thirdparty/icu4c/common/ucnv_bld.cpp b/thirdparty/icu4c/common/ucnv_bld.cpp
index 0e198892f1..d08eec7369 100644
--- a/thirdparty/icu4c/common/ucnv_bld.cpp
+++ b/thirdparty/icu4c/common/ucnv_bld.cpp
@@ -427,7 +427,7 @@ getAlgorithmicTypeFromName(const char *realName)
 #define UCNV_CACHE_LOAD_FACTOR 2
 
 /* Puts the shared data in the static hashtable SHARED_DATA_HASHTABLE */
-/*   Will always be called with the cnvCacheMutex alrady being held   */
+/*   Will always be called with the cnvCacheMutex already being held   */
 /*     by the calling function.                                       */
 /* Stores the shared data in the SHARED_DATA_HASHTABLE
  * @param data The shared data
diff --git a/thirdparty/icu4c/common/ucnv_err.cpp b/thirdparty/icu4c/common/ucnv_err.cpp
index 6b738face5..e1f2b934aa 100644
--- a/thirdparty/icu4c/common/ucnv_err.cpp
+++ b/thirdparty/icu4c/common/ucnv_err.cpp
@@ -321,7 +321,7 @@ UCNV_FROM_U_CALLBACK_ESCAPE (
       case UCNV_PRV_ESCAPE_CSS2:
           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
           valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
-          /* Always add space character, becase the next character might be whitespace,
+          /* Always add space character, because the next character might be whitespace,
              which would erroneously be considered the termination of the escape sequence. */
           valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
           break;
diff --git a/thirdparty/icu4c/common/ucnv_lmb.cpp b/thirdparty/icu4c/common/ucnv_lmb.cpp
index 168392837b..41317d1cc0 100644
--- a/thirdparty/icu4c/common/ucnv_lmb.cpp
+++ b/thirdparty/icu4c/common/ucnv_lmb.cpp
@@ -81,7 +81,7 @@
   [G] D1 [D2]
 
   That is, a sometimes-optional 'group' byte, followed by 1 and sometimes 2
-  data bytes. The maximum size of a LMBCS chjaracter is 3 bytes:
+  data bytes. The maximum size of a LMBCS character is 3 bytes:
 */
 #define ULMBCS_CHARSIZE_MAX      3
 /*
@@ -164,7 +164,7 @@ beginning of internal 'system' range names: */
 /* Then we needed a place to put all the other ansi control characters 
 that must be moved to different values because LMBCS reserves those 
 values for other purposes. To represent the control characters, we start 
-with a first byte of 0xF & add the control chaarcter value as the 
+with a first byte of 0xF & add the control character value as the 
 second byte */
 #define ULMBCS_GRP_CTRL       0x0F   
 
diff --git a/thirdparty/icu4c/common/ucnv_u7.cpp b/thirdparty/icu4c/common/ucnv_u7.cpp
index 87ba8cf37e..de9f3f42ec 100644
--- a/thirdparty/icu4c/common/ucnv_u7.cpp
+++ b/thirdparty/icu4c/common/ucnv_u7.cpp
@@ -814,7 +814,7 @@ const UConverterSharedData _UTF7Data=
  *       the use of "~" in some servers as a home directory indicator.
  *
  *    5) UTF-7 permits multiple alternate forms to represent the same
- *       string; in particular, printable US-ASCII chararacters can be
+ *       string; in particular, printable US-ASCII characters can be
  *       represented in encoded form.
  *
  * In modified UTF-7, printable US-ASCII characters except for "&"
diff --git a/thirdparty/icu4c/common/ucnvisci.cpp b/thirdparty/icu4c/common/ucnvisci.cpp
index 44a7c05a3c..ffb8c7ac3e 100644
--- a/thirdparty/icu4c/common/ucnvisci.cpp
+++ b/thirdparty/icu4c/common/ucnvisci.cpp
@@ -992,7 +992,7 @@ UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
                     
                     if (converterData->currentDeltaFromUnicode == PNJ_DELTA) { 
                         if (sourceChar == PNJ_TIPPI) {
-                            /* Make sure Tippi is converterd to Bindi. */
+                            /* Make sure Tippi is converted to Bindi. */
                             sourceChar = PNJ_BINDI;
                         } else if (sourceChar == PNJ_ADHAK) {
                             /* This is for consonant cluster handling. */
@@ -1147,7 +1147,7 @@ static const uint16_t lookupTable[][2]={
     /* is the code point valid in current script? */                                     \
     if(sourceChar> ASCII_END &&                                                          \
             (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){    \
-        /* Vocallic RR is assigne in ISCII Telugu and Unicode */                         \
+        /* Vocallic RR is assigned in ISCII Telugu and Unicode */                         \
         if(data->currentDeltaToUnicode!=(TELUGU_DELTA) ||                                \
                     targetUniChar!=VOCALLIC_RR){                                         \
             targetUniChar=missingCharMarker;                                             \
@@ -1272,7 +1272,7 @@ UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, UErrorCo
                 goto CALLBACK;
             } else if (*contextCharToUnicode==ISCII_INV) {
                 if (sourceChar==ISCII_HALANT) {
-                    targetUniChar = 0x0020; /* replace with space accoding to Indic FAQ */
+                    targetUniChar = 0x0020; /* replace with space according to Indic FAQ */
                 } else {
                     targetUniChar = ZWJ;
                 }
diff --git a/thirdparty/icu4c/common/ucurr.cpp b/thirdparty/icu4c/common/ucurr.cpp
index 0e14cddcff..20bbd51488 100644
--- a/thirdparty/icu4c/common/ucurr.cpp
+++ b/thirdparty/icu4c/common/ucurr.cpp
@@ -844,7 +844,7 @@ typedef struct {
 #endif
 
 
-// Comparason function used in quick sort.
+// Comparison function used in quick sort.
 static int U_CALLCONV currencyNameComparator(const void* a, const void* b) {
     const CurrencyNameStruct* currName_1 = (const CurrencyNameStruct*)a;
     const CurrencyNameStruct* currName_2 = (const CurrencyNameStruct*)b;
@@ -1530,7 +1530,7 @@ uprv_parseCurrency(const char* locale,
 
     int32_t max = 0;
     int32_t matchIndex = -1;
-    // case in-sensitive comparision against currency names
+    // case in-sensitive comparison against currency names
     searchCurrencyName(currencyNames, total_currency_name_count, 
                        upperText, textLen, partialMatchLen, &max, &matchIndex);
 
diff --git a/thirdparty/icu4c/common/uhash.cpp b/thirdparty/icu4c/common/uhash.cpp
index 86311ceb0b..67c7c36354 100644
--- a/thirdparty/icu4c/common/uhash.cpp
+++ b/thirdparty/icu4c/common/uhash.cpp
@@ -133,8 +133,10 @@ static const float RESIZE_POLICY_RATIO_TABLE[6] = {
  * or a pointer.  If a hint bit is zero, then the associated
  * token is assumed to be an integer.
  */
+#define HINT_BOTH_INTEGERS (0)
 #define HINT_KEY_POINTER   (1)
 #define HINT_VALUE_POINTER (2)
+#define HINT_ALLOW_ZERO    (4)
 
 /********************************************************************
  * PRIVATE Implementation
@@ -479,8 +481,9 @@ _uhash_put(UHashtable *hash,
         goto err;
     }
     U_ASSERT(hash != NULL);
-    /* Cannot always check pointer here or iSeries sees NULL every time. */
-    if ((hint & HINT_VALUE_POINTER) && value.pointer == NULL) {
+    if ((hint & HINT_VALUE_POINTER) ?
+            value.pointer == NULL :
+            value.integer == 0 && (hint & HINT_ALLOW_ZERO) == 0) {
         /* Disallow storage of NULL values, since NULL is returned by
          * get() to indicate an absent key.  Storing NULL == removing.
          */
@@ -687,6 +690,28 @@ uhash_igeti(const UHashtable *hash,
     return _uhash_find(hash, keyholder, hash->keyHasher(keyholder))->value.integer;
 }
 
+U_CAPI int32_t U_EXPORT2
+uhash_getiAndFound(const UHashtable *hash,
+                   const void *key,
+                   UBool *found) {
+    UHashTok keyholder;
+    keyholder.pointer = (void *)key;
+    const UHashElement *e = _uhash_find(hash, keyholder, hash->keyHasher(keyholder));
+    *found = !IS_EMPTY_OR_DELETED(e->hashcode);
+    return e->value.integer;
+}
+
+U_CAPI int32_t U_EXPORT2
+uhash_igetiAndFound(const UHashtable *hash,
+                    int32_t key,
+                    UBool *found) {
+    UHashTok keyholder;
+    keyholder.integer = key;
+    const UHashElement *e = _uhash_find(hash, keyholder, hash->keyHasher(keyholder));
+    *found = !IS_EMPTY_OR_DELETED(e->hashcode);
+    return e->value.integer;
+}
+
 U_CAPI void* U_EXPORT2
 uhash_put(UHashtable *hash,
           void* key,
@@ -736,7 +761,34 @@ uhash_iputi(UHashtable *hash,
     keyholder.integer = key;
     valueholder.integer = value;
     return _uhash_put(hash, keyholder, valueholder,
-                      0, /* neither is a ptr */
+                      HINT_BOTH_INTEGERS,
+                      status).integer;
+}
+
+U_CAPI int32_t U_EXPORT2
+uhash_putiAllowZero(UHashtable *hash,
+                    void *key,
+                    int32_t value,
+                    UErrorCode *status) {
+    UHashTok keyholder, valueholder;
+    keyholder.pointer = key;
+    valueholder.integer = value;
+    return _uhash_put(hash, keyholder, valueholder,
+                      HINT_KEY_POINTER | HINT_ALLOW_ZERO,
+                      status).integer;
+}
+
+
+U_CAPI int32_t U_EXPORT2
+uhash_iputiAllowZero(UHashtable *hash,
+                     int32_t key,
+                     int32_t value,
+                     UErrorCode *status) {
+    UHashTok keyholder, valueholder;
+    keyholder.integer = key;
+    valueholder.integer = value;
+    return _uhash_put(hash, keyholder, valueholder,
+                      HINT_BOTH_INTEGERS | HINT_ALLOW_ZERO,
                       status).integer;
 }
 
@@ -785,6 +837,29 @@ uhash_removeAll(UHashtable *hash) {
     U_ASSERT(hash->count == 0);
 }
 
+U_CAPI UBool U_EXPORT2
+uhash_containsKey(const UHashtable *hash, const void *key) {
+    UHashTok keyholder;
+    keyholder.pointer = (void *)key;
+    const UHashElement *e = _uhash_find(hash, keyholder, hash->keyHasher(keyholder));
+    return !IS_EMPTY_OR_DELETED(e->hashcode);
+}
+
+/**
+ * Returns true if the UHashtable contains an item with this integer key.
+ *
+ * @param hash The target UHashtable.
+ * @param key An integer key stored in a hashtable
+ * @return true if the key is found.
+ */
+U_CAPI UBool U_EXPORT2
+uhash_icontainsKey(const UHashtable *hash, int32_t key) {
+    UHashTok keyholder;
+    keyholder.integer = key;
+    const UHashElement *e = _uhash_find(hash, keyholder, hash->keyHasher(keyholder));
+    return !IS_EMPTY_OR_DELETED(e->hashcode);
+}
+
 U_CAPI const UHashElement* U_EXPORT2
 uhash_find(const UHashtable *hash, const void* key) {
     UHashTok keyholder;
diff --git a/thirdparty/icu4c/common/uhash.h b/thirdparty/icu4c/common/uhash.h
index b59d2711bb..af75999860 100644
--- a/thirdparty/icu4c/common/uhash.h
+++ b/thirdparty/icu4c/common/uhash.h
@@ -23,7 +23,7 @@
 /**
  * UHashtable stores key-value pairs and does moderately fast lookup
  * based on keys.  It provides a good tradeoff between access time and
- * storage space.  As elements are added to it, it grows to accomodate
+ * storage space.  As elements are added to it, it grows to accommodate
  * them.  By default, the table never shrinks, even if all elements
  * are removed from it.
  *
@@ -54,6 +54,13 @@
  * uhash_remove() on that key.  This keeps uhash_get(), uhash_count(),
  * and uhash_nextElement() consistent with one another.
  *
+ * Keys and values can be integers.
+ * Functions that work with an integer key have an "i" prefix.
+ * Functions that work with an integer value have an "i" suffix.
+ * As with putting a NULL value pointer, putting a zero value integer removes the item.
+ * Except, there are pairs of functions that allow setting zero values
+ * and fetching (value, found) pairs.
+ *
  * To see everything in a hashtable, use uhash_nextElement() to
  * iterate through its contents.  Each call to this function returns a
  * UHashElement pointer.  A hash element contains a key, value, and
@@ -406,6 +413,44 @@ uhash_iputi(UHashtable *hash,
            UErrorCode *status);
 
 /**
+ * Put a (key=pointer, value=integer) item in a UHashtable.  If the
+ * keyDeleter is non-NULL, then the hashtable owns 'key' after this
+ * call.  valueDeleter must be NULL.
+ * Storing a 0 value is possible; call uhash_igetiAndFound() to retrieve values including zero.
+ *
+ * @param hash The target UHashtable.
+ * @param key The key to store.
+ * @param value The integer value to store.
+ * @param status A pointer to an UErrorCode to receive any errors.
+ * @return The previous value, or 0 if none.
+ * @see uhash_getiAndFound
+ */
+U_CAPI int32_t U_EXPORT2
+uhash_putiAllowZero(UHashtable *hash,
+                    void *key,
+                    int32_t value,
+                    UErrorCode *status);
+
+/**
+ * Put a (key=integer, value=integer) item in a UHashtable.  If the
+ * keyDeleter is non-NULL, then the hashtable owns 'key' after this
+ * call.  valueDeleter must be NULL.
+ * Storing a 0 value is possible; call uhash_igetiAndFound() to retrieve values including zero.
+ *
+ * @param hash The target UHashtable.
+ * @param key The key to store.
+ * @param value The integer value to store.
+ * @param status A pointer to an UErrorCode to receive any errors.
+ * @return The previous value, or 0 if none.
+ * @see uhash_igetiAndFound
+ */
+U_CAPI int32_t U_EXPORT2
+uhash_iputiAllowZero(UHashtable *hash,
+                     int32_t key,
+                     int32_t value,
+                     UErrorCode *status);
+
+/**
  * Retrieve a pointer value from a UHashtable using a pointer key,
  * as previously stored by uhash_put().
  * @param hash The target UHashtable.
@@ -449,6 +494,34 @@ uhash_igeti(const UHashtable *hash,
            int32_t key);
 
 /**
+ * Retrieves an integer value from a UHashtable using a pointer key,
+ * as previously stored by uhash_putiAllowZero() or uhash_puti().
+ *
+ * @param hash The target UHashtable.
+ * @param key A pointer key stored in a hashtable
+ * @param found A pointer to a boolean which will be set for whether the key was found.
+ * @return The requested item, or 0 if not found.
+ */
+U_CAPI int32_t U_EXPORT2
+uhash_getiAndFound(const UHashtable *hash,
+                   const void *key,
+                   UBool *found);
+
+/**
+ * Retrieves an integer value from a UHashtable using an integer key,
+ * as previously stored by uhash_iputiAllowZero() or uhash_iputi().
+ *
+ * @param hash The target UHashtable.
+ * @param key An integer key stored in a hashtable
+ * @param found A pointer to a boolean which will be set for whether the key was found.
+ * @return The requested item, or 0 if not found.
+ */
+U_CAPI int32_t U_EXPORT2
+uhash_igetiAndFound(const UHashtable *hash,
+                    int32_t key,
+                    UBool *found);
+
+/**
  * Remove an item from a UHashtable stored by uhash_put().
  * @param hash The target UHashtable.
  * @param key A key stored in a hashtable
@@ -496,6 +569,26 @@ U_CAPI void U_EXPORT2
 uhash_removeAll(UHashtable *hash);
 
 /**
+ * Returns true if the UHashtable contains an item with this pointer key.
+ *
+ * @param hash The target UHashtable.
+ * @param key A pointer key stored in a hashtable
+ * @return true if the key is found.
+ */
+U_CAPI UBool U_EXPORT2
+uhash_containsKey(const UHashtable *hash, const void *key);
+
+/**
+ * Returns true if the UHashtable contains an item with this integer key.
+ *
+ * @param hash The target UHashtable.
+ * @param key An integer key stored in a hashtable
+ * @return true if the key is found.
+ */
+U_CAPI UBool U_EXPORT2
+uhash_icontainsKey(const UHashtable *hash, int32_t key);
+
+/**
  * Locate an element of a UHashtable.  The caller must not modify the
  * returned object.  The primary use of this function is to obtain the
  * stored key when it may not be identical to the search key.  For
diff --git a/thirdparty/icu4c/common/uloc.cpp b/thirdparty/icu4c/common/uloc.cpp
index ebfbb50650..d96e79b8fd 100644
--- a/thirdparty/icu4c/common/uloc.cpp
+++ b/thirdparty/icu4c/common/uloc.cpp
@@ -143,7 +143,7 @@ static const char * const LANGUAGES[] = {
     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
     "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
-    "ml",  "mn",  "mnc", "mni", "mo",
+    "ml",  "mn",  "mnc", "mni",
     "moh", "mos", "mr",  "mrj",
     "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
     "my",  "mye", "myv", "mzn",
@@ -166,9 +166,9 @@ static const char * const LANGUAGES[] = {
     "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
     "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
     "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
-    "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
+    "sv",  "sw",  "swb", "syc", "syr", "szl",
     "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
-    "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
+    "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr",
     "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
     "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
     "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
@@ -181,7 +181,7 @@ static const char * const LANGUAGES[] = {
     "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
     "zun", "zxx", "zza",
 NULL,
-    "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
+    "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  /* obsolete language codes */
 NULL
 };
 
@@ -260,7 +260,7 @@ static const char * const LANGUAGES_3[] = {
     "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
     "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
     "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
-    "mal", "mon", "mnc", "mni", "mol",
+    "mal", "mon", "mnc", "mni",
     "moh", "mos", "mar", "mrj",
     "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
     "mya", "mye", "myv", "mzn",
@@ -283,9 +283,9 @@ static const char * const LANGUAGES_3[] = {
     "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
     "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
     "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
-    "swe", "swa", "swb", "swc", "syc", "syr", "szl",
+    "swe", "swa", "swb", "syc", "syr", "szl",
     "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
-    "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
+    "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
     "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
     "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
     "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
@@ -298,8 +298,8 @@ static const char * const LANGUAGES_3[] = {
     "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
     "zun", "zxx", "zza",
 NULL,
-/*  "in",  "iw",  "ji",  "jw",  "sh",                          */
-    "ind", "heb", "yid", "jaw", "srp",
+/*  "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  */
+    "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
 NULL
 };
 
@@ -334,13 +334,13 @@ static const char * const COUNTRIES[] = {
     "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
     "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
     "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
-    "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
-    "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
+    "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
+    "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
     "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
     "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
     "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
     "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
-    "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
+    "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
     "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
     "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
     "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
@@ -357,7 +357,7 @@ static const char * const COUNTRIES[] = {
     "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
     "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
     "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
-    "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
+    "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
 NULL,
     "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
 NULL
@@ -397,10 +397,10 @@ static const char * const COUNTRIES_3[] = {
     "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
     "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
-/*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
-    "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
-/*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
-    "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
+/*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
+    "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
+/*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
+    "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
     "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
@@ -409,8 +409,8 @@ static const char * const COUNTRIES_3[] = {
     "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
     "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
-/*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
-    "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
+/*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
+    "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
     "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
@@ -443,8 +443,8 @@ static const char * const COUNTRIES_3[] = {
     "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
-/*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
-    "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
+/*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
+    "WSM", "XXK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
 NULL,
 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
diff --git a/thirdparty/icu4c/common/uloc_keytype.cpp b/thirdparty/icu4c/common/uloc_keytype.cpp
index 019da058cf..c289ebe76f 100644
--- a/thirdparty/icu4c/common/uloc_keytype.cpp
+++ b/thirdparty/icu4c/common/uloc_keytype.cpp
@@ -271,7 +271,7 @@ initFromResourceBundle(UErrorCode& sts) {
                         if (U_FAILURE(sts)) {
                             break;
                         }
-                        // check if this is an alias of canoncal legacy type
+                        // check if this is an alias of canonical legacy type
                         if (uprv_compareInvWithUChar(NULL, legacyTypeId, -1, to, toLen) == 0) {
                             const char* from = ures_getKey(typeAliasDataEntry.getAlias());
                             if (isTZ) {
diff --git a/thirdparty/icu4c/common/uloc_tag.cpp b/thirdparty/icu4c/common/uloc_tag.cpp
index 7f7fd9119e..1235081bf3 100644
--- a/thirdparty/icu4c/common/uloc_tag.cpp
+++ b/thirdparty/icu4c/common/uloc_tag.cpp
@@ -129,7 +129,6 @@ static const char* const LEGACY[] = {
     // Legacy tags with no preferred value in the IANA
     // registry. Kept for now for the backward compatibility
     // because ICU has mapped them this way.
-    "cel-gaulish",  "xtg-x-cel-gaulish",
     "i-default",    "en-x-i-default",
     "i-enochian",   "und-x-i-enochian",
     "i-mingo",      "see-x-i-mingo",
@@ -647,6 +646,22 @@ _isTKey(const char* s, int32_t len)
     return FALSE;
 }
 
+U_CAPI const char * U_EXPORT2
+ultag_getTKeyStart(const char *localeID) {
+    const char *result = localeID;
+    const char *sep;
+    while((sep = uprv_strchr(result, SEP)) != nullptr) {
+        if (_isTKey(result, static_cast<int32_t>(sep - result))) {
+            return result;
+        }
+        result = ++sep;
+    }
+    if (_isTKey(result, -1)) {
+        return result;
+    }
+    return nullptr;
+}
+
 static UBool
 _isTValue(const char* s, int32_t len)
 {
@@ -671,9 +686,13 @@ _isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len)
     const int32_t kGotTKey = -1;    // Got tkey, wait for tvalue. ERROR if stop here.
     const int32_t kGotTValue = 6;   // Got tvalue, wait for tkey, tvalue or end
 
+
+    if (len < 0) {
+        len = (int32_t)uprv_strlen(s);
+    }
     switch (state) {
         case kStart:
-            if (ultag_isLanguageSubtag(s, len)) {
+            if (ultag_isLanguageSubtag(s, len) && len != 4) {
                 state = kGotLanguage;
                 return TRUE;
             }
@@ -1775,11 +1794,6 @@ _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status)
         return;
     }
 
-    /* Determine if variants already exists */
-    if (ultag_getVariantsSize(langtag)) {
-        posixVariant = TRUE;
-    }
-
     n = ultag_getExtensionsSize(langtag);
 
     /* resolve locale keywords and reordering keys */
@@ -1787,6 +1801,11 @@ _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode* status)
         key = ultag_getExtensionKey(langtag, i);
         type = ultag_getExtensionValue(langtag, i);
         if (*key == LDMLEXT) {
+            /* Determine if variants already exists */
+            if (ultag_getVariantsSize(langtag)) {
+                posixVariant = TRUE;
+            }
+
             _appendLDMLExtensionAsKeywords(type, &kwdFirst, extPool, kwdBuf, &posixVariant, status);
             if (U_FAILURE(*status)) {
                 break;
@@ -2028,7 +2047,10 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
         *status = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
-    uprv_memcpy(tagBuf, tag, tagLen);
+    
+    if (tagLen > 0) {
+        uprv_memcpy(tagBuf, tag, tagLen);
+    }
     *(tagBuf + tagLen) = 0;
 
     /* create a ULanguageTag */
@@ -2692,8 +2714,7 @@ ulocimp_toLanguageTag(const char* localeID,
                     if (U_SUCCESS(tmpStatus)) {
                         if (ultag_isPrivateuseValueSubtags(buf.data(), buf.length())) {
                             /* return private use only tag */
-                            static const char PREFIX[] = { PRIVATEUSE, SEP };
-                            sink.Append(PREFIX, sizeof(PREFIX));
+                            sink.Append("und-x-", 6);
                             sink.Append(buf.data(), buf.length());
                             done = TRUE;
                         } else if (strict) {
diff --git a/thirdparty/icu4c/common/ulocimp.h b/thirdparty/icu4c/common/ulocimp.h
index 5691fe9a77..1f796aa213 100644
--- a/thirdparty/icu4c/common/ulocimp.h
+++ b/thirdparty/icu4c/common/ulocimp.h
@@ -286,6 +286,9 @@ ultag_isUnicodeLocaleType(const char* s, int32_t len);
 U_CFUNC UBool
 ultag_isVariantSubtags(const char* s, int32_t len);
 
+U_CAPI const char * U_EXPORT2
+ultag_getTKeyStart(const char *localeID);
+
 U_CFUNC const char*
 ulocimp_toBcpKey(const char* key);
 
diff --git a/thirdparty/icu4c/common/unicode/bytestream.h b/thirdparty/icu4c/common/unicode/bytestream.h
index 044f7a77e7..9735ee0bf8 100644
--- a/thirdparty/icu4c/common/unicode/bytestream.h
+++ b/thirdparty/icu4c/common/unicode/bytestream.h
@@ -71,7 +71,6 @@ public:
    */
   virtual void Append(const char* bytes, int32_t n) = 0;
 
-#ifndef U_HIDE_DRAFT_API
   /**
    * Appends n bytes to this. Same as Append().
    * Call AppendU8() with u8"string literals" which are const char * in C++11
@@ -81,7 +80,7 @@ public:
    *
    * @param bytes the pointer to the bytes
    * @param n the number of bytes; must be non-negative
-   * @draft ICU 67
+   * @stable ICU 67
    */
   inline void AppendU8(const char* bytes, int32_t n) {
     Append(bytes, n);
@@ -97,13 +96,12 @@ public:
    *
    * @param bytes the pointer to the bytes
    * @param n the number of bytes; must be non-negative
-   * @draft ICU 67
+   * @stable ICU 67
    */
   inline void AppendU8(const char8_t* bytes, int32_t n) {
     Append(reinterpret_cast<const char*>(bytes), n);
   }
 #endif
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Returns a writable buffer for appending and writes the buffer's capacity to
diff --git a/thirdparty/icu4c/common/unicode/bytestrie.h b/thirdparty/icu4c/common/unicode/bytestrie.h
index 85f802df42..271a81d1b4 100644
--- a/thirdparty/icu4c/common/unicode/bytestrie.h
+++ b/thirdparty/icu4c/common/unicode/bytestrie.h
@@ -30,6 +30,8 @@
 #include "unicode/uobject.h"
 #include "unicode/ustringtrie.h"
 
+class BytesTrieTest;
+
 U_NAMESPACE_BEGIN
 
 class ByteSink;
@@ -378,6 +380,7 @@ public:
 
 private:
     friend class BytesTrieBuilder;
+    friend class ::BytesTrieTest;
 
     /**
      * Constructs a BytesTrie reader instance.
diff --git a/thirdparty/icu4c/common/unicode/bytestriebuilder.h b/thirdparty/icu4c/common/unicode/bytestriebuilder.h
index cae16e48b4..3cff89e443 100644
--- a/thirdparty/icu4c/common/unicode/bytestriebuilder.h
+++ b/thirdparty/icu4c/common/unicode/bytestriebuilder.h
@@ -30,6 +30,8 @@
 #include "unicode/stringpiece.h"
 #include "unicode/stringtriebuilder.h"
 
+class BytesTrieTest;
+
 U_NAMESPACE_BEGIN
 
 class BytesTrieElement;
@@ -125,6 +127,8 @@ public:
     BytesTrieBuilder &clear();
 
 private:
+    friend class ::BytesTrieTest;
+
     BytesTrieBuilder(const BytesTrieBuilder &other);  // no copy constructor
     BytesTrieBuilder &operator=(const BytesTrieBuilder &other);  // no assignment operator
 
@@ -168,6 +172,7 @@ private:
     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal);
     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node);
     virtual int32_t writeDeltaTo(int32_t jumpTarget);
+    static int32_t internalEncodeDelta(int32_t i, char intBytes[]);
 
     CharString *strings;  // Pointer not object so we need not #include internal charstr.h.
     BytesTrieElement *elements;
diff --git a/thirdparty/icu4c/common/unicode/docmain.h b/thirdparty/icu4c/common/unicode/docmain.h
index edcb5d4e83..e82678c95f 100644
--- a/thirdparty/icu4c/common/unicode/docmain.h
+++ b/thirdparty/icu4c/common/unicode/docmain.h
@@ -15,7 +15,7 @@
  * \file
  * \brief (Non API- contains Doxygen definitions)
  *
- * This file contains documentation for Doxygen and doesnot have
+ * This file contains documentation for Doxygen and does not have
  * any significance with respect to C or C++ API
  */
 
@@ -74,7 +74,7 @@
  *   </tr>
  *   <tr>
  *     <td>Strings and Character Iteration</td>
- *     <td>ustring.h, utf8.h, utf16.h, UText, UCharIterator</td>
+ *     <td>ustring.h, utf8.h, utf16.h, icu::StringPiece, UText, UCharIterator, icu::ByteSink</td>
  *     <td>icu::UnicodeString, icu::CharacterIterator, icu::Appendable, icu::StringPiece,icu::ByteSink</td>
  *   </tr>
  *   <tr>
@@ -128,9 +128,9 @@
  *     <td>icu::Normalizer2</td>
  *   </tr>
  *   <tr>
- *     <td>Calendars</td>
+ *     <td>Calendars and Time Zones</td>
  *     <td>ucal.h</td>
- *     <td>icu::Calendar</td>
+ *     <td>icu::Calendar, icu::TimeZone</td>
  *   </tr>
  *   <tr>
  *     <td>Date and Time Formatting</td>
diff --git a/thirdparty/icu4c/common/unicode/icuplug.h b/thirdparty/icu4c/common/unicode/icuplug.h
index 52f810da57..205af360d4 100644
--- a/thirdparty/icu4c/common/unicode/icuplug.h
+++ b/thirdparty/icu4c/common/unicode/icuplug.h
@@ -117,14 +117,13 @@
 /* === Basic types === */
 
 #ifndef U_HIDE_INTERNAL_API
+struct UPlugData;
 /**
  * @{
- * Opaque structure passed to/from a plugin. 
- * use the APIs to access it.
+ * Typedef for opaque structure passed to/from a plugin. 
+ * Use the APIs to access it.
  * @internal ICU 4.4 Technology Preview
  */
-
-struct UPlugData;
 typedef struct UPlugData UPlugData;
 
 /** @} */
diff --git a/thirdparty/icu4c/common/unicode/localematcher.h b/thirdparty/icu4c/common/unicode/localematcher.h
index 63a68b0b7f..0cd068ef32 100644
--- a/thirdparty/icu4c/common/unicode/localematcher.h
+++ b/thirdparty/icu4c/common/unicode/localematcher.h
@@ -91,8 +91,6 @@ enum ULocMatchDemotion {
 typedef enum ULocMatchDemotion ULocMatchDemotion;
 #endif
 
-#ifndef U_FORCE_HIDE_DRAFT_API
-
 /**
  * Builder option for whether to include or ignore one-way (fallback) match data.
  * The LocaleMatcher uses CLDR languageMatch data which includes fallback (oneway=true) entries.
@@ -108,20 +106,20 @@ typedef enum ULocMatchDemotion ULocMatchDemotion;
  * but not if it is merely a fallback.
  *
  * @see LocaleMatcher::Builder#setDirection(ULocMatchDirection)
- * @draft ICU 67
+ * @stable ICU 67
  */
 enum ULocMatchDirection {
     /**
      * Locale matching includes one-way matches such as Breton→French. (default)
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     ULOCMATCH_DIRECTION_WITH_ONE_WAY,
     /**
      * Locale matching limited to two-way matches including e.g. Danish↔Norwegian
      * but ignoring one-way matches.
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     ULOCMATCH_DIRECTION_ONLY_TWO_WAY
 };
@@ -129,8 +127,6 @@ enum ULocMatchDirection {
 typedef enum ULocMatchDirection ULocMatchDirection;
 #endif
 
-#endif  // U_FORCE_HIDE_DRAFT_API
-
 struct UHashtable;
 
 U_NAMESPACE_BEGIN
@@ -463,14 +459,13 @@ public:
          */
         Builder &setDemotionPerDesiredLocale(ULocMatchDemotion demotion);
 
-#ifndef U_HIDE_DRAFT_API
         /**
          * Option for whether to include or ignore one-way (fallback) match data.
          * By default, they are included.
          *
          * @param direction the match direction to set.
          * @return this Builder object
-         * @draft ICU 67
+         * @stable ICU 67
          */
         Builder &setDirection(ULocMatchDirection direction) {
             if (U_SUCCESS(errorCode_)) {
@@ -478,7 +473,6 @@ public:
             }
             return *this;
         }
-#endif  // U_HIDE_DRAFT_API
 
 #ifndef U_HIDE_DRAFT_API
         /**
@@ -704,7 +698,7 @@ private:
     LSR *lsrs;
     int32_t supportedLocalesLength;
     // These are in preference order: 1. Default locale 2. paradigm locales 3. others.
-    UHashtable *supportedLsrToIndex;  // Map<LSR, Integer> stores index+1 because 0 is "not found"
+    UHashtable *supportedLsrToIndex;  // Map<LSR, Integer>
     // Array versions of the supportedLsrToIndex keys and values.
     // The distance lookup loops over the supportedLSRs and returns the index of the best match.
     const LSR **supportedLSRs;
diff --git a/thirdparty/icu4c/common/unicode/locid.h b/thirdparty/icu4c/common/unicode/locid.h
index ba858d702a..81f4685d65 100644
--- a/thirdparty/icu4c/common/unicode/locid.h
+++ b/thirdparty/icu4c/common/unicode/locid.h
@@ -571,15 +571,13 @@ public:
      */
     void minimizeSubtags(UErrorCode& status);
 
-#ifndef U_HIDE_DRAFT_API
     /**
      * Canonicalize the locale ID of this object according to CLDR.
      * @param status the status code
-     * @draft ICU 67
+     * @stable ICU 67
      * @see createCanonical
      */
     void canonicalize(UErrorCode& status);
-#endif  // U_HIDE_DRAFT_API
 
     /**
      * Gets the list of keywords for the specified locale.
diff --git a/thirdparty/icu4c/common/unicode/normalizer2.h b/thirdparty/icu4c/common/unicode/normalizer2.h
index 5eb1d95caf..2d355250c2 100644
--- a/thirdparty/icu4c/common/unicode/normalizer2.h
+++ b/thirdparty/icu4c/common/unicode/normalizer2.h
@@ -225,10 +225,8 @@ public:
      * Normalizes a UTF-8 string and optionally records how source substrings
      * relate to changed and unchanged result substrings.
      *
-     * Currently implemented completely only for "compose" modes,
-     * such as for NFC, NFKC, and NFKC_Casefold
-     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
-     * Otherwise currently converts to & from UTF-16 and does not support edits.
+     * Implemented completely for all built-in modes except for FCD.
+     * The base class implementation converts to & from UTF-16 and does not support edits.
      *
      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
      * @param src       Source UTF-8 string.
@@ -381,11 +379,9 @@ public:
      * resolves to "yes" or "no" to provide a definitive result,
      * at the cost of doing more work in those cases.
      *
-     * This works for all normalization modes,
-     * but it is currently optimized for UTF-8 only for "compose" modes,
-     * such as for NFC, NFKC, and NFKC_Casefold
-     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
-     * For other modes it currently converts to UTF-16 and calls isNormalized().
+     * This works for all normalization modes.
+     * It is optimized for UTF-8 for all built-in modes except for FCD.
+     * The base class implementation converts to UTF-16 and calls isNormalized().
      *
      * @param s UTF-8 input string
      * @param errorCode Standard ICU error code. Its input value must
@@ -543,10 +539,8 @@ public:
      * Normalizes a UTF-8 string and optionally records how source substrings
      * relate to changed and unchanged result substrings.
      *
-     * Currently implemented completely only for "compose" modes,
-     * such as for NFC, NFKC, and NFKC_Casefold
-     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
-     * Otherwise currently converts to & from UTF-16 and does not support edits.
+     * Implemented completely for most built-in modes except for FCD.
+     * The base class implementation converts to & from UTF-16 and does not support edits.
      *
      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
      * @param src       Source UTF-8 string.
@@ -676,11 +670,9 @@ public:
      * resolves to "yes" or "no" to provide a definitive result,
      * at the cost of doing more work in those cases.
      *
-     * This works for all normalization modes,
-     * but it is currently optimized for UTF-8 only for "compose" modes,
-     * such as for NFC, NFKC, and NFKC_Casefold
-     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
-     * For other modes it currently converts to UTF-16 and calls isNormalized().
+     * This works for all normalization modes.
+     * It is optimized for UTF-8 for all built-in modes except for FCD.
+     * The base class implementation converts to UTF-16 and calls isNormalized().
      *
      * @param s UTF-8 input string
      * @param errorCode Standard ICU error code. Its input value must
diff --git a/thirdparty/icu4c/common/unicode/platform.h b/thirdparty/icu4c/common/unicode/platform.h
index 2bb2f8b318..cb3a833fef 100644
--- a/thirdparty/icu4c/common/unicode/platform.h
+++ b/thirdparty/icu4c/common/unicode/platform.h
@@ -880,6 +880,6 @@ namespace std {
 #else
 #    define U_CALLCONV_FPTR
 #endif
-/* @} */
+/** @} */
 
 #endif  // _PLATFORM_H
diff --git a/thirdparty/icu4c/common/unicode/stringpiece.h b/thirdparty/icu4c/common/unicode/stringpiece.h
index 7d7d871e1f..8c96789e73 100644
--- a/thirdparty/icu4c/common/unicode/stringpiece.h
+++ b/thirdparty/icu4c/common/unicode/stringpiece.h
@@ -75,12 +75,11 @@ class U_COMMON_API StringPiece : public UMemory {
    * @stable ICU 4.2
    */
   StringPiece(const char* str);
-#ifndef U_HIDE_DRAFT_API
 #if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
   /**
    * Constructs from a NUL-terminated const char8_t * pointer.
    * @param str a NUL-terminated const char8_t * pointer
-   * @draft ICU 67
+   * @stable ICU 67
    */
   StringPiece(const char8_t* str) : StringPiece(reinterpret_cast<const char*>(str)) {}
 #endif
@@ -88,10 +87,9 @@ class U_COMMON_API StringPiece : public UMemory {
    * Constructs an empty StringPiece.
    * Needed for type disambiguation from multiple other overloads.
    * @param p nullptr
-   * @draft ICU 67
+   * @stable ICU 67
    */
   StringPiece(std::nullptr_t p) : ptr_(p), length_(0) {}
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Constructs from a std::string.
@@ -99,17 +97,15 @@ class U_COMMON_API StringPiece : public UMemory {
    */
   StringPiece(const std::string& str)
     : ptr_(str.data()), length_(static_cast<int32_t>(str.size())) { }
-#ifndef U_HIDE_DRAFT_API
 #if defined(__cpp_lib_char8_t) || defined(U_IN_DOXYGEN)
   /**
    * Constructs from a std::u8string.
-   * @draft ICU 67
+   * @stable ICU 67
    */
   StringPiece(const std::u8string& str)
     : ptr_(reinterpret_cast<const char*>(str.data())),
       length_(static_cast<int32_t>(str.size())) { }
 #endif
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Constructs from some other implementation of a string piece class, from any
@@ -152,18 +148,16 @@ class U_COMMON_API StringPiece : public UMemory {
    * @stable ICU 4.2
    */
   StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { }
-#ifndef U_HIDE_DRAFT_API
 #if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
   /**
    * Constructs from a const char8_t * pointer and a specified length.
    * @param str a const char8_t * pointer (need not be terminated)
    * @param len the length of the string; must be non-negative
-   * @draft ICU 67
+   * @stable ICU 67
    */
   StringPiece(const char8_t* str, int32_t len) :
       StringPiece(reinterpret_cast<const char*>(str), len) {}
 #endif
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Substring of another StringPiece.
@@ -233,13 +227,12 @@ class U_COMMON_API StringPiece : public UMemory {
    */
   void set(const char* str);
 
-#ifndef U_HIDE_DRAFT_API
 #if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
   /**
    * Resets the stringpiece to refer to new data.
    * @param xdata pointer the new string data. Need not be NUL-terminated.
    * @param len the length of the new data
-   * @draft ICU 67
+   * @stable ICU 67
    */
   inline void set(const char8_t* xdata, int32_t len) {
       set(reinterpret_cast<const char*>(xdata), len);
@@ -248,13 +241,12 @@ class U_COMMON_API StringPiece : public UMemory {
   /**
    * Resets the stringpiece to refer to new data.
    * @param str a pointer to a NUL-terminated string.
-   * @draft ICU 67
+   * @stable ICU 67
    */
   inline void set(const char8_t* str) {
       set(reinterpret_cast<const char*>(str));
   }
 #endif
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Removes the first n string units.
@@ -286,13 +278,12 @@ class U_COMMON_API StringPiece : public UMemory {
     }
   }
 
-#ifndef U_HIDE_DRAFT_API
   /**
    * Searches the StringPiece for the given search string (needle);
    * @param needle The string for which to search.
    * @param offset Where to start searching within this string (haystack).
    * @return The offset of needle in haystack, or -1 if not found.
-   * @draft ICU 67
+   * @stable ICU 67
    */
   int32_t find(StringPiece needle, int32_t offset);
 
@@ -301,10 +292,9 @@ class U_COMMON_API StringPiece : public UMemory {
    * similar to std::string::compare().
    * @param other The string to compare to.
    * @return below zero if this < other; above zero if this > other; 0 if this == other.
-   * @draft ICU 67
+   * @stable ICU 67
    */
   int32_t compare(StringPiece other);
-#endif  // U_HIDE_DRAFT_API
 
   /**
    * Maximum integer, used as a default value for substring methods.
diff --git a/thirdparty/icu4c/common/unicode/ubrk.h b/thirdparty/icu4c/common/unicode/ubrk.h
index 37189a8598..1249b0b160 100644
--- a/thirdparty/icu4c/common/unicode/ubrk.h
+++ b/thirdparty/icu4c/common/unicode/ubrk.h
@@ -296,6 +296,8 @@ ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
                      const UChar *  text, int32_t textLength,
                      UErrorCode *   status);
 
+#ifndef U_HIDE_DEPRECATED_API
+
 /**
  * Thread safe cloning operation
  * @param bi iterator to be cloned
@@ -312,7 +314,7 @@ ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
  * @param status to indicate whether the operation went on smoothly or there were errors
  *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
  * @return pointer to the new clone
- * @stable ICU 2.0
+ * @deprecated ICU 69 Use ubrk_clone() instead.
  */
 U_CAPI UBreakIterator * U_EXPORT2
 ubrk_safeClone(
@@ -321,6 +323,23 @@ ubrk_safeClone(
           int32_t *pBufferSize,
           UErrorCode *status);
 
+#endif /* U_HIDE_DEPRECATED_API */
+
+#ifndef U_HIDE_DRAFT_API
+
+/**
+ * Thread safe cloning operation.
+ * @param bi iterator to be cloned
+ * @param status to indicate whether the operation went on smoothly or there were errors
+ * @return pointer to the new clone
+ * @draft ICU 69
+ */
+U_CAPI UBreakIterator * U_EXPORT2
+ubrk_clone(const UBreakIterator *bi,
+           UErrorCode *status);
+
+#endif  // U_HIDE_DRAFT_API
+
 #ifndef U_HIDE_DEPRECATED_API
 
 /**
diff --git a/thirdparty/icu4c/common/unicode/ucnv.h b/thirdparty/icu4c/common/unicode/ucnv.h
index 58f271cfb5..5d784990f2 100644
--- a/thirdparty/icu4c/common/unicode/ucnv.h
+++ b/thirdparty/icu4c/common/unicode/ucnv.h
@@ -1699,10 +1699,10 @@ ucnv_countAvailable(void);
 
 /**
  * Gets the canonical converter name of the specified converter from a list of
- * all available converters contaied in the alias file. All converters
+ * all available converters contained in the alias file. All converters
  * in this list can be opened.
  *
- * @param n the index to a converter available on the system (in the range <TT>[0..ucnv_countAvaiable()]</TT>)
+ * @param n the index to a converter available on the system (in the range <TT>[0..ucnv_countAvailable()]</TT>)
  * @return a pointer a string (library owned), or <TT>NULL</TT> if the index is out of bounds.
  * @see ucnv_countAvailable
  * @stable ICU 2.0
diff --git a/thirdparty/icu4c/common/unicode/ucnvsel.h b/thirdparty/icu4c/common/unicode/ucnvsel.h
index 5e0a71cf35..3d7d3327f7 100644
--- a/thirdparty/icu4c/common/unicode/ucnvsel.h
+++ b/thirdparty/icu4c/common/unicode/ucnvsel.h
@@ -45,11 +45,11 @@
  * from the serialized form.
  */
 
+struct UConverterSelector;
 /**
  * @{
- * The selector data structure
+ * Typedef for selector data structure.
  */
-struct UConverterSelector;
 typedef struct UConverterSelector UConverterSelector;
 /** @} */
 
diff --git a/thirdparty/icu4c/common/unicode/unifilt.h b/thirdparty/icu4c/common/unicode/unifilt.h
index 420e1a1905..7870b55939 100644
--- a/thirdparty/icu4c/common/unicode/unifilt.h
+++ b/thirdparty/icu4c/common/unicode/unifilt.h
@@ -40,8 +40,8 @@ U_NAMESPACE_BEGIN
  *
  * <code>UnicodeFilter</code> defines a protocol for selecting a
  * subset of the full range (U+0000 to U+10FFFF) of Unicode characters.
- * Currently, filters are used in conjunction with classes like {@link
- * Transliterator} to only process selected characters through a
+ * Currently, filters are used in conjunction with classes like
+ * {@link Transliterator} to only process selected characters through a
  * transformation.
  *
  * <p>Note: UnicodeFilter currently stubs out two pure virtual methods
diff --git a/thirdparty/icu4c/common/unicode/uniset.h b/thirdparty/icu4c/common/unicode/uniset.h
index 50b6360f3a..8403c4026c 100644
--- a/thirdparty/icu4c/common/unicode/uniset.h
+++ b/thirdparty/icu4c/common/unicode/uniset.h
@@ -178,8 +178,6 @@ class RuleCharacterIterator;
  * Unicode property
  * </table>
  *
- * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
- *
  * <p><b>Formal syntax</b></p>
  *
  * \htmlonly<blockquote>\endhtmlonly
@@ -601,7 +599,7 @@ public:
 
     /**
      * Make this object represent the range `start - end`.
-     * If `end > start` then this object is set to an empty range.
+     * If `start > end` then this object is set to an empty range.
      * A frozen set will not be modified.
      *
      * @param start first character in the set, inclusive
@@ -1077,7 +1075,7 @@ public:
     /**
      * Adds the specified range to this set if it is not already
      * present.  If this set already contains the specified range,
-     * the call leaves this set unchanged.  If <code>end > start</code>
+     * the call leaves this set unchanged.  If <code>start > end</code>
      * then an empty range is added, leaving the set unchanged.
      * This is equivalent to a boolean logic OR, or a set UNION.
      * A frozen set will not be modified.
@@ -1095,6 +1093,9 @@ public:
      * present.  If this set already contains the specified character,
      * the call leaves this set unchanged.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& add(UChar32 c);
@@ -1104,8 +1105,8 @@ public:
      * present.  If this set already contains the multicharacter,
      * the call leaves this set unchanged.
      * Thus "ch" => {"ch"}
-     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
      * A frozen set will not be modified.
+     *
      * @param s the source string
      * @return this object, for chaining
      * @stable ICU 2.4
@@ -1124,8 +1125,8 @@ public:
 
  public:
     /**
-     * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
+     * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
+     * If this set already contains any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1135,7 +1136,6 @@ public:
 
     /**
      * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1145,7 +1145,6 @@ public:
 
     /**
      * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1155,7 +1154,6 @@ public:
 
     /**
      * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
-     * If this set already any particular character, it has no effect on that character.
      * A frozen set will not be modified.
      * @param s the source string
      * @return this object, for chaining
@@ -1165,7 +1163,7 @@ public:
 
     /**
      * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
-     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+     *
      * @param s the source string
      * @return a newly created set containing the given string.
      * The caller owns the return object and is responsible for deleting it.
@@ -1185,15 +1183,13 @@ public:
 
     /**
      * Retain only the elements in this set that are contained in the
-     * specified range.  If <code>end > start</code> then an empty range is
+     * specified range.  If <code>start > end</code> then an empty range is
      * retained, leaving the set empty.  This is equivalent to
      * a boolean logic AND, or a set INTERSECTION.
      * A frozen set will not be modified.
      *
-     * @param start first character, inclusive, of range to be retained
-     * to this set.
-     * @param end last character, inclusive, of range to be retained
-     * to this set.
+     * @param start first character, inclusive, of range
+     * @param end last character, inclusive, of range
      * @stable ICU 2.0
      */
     virtual UnicodeSet& retain(UChar32 start, UChar32 end);
@@ -1202,14 +1198,31 @@ public:
     /**
      * Retain the specified character from this set if it is present.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& retain(UChar32 c);
 
+#ifndef U_HIDE_DRAFT_API
+    /**
+     * Retains only the specified string from this set if it is present.
+     * Upon return this set will be empty if it did not contain s, or
+     * will only contain s if it did contain s.
+     * A frozen set will not be modified.
+     *
+     * @param s the source string
+     * @return this object, for chaining
+     * @draft ICU 69
+     */
+    UnicodeSet& retain(const UnicodeString &s);
+#endif  // U_HIDE_DRAFT_API
+
     /**
      * Removes the specified range from this set if it is present.
      * The set will not contain the specified range once the call
-     * returns.  If <code>end > start</code> then an empty range is
+     * returns.  If <code>start > end</code> then an empty range is
      * removed, leaving the set unchanged.
      * A frozen set will not be modified.
      *
@@ -1226,6 +1239,9 @@ public:
      * The set will not contain the specified range once the call
      * returns.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& remove(UChar32 c);
@@ -1253,15 +1269,13 @@ public:
     /**
      * Complements the specified range in this set.  Any character in
      * the range will be removed if it is in this set, or will be
-     * added if it is not in this set.  If <code>end > start</code>
+     * added if it is not in this set.  If <code>start > end</code>
      * then an empty range is complemented, leaving the set unchanged.
      * This is equivalent to a boolean logic XOR.
      * A frozen set will not be modified.
      *
-     * @param start first character, inclusive, of range to be removed
-     * from this set.
-     * @param end last character, inclusive, of range to be removed
-     * from this set.
+     * @param start first character, inclusive, of range
+     * @param end last character, inclusive, of range
      * @stable ICU 2.0
      */
     virtual UnicodeSet& complement(UChar32 start, UChar32 end);
@@ -1271,16 +1285,18 @@ public:
      * will be removed if it is in this set, or will be added if it is
      * not in this set.
      * A frozen set will not be modified.
+     *
+     * @param c the character (code point)
+     * @return this object, for chaining
      * @stable ICU 2.0
      */
     UnicodeSet& complement(UChar32 c);
 
     /**
      * Complement the specified string in this set.
-     * The set will not contain the specified string once the call
-     * returns.
-     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+     * The string will be removed if it is in this set, or will be added if it is not in this set.
      * A frozen set will not be modified.
+     *
      * @param s the string to complement
      * @return this object, for chaining
      * @stable ICU 2.4
diff --git a/thirdparty/icu4c/common/unicode/unistr.h b/thirdparty/icu4c/common/unicode/unistr.h
index 456389f265..85bd964951 100644
--- a/thirdparty/icu4c/common/unicode/unistr.h
+++ b/thirdparty/icu4c/common/unicode/unistr.h
@@ -44,9 +44,10 @@ struct UConverter;          // unicode/ucnv.h
 #ifndef USTRING_H
 /**
  * \ingroup ustring_ustrlen
+ * @param s Pointer to sequence of UChars.
+ * @return Length of sequence.
  */
-U_CAPI int32_t U_EXPORT2
-u_strlen(const UChar *s);
+U_CAPI int32_t U_EXPORT2 u_strlen(const UChar *s);
 #endif
 
 U_NAMESPACE_BEGIN
@@ -2766,7 +2767,6 @@ public:
    * @param options   Options bit set, usually 0. See U_TITLECASE_NO_LOWERCASE,
    *                  U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
    *                  U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
-   * @param options Options bit set, see ucasemap_open().
    * @return A reference to this.
    * @stable ICU 3.8
    */
@@ -3614,7 +3614,7 @@ private:
   // turn a bogus string into an empty one
   void unBogus();
 
-  // implements assigment operator, copy constructor, and fastCopyFrom()
+  // implements assignment operator, copy constructor, and fastCopyFrom()
   UnicodeString &copyFrom(const UnicodeString &src, UBool fastCopy=false);
 
   // Copies just the fields without memory management.
diff --git a/thirdparty/icu4c/common/unicode/urename.h b/thirdparty/icu4c/common/unicode/urename.h
index fe59fdd893..737f4b308e 100644
--- a/thirdparty/icu4c/common/unicode/urename.h
+++ b/thirdparty/icu4c/common/unicode/urename.h
@@ -482,6 +482,7 @@
 #define ubiditransform_open U_ICU_ENTRY_POINT_RENAME(ubiditransform_open)
 #define ubiditransform_transform U_ICU_ENTRY_POINT_RENAME(ubiditransform_transform)
 #define ublock_getCode U_ICU_ENTRY_POINT_RENAME(ublock_getCode)
+#define ubrk_clone U_ICU_ENTRY_POINT_RENAME(ubrk_clone)
 #define ubrk_close U_ICU_ENTRY_POINT_RENAME(ubrk_close)
 #define ubrk_countAvailable U_ICU_ENTRY_POINT_RENAME(ubrk_countAvailable)
 #define ubrk_current U_ICU_ENTRY_POINT_RENAME(ubrk_current)
@@ -534,6 +535,7 @@
 #define ucal_getTimeZoneDisplayName U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneDisplayName)
 #define ucal_getTimeZoneID U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneID)
 #define ucal_getTimeZoneIDForWindowsID U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneIDForWindowsID)
+#define ucal_getTimeZoneOffsetFromLocal U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneOffsetFromLocal)
 #define ucal_getTimeZoneTransitionDate U_ICU_ENTRY_POINT_RENAME(ucal_getTimeZoneTransitionDate)
 #define ucal_getType U_ICU_ENTRY_POINT_RENAME(ucal_getType)
 #define ucal_getWeekendTransition U_ICU_ENTRY_POINT_RENAME(ucal_getWeekendTransition)
@@ -962,6 +964,7 @@
 #define uhash_compareScriptSet U_ICU_ENTRY_POINT_RENAME(uhash_compareScriptSet)
 #define uhash_compareUChars U_ICU_ENTRY_POINT_RENAME(uhash_compareUChars)
 #define uhash_compareUnicodeString U_ICU_ENTRY_POINT_RENAME(uhash_compareUnicodeString)
+#define uhash_containsKey U_ICU_ENTRY_POINT_RENAME(uhash_containsKey)
 #define uhash_count U_ICU_ENTRY_POINT_RENAME(uhash_count)
 #define uhash_deleteHashtable U_ICU_ENTRY_POINT_RENAME(uhash_deleteHashtable)
 #define uhash_deleteScriptSet U_ICU_ENTRY_POINT_RENAME(uhash_deleteScriptSet)
@@ -970,6 +973,7 @@
 #define uhash_find U_ICU_ENTRY_POINT_RENAME(uhash_find)
 #define uhash_get U_ICU_ENTRY_POINT_RENAME(uhash_get)
 #define uhash_geti U_ICU_ENTRY_POINT_RENAME(uhash_geti)
+#define uhash_getiAndFound U_ICU_ENTRY_POINT_RENAME(uhash_getiAndFound)
 #define uhash_hashCaselessUnicodeString U_ICU_ENTRY_POINT_RENAME(uhash_hashCaselessUnicodeString)
 #define uhash_hashChars U_ICU_ENTRY_POINT_RENAME(uhash_hashChars)
 #define uhash_hashIChars U_ICU_ENTRY_POINT_RENAME(uhash_hashIChars)
@@ -977,12 +981,15 @@
 #define uhash_hashScriptSet U_ICU_ENTRY_POINT_RENAME(uhash_hashScriptSet)
 #define uhash_hashUChars U_ICU_ENTRY_POINT_RENAME(uhash_hashUChars)
 #define uhash_hashUnicodeString U_ICU_ENTRY_POINT_RENAME(uhash_hashUnicodeString)
+#define uhash_icontainsKey U_ICU_ENTRY_POINT_RENAME(uhash_icontainsKey)
 #define uhash_iget U_ICU_ENTRY_POINT_RENAME(uhash_iget)
 #define uhash_igeti U_ICU_ENTRY_POINT_RENAME(uhash_igeti)
+#define uhash_igetiAndFound U_ICU_ENTRY_POINT_RENAME(uhash_igetiAndFound)
 #define uhash_init U_ICU_ENTRY_POINT_RENAME(uhash_init)
 #define uhash_initSize U_ICU_ENTRY_POINT_RENAME(uhash_initSize)
 #define uhash_iput U_ICU_ENTRY_POINT_RENAME(uhash_iput)
 #define uhash_iputi U_ICU_ENTRY_POINT_RENAME(uhash_iputi)
+#define uhash_iputiAllowZero U_ICU_ENTRY_POINT_RENAME(uhash_iputiAllowZero)
 #define uhash_iremove U_ICU_ENTRY_POINT_RENAME(uhash_iremove)
 #define uhash_iremovei U_ICU_ENTRY_POINT_RENAME(uhash_iremovei)
 #define uhash_nextElement U_ICU_ENTRY_POINT_RENAME(uhash_nextElement)
@@ -990,6 +997,7 @@
 #define uhash_openSize U_ICU_ENTRY_POINT_RENAME(uhash_openSize)
 #define uhash_put U_ICU_ENTRY_POINT_RENAME(uhash_put)
 #define uhash_puti U_ICU_ENTRY_POINT_RENAME(uhash_puti)
+#define uhash_putiAllowZero U_ICU_ENTRY_POINT_RENAME(uhash_putiAllowZero)
 #define uhash_remove U_ICU_ENTRY_POINT_RENAME(uhash_remove)
 #define uhash_removeAll U_ICU_ENTRY_POINT_RENAME(uhash_removeAll)
 #define uhash_removeElement U_ICU_ENTRY_POINT_RENAME(uhash_removeElement)
@@ -1150,6 +1158,8 @@
 #define ultag_isUnicodeLocaleKey U_ICU_ENTRY_POINT_RENAME(ultag_isUnicodeLocaleKey)
 #define ultag_isUnicodeLocaleType U_ICU_ENTRY_POINT_RENAME(ultag_isUnicodeLocaleType)
 #define ultag_isVariantSubtags U_ICU_ENTRY_POINT_RENAME(ultag_isVariantSubtags)
+#define umeas_getPrefixBase U_ICU_ENTRY_POINT_RENAME(umeas_getPrefixBase)
+#define umeas_getPrefixPower U_ICU_ENTRY_POINT_RENAME(umeas_getPrefixPower)
 #define umsg_applyPattern U_ICU_ENTRY_POINT_RENAME(umsg_applyPattern)
 #define umsg_autoQuoteApostrophe U_ICU_ENTRY_POINT_RENAME(umsg_autoQuoteApostrophe)
 #define umsg_clone U_ICU_ENTRY_POINT_RENAME(umsg_clone)
@@ -1672,6 +1682,9 @@
 #define uset_compact U_ICU_ENTRY_POINT_RENAME(uset_compact)
 #define uset_complement U_ICU_ENTRY_POINT_RENAME(uset_complement)
 #define uset_complementAll U_ICU_ENTRY_POINT_RENAME(uset_complementAll)
+#define uset_complementAllCodePoints U_ICU_ENTRY_POINT_RENAME(uset_complementAllCodePoints)
+#define uset_complementRange U_ICU_ENTRY_POINT_RENAME(uset_complementRange)
+#define uset_complementString U_ICU_ENTRY_POINT_RENAME(uset_complementString)
 #define uset_contains U_ICU_ENTRY_POINT_RENAME(uset_contains)
 #define uset_containsAll U_ICU_ENTRY_POINT_RENAME(uset_containsAll)
 #define uset_containsAllCodePoints U_ICU_ENTRY_POINT_RENAME(uset_containsAllCodePoints)
@@ -1695,12 +1708,15 @@
 #define uset_openPatternOptions U_ICU_ENTRY_POINT_RENAME(uset_openPatternOptions)
 #define uset_remove U_ICU_ENTRY_POINT_RENAME(uset_remove)
 #define uset_removeAll U_ICU_ENTRY_POINT_RENAME(uset_removeAll)
+#define uset_removeAllCodePoints U_ICU_ENTRY_POINT_RENAME(uset_removeAllCodePoints)
 #define uset_removeAllStrings U_ICU_ENTRY_POINT_RENAME(uset_removeAllStrings)
 #define uset_removeRange U_ICU_ENTRY_POINT_RENAME(uset_removeRange)
 #define uset_removeString U_ICU_ENTRY_POINT_RENAME(uset_removeString)
 #define uset_resemblesPattern U_ICU_ENTRY_POINT_RENAME(uset_resemblesPattern)
 #define uset_retain U_ICU_ENTRY_POINT_RENAME(uset_retain)
 #define uset_retainAll U_ICU_ENTRY_POINT_RENAME(uset_retainAll)
+#define uset_retainAllCodePoints U_ICU_ENTRY_POINT_RENAME(uset_retainAllCodePoints)
+#define uset_retainString U_ICU_ENTRY_POINT_RENAME(uset_retainString)
 #define uset_serialize U_ICU_ENTRY_POINT_RENAME(uset_serialize)
 #define uset_serializedContains U_ICU_ENTRY_POINT_RENAME(uset_serializedContains)
 #define uset_set U_ICU_ENTRY_POINT_RENAME(uset_set)
diff --git a/thirdparty/icu4c/common/unicode/uset.h b/thirdparty/icu4c/common/unicode/uset.h
index 502ea8dc14..1d0daf9d09 100644
--- a/thirdparty/icu4c/common/unicode/uset.h
+++ b/thirdparty/icu4c/common/unicode/uset.h
@@ -582,8 +582,8 @@ U_CAPI void U_EXPORT2
 uset_addString(USet* set, const UChar* str, int32_t strLen);
 
 /**
- * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
+ * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
+ * If this set already contains any particular character, it has no effect on that character.
  * A frozen set will not be modified.
  * @param set the object to which to add the character
  * @param str the source string
@@ -628,6 +628,20 @@ uset_removeRange(USet* set, UChar32 start, UChar32 end);
 U_CAPI void U_EXPORT2
 uset_removeString(USet* set, const UChar* str, int32_t strLen);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Removes from this set all of its elements that are contained in the
  * specified set.  This operation effectively modifies this
@@ -650,15 +664,41 @@ uset_removeAll(USet* set, const USet* removeSet);
  * A frozen set will not be modified.
  *
  * @param set the object for which to retain only the specified range
- * @param start first character, inclusive, of range to be retained
- * to this set.
- * @param end last character, inclusive, of range to be retained
- * to this set.
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
  * @stable ICU 3.2
  */
 U_CAPI void U_EXPORT2
 uset_retain(USet* set, UChar32 start, UChar32 end);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Retains only the specified string from this set if it is present.
+ * Upon return this set will be empty if it did not contain s, or
+ * will only contain s if it did contain s.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_retainString(USet *set, const UChar *str, int32_t length);
+
+/**
+ * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Retains only the elements in this set that are contained in the
  * specified set.  In other words, removes from this set all of
@@ -696,6 +736,49 @@ uset_compact(USet* set);
 U_CAPI void U_EXPORT2
 uset_complement(USet* set);
 
+#ifndef U_HIDE_DRAFT_API
+/**
+ * Complements the specified range in this set.  Any character in
+ * the range will be removed if it is in this set, or will be
+ * added if it is not in this set.  If <code>start > end</code>
+ * then an empty range is complemented, leaving the set unchanged.
+ * This is equivalent to a boolean logic XOR.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementRange(USet *set, UChar32 start, UChar32 end);
+
+/**
+ * Complements the specified string in this set.
+ * The string will be removed if it is in this set, or will be added if it is not in this set.
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementString(USet *set, const UChar *str, int32_t length);
+
+/**
+ * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * A frozen set will not be modified.
+ *
+ * @param set the object to be modified
+ * @param str the string
+ * @param length the length of the string, or -1 if NUL-terminated
+ * @draft ICU 69
+ */
+U_CAPI void U_EXPORT2
+uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
+#endif  // U_HIDE_DRAFT_API
+
 /**
  * Complements in this set all elements contained in the specified
  * set.  Any character in the other set will be removed if it is
diff --git a/thirdparty/icu4c/common/unicode/ushape.h b/thirdparty/icu4c/common/unicode/ushape.h
index fed4869abd..14371edc8f 100644
--- a/thirdparty/icu4c/common/unicode/ushape.h
+++ b/thirdparty/icu4c/common/unicode/ushape.h
@@ -323,7 +323,7 @@ u_shapeArabic(const UChar *source, int32_t sourceLength,
 #define U_SHAPE_PRESERVE_PRESENTATION           0x8000
 /** Presentation form option: 
  * Replace Arabic Presentation Forms-A and Arabic Presentationo Forms-B with 
- * their unshaped correspondants in range 0+06xx, before shaping.
+ * their unshaped correspondents in range 0+06xx, before shaping.
  * @stable ICU 3.6 
  */
 #define U_SHAPE_PRESERVE_PRESENTATION_NOOP      0
diff --git a/thirdparty/icu4c/common/unicode/utrace.h b/thirdparty/icu4c/common/unicode/utrace.h
index 28c313c582..677486f473 100644
--- a/thirdparty/icu4c/common/unicode/utrace.h
+++ b/thirdparty/icu4c/common/unicode/utrace.h
@@ -173,24 +173,23 @@ typedef enum UTraceFunctionNumber {
     UTRACE_RES_DATA_LIMIT,
 #endif  // U_HIDE_INTERNAL_API
 
-#ifndef U_HIDE_DRAFT_API
     /**
      * The lowest break iterator location.
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_START=0x4000,
 
     /**
      * Indicates that a character instance of break iterator was created.
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_CHARACTER = UTRACE_UBRK_START,
 
     /**
      * Indicates that a word instance of break iterator was created.
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_WORD,
 
@@ -200,21 +199,21 @@ typedef enum UTraceFunctionNumber {
      * Provides one C-style string to UTraceData: the lb value ("",
      * "loose", "strict", or "normal").
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_LINE,
 
     /**
      * Indicates that a sentence instance of break iterator was created.
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_SENTENCE,
 
     /**
      * Indicates that a title instance of break iterator was created.
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_TITLE,
 
@@ -224,12 +223,10 @@ typedef enum UTraceFunctionNumber {
      * Provides one C-style string to UTraceData: the script code of what
      * the break engine cover ("Hani", "Khmr", "Laoo", "Mymr", or "Thai").
      *
-     * @draft ICU 67
+     * @stable ICU 67
      */
     UTRACE_UBRK_CREATE_BREAK_ENGINE,
 
-#endif  // U_HIDE_DRAFT_API
-
 #ifndef U_HIDE_INTERNAL_API
     /**
      * One more than the highest normal break iterator trace location.
diff --git a/thirdparty/icu4c/common/unicode/uvernum.h b/thirdparty/icu4c/common/unicode/uvernum.h
index a46481a3fe..b09d4943c1 100644
--- a/thirdparty/icu4c/common/unicode/uvernum.h
+++ b/thirdparty/icu4c/common/unicode/uvernum.h
@@ -60,13 +60,13 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.4
  */
-#define U_ICU_VERSION_MAJOR_NUM 68
+#define U_ICU_VERSION_MAJOR_NUM 69
 
 /** The current ICU minor version as an integer.
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.6
  */
-#define U_ICU_VERSION_MINOR_NUM 2
+#define U_ICU_VERSION_MINOR_NUM 1
 
 /** The current ICU patchlevel version as an integer.
  *  This value will change in the subsequent releases of ICU
@@ -86,7 +86,7 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.6
  */
-#define U_ICU_VERSION_SUFFIX _68
+#define U_ICU_VERSION_SUFFIX _69
 
 /**
  * \def U_DEF2_ICU_ENTRY_POINT_RENAME
@@ -139,7 +139,7 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.4
  */
-#define U_ICU_VERSION "68.2"
+#define U_ICU_VERSION "69.1"
 
 /**
  * The current ICU library major version number as a string, for library name suffixes.
@@ -152,13 +152,13 @@
  *
  * @stable ICU 2.6
  */
-#define U_ICU_VERSION_SHORT "68"
+#define U_ICU_VERSION_SHORT "69"
 
 #ifndef U_HIDE_INTERNAL_API
 /** Data version in ICU4C.
  * @internal ICU 4.4 Internal Use Only
  **/
-#define U_ICU_DATA_VERSION "68.2"
+#define U_ICU_DATA_VERSION "69.1"
 #endif  /* U_HIDE_INTERNAL_API */
 
 /*===========================================================================
diff --git a/thirdparty/icu4c/common/uniset.cpp b/thirdparty/icu4c/common/uniset.cpp
index b73d612f24..461e5a7197 100644
--- a/thirdparty/icu4c/common/uniset.cpp
+++ b/thirdparty/icu4c/common/uniset.cpp
@@ -30,24 +30,6 @@
 #include "bmpset.h"
 #include "unisetspan.h"
 
-// Define UChar constants using hex for EBCDIC compatibility
-// Used #define to reduce private static exports and memory access time.
-#define SET_OPEN        ((UChar)0x005B) /*[*/
-#define SET_CLOSE       ((UChar)0x005D) /*]*/
-#define HYPHEN          ((UChar)0x002D) /*-*/
-#define COMPLEMENT      ((UChar)0x005E) /*^*/
-#define COLON           ((UChar)0x003A) /*:*/
-#define BACKSLASH       ((UChar)0x005C) /*\*/
-#define INTERSECTION    ((UChar)0x0026) /*&*/
-#define UPPER_U         ((UChar)0x0055) /*U*/
-#define LOWER_U         ((UChar)0x0075) /*u*/
-#define OPEN_BRACE      ((UChar)123)    /*{*/
-#define CLOSE_BRACE     ((UChar)125)    /*}*/
-#define UPPER_P         ((UChar)0x0050) /*P*/
-#define LOWER_P         ((UChar)0x0070) /*p*/
-#define UPPER_N         ((UChar)78)     /*N*/
-#define EQUALS          ((UChar)0x003D) /*=*/
-
 // HIGH_VALUE > all valid values. 110000 for codepoints
 #define UNICODESET_HIGH 0x0110000
 
@@ -444,7 +426,6 @@ UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
  * @return <tt>true</tt> if this set contains the specified string
  */
 UBool UnicodeSet::contains(const UnicodeString& s) const {
-    if (s.length() == 0) return FALSE;
     int32_t cp = getSingleCP(s);
     if (cp < 0) {
         return stringsContains(s);
@@ -559,11 +540,9 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
     if (hasStrings()) {
         for (i=0; i<strings->size(); ++i) {
             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
-            //if (s.length() == 0) {
-            //    // Empty strings match everything
-            //    return TRUE;
-            //}
-            // assert(s.length() != 0); // We enforce this elsewhere
+            if (s.isEmpty()) {
+                continue;  // skip the empty string
+            }
             UChar32 c = s.char32At(0);
             if ((c & 0xFF) == v) {
                 return TRUE;
@@ -582,9 +561,6 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
                                  int32_t limit,
                                  UBool incremental) {
     if (offset == limit) {
-        // Strings, if any, have length != 0, so we don't worry
-        // about them here.  If we ever allow zero-length strings
-        // we much check for them here.
         if (contains(U_ETHER)) {
             return incremental ? U_PARTIAL_MATCH : U_MATCH;
         } else {
@@ -614,11 +590,9 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
 
             for (i=0; i<strings->size(); ++i) {
                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
-
-                //if (trial.length() == 0) {
-                //    return U_MATCH; // null-string always matches
-                //}
-                // assert(trial.length() != 0); // We ensure this elsewhere
+                if (trial.isEmpty()) {
+                    continue;  // skip the empty string
+                }
 
                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
 
@@ -971,12 +945,12 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
  * present.  If this set already contains the multicharacter,
  * the call leaves this set unchanged.
  * Thus "ch" => {"ch"}
- * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+ *
  * @param s the source string
  * @return the modified set, for chaining
  */
 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
-    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+    if (isFrozen() || isBogus()) return *this;
     int32_t cp = getSingleCP(s);
     if (cp < 0) {
         if (!stringsContains(s)) {
@@ -991,8 +965,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
 
 /**
  * Adds the given string, in order, to 'strings'.  The given string
- * must have been checked by the caller to not be empty and to not
- * already be in 'strings'.
+ * must have been checked by the caller to not already be in 'strings'.
  */
 void UnicodeSet::_add(const UnicodeString& s) {
     if (isFrozen() || isBogus()) {
@@ -1021,16 +994,13 @@ void UnicodeSet::_add(const UnicodeString& s) {
  * @param string to test
  */
 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
-    //if (s.length() < 1) {
-    //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
-    //}
-    if (s.length() > 2) return -1;
-    if (s.length() == 1) return s.charAt(0);
-
-    // at this point, len = 2
-    UChar32 cp = s.char32At(0);
-    if (cp > 0xFFFF) { // is surrogate pair
-        return cp;
+    int32_t sLength = s.length();
+    if (sLength == 1) return s.charAt(0);
+    if (sLength == 2) {
+        UChar32 cp = s.char32At(0);
+        if (cp > 0xFFFF) { // is surrogate pair
+            return cp;
+        }
     }
     return -1;
 }
@@ -1150,6 +1120,26 @@ UnicodeSet& UnicodeSet::retain(UChar32 c) {
     return retain(c, c);
 }
 
+UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
+    if (isFrozen() || isBogus()) { return *this; }
+    UChar32 cp = getSingleCP(s);
+    if (cp < 0) {
+        bool isIn = stringsContains(s);
+        // Check for getRangeCount() first to avoid somewhat-expensive size()
+        // when there are single code points.
+        if (isIn && getRangeCount() == 0 && size() == 1) {
+            return *this;
+        }
+        clear();
+        if (isIn) {
+            _add(s);
+        }
+    } else {
+        retain(cp, cp);
+    }
+    return *this;
+}
+
 /**
  * Removes the specified range from this set if it is present.
  * The set will not contain the specified range once the call
@@ -1186,7 +1176,7 @@ UnicodeSet& UnicodeSet::remove(UChar32 c) {
  * @return the modified set, for chaining
  */
 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
-    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+    if (isFrozen() || isBogus()) return *this;
     int32_t cp = getSingleCP(s);
     if (cp < 0) {
         if (strings != nullptr && strings->removeElement((void*) &s)) {
@@ -1252,12 +1242,12 @@ UnicodeSet& UnicodeSet::complement(void) {
  * Complement the specified string in this set.
  * The set will not contain the specified string once the call
  * returns.
- * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+ *
  * @param s the string to complement
  * @return this object, for chaining
  */
 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
-    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+    if (isFrozen() || isBogus()) return *this;
     int32_t cp = getSingleCP(s);
     if (cp < 0) {
         if (stringsContains(s)) {
@@ -2001,22 +1991,22 @@ escapeUnprintable) {
     }
     // Okay to let ':' pass through
     switch (c) {
-    case SET_OPEN:
-    case SET_CLOSE:
-    case HYPHEN:
-    case COMPLEMENT:
-    case INTERSECTION:
-    case BACKSLASH:
-    case OPEN_BRACE:
-    case CLOSE_BRACE:
-    case COLON:
+    case u'[':
+    case u']':
+    case u'-':
+    case u'^':
+    case u'&':
+    case u'\\':
+    case u'{':
+    case u'}':
+    case u':':
     case SymbolTable::SYMBOL_REF:
-        buf.append(BACKSLASH);
+        buf.append(u'\\');
         break;
     default:
         // Escape whitespace
         if (PatternProps::isWhiteSpace(c)) {
-            buf.append(BACKSLASH);
+            buf.append(u'\\');
         }
         break;
     }
@@ -2049,7 +2039,7 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
                 backslashCount = 0;
             } else {
                 result.append(c);
-                if (c == BACKSLASH) {
+                if (c == u'\\') {
                     ++backslashCount;
                 } else {
                     backslashCount = 0;
@@ -2082,13 +2072,13 @@ UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
                                             UBool escapeUnprintable) const
 {
-    result.append(SET_OPEN);
+    result.append(u'[');
 
 //  // Check against the predefined categories.  We implicitly build
 //  // up ALL category sets the first time toPattern() is called.
 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
 //      if (*this == getCategorySet(cat)) {
-//          result.append(COLON);
+//          result.append(u':');
 //          result.append(CATEGORY_NAMES, cat*2, 2);
 //          return result.append(CATEGORY_CLOSE);
 //      }
@@ -2104,7 +2094,7 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
         getRangeEnd(count-1) == MAX_VALUE) {
 
         // Emit the inverse
-        result.append(COMPLEMENT);
+        result.append(u'^');
 
         for (int32_t i = 1; i < count; ++i) {
             UChar32 start = getRangeEnd(i-1)+1;
@@ -2112,7 +2102,7 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
             _appendToPat(result, start, escapeUnprintable);
             if (start != end) {
                 if ((start+1) != end) {
-                    result.append(HYPHEN);
+                    result.append(u'-');
                 }
                 _appendToPat(result, end, escapeUnprintable);
             }
@@ -2127,7 +2117,7 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
             _appendToPat(result, start, escapeUnprintable);
             if (start != end) {
                 if ((start+1) != end) {
-                    result.append(HYPHEN);
+                    result.append(u'-');
                 }
                 _appendToPat(result, end, escapeUnprintable);
             }
@@ -2136,14 +2126,14 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
 
     if (strings != nullptr) {
         for (int32_t i = 0; i<strings->size(); ++i) {
-            result.append(OPEN_BRACE);
+            result.append(u'{');
             _appendToPat(result,
                          *(const UnicodeString*) strings->elementAt(i),
                          escapeUnprintable);
-            result.append(CLOSE_BRACE);
+            result.append(u'}');
         }
     }
-    return result.append(SET_CLOSE);
+    return result.append(u']');
 }
 
 /**
diff --git a/thirdparty/icu4c/common/uniset_props.cpp b/thirdparty/icu4c/common/uniset_props.cpp
index 37277fcb75..8fde5abcdd 100644
--- a/thirdparty/icu4c/common/uniset_props.cpp
+++ b/thirdparty/icu4c/common/uniset_props.cpp
@@ -47,31 +47,6 @@
 
 U_NAMESPACE_USE
 
-// Define UChar constants using hex for EBCDIC compatibility
-// Used #define to reduce private static exports and memory access time.
-#define SET_OPEN        ((UChar)0x005B) /*[*/
-#define SET_CLOSE       ((UChar)0x005D) /*]*/
-#define HYPHEN          ((UChar)0x002D) /*-*/
-#define COMPLEMENT      ((UChar)0x005E) /*^*/
-#define COLON           ((UChar)0x003A) /*:*/
-#define BACKSLASH       ((UChar)0x005C) /*\*/
-#define INTERSECTION    ((UChar)0x0026) /*&*/
-#define UPPER_U         ((UChar)0x0055) /*U*/
-#define LOWER_U         ((UChar)0x0075) /*u*/
-#define OPEN_BRACE      ((UChar)123)    /*{*/
-#define CLOSE_BRACE     ((UChar)125)    /*}*/
-#define UPPER_P         ((UChar)0x0050) /*P*/
-#define LOWER_P         ((UChar)0x0070) /*p*/
-#define UPPER_N         ((UChar)78)     /*N*/
-#define EQUALS          ((UChar)0x003D) /*=*/
-
-//static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:"
-static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 };  // ":]"
-//static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p"
-//static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}"
-//static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N"
-static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
-
 // Special property set IDs
 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
@@ -81,12 +56,6 @@ static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
 #define NAME_PROP "na"
 #define NAME_PROP_LENGTH 2
 
-/**
- * Delimiter string used in patterns to close a category reference:
- * ":]".  Example: "[:Lu:]".
- */
-//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
-
 // Cached sets ------------------------------------------------------------- ***
 
 U_CDECL_BEGIN
@@ -140,27 +109,27 @@ uniset_getUnicode32Instance(UErrorCode &errorCode) {
 static inline UBool
 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
     UChar c;
-    return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
+    return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
 }
 
 /*static inline UBool
 isPerlClose(const UnicodeString &pattern, int32_t pos) {
-    return pattern.charAt(pos)==CLOSE_BRACE;
+    return pattern.charAt(pos)==u'}';
 }*/
 
 static inline UBool
 isNameOpen(const UnicodeString &pattern, int32_t pos) {
-    return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
+    return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
 }
 
 static inline UBool
 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
-    return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
+    return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
 }
 
 /*static inline UBool
 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
-    return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
+    return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
 }*/
 
 // TODO memory debugging provided inside uniset.cpp
@@ -326,9 +295,8 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
 
     while (mode != 2 && !chars.atEnd()) {
         U_ASSERT((lastItem == 0 && op == 0) ||
-                 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
-                 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
-                                    op == INTERSECTION /*'&'*/)));
+                 (lastItem == 1 && (op == 0 || op == u'-')) ||
+                 (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
 
         UChar32 c = 0;
         UBool literal = FALSE;
@@ -356,27 +324,27 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
             c = chars.next(opts, literal, ec);
             if (U_FAILURE(ec)) return;
 
-            if (c == 0x5B /*'['*/ && !literal) {
+            if (c == u'[' && !literal) {
                 if (mode == 1) {
                     chars.setPos(backup); // backup
                     setMode = 1;
                 } else {
                     // Handle opening '[' delimiter
                     mode = 1;
-                    patLocal.append((UChar) 0x5B /*'['*/);
+                    patLocal.append(u'[');
                     chars.getPos(backup); // prepare to backup
                     c = chars.next(opts, literal, ec); 
                     if (U_FAILURE(ec)) return;
-                    if (c == 0x5E /*'^'*/ && !literal) {
+                    if (c == u'^' && !literal) {
                         invert = TRUE;
-                        patLocal.append((UChar) 0x5E /*'^'*/);
+                        patLocal.append(u'^');
                         chars.getPos(backup); // prepare to backup
                         c = chars.next(opts, literal, ec);
                         if (U_FAILURE(ec)) return;
                     }
                     // Fall through to handle special leading '-';
                     // otherwise restart loop for nested [], \p{}, etc.
-                    if (c == HYPHEN /*'-'*/) {
+                    if (c == u'-') {
                         literal = TRUE;
                         // Fall through to handle literal '-' below
                     } else {
@@ -418,7 +386,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                 op = 0;
             }
 
-            if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
+            if (op == u'-' || op == u'&') {
                 patLocal.append(op);
             }
 
@@ -454,10 +422,10 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
             }
 
             switch (op) {
-            case HYPHEN: /*'-'*/
+            case u'-':
                 removeAll(*nested);
                 break;
-            case INTERSECTION: /*'&'*/
+            case u'&':
                 retainAll(*nested);
                 break;
             case 0:
@@ -483,24 +451,24 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
 
         if (!literal) {
             switch (c) {
-            case 0x5D /*']'*/:
+            case u']':
                 if (lastItem == 1) {
                     add(lastChar, lastChar);
                     _appendToPat(patLocal, lastChar, FALSE);
                 }
                 // Treat final trailing '-' as a literal
-                if (op == HYPHEN /*'-'*/) {
+                if (op == u'-') {
                     add(op, op);
                     patLocal.append(op);
-                } else if (op == INTERSECTION /*'&'*/) {
+                } else if (op == u'&') {
                     // syntaxError(chars, "Trailing '&'");
                     ec = U_MALFORMED_SET;
                     return;
                 }
-                patLocal.append((UChar) 0x5D /*']'*/);
+                patLocal.append(u']');
                 mode = 2;
                 continue;
-            case HYPHEN /*'-'*/:
+            case u'-':
                 if (op == 0) {
                     if (lastItem != 0) {
                         op = (UChar) c;
@@ -510,8 +478,8 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                         add(c, c);
                         c = chars.next(opts, literal, ec);
                         if (U_FAILURE(ec)) return;
-                        if (c == 0x5D /*']'*/ && !literal) {
-                            patLocal.append(HYPHEN_RIGHT_BRACE, 2);
+                        if (c == u']' && !literal) {
+                            patLocal.append(u"-]", 2);
                             mode = 2;
                             continue;
                         }
@@ -520,7 +488,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                 // syntaxError(chars, "'-' not after char or set");
                 ec = U_MALFORMED_SET;
                 return;
-            case INTERSECTION /*'&'*/:
+            case u'&':
                 if (lastItem == 2 && op == 0) {
                     op = (UChar) c;
                     continue;
@@ -528,11 +496,11 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                 // syntaxError(chars, "'&' not after set");
                 ec = U_MALFORMED_SET;
                 return;
-            case 0x5E /*'^'*/:
+            case u'^':
                 // syntaxError(chars, "'^' not after '['");
                 ec = U_MALFORMED_SET;
                 return;
-            case 0x7B /*'{'*/:
+            case u'{':
                 if (op != 0) {
                     // syntaxError(chars, "Missing operand after operator");
                     ec = U_MALFORMED_SET;
@@ -549,13 +517,13 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                     while (!chars.atEnd()) {
                         c = chars.next(opts, literal, ec);
                         if (U_FAILURE(ec)) return;
-                        if (c == 0x7D /*'}'*/ && !literal) {
+                        if (c == u'}' && !literal) {
                             ok = TRUE;
                             break;
                         }
                         buf.append(c);
                     }
-                    if (buf.length() < 1 || !ok) {
+                    if (!ok) {
                         // syntaxError(chars, "Invalid multicharacter string");
                         ec = U_MALFORMED_SET;
                         return;
@@ -565,9 +533,9 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                 // we don't need to drop through to the further
                 // processing
                 add(buf);
-                patLocal.append((UChar) 0x7B /*'{'*/);
+                patLocal.append(u'{');
                 _appendToPat(patLocal, buf, FALSE);
-                patLocal.append((UChar) 0x7D /*'}'*/);
+                patLocal.append(u'}');
                 continue;
             case SymbolTable::SYMBOL_REF:
                 //         symbols  nosymbols
@@ -580,7 +548,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                     chars.getPos(backup);
                     c = chars.next(opts, literal, ec);
                     if (U_FAILURE(ec)) return;
-                    UBool anchor = (c == 0x5D /*']'*/ && !literal);
+                    UBool anchor = (c == u']' && !literal);
                     if (symbols == 0 && !anchor) {
                         c = SymbolTable::SYMBOL_REF;
                         chars.setPos(backup);
@@ -594,7 +562,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                         add(U_ETHER);
                         usePat = TRUE;
                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);
-                        patLocal.append((UChar) 0x5D /*']'*/);
+                        patLocal.append(u']');
                         mode = 2;
                         continue;
                     }
@@ -617,7 +585,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
             lastChar = c;
             break;
         case 1:
-            if (op == HYPHEN /*'-'*/) {
+            if (op == u'-') {
                 if (lastChar >= c) {
                     // Don't allow redundant (a-a) or empty (b-a) ranges;
                     // these are most likely typos.
@@ -1036,11 +1004,11 @@ UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
     RuleCharacterIterator::Pos pos;
     chars.getPos(pos);
     UChar32 c = chars.next(iterOpts, literal, ec);
-    if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
+    if (c == u'[' || c == u'\\') {
         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
                                literal, ec);
-        result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
-                 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
+        result = (c == u'[') ? (d == u':') :
+                               (d == u'N' || d == u'p' || d == u'P');
     }
     chars.setPos(pos);
     return result && U_SUCCESS(ec);
@@ -1071,17 +1039,17 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
         posix = TRUE;
         pos += 2;
         pos = ICU_Utility::skipWhitespace(pattern, pos);
-        if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
+        if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
             ++pos;
             invert = TRUE;
         }
     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
         UChar c = pattern.charAt(pos+1);
-        invert = (c == UPPER_P);
-        isName = (c == UPPER_N);
+        invert = (c == u'P');
+        isName = (c == u'N');
         pos += 2;
         pos = ICU_Utility::skipWhitespace(pattern, pos);
-        if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
+        if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
             // Syntax error; "\p" or "\P" not followed by "{"
             FAIL(ec);
         }
@@ -1093,9 +1061,9 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
     // Look for the matching close delimiter, either :] or }
     int32_t close;
     if (posix) {
-      close = pattern.indexOf(POSIX_CLOSE, 2, pos);
+      close = pattern.indexOf(u":]", 2, pos);
     } else {
-      close = pattern.indexOf(CLOSE_BRACE, pos);
+      close = pattern.indexOf(u'}', pos);
     }
     if (close < 0) {
         // Syntax error; close delimiter missing
@@ -1105,7 +1073,7 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
     // Look for an '=' sign.  If this is present, we will parse a
     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
     // pattern.
-    int32_t equals = pattern.indexOf(EQUALS, pos);
+    int32_t equals = pattern.indexOf(u'=', pos);
     UnicodeString propName, valueName;
     if (equals >= 0 && equals < close && !isName) {
         // Equals seen; parse medium/long pattern
diff --git a/thirdparty/icu4c/common/unisetspan.cpp b/thirdparty/icu4c/common/unisetspan.cpp
index 68e44d91ee..fe0d74f5b2 100644
--- a/thirdparty/icu4c/common/unisetspan.cpp
+++ b/thirdparty/icu4c/common/unisetspan.cpp
@@ -231,6 +231,9 @@ UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
         const UChar *s16=string.getBuffer();
         int32_t length16=string.length();
+        if (length16==0) {
+            continue;  // skip the empty string
+        }
         UBool thisRelevant;
         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
         if(spanLength<length16) {  // Relevant string.
@@ -312,7 +315,7 @@ UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
         const UChar *s16=string.getBuffer();
         int32_t length16=string.length();
         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
-        if(spanLength<length16) {  // Relevant string.
+        if(spanLength<length16 && length16>0) {  // Relevant string.
             if(which&UTF16) {
                 if(which&CONTAINED) {
                     if(which&FWD) {
@@ -362,7 +365,7 @@ UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
                     addToSpanNotSet(c);
                 }
             }
-        } else {  // Irrelevant string.
+        } else {  // Irrelevant string. (Also the empty string.)
             if(which&UTF8) {
                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
                     uint8_t *s8=utf8+utf8Count;
@@ -653,11 +656,12 @@ int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondi
             for(i=0; i<stringsLength; ++i) {
                 int32_t overlap=spanLengths[i];
                 if(overlap==ALL_CP_CONTAINED) {
-                    continue;  // Irrelevant string.
+                    continue;  // Irrelevant string. (Also the empty string.)
                 }
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
+                U_ASSERT(length>0);
 
                 // Try to match this string at pos-overlap..pos.
                 if(overlap>=LONG_SPAN) {
@@ -697,6 +701,9 @@ int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondi
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
+                if (length16==0) {
+                    continue;  // skip the empty string
+                }
 
                 // Try to match this string at pos-overlap..pos.
                 if(overlap>=LONG_SPAN) {
@@ -817,11 +824,12 @@ int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanC
             for(i=0; i<stringsLength; ++i) {
                 int32_t overlap=spanBackLengths[i];
                 if(overlap==ALL_CP_CONTAINED) {
-                    continue;  // Irrelevant string.
+                    continue;  // Irrelevant string. (Also the empty string.)
                 }
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
+                U_ASSERT(length>0);
 
                 // Try to match this string at pos-(length16-overlap)..pos-length16.
                 if(overlap>=LONG_SPAN) {
@@ -863,6 +871,9 @@ int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanC
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
+                if (length16==0) {
+                    continue;  // skip the empty string
+                }
 
                 // Try to match this string at pos-(length16-overlap)..pos-length16.
                 if(overlap>=LONG_SPAN) {
@@ -1358,11 +1369,12 @@ int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
         // Try to match the strings at pos.
         for(i=0; i<stringsLength; ++i) {
             if(spanLengths[i]==ALL_CP_CONTAINED) {
-                continue;  // Irrelevant string.
+                continue;  // Irrelevant string. (Also the empty string.)
             }
             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
             const UChar *s16=string.getBuffer();
             int32_t length16=string.length();
+            U_ASSERT(length>0);
             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
                 return pos;  // There is a set element at pos.
             }
@@ -1401,11 +1413,12 @@ int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const
             // it is easier and we only need to know whether the string is irrelevant
             // which is the same in either array.
             if(spanLengths[i]==ALL_CP_CONTAINED) {
-                continue;  // Irrelevant string.
+                continue;  // Irrelevant string. (Also the empty string.)
             }
             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
             const UChar *s16=string.getBuffer();
             int32_t length16=string.length();
+            U_ASSERT(length>0);
             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
                 return pos;  // There is a set element at pos.
             }
diff --git a/thirdparty/icu4c/common/uprops.h b/thirdparty/icu4c/common/uprops.h
index 8bf929919f..09830bdeb9 100644
--- a/thirdparty/icu4c/common/uprops.h
+++ b/thirdparty/icu4c/common/uprops.h
@@ -310,55 +310,12 @@ u_isgraphPOSIX(UChar32 c);
 U_CFUNC UBool
 u_isprintPOSIX(UChar32 c);
 
-/** Turn a bit index into a bit flag. @internal */
-#define FLAG(n) ((uint32_t)1<<(n))
-
-/** Flags for general categories in the order of UCharCategory. @internal */
-#define _Cn     FLAG(U_GENERAL_OTHER_TYPES)
-#define _Lu     FLAG(U_UPPERCASE_LETTER)
-#define _Ll     FLAG(U_LOWERCASE_LETTER)
-#define _Lt     FLAG(U_TITLECASE_LETTER)
-#define _Lm     FLAG(U_MODIFIER_LETTER)
-/* #define _Lo     FLAG(U_OTHER_LETTER) -- conflicts with MS Visual Studio 9.0 xiosbase */
-#define _Mn     FLAG(U_NON_SPACING_MARK)
-#define _Me     FLAG(U_ENCLOSING_MARK)
-#define _Mc     FLAG(U_COMBINING_SPACING_MARK)
-#define _Nd     FLAG(U_DECIMAL_DIGIT_NUMBER)
-#define _Nl     FLAG(U_LETTER_NUMBER)
-#define _No     FLAG(U_OTHER_NUMBER)
-#define _Zs     FLAG(U_SPACE_SEPARATOR)
-#define _Zl     FLAG(U_LINE_SEPARATOR)
-#define _Zp     FLAG(U_PARAGRAPH_SEPARATOR)
-#define _Cc     FLAG(U_CONTROL_CHAR)
-#define _Cf     FLAG(U_FORMAT_CHAR)
-#define _Co     FLAG(U_PRIVATE_USE_CHAR)
-#define _Cs     FLAG(U_SURROGATE)
-#define _Pd     FLAG(U_DASH_PUNCTUATION)
-#define _Ps     FLAG(U_START_PUNCTUATION)
-/* #define _Pe     FLAG(U_END_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 xlocnum */
-/* #define _Pc     FLAG(U_CONNECTOR_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */
-#define _Po     FLAG(U_OTHER_PUNCTUATION)
-#define _Sm     FLAG(U_MATH_SYMBOL)
-#define _Sc     FLAG(U_CURRENCY_SYMBOL)
-#define _Sk     FLAG(U_MODIFIER_SYMBOL)
-#define _So     FLAG(U_OTHER_SYMBOL)
-#define _Pi     FLAG(U_INITIAL_PUNCTUATION)
-/* #define _Pf     FLAG(U_FINAL_PUNCTUATION) -- conflicts with MS Visual Studio 9.0 streambuf */
-
 /** Some code points. @internal */
 enum {
     TAB     =0x0009,
     LF      =0x000a,
     FF      =0x000c,
     CR      =0x000d,
-    U_A     =0x0041,
-    U_F     =0x0046,
-    U_Z     =0x005a,
-    U_a     =0x0061,
-    U_f     =0x0066,
-    U_z     =0x007a,
-    DEL     =0x007f,
-    NL      =0x0085,
     NBSP    =0x00a0,
     CGJ     =0x034f,
     FIGURESP=0x2007,
@@ -367,15 +324,6 @@ enum {
     ZWJ     =0x200d,
     RLM     =0x200f,
     NNBSP   =0x202f,
-    WJ      =0x2060,
-    INHSWAP =0x206a,
-    NOMDIG  =0x206f,
-    U_FW_A  =0xff21,
-    U_FW_F  =0xff26,
-    U_FW_Z  =0xff3a,
-    U_FW_a  =0xff41,
-    U_FW_f  =0xff46,
-    U_FW_z  =0xff5a,
     ZWNBSP  =0xfeff
 };
 
diff --git a/thirdparty/icu4c/common/uresbund.cpp b/thirdparty/icu4c/common/uresbund.cpp
index 2ece87897d..5ea4187100 100644
--- a/thirdparty/icu4c/common/uresbund.cpp
+++ b/thirdparty/icu4c/common/uresbund.cpp
@@ -92,6 +92,15 @@ static UBool chopLocale(char *name) {
 }
 
 /**
+ *  Called to check whether a name without '_' needs to be checked for a parent.
+ *  Some code had assumed that locale IDs with '_' could not have a non-root parent.
+ *  We may want a better way of doing this.
+ */
+static UBool mayHaveParent(char *name) {
+    return (name[0] != 0 && uprv_strstr("nb nn",name) != nullptr);
+}
+
+/**
  *  Internal function
  */
 static void entryIncrease(UResourceDataEntry *entry) {
@@ -529,8 +538,8 @@ loadParentsExceptRoot(UResourceDataEntry *&t1,
                       char name[], int32_t nameCapacity,
                       UBool usingUSRData, char usrDataPath[], UErrorCode *status) {
     if (U_FAILURE(*status)) { return FALSE; }
-    UBool hasChopped = TRUE;
-    while (hasChopped && t1->fParent == NULL && !t1->fData.noFallback &&
+    UBool checkParent = TRUE;
+    while (checkParent && t1->fParent == NULL && !t1->fData.noFallback &&
             res_getResource(&t1->fData,"%%ParentIsRoot") == RES_BOGUS) {
         Resource parentRes = res_getResource(&t1->fData, "%%Parent");
         if (parentRes != RES_BOGUS) {  // An explicit parent was found.
@@ -573,7 +582,7 @@ loadParentsExceptRoot(UResourceDataEntry *&t1,
             }
         }
         t1 = t2;
-        hasChopped = chopLocale(name);
+        checkParent = chopLocale(name) || mayHaveParent(name);
     }
     return TRUE;
 }
@@ -692,7 +701,7 @@ static UResourceDataEntry *entryOpen(const char* path, const char* localeID,
                 }
             }
         }
-        if (hasChopped && !isRoot) {
+        if ((hasChopped || mayHaveParent(name)) && !isRoot) {
             if (!loadParentsExceptRoot(t1, name, UPRV_LENGTHOF(name), usingUSRData, usrDataPath, status)) {
                 goto finish;
             }
@@ -716,7 +725,7 @@ static UResourceDataEntry *entryOpen(const char* path, const char* localeID,
             hasRealData = TRUE;
             isDefault = TRUE;
             // TODO: Why not if (usingUSRData) { ... } like in the non-default-locale code path?
-            if (hasChopped && !isRoot) {
+            if ((hasChopped || mayHaveParent(name)) && !isRoot) {
                 if (!loadParentsExceptRoot(t1, name, UPRV_LENGTHOF(name), usingUSRData, usrDataPath, status)) {
                     goto finish;
                 }
@@ -1908,6 +1917,8 @@ ures_getByKeyWithFallback(const UResourceBundle *resB,
                             } else {
                               break;
                             }
+                        } else if (res == RES_BOGUS) {
+                            break;
                         }
                     } while(*myPath); /* Continue until the whole path is consumed */
                 }
@@ -3019,7 +3030,7 @@ ures_getKeywordValues(const char *path, const char *keyword, UErrorCode *status)
 U_CAPI UBool U_EXPORT2
 ures_equal(const UResourceBundle* res1, const UResourceBundle* res2){
     if(res1==NULL || res2==NULL){
-        return res1==res2; /* pointer comparision */
+        return res1==res2; /* pointer comparison */
     }
     if(res1->fKey==NULL||  res2->fKey==NULL){
         return (res1->fKey==res2->fKey);
diff --git a/thirdparty/icu4c/common/uresdata.cpp b/thirdparty/icu4c/common/uresdata.cpp
index ae731e4544..9af081be40 100644
--- a/thirdparty/icu4c/common/uresdata.cpp
+++ b/thirdparty/icu4c/common/uresdata.cpp
@@ -960,14 +960,6 @@ res_findResource(const ResourceData *pResData, Resource r, char** path, const ch
     if(URES_IS_TABLE(type)) {
       *key = pathP;
       t2 = res_getTableItemByKey(pResData, t1, &indexR, key);
-      if(t2 == RES_BOGUS) { 
-        /* if we fail to get the resource by key, maybe we got an index */
-        indexR = uprv_strtol(pathP, &closeIndex, 10);
-        if(indexR >= 0 && *closeIndex == 0 && (*pathP != '0' || closeIndex - pathP == 1)) {
-          /* if we indeed have an index, try to get the item by index */
-          t2 = res_getTableItemByIndex(pResData, t1, indexR, key);
-        } // else t2 is already RES_BOGUS
-      }
     } else if(URES_IS_ARRAY(type)) {
       indexR = uprv_strtol(pathP, &closeIndex, 10);
       if(indexR >= 0 && *closeIndex == 0) {
diff --git a/thirdparty/icu4c/common/uresimp.h b/thirdparty/icu4c/common/uresimp.h
index 69d82566fe..f038dedace 100644
--- a/thirdparty/icu4c/common/uresimp.h
+++ b/thirdparty/icu4c/common/uresimp.h
@@ -270,11 +270,13 @@ ures_getByKeyWithFallback(const UResourceBundle *resB,
  * function can perform fallback on the sub-resources of the table.
  * @param resB              a resource
  * @param inKey             a key associated with the requested resource
+ * @param len               if not NULL, used to return the length of the string
  * @param status: fills in the outgoing error code
  *                could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
  *                could be a non-failing error 
  *                e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
- * @return                  a pointer to a UResourceBundle struct. If fill in param was NULL, caller must delete it
+ * @return returns a pointer to a zero-terminated UChar array which lives in a
+ *         memory mapped/DLL file.
  */
 U_CAPI const UChar* U_EXPORT2 
 ures_getStringByKeyWithFallback(const UResourceBundle *resB, 
diff --git a/thirdparty/icu4c/common/uset.cpp b/thirdparty/icu4c/common/uset.cpp
index eae7981d52..a7e3046dbf 100644
--- a/thirdparty/icu4c/common/uset.cpp
+++ b/thirdparty/icu4c/common/uset.cpp
@@ -117,6 +117,12 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen) {
 }
 
 U_CAPI void U_EXPORT2
+uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::removeAll(s);
+}
+
+U_CAPI void U_EXPORT2
 uset_removeAll(USet* set, const USet* remove) {
     ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove);
 }
@@ -127,6 +133,18 @@ uset_retain(USet* set, UChar32 start, UChar32 end) {
 }
 
 U_CAPI void U_EXPORT2
+uset_retainString(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::retain(s);
+}
+
+U_CAPI void U_EXPORT2
+uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::retainAll(s);
+}
+
+U_CAPI void U_EXPORT2
 uset_retainAll(USet* set, const USet* retain) {
     ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain);
 }
@@ -142,6 +160,23 @@ uset_complement(USet* set) {
 }
 
 U_CAPI void U_EXPORT2
+uset_complementRange(USet *set, UChar32 start, UChar32 end) {
+    ((UnicodeSet*) set)->UnicodeSet::complement(start, end);
+}
+
+U_CAPI void U_EXPORT2
+uset_complementString(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::complement(s);
+}
+
+U_CAPI void U_EXPORT2
+uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length) {
+    UnicodeString s(length==-1, str, length);
+    ((UnicodeSet*) set)->UnicodeSet::complementAll(s);
+}
+
+U_CAPI void U_EXPORT2
 uset_complementAll(USet* set, const USet* complement) {
     ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement);
 }
diff --git a/thirdparty/icu4c/common/usprep.cpp b/thirdparty/icu4c/common/usprep.cpp
index 8351a77370..874ffc63a8 100644
--- a/thirdparty/icu4c/common/usprep.cpp
+++ b/thirdparty/icu4c/common/usprep.cpp
@@ -575,7 +575,7 @@ usprep_map(  const UStringPrepProfile* profile,
             }
 
         }else if(type==USPREP_DELETE){
-             // just consume the codepoint and contine
+             // just consume the codepoint and continue
             continue;
         }
         //copy the code point into destination
diff --git a/thirdparty/icu4c/common/ustr_wcs.cpp b/thirdparty/icu4c/common/ustr_wcs.cpp
index e9f278e969..89d0762480 100644
--- a/thirdparty/icu4c/common/ustr_wcs.cpp
+++ b/thirdparty/icu4c/common/ustr_wcs.cpp
@@ -364,7 +364,7 @@ _strFromWCS( UChar   *dest,
                 }
 
                 /* we have found a null  so convert the 
-                 * chunk from begining of non-null char to null
+                 * chunk from beginning of non-null char to null
                  */
                 retVal = uprv_wcstombs(pCSrc,pSrc,remaining);
 
@@ -387,7 +387,7 @@ _strFromWCS( UChar   *dest,
                  * null terminate it and convert wchar_ts to chars
                  */
                 if(nulLen >= _STACK_BUFFER_CAPACITY){
-                    /* Should rarely occcur */
+                    /* Should rarely occur */
                     /* allocate new buffer buffer */
                     pWStack =(wchar_t*) uprv_malloc(sizeof(wchar_t) * (nulLen + 1));
                     if(pWStack==NULL){
diff --git a/thirdparty/icu4c/common/utext.cpp b/thirdparty/icu4c/common/utext.cpp
index 763b6684fb..d79f8141bb 100644
--- a/thirdparty/icu4c/common/utext.cpp
+++ b/thirdparty/icu4c/common/utext.cpp
@@ -382,7 +382,7 @@ utext_previous32From(UText *ut, int64_t index) {
     //
     UChar32     cPrev;    // The character preceding cCurr, which is what we will return.
 
-    // Address the chunk containg the position preceding the incoming index
+    // Address the chunk containing the position preceding the incoming index
     // A tricky edge case:
     //   We try to test the requested native index against the chunkNativeStart to determine
     //    whether the character preceding the one at the index is in the current chunk.
@@ -894,7 +894,7 @@ struct UTF8Buf {
                                                      //    one for a supplementary starting in the last normal position,
                                                      //    and one for an entry for the buffer limit position.
     uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
-                                                     //   correspoding offset in filled part of buf.
+                                                     //   corresponding offset in filled part of buf.
     int32_t   align;
 };
 
@@ -1545,7 +1545,7 @@ utf8TextMapOffsetToNative(const UText *ut) {
 }
 
 //
-// Map a native index to the corrsponding chunk offset
+// Map a native index to the corresponding chunk offset
 //
 static int32_t U_CALLCONV
 utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
diff --git a/thirdparty/icu4c/common/util.h b/thirdparty/icu4c/common/util.h
index 9c3b76d9ed..b5fac383a2 100644
--- a/thirdparty/icu4c/common/util.h
+++ b/thirdparty/icu4c/common/util.h
@@ -13,10 +13,10 @@
 #ifndef ICU_UTIL_H
 #define ICU_UTIL_H
 
-#include "unicode/utypes.h"
-#include "unicode/uobject.h"
+#include "charstr.h"
 #include "unicode/unistr.h"
-
+#include "unicode/uobject.h"
+#include "unicode/utypes.h"
 //--------------------------------------------------------------------
 // class ICU_Utility
 // i18n utility functions, scoped into the class ICU_Utility.
diff --git a/thirdparty/icu4c/common/utracimp.h b/thirdparty/icu4c/common/utracimp.h
index f32fe1db39..945540d25a 100644
--- a/thirdparty/icu4c/common/utracimp.h
+++ b/thirdparty/icu4c/common/utracimp.h
@@ -193,7 +193,7 @@ UPRV_BLOCK_MACRO_BEGIN { \
  * Trace statement for each exit point of a function that has a UTRACE_ENTRY()
  * statement, and that returns a value.
  *
- * @param val       The function's return value, int32_t or comatible type.
+ * @param val       The function's return value, int32_t or compatible type.
  *
  * @internal 
  */
diff --git a/thirdparty/icu4c/common/uvector.cpp b/thirdparty/icu4c/common/uvector.cpp
index cf19edf646..9c7e74c6d5 100644
--- a/thirdparty/icu4c/common/uvector.cpp
+++ b/thirdparty/icu4c/common/uvector.cpp
@@ -312,7 +312,7 @@ int32_t UVector::indexOf(UElement key, int32_t startIndex, int8_t hint) const {
     } else {
         for (i=startIndex; i<count; ++i) {
             /* Pointers are not always the same size as ints so to perform
-             * a valid comparision we need to know whether we are being
+             * a valid comparison we need to know whether we are being
              * provided an int or a pointer. */
             if (hint & HINT_KEY_POINTER) {
                 if (key.pointer == elements[i].pointer) {
@@ -518,7 +518,7 @@ sortiComparator(const void * /*context */, const void *left, const void *right)
 }
 
 /**
-  * Sort the vector, assuming it constains ints.
+  * Sort the vector, assuming it contains ints.
   *     (A more general sort would take a comparison function, but it's
   *     not clear whether UVector's UElementComparator or
   *     UComparator from uprv_sortAray would be more appropriate.)
diff --git a/thirdparty/icu4c/common/wintz.cpp b/thirdparty/icu4c/common/wintz.cpp
index 580cedadb6..ebf31650c2 100644
--- a/thirdparty/icu4c/common/wintz.cpp
+++ b/thirdparty/icu4c/common/wintz.cpp
@@ -124,10 +124,26 @@ uprv_detectWindowsTimeZone()
         // No way to support when DST is turned off and the offset in minutes is not a multiple of 60.
         if (utcOffsetMins % 60 == 0) {
             char gmtOffsetTz[11] = {}; // "Etc/GMT+dd" is 11-char long with a terminal null.
-            // Note '-' before 'utcOffsetMin'. The timezone ID's sign convention
-            // is that a timezone ahead of UTC is Etc/GMT-<offset> and a timezone
-            // behind UTC is Etc/GMT+<offset>.
-            int ret = snprintf(gmtOffsetTz, UPRV_LENGTHOF(gmtOffsetTz), "Etc/GMT%+ld", -utcOffsetMins / 60);
+            // Important note on the sign convention for zones:
+            //
+            // From https://en.wikipedia.org/wiki/Tz_database#Area
+            //   "In order to conform with the POSIX style, those zone names beginning with "Etc/GMT" have their sign reversed
+            //   from the standard ISO 8601 convention. In the "Etc" area, zones west of GMT have a positive sign and those
+            //   east have a negative sign in their name (e.g "Etc/GMT-14" is 14 hours ahead of GMT)."
+            //
+            // Regarding the POSIX style, from https://www.gnu.org/software/libc/manual/html_node/TZ-Variable.html
+            //   "The offset specifies the time value you must add to the local time to get a Coordinated Universal Time value."
+            //
+            // However, the Bias value in DYNAMIC_TIME_ZONE_INFORMATION *already* follows the POSIX convention.
+            // 
+            // From https://docs.microsoft.com/en-us/windows/win32/api/timezoneapi/ns-timezoneapi-dynamic_time_zone_information
+            //   "The bias is the difference, in minutes, between Coordinated Universal Time (UTC) and
+            //   local time. All translations between UTC and local time are based on the following formula:
+            //      UTC = local time + bias"
+            //
+            // For example, a time zone that is 3 hours ahead of UTC (UTC+03:00) would have a Bias value of -180, and the
+            // corresponding time zone ID would be "Etc/GMT-3". (So there is no need to negate utcOffsetMins below.)
+            int ret = snprintf(gmtOffsetTz, UPRV_LENGTHOF(gmtOffsetTz), "Etc/GMT%+ld", utcOffsetMins / 60);
             if (ret > 0 && ret < UPRV_LENGTHOF(gmtOffsetTz)) {
                 return uprv_strdup(gmtOffsetTz);
             }
diff --git a/thirdparty/icu4c/icudt68l.dat b/thirdparty/icu4c/icudt69l.dat
index 9ecea5d548..3101a49695 100644
--- a/thirdparty/icu4c/icudt68l.dat
+++ b/thirdparty/icu4c/icudt69l.dat
diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md
index 4fcd766d22..3c52415f62 100644
--- a/thirdparty/meshoptimizer/LICENSE.md
+++ b/thirdparty/meshoptimizer/LICENSE.md
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2016-2020 Arseny Kapoulkine
+Copyright (c) 2016-2021 Arseny Kapoulkine
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp
index f7d88c5136..f8aad7b49c 100644
--- a/thirdparty/meshoptimizer/clusterizer.cpp
+++ b/thirdparty/meshoptimizer/clusterizer.cpp
@@ -2,6 +2,7 @@
 #include "meshoptimizer.h"
 
 #include <assert.h>
+#include <float.h>
 #include <math.h>
 #include <string.h>
 
@@ -12,6 +13,68 @@
 namespace meshopt
 {
 
+// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet
+const size_t kMeshletMaxVertices = 255;
+
+// A reasonable limit is around 2*max_vertices or less
+const size_t kMeshletMaxTriangles = 512;
+
+struct TriangleAdjacency2
+{
+	unsigned int* counts;
+	unsigned int* offsets;
+	unsigned int* data;
+};
+
+static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill triangle counts
+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		assert(indices[i] < vertex_count);
+
+		adjacency.counts[indices[i]]++;
+	}
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		adjacency.offsets[i] = offset;
+		offset += adjacency.counts[i];
+	}
+
+	assert(offset == index_count);
+
+	// fill triangle data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+		adjacency.offsets[i] -= adjacency.counts[i];
+	}
+}
+
 static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
 {
 	assert(count > 0);
@@ -82,13 +145,310 @@ static void computeBoundingSphere(float result[4], const float points[][3], size
 	result[3] = radius;
 }
 
+struct Cone
+{
+	float px, py, pz;
+	float nx, ny, nz;
+};
+
+static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius)
+{
+	float cone = 1.f - spread * cone_weight;
+	float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
+
+	return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped;
+}
+
+static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
+{
+	Cone result = acc;
+
+	float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);
+
+	result.px *= center_scale;
+	result.py *= center_scale;
+	result.pz *= center_scale;
+
+	float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;
+	float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);
+
+	result.nx *= axis_scale;
+	result.ny *= axis_scale;
+	result.nz *= axis_scale;
+
+	return result;
+}
+
+static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	(void)vertex_count;
+
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+	size_t face_count = index_count / 3;
+
+	float mesh_area = 0;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* p0 = vertex_positions + vertex_stride_float * a;
+		const float* p1 = vertex_positions + vertex_stride_float * b;
+		const float* p2 = vertex_positions + vertex_stride_float * c;
+
+		float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+		float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+		float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+		float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+		float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+		float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+		float invarea = (area == 0.f) ? 0.f : 1.f / area;
+
+		triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;
+		triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;
+		triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;
+
+		triangles[i].nx = normalx * invarea;
+		triangles[i].ny = normaly * invarea;
+		triangles[i].nz = normalz * invarea;
+
+		mesh_area += area;
+	}
+
+	return mesh_area;
+}
+
+static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
+{
+	size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
+
+	// fill 4b padding with 0
+	while (offset & 3)
+		meshlet_triangles[offset++] = 0;
+}
+
+static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles)
+{
+	unsigned char& av = used[a];
+	unsigned char& bv = used[b];
+	unsigned char& cv = used[c];
+
+	bool result = false;
+
+	unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+
+	if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+	{
+		meshlets[meshlet_offset] = meshlet;
+
+		for (size_t j = 0; j < meshlet.vertex_count; ++j)
+			used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff;
+
+		finishMeshlet(meshlet, meshlet_triangles);
+
+		meshlet.vertex_offset += meshlet.vertex_count;
+		meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
+		meshlet.vertex_count = 0;
+		meshlet.triangle_count = 0;
+
+		result = true;
+	}
+
+	if (av == 0xff)
+	{
+		av = (unsigned char)meshlet.vertex_count;
+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
+	}
+
+	if (bv == 0xff)
+	{
+		bv = (unsigned char)meshlet.vertex_count;
+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
+	}
+
+	if (cv == 0xff)
+	{
+		cv = (unsigned char)meshlet.vertex_count;
+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
+	}
+
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv;
+	meshlet.triangle_count++;
+
+	return result;
+}
+
+struct KDNode
+{
+	union
+	{
+		float split;
+		unsigned int index;
+	};
+
+	// leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
+	// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
+	unsigned int axis : 2;
+	unsigned int children : 30;
+};
+
+static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
+{
+	size_t m = 0;
+
+	// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
+	for (size_t i = 0; i < count; ++i)
+	{
+		float v = points[indices[i] * stride + axis];
+
+		// swap(m, i) unconditionally
+		unsigned int t = indices[m];
+		indices[m] = indices[i];
+		indices[i] = t;
+
+		// when v >= pivot, we swap i with m without advancing it, preserving invariants
+		m += v < pivot;
+	}
+
+	return m;
+}
+
+static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)
+{
+	assert(offset + count <= node_count);
+	(void)node_count;
+
+	KDNode& result = nodes[offset];
+
+	result.index = indices[0];
+	result.axis = 3;
+	result.children = unsigned(count - 1);
+
+	// all remaining points are stored in nodes immediately following the leaf
+	for (size_t i = 1; i < count; ++i)
+	{
+		KDNode& tail = nodes[offset + i];
+
+		tail.index = indices[i];
+		tail.axis = 3;
+		tail.children = ~0u >> 2; // bogus value to prevent misuse
+	}
+
+	return offset + count;
+}
+
+static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
+{
+	assert(count > 0);
+	assert(offset < node_count);
+
+	if (count <= leaf_size)
+		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
+
+	float mean[3] = {};
+	float vars[3] = {};
+	float runc = 1, runs = 1;
+
+	// gather statistics on the points in the subtree using Welford's algorithm
+	for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
+	{
+		const float* point = points + indices[i] * stride;
+
+		for (int k = 0; k < 3; ++k)
+		{
+			float delta = point[k] - mean[k];
+			mean[k] += delta * runs;
+			vars[k] += delta * (point[k] - mean[k]);
+		}
+	}
+
+	// split axis is one where the variance is largest
+	unsigned int axis = vars[0] >= vars[1] && vars[0] >= vars[2] ? 0 : vars[1] >= vars[2] ? 1
+	                                                                                      : 2;
+
+	float split = mean[axis];
+	size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
+
+	// when the partition is degenerate simply consolidate the points into a single node
+	if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
+		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
+
+	KDNode& result = nodes[offset];
+
+	result.split = split;
+	result.axis = axis;
+
+	// left subtree is right after our node
+	size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
+
+	// distance to the right subtree is represented explicitly
+	result.children = unsigned(next_offset - offset - 1);
+
+	return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
+}
+
+static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
+{
+	const KDNode& node = nodes[root];
+
+	if (node.axis == 3)
+	{
+		// leaf
+		for (unsigned int i = 0; i <= node.children; ++i)
+		{
+			unsigned int index = nodes[root + i].index;
+
+			if (emitted_flags[index])
+				continue;
+
+			const float* point = points + index * stride;
+
+			float distance2 =
+			    (point[0] - position[0]) * (point[0] - position[0]) +
+			    (point[1] - position[1]) * (point[1] - position[1]) +
+			    (point[2] - position[2]) * (point[2] - position[2]);
+			float distance = sqrtf(distance2);
+
+			if (distance < limit)
+			{
+				result = index;
+				limit = distance;
+			}
+		}
+	}
+	else
+	{
+		// branch; we order recursion to process the node that search position is in first
+		float delta = position[node.axis] - node.split;
+		unsigned int first = (delta <= 0) ? 0 : node.children;
+		unsigned int second = first ^ node.children;
+
+		kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
+
+		// only process the other node if it can have a match based on closest distance so far
+		if (fabsf(delta) <= limit)
+			kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
+	}
+}
+
 } // namespace meshopt
 
 size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
 {
+	using namespace meshopt;
+
 	assert(index_count % 3 == 0);
-	assert(max_vertices >= 3);
-	assert(max_triangles >= 1);
+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
+
+	(void)kMeshletMaxVertices;
+	(void)kMeshletMaxTriangles;
 
 	// meshlet construction is limited by max vertices and max triangles per meshlet
 	// the worst case is that the input is an unindexed stream since this equally stresses both limits
@@ -100,77 +460,226 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
 	return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
 }
 
-size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
 {
+	using namespace meshopt;
+
 	assert(index_count % 3 == 0);
-	assert(max_vertices >= 3);
-	assert(max_triangles >= 1);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
 
 	meshopt_Allocator allocator;
 
-	meshopt_Meshlet meshlet;
-	memset(&meshlet, 0, sizeof(meshlet));
+	TriangleAdjacency2 adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	size_t face_count = index_count / 3;
+
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	// for each triangle, precompute centroid & normal to use for scoring
+	Cone* triangles = allocator.allocate<Cone>(face_count);
+	float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
+	// assuming each meshlet is a square patch, expected radius is sqrt(expected area)
+	float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;
+	float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;
+
+	// build a kd-tree for nearest neighbor lookup
+	unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);
+	for (size_t i = 0; i < face_count; ++i)
+		kdindices[i] = unsigned(i);
 
-	assert(max_vertices <= sizeof(meshlet.vertices) / sizeof(meshlet.vertices[0]));
-	assert(max_triangles <= sizeof(meshlet.indices) / 3);
+	KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
+	kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
 
 	// index of the vertex in the meshlet, 0xff if the vertex isn't used
 	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
 	memset(used, -1, vertex_count);
 
-	size_t offset = 0;
+	meshopt_Meshlet meshlet = {};
+	size_t meshlet_offset = 0;
 
-	for (size_t i = 0; i < index_count; i += 3)
-	{
-		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
-		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+	Cone meshlet_cone_acc = {};
 
-		unsigned char& av = used[a];
-		unsigned char& bv = used[b];
-		unsigned char& cv = used[c];
+	for (;;)
+	{
+		unsigned int best_triangle = ~0u;
+		unsigned int best_extra = 5;
+		float best_score = FLT_MAX;
 
-		unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+		Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
 
-		if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+		for (size_t i = 0; i < meshlet.vertex_count; ++i)
 		{
-			destination[offset++] = meshlet;
+			unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
+
+			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbours_size = adjacency.counts[index];
+
+			for (size_t j = 0; j < neighbours_size; ++j)
+			{
+				unsigned int triangle = neighbours[j];
+				assert(!emitted_flags[triangle]);
+
+				unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+				assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+				unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
+
+				// triangles that don't add new vertices to meshlets are max. priority
+				if (extra != 0)
+				{
+					// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
+					if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
+						extra = 0;
+
+					extra++;
+				}
+
+				// since topology-based priority is always more important than the score, we can skip scoring in some cases
+				if (extra > best_extra)
+					continue;
+
+				const Cone& tri_cone = triangles[triangle];
+
+				float distance2 =
+				    (tri_cone.px - meshlet_cone.px) * (tri_cone.px - meshlet_cone.px) +
+				    (tri_cone.py - meshlet_cone.py) * (tri_cone.py - meshlet_cone.py) +
+				    (tri_cone.pz - meshlet_cone.pz) * (tri_cone.pz - meshlet_cone.pz);
 
-			for (size_t j = 0; j < meshlet.vertex_count; ++j)
-				used[meshlet.vertices[j]] = 0xff;
+				float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
 
-			memset(&meshlet, 0, sizeof(meshlet));
+				float score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
+
+				// note that topology-based priority is always more important than the score
+				// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
+				if (extra < best_extra || score < best_score)
+				{
+					best_triangle = triangle;
+					best_extra = extra;
+					best_score = score;
+				}
+			}
 		}
 
-		if (av == 0xff)
+		if (best_triangle == ~0u)
 		{
-			av = meshlet.vertex_count;
-			meshlet.vertices[meshlet.vertex_count++] = a;
+			float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
+			unsigned int index = ~0u;
+			float limit = FLT_MAX;
+
+			kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit);
+
+			best_triangle = index;
 		}
 
-		if (bv == 0xff)
+		if (best_triangle == ~0u)
+			break;
+
+		unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
+		if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles))
 		{
-			bv = meshlet.vertex_count;
-			meshlet.vertices[meshlet.vertex_count++] = b;
+			meshlet_offset++;
+			memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
 		}
 
-		if (cv == 0xff)
+		live_triangles[a]--;
+		live_triangles[b]--;
+		live_triangles[c]--;
+
+		// remove emitted triangle from adjacency data
+		// this makes sure that we spend less time traversing these lists on subsequent iterations
+		for (size_t k = 0; k < 3; ++k)
 		{
-			cv = meshlet.vertex_count;
-			meshlet.vertices[meshlet.vertex_count++] = c;
+			unsigned int index = indices[best_triangle * 3 + k];
+
+			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbours_size = adjacency.counts[index];
+
+			for (size_t i = 0; i < neighbours_size; ++i)
+			{
+				unsigned int tri = neighbours[i];
+
+				if (tri == best_triangle)
+				{
+					neighbours[i] = neighbours[neighbours_size - 1];
+					adjacency.counts[index]--;
+					break;
+				}
+			}
 		}
 
-		meshlet.indices[meshlet.triangle_count][0] = av;
-		meshlet.indices[meshlet.triangle_count][1] = bv;
-		meshlet.indices[meshlet.triangle_count][2] = cv;
-		meshlet.triangle_count++;
+		// update aggregated meshlet cone data for scoring subsequent triangles
+		meshlet_cone_acc.px += triangles[best_triangle].px;
+		meshlet_cone_acc.py += triangles[best_triangle].py;
+		meshlet_cone_acc.pz += triangles[best_triangle].pz;
+		meshlet_cone_acc.nx += triangles[best_triangle].nx;
+		meshlet_cone_acc.ny += triangles[best_triangle].ny;
+		meshlet_cone_acc.nz += triangles[best_triangle].nz;
+
+		emitted_flags[best_triangle] = 1;
+	}
+
+	if (meshlet.triangle_count)
+	{
+		finishMeshlet(meshlet, meshlet_triangles);
+
+		meshlets[meshlet_offset++] = meshlet;
+	}
+
+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+	return meshlet_offset;
+}
+
+size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
+
+	meshopt_Allocator allocator;
+
+	// index of the vertex in the meshlet, 0xff if the vertex isn't used
+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+	memset(used, -1, vertex_count);
+
+	meshopt_Meshlet meshlet = {};
+	size_t meshlet_offset = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		// appends triangle to the meshlet and writes previous meshlet to the output if full
+		meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);
 	}
 
 	if (meshlet.triangle_count)
-		destination[offset++] = meshlet;
+	{
+		finishMeshlet(meshlet, meshlet_triangles);
 
-	assert(offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+		meshlets[meshlet_offset++] = meshlet;
+	}
 
-	return offset;
+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+	return meshlet_offset;
 }
 
 meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
@@ -178,18 +687,17 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
+	assert(index_count / 3 <= kMeshletMaxTriangles);
 	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
-	assert(index_count / 3 <= 256);
-
 	(void)vertex_count;
 
 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
 
 	// compute triangle normals and gather triangle corners
-	float normals[256][3];
-	float corners[256][3][3];
+	float normals[kMeshletMaxTriangles][3];
+	float corners[kMeshletMaxTriangles][3][3];
 	size_t triangles = 0;
 
 	for (size_t i = 0; i < index_count; i += 3)
@@ -327,25 +835,23 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 	return bounds;
 }
 
-meshopt_Bounds meshopt_computeMeshletBounds(const meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
+	using namespace meshopt;
+
+	assert(triangle_count <= kMeshletMaxTriangles);
 	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
-	unsigned int indices[sizeof(meshlet->indices) / sizeof(meshlet->indices[0][0])];
+	unsigned int indices[kMeshletMaxTriangles * 3];
 
-	for (size_t i = 0; i < meshlet->triangle_count; ++i)
+	for (size_t i = 0; i < triangle_count * 3; ++i)
 	{
-		unsigned int a = meshlet->vertices[meshlet->indices[i][0]];
-		unsigned int b = meshlet->vertices[meshlet->indices[i][1]];
-		unsigned int c = meshlet->vertices[meshlet->indices[i][2]];
-
-		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+		unsigned int index = meshlet_vertices[meshlet_triangles[i]];
+		assert(index < vertex_count);
 
-		indices[i * 3 + 0] = a;
-		indices[i * 3 + 1] = b;
-		indices[i * 3 + 2] = c;
+		indices[i] = index;
 	}
 
-	return meshopt_computeClusterBounds(indices, meshlet->triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
+	return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
 }
diff --git a/thirdparty/meshoptimizer/indexgenerator.cpp b/thirdparty/meshoptimizer/indexgenerator.cpp
index aa4a30efa4..f60db0dc4f 100644
--- a/thirdparty/meshoptimizer/indexgenerator.cpp
+++ b/thirdparty/meshoptimizer/indexgenerator.cpp
@@ -4,6 +4,8 @@
 #include <assert.h>
 #include <string.h>
 
+// This work is based on:
+// John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
 namespace meshopt
 {
 
@@ -83,10 +85,49 @@ struct VertexStreamHasher
 	}
 };
 
+struct EdgeHasher
+{
+	const unsigned int* remap;
+
+	size_t hash(unsigned long long edge) const
+	{
+		unsigned int e0 = unsigned(edge >> 32);
+		unsigned int e1 = unsigned(edge);
+
+		unsigned int h1 = remap[e0];
+		unsigned int h2 = remap[e1];
+
+		const unsigned int m = 0x5bd1e995;
+
+		// MurmurHash64B finalizer
+		h1 ^= h2 >> 18;
+		h1 *= m;
+		h2 ^= h1 >> 22;
+		h2 *= m;
+		h1 ^= h2 >> 17;
+		h1 *= m;
+		h2 ^= h1 >> 19;
+		h2 *= m;
+
+		return h2;
+	}
+
+	bool equal(unsigned long long lhs, unsigned long long rhs) const
+	{
+		unsigned int l0 = unsigned(lhs >> 32);
+		unsigned int l1 = unsigned(lhs);
+
+		unsigned int r0 = unsigned(rhs >> 32);
+		unsigned int r1 = unsigned(rhs);
+
+		return remap[l0] == remap[r0] && remap[l1] == remap[r1];
+	}
+};
+
 static size_t hashBuckets(size_t count)
 {
 	size_t buckets = 1;
-	while (buckets < count)
+	while (buckets < count + count / 4)
 		buckets *= 2;
 
 	return buckets;
@@ -119,6 +160,26 @@ static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, c
 	return 0;
 }
 
+static void buildPositionRemap(unsigned int* remap, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
+{
+	VertexHasher vertex_hasher = {reinterpret_cast<const unsigned char*>(vertex_positions), 3 * sizeof(float), vertex_positions_stride};
+
+	size_t vertex_table_size = hashBuckets(vertex_count);
+	unsigned int* vertex_table = allocator.allocate<unsigned int>(vertex_table_size);
+	memset(vertex_table, -1, vertex_table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int index = unsigned(i);
+		unsigned int* entry = hashLookup(vertex_table, vertex_table_size, vertex_hasher, index, ~0u);
+
+		if (*entry == ~0u)
+			*entry = index;
+
+		remap[index] = *entry;
+	}
+}
+
 } // namespace meshopt
 
 size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
@@ -345,3 +406,146 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns
 		destination[i] = remap[index];
 	}
 }
+
+void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	static const int next[4] = {1, 2, 0, 1};
+
+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
+
+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
+	EdgeHasher edge_hasher = {remap};
+
+	size_t edge_table_size = hashBuckets(index_count);
+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
+	unsigned int* edge_vertex_table = allocator.allocate<unsigned int>(edge_table_size);
+
+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
+	memset(edge_vertex_table, -1, edge_table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			unsigned int i2 = indices[i + next[e + 1]];
+			assert(i0 < vertex_count && i1 < vertex_count && i2 < vertex_count);
+
+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			if (*entry == ~0ull)
+			{
+				*entry = edge;
+
+				// store vertex opposite to the edge
+				edge_vertex_table[entry - edge_table] = i2;
+			}
+		}
+	}
+
+	// build resulting index buffer: 6 indices for each input triangle
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int patch[6];
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			// note: this refers to the opposite edge!
+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
+			unsigned long long* oppe = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			patch[e * 2 + 0] = i0;
+			patch[e * 2 + 1] = (*oppe == ~0ull) ? i0 : edge_vertex_table[oppe - edge_table];
+		}
+
+		memcpy(destination + i * 2, patch, sizeof(patch));
+	}
+}
+
+void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	static const int next[3] = {1, 2, 0};
+
+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
+
+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
+	EdgeHasher edge_hasher = {remap};
+
+	size_t edge_table_size = hashBuckets(index_count);
+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			if (*entry == ~0ull)
+				*entry = edge;
+		}
+	}
+
+	// build resulting index buffer: 12 indices for each input triangle
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int patch[12];
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			// note: this refers to the opposite edge!
+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
+			unsigned long long oppe = *hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			// use the same edge if opposite edge doesn't exist (border)
+			oppe = (oppe == ~0ull) ? edge : oppe;
+
+			// triangle index (0, 1, 2)
+			patch[e] = i0;
+
+			// opposite edge (3, 4; 5, 6; 7, 8)
+			patch[3 + e * 2 + 0] = unsigned(oppe);
+			patch[3 + e * 2 + 1] = unsigned(oppe >> 32);
+
+			// dominant vertex (9, 10, 11)
+			patch[9 + e] = remap[i0];
+		}
+
+		memcpy(destination + i * 4, patch, sizeof(patch));
+	}
+}
diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
index 1714000384..fe8d349731 100644
--- a/thirdparty/meshoptimizer/meshoptimizer.h
+++ b/thirdparty/meshoptimizer/meshoptimizer.h
@@ -1,7 +1,7 @@
 /**
- * meshoptimizer - version 0.15
+ * meshoptimizer - version 0.16
  *
- * Copyright (C) 2016-2020, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2016-2021, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
  * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
  *
  * This library is distributed under the MIT License. See notice at the end of this file.
@@ -12,7 +12,7 @@
 #include <stddef.h>
 
 /* Version macro; major * 1000 + minor * 10 + patch */
-#define MESHOPTIMIZER_VERSION 150 /* 0.15 */
+#define MESHOPTIMIZER_VERSION 160 /* 0.16 */
 
 /* If no API is defined, assume default */
 #ifndef MESHOPTIMIZER_API
@@ -98,6 +98,35 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
 MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
 
 /**
+ * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
+ * Each triangle is converted into a 6-vertex patch with the following layout:
+ * - 0, 2, 4: original triangle vertices
+ * - 1, 3, 5: vertices adjacent to edges 02, 24 and 40
+ * The resulting patch can be rendered with geometry shaders using e.g. VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY.
+ * This can be used to implement algorithms like silhouette detection/expansion and other forms of GS-driven rendering.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count*2 elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Generate index buffer that can be used for PN-AEN tessellation with crack-free displacement
+ * Each triangle is converted into a 12-vertex patch with the following layout:
+ * - 0, 1, 2: original triangle vertices
+ * - 3, 4: opposing edge for edge 0, 1
+ * - 5, 6: opposing edge for edge 1, 2
+ * - 7, 8: opposing edge for edge 2, 0
+ * - 9, 10, 11: dominant vertices for corners 0, 1, 2
+ * The resulting patch can be rendered with hardware tessellation using PN-AEN and displacement mapping.
+ * See "Tessellation on Any Budget" (John McDonald, GDC 2011) for implementation details.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count*4 elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
  * Vertex transform cache optimizer
  * Reorders indices to reduce the number of GPU vertex shader invocations
  * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
@@ -373,22 +402,31 @@ MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetc
 
 struct meshopt_Meshlet
 {
-	unsigned int vertices[64];
-	unsigned char indices[126][3];
-	unsigned char triangle_count;
-	unsigned char vertex_count;
+	/* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */
+	unsigned int vertex_offset;
+	unsigned int triangle_offset;
+
+	/* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */
+	unsigned int vertex_count;
+	unsigned int triangle_count;
 };
 
 /**
  * Experimental: Meshlet builder
  * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
  * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
- * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ * When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters.
+ * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
  *
- * destination must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
- * max_vertices and max_triangles can't exceed limits statically declared in meshopt_Meshlet (max_vertices <= 64, max_triangles <= 126)
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
+ * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
+ * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512)
+ * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
 MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
 
 struct meshopt_Bounds
@@ -426,10 +464,10 @@ struct meshopt_Bounds
  * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
  *
  * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
- * index_count should be less than or equal to 256*3 (the function assumes clusters of limited size)
+ * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
  */
 MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
-MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const struct meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
 /**
  * Experimental: Spatial sorter
@@ -513,6 +551,10 @@ inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices,
 template <typename T>
 inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
 template <typename T>
+inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
+inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
 template <typename T>
 inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
@@ -547,7 +589,9 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size
 template <typename T>
 inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
 template <typename T>
-inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
+template <typename T>
+inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
 template <typename T>
 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
@@ -762,6 +806,24 @@ inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indi
 }
 
 template <typename T>
+inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count * 2);
+
+	meshopt_generateAdjacencyIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count * 4);
+
+	meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
 {
 	meshopt_IndexAdapter<T> in(0, indices, index_count);
@@ -908,11 +970,19 @@ inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices
 }
 
 template <typename T>
-inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_buildMeshlets(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, cone_weight);
+}
+
+template <typename T>
+inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
 {
 	meshopt_IndexAdapter<T> in(0, indices, index_count);
 
-	return meshopt_buildMeshlets(destination, in.data, index_count, vertex_count, max_vertices, max_triangles);
+	return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
 }
 
 template <typename T>
@@ -934,7 +1004,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_
 #endif
 
 /**
- * Copyright (c) 2016-2020 Arseny Kapoulkine
+ * Copyright (c) 2016-2021 Arseny Kapoulkine
  *
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
index 942db14461..b2cb589462 100644
--- a/thirdparty/meshoptimizer/simplifier.cpp
+++ b/thirdparty/meshoptimizer/simplifier.cpp
@@ -131,7 +131,7 @@ struct PositionHasher
 static size_t hashBuckets2(size_t count)
 {
 	size_t buckets = 1;
-	while (buckets < count)
+	while (buckets < count + count / 4)
 		buckets *= 2;
 
 	return buckets;
diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp
index 2cbfaac367..5f3ec204ab 100644
--- a/thirdparty/meshoptimizer/vertexcodec.cpp
+++ b/thirdparty/meshoptimizer/vertexcodec.cpp
@@ -710,18 +710,12 @@ static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 SIMD_TARGET
 static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
 {
-	v128_t mask_0 = wasm_v32x4_shuffle(mask, mask, 0, 2, 1, 3);
-
-	uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
-	uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull;
+	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
+	const uint64_t magic = 0x000103070f1f3f80ull;
 
 	// TODO: This can use v8x16_bitmask in the future
-	uint64_t mask_2 = mask_1a | mask_1b;
-	uint64_t mask_4 = mask_2 | (mask_2 >> 16);
-	uint64_t mask_8 = mask_4 | (mask_4 >> 8);
-
-	mask0 = uint8_t(mask_8);
-	mask1 = uint8_t(mask_8 >> 32);
+	mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
+	mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
 }
 
 SIMD_TARGET
diff --git a/thirdparty/oidn/core/transfer_function.cpp b/thirdparty/oidn/core/transfer_function.cpp
index 487f0a9f75..ce5deca56b 100644
--- a/thirdparty/oidn/core/transfer_function.cpp
+++ b/thirdparty/oidn/core/transfer_function.cpp
@@ -24,10 +24,6 @@ namespace oidn {
   float AutoexposureNode::autoexposure(const Image& color)
   {
     assert(color.format == Format::Float3);
-// -- GODOT start --
-// We don't want to mess with TTB and we don't use autoexposure, so we disable this code
-#if 0
-// -- GODOT end --
 
     constexpr float key = 0.18f;
     constexpr float eps = 1e-8f;
@@ -42,61 +38,66 @@ namespace oidn {
     // Compute the average log luminance of the downsampled image
     using Sum = std::pair<float, int>;
 
-    Sum sum =
-      tbb::parallel_reduce(
-        tbb::blocked_range2d<int>(0, HK, 0, WK),
-        Sum(0.f, 0),
-        [&](const tbb::blocked_range2d<int>& r, Sum sum) -> Sum
+    // -- GODOT start --
+    // Sum sum =
+    //   tbb::parallel_reduce(
+    //     tbb::blocked_range2d<int>(0, HK, 0, WK),
+    //     Sum(0.f, 0),
+    //     [&](const tbb::blocked_range2d<int>& r, Sum sum) -> Sum
+    //     {
+    //       // Iterate over blocks
+    //       for (int i = r.rows().begin(); i != r.rows().end(); ++i)
+    //       {
+    //         for (int j = r.cols().begin(); j != r.cols().end(); ++j)
+    //         {
+
+    Sum sum = Sum(0.0f, 0);
+
+    for (int i = 0; i != HK; ++i)
+    {
+      for (int j = 0; j != WK; ++j)
+      {
+        // Compute the average luminance in the current block
+        const int beginH = int(ptrdiff_t(i)   * H / HK);
+        const int beginW = int(ptrdiff_t(j)   * W / WK);
+        const int endH   = int(ptrdiff_t(i+1) * H / HK);
+        const int endW   = int(ptrdiff_t(j+1) * W / WK);
+
+        float L = 0.f;
+
+        for (int h = beginH; h < endH; ++h)
         {
-          // Iterate over blocks
-          for (int i = r.rows().begin(); i != r.rows().end(); ++i)
+          for (int w = beginW; w < endW; ++w)
           {
-            for (int j = r.cols().begin(); j != r.cols().end(); ++j)
-            {
-              // Compute the average luminance in the current block
-              const int beginH = int(ptrdiff_t(i)   * H / HK);
-              const int beginW = int(ptrdiff_t(j)   * W / WK);
-              const int endH   = int(ptrdiff_t(i+1) * H / HK);
-              const int endW   = int(ptrdiff_t(j+1) * W / WK);
-
-              float L = 0.f;
-
-              for (int h = beginH; h < endH; ++h)
-              {
-                for (int w = beginW; w < endW; ++w)
-                {
-                  const float* rgb = (const float*)color.get(h, w);
-
-                  const float r = maxSafe(rgb[0], 0.f);
-                  const float g = maxSafe(rgb[1], 0.f);
-                  const float b = maxSafe(rgb[2], 0.f);
-
-                  L += luminance(r, g, b);
-                }
-              }
-
-              L /= (endH - beginH) * (endW - beginW);
-
-              // Accumulate the log luminance
-              if (L > eps)
-              {
-                sum.first += log2(L);
-                sum.second++;
-              }
-            }
+            const float* rgb = (const float*)color.get(h, w);
+
+            const float r = maxSafe(rgb[0], 0.f);
+            const float g = maxSafe(rgb[1], 0.f);
+            const float b = maxSafe(rgb[2], 0.f);
+
+            L += luminance(r, g, b);
           }
+        }
 
-          return sum;
-        },
-        [](Sum a, Sum b) -> Sum { return Sum(a.first+b.first, a.second+b.second); },
-        tbb::static_partitioner()
-      );
+        L /= (endH - beginH) * (endW - beginW);
+
+        // Accumulate the log luminance
+        if (L > eps)
+        {
+          sum.first += log2(L);
+          sum.second++;
+        }
+      }
+    }
+
+    //     return sum;
+    //   },
+    //   [](Sum a, Sum b) -> Sum { return Sum(a.first+b.first, a.second+b.second); },
+    //   tbb::static_partitioner()
+    // );
+    // -- GODOT end --
 
     return (sum.second > 0) ? (key / exp2(sum.first / float(sum.second))) : 1.f;
-// -- GODOT start --
-#endif
-    return 1.0;
-// -- GODOT end --
   }
 
 } // namespace oidn
diff --git a/thirdparty/oidn/patches/godot-changes-c58c5216.patch b/thirdparty/oidn/patches/godot-changes-c58c5216.patch
index 6a54703064..c01f00187b 100644
--- a/thirdparty/oidn/patches/godot-changes-c58c5216.patch
+++ b/thirdparty/oidn/patches/godot-changes-c58c5216.patch
@@ -280,28 +280,58 @@ index 8c2de09..ed8328c 100644
  namespace oidn {
  
 diff --git a/core/transfer_function.cpp b/core/transfer_function.cpp
-index 601f814..487f0a9 100644
+index 601f814..ce5deca 100644
 --- a/core/transfer_function.cpp
 +++ b/core/transfer_function.cpp
-@@ -24,6 +24,10 @@ namespace oidn {
-   float AutoexposureNode::autoexposure(const Image& color)
-   {
-     assert(color.format == Format::Float3);
-+// -- GODOT start --
-+// We don't want to mess with TTB and we don't use autoexposure, so we disable this code
-+#if 0
-+// -- GODOT end --
+@@ -38,16 +38,24 @@ namespace oidn {
+     // Compute the average log luminance of the downsampled image
+     using Sum = std::pair<float, int>;
+ 
+-    Sum sum =
+-      tbb::parallel_reduce(
+-        tbb::blocked_range2d<int>(0, HK, 0, WK),
+-        Sum(0.f, 0),
+-        [&](const tbb::blocked_range2d<int>& r, Sum sum) -> Sum
++    // -- GODOT start --
++    // Sum sum =
++    //   tbb::parallel_reduce(
++    //     tbb::blocked_range2d<int>(0, HK, 0, WK),
++    //     Sum(0.f, 0),
++    //     [&](const tbb::blocked_range2d<int>& r, Sum sum) -> Sum
++    //     {
++    //       // Iterate over blocks
++    //       for (int i = r.rows().begin(); i != r.rows().end(); ++i)
++    //       {
++    //         for (int j = r.cols().begin(); j != r.cols().end(); ++j)
++    //         {
++
++    Sum sum = Sum(0.0f, 0);
++
++    for (int i = 0; i != HK; ++i)
+     {
+-          // Iterate over blocks
+-          for (int i = r.rows().begin(); i != r.rows().end(); ++i)
+-          {
+-            for (int j = r.cols().begin(); j != r.cols().end(); ++j)
++      for (int j = 0; j != WK; ++j)
+       {
+         // Compute the average luminance in the current block
+         const int beginH = int(ptrdiff_t(i)   * H / HK);
+@@ -82,11 +90,12 @@ namespace oidn {
+       }
+     }
  
-     constexpr float key = 0.18f;
-     constexpr float eps = 1e-8f;
-@@ -89,6 +93,10 @@ namespace oidn {
-       );
+-          return sum;
+-        },
+-        [](Sum a, Sum b) -> Sum { return Sum(a.first+b.first, a.second+b.second); },
+-        tbb::static_partitioner()
+-      );
++    //     return sum;
++    //   },
++    //   [](Sum a, Sum b) -> Sum { return Sum(a.first+b.first, a.second+b.second); },
++    //   tbb::static_partitioner()
++    // );
++    // -- GODOT end --
  
      return (sum.second > 0) ? (key / exp2(sum.first / float(sum.second))) : 1.f;
-+// -- GODOT start --
-+#endif
-+    return 1.0;
-+// -- GODOT end --
    }
- 
- } // namespace oidn